Re: LTP write03 writev07 xfs failures

From: Brian Foster
Date: Tue Feb 28 2017 - 11:11:56 EST


On Tue, Feb 28, 2017 at 07:11:35AM -0800, Christoph Hellwig wrote:
> On Tue, Feb 28, 2017 at 09:59:40AM -0500, Brian Foster wrote:
> > Heh. I've appended what I'm currently playing around with. It's
> > certainly uglier, but not terrible IMO (outside of the fact that we have
> > to look at the buffer_heads). This seems to address the problem, but
> > still only lightly tested...
> >
> > An entirely different approach may be to somehow or another
> > differentiate allocated delalloc blocks from "found" delalloc blocks in
> > the iomap_begin() handler, and then perhaps encode that into the iomap
> > such that the iomap_end() handler has an explicit reference of what to
> > punch. Personally, I wouldn't mind doing something like the below short
> > term to fix the regression and then incorporate an iomap enhancement to
> > break the buffer_head dependency.
>
> We actually have a IOMAP_F_NEW for this already, but so far it's
> only used by the DIO and DAX code.

Ok. I think that has the potential to provide a more clean and simple
solution. I don't think it's as straightforward of a change to enable
that for the buffered write code, however. I don't think we can just set
the NEW flag when we do xfs_bmapi_reserve_delalloc() because something
like the following would still break:

xfs_io -fc "pwrite 16k 4k" -c "pwrite -b 32k 0 32k" <file>

If the second write happened to fail, AFAICT it would still punch out
the block allocated by the first. So I suppose we'd have to tweak
reserve_delalloc() to trim the returned extent, or perhaps add a flag
that skips the xfs_bmbt_get_all() call to update got after we insert the
extent, and thus only return what was allocated..?

(The latter actually seems to work on a very quick test, see appended
diff..).

Brian

---8<---

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a9c66d4..18b927d 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4159,7 +4159,8 @@ xfs_bmapi_reserve_delalloc(
xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
xfs_extnum_t *lastx,
- int eof)
+ int eof,
+ int flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -4242,7 +4243,8 @@ xfs_bmapi_reserve_delalloc(
* Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
* might have merged it into one of the neighbouring ones.
*/
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+ if (flags & XFS_BMAPI_ENTIRE)
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);

/*
* Tag the inode if blocks were preallocated. Note that COW fork
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index cdef87d..459ba6b 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -243,7 +243,8 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof);
+ struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof,
+ int flags);

enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 41662fb..9b1b2a4 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -613,7 +613,8 @@ xfs_file_iomap_begin_delay(

retry:
error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof);
+ end_fsb - offset_fsb, prealloc_blocks, &got, &idx,
+ eof, 0);
switch (error) {
case 0:
break;
@@ -629,7 +630,7 @@ xfs_file_iomap_begin_delay(
default:
goto out_unlock;
}
-
+ iomap->flags = IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
done:
if (isnullstartblock(got.br_startblock))
@@ -1071,16 +1072,22 @@ xfs_file_iomap_end_delalloc(
struct xfs_inode *ip,
loff_t offset,
loff_t length,
- ssize_t written)
+ ssize_t written,
+ struct iomap *iomap)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t start_fsb;
xfs_fileoff_t end_fsb;
int error = 0;

+ trace_printk("%d: ino 0x%llx offset %llu len %llu writ %ld new %d\n", __LINE__,
+ ip->i_ino, offset, length, written, !!(iomap->flags & IOMAP_F_NEW));
+
/* behave as if the write failed if drop writes is enabled */
- if (xfs_mp_drop_writes(mp))
+ if (xfs_mp_drop_writes(mp)) {
+ iomap->flags |= IOMAP_F_NEW;
written = 0;
+ }

/*
* start_fsb refers to the first unused block after a short write. If
@@ -1101,7 +1108,7 @@ xfs_file_iomap_end_delalloc(
* across the reserve/allocate/unreserve calls. If there are delalloc
* blocks in the range, they are ours.
*/
- if (start_fsb < end_fsb) {
+ if (iomap->flags & IOMAP_F_NEW && start_fsb < end_fsb) {
truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
XFS_FSB_TO_B(mp, end_fsb) - 1);

@@ -1131,7 +1138,7 @@ xfs_file_iomap_end(
{
if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
- length, written);
+ length, written, iomap);
return 0;
}

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index da6d08f..d76a2b6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -313,7 +313,8 @@ xfs_reflink_reserve_cow(
return error;

error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &idx, eof);
+ imap->br_blockcount, 0, &got, &idx, eof,
+ XFS_BMAPI_ENTIRE);
if (error == -ENOSPC || error == -EDQUOT)
trace_xfs_reflink_cow_enospc(ip, imap);
if (error)