Re: [PATCH 1/2] fsdax,xfs: fix warning messages at dax_[dis]associate_entry()
From: Darrick J. Wong
Date: Tue Nov 29 2022 - 23:09:27 EST
On Thu, Nov 24, 2022 at 02:54:53PM +0000, Shiyang Ruan wrote:
> This patch fixes the warning message reported in dax_associate_entry()
> and dax_disassociate_entry().
Hmm, that's quite a bit to put in a single patch, but I'll try to get
through this...
> 1. reset page->mapping and ->index when refcount counting down to 0.
> 2. set IOMAP_F_SHARED flag when iomap read to allow one dax page to be
> associated more than once for not only write but also read.
That makes sense, I think.
> 3. should zero the edge (when not aligned) if srcmap is HOLE or
When is IOMAP_F_SHARED set on the /source/ mapping?
> UNWRITTEN.
> 4. iterator of two files in dedupe should be executed side by side, not
> nested.
Why? Also, this seems like a separate change?
> 5. use xfs_dax_write_iomap_ops for xfs zero and truncate.
Makes sense.
> Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxx>
> ---
> fs/dax.c | 114 ++++++++++++++++++++++++++-------------------
> fs/xfs/xfs_iomap.c | 6 +--
> 2 files changed, 69 insertions(+), 51 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 1c6867810cbd..5ea7c0926b7f 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -398,7 +398,7 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
> WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
> if (dax_mapping_is_cow(page->mapping)) {
> /* keep the CoW flag if this page is still shared */
> - if (page->index-- > 0)
> + if (page->index-- > 1)
Hmm. So if the fsdax "page" sharing factor drops from 2 to 1, we'll now
null out the mapping and index? Before, we only did that when it
dropped from 1 to 0.
Does this leave the page with no mapping? And I guess a subsequent
access will now take a fault to map it back in?
> continue;
> } else
> WARN_ON_ONCE(page->mapping && page->mapping != mapping);
> @@ -840,12 +840,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
> (iter->iomap.flags & IOMAP_F_DIRTY);
> }
>
> -static bool dax_fault_is_cow(const struct iomap_iter *iter)
> -{
> - return (iter->flags & IOMAP_WRITE) &&
> - (iter->iomap.flags & IOMAP_F_SHARED);
> -}
> -
> /*
> * By this point grab_mapping_entry() has ensured that we have a locked entry
> * of the appropriate size so we don't have to worry about downgrading PMDs to
> @@ -859,13 +853,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
> {
> struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> void *new_entry = dax_make_entry(pfn, flags);
> - bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
> - bool cow = dax_fault_is_cow(iter);
> + bool write = iter->flags & IOMAP_WRITE;
> + bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
> + bool shared = iter->iomap.flags & IOMAP_F_SHARED;
>
> if (dirty)
> __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
>
> - if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
> + if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
Ah, ok, so now we're yanking the mapping if the extent is shared,
presumably so that...
> unsigned long index = xas->xa_index;
> /* we are replacing a zero page with block mapping */
> if (dax_is_pmd_entry(entry))
> @@ -877,12 +872,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
>
> xas_reset(xas);
> xas_lock_irq(xas);
> - if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
> + if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
> void *old;
>
> dax_disassociate_entry(entry, mapping, false);
> dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
> - cow);
> + shared);
...down here we can rebuild the association, but this time we'll set the
page->mapping to PAGE_MAPPING_DAX_COW? I see a lot of similar changes,
so I'm guessing this is how you fixed the failures that were a result of
read file A -> reflink A to B -> read file B sequences?
> /*
> * Only swap our new entry into the page cache if the current
> * entry is a zero page or an empty entry. If a normal PTE or
> @@ -902,7 +897,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
> if (dirty)
> xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
>
> - if (cow)
> + if (write && shared)
> xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
>
> xas_unlock_irq(xas);
> @@ -1107,23 +1102,35 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
I think this function isn't well named. It's copying into the parts of
the @daddr page that are *not* covered by @pos/@length. In other words,
it's really copying *around* the range that's supplied, isn't it?
> loff_t end = pos + length;
> loff_t pg_end = round_up(end, align_size);
> bool copy_all = head_off == 0 && end == pg_end;
> + /* write zero at edge if srcmap is a HOLE or IOMAP_UNWRITTEN */
> + bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
When is IOMAP_F_SHARED set on the /source/ mapping? I don't understand
that circumstance, so I don't understand why we want to zero around in
that case.
> + srcmap->type == IOMAP_UNWRITTEN;
Though it's self evident why we'd do that if the source map is
unwritten.
> void *saddr = 0;
> int ret = 0;
>
> - ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
> - if (ret)
> - return ret;
> + if (!zero_edge) {
> + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
> + if (ret)
> + return ret;
> + }
>
> if (copy_all) {
> - ret = copy_mc_to_kernel(daddr, saddr, length);
> - return ret ? -EIO : 0;
> + if (zero_edge)
> + memset(daddr, 0, size);
> + else
> + ret = copy_mc_to_kernel(daddr, saddr, length);
> + goto out;
> }
>
> /* Copy the head part of the range */
> if (head_off) {
> - ret = copy_mc_to_kernel(daddr, saddr, head_off);
> - if (ret)
> - return -EIO;
> + if (zero_edge)
> + memset(daddr, 0, head_off);
> + else {
> + ret = copy_mc_to_kernel(daddr, saddr, head_off);
> + if (ret)
> + return -EIO;
> + }
> }
>
> /* Copy the tail part of the range */
> @@ -1131,12 +1138,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
> loff_t tail_off = head_off + length;
> loff_t tail_len = pg_end - end;
>
> - ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
> - tail_len);
> - if (ret)
> - return -EIO;
> + if (zero_edge)
> + memset(daddr + tail_off, 0, tail_len);
> + else {
> + ret = copy_mc_to_kernel(daddr + tail_off,
> + saddr + tail_off, tail_len);
> + if (ret)
> + return -EIO;
> + }
> }
> - return 0;
> +out:
> + if (zero_edge)
> + dax_flush(srcmap->dax_dev, daddr, size);
> + return ret ? -EIO : 0;
> }
>
> /*
> @@ -1235,13 +1249,9 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
> if (ret < 0)
> return ret;
> memset(kaddr + offset, 0, size);
> - if (srcmap->addr != iomap->addr) {
> - ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
> - kaddr);
> - if (ret < 0)
> - return ret;
> - dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
> - } else
> + if (iomap->flags & IOMAP_F_SHARED)
> + ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap, kaddr);
> + else
> dax_flush(iomap->dax_dev, kaddr + offset, size);
> return ret;
> }
> @@ -1258,6 +1268,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
> if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
> return length;
>
> + /*
> + * invalidate the pages whose sharing state is to be changed
> + * because of CoW.
> + */
> + if (iomap->flags & IOMAP_F_SHARED)
> + invalidate_inode_pages2_range(iter->inode->i_mapping,
> + pos >> PAGE_SHIFT,
> + (pos + length - 1) >> PAGE_SHIFT);
> +
> do {
> unsigned offset = offset_in_page(pos);
> unsigned size = min_t(u64, PAGE_SIZE - offset, length);
> @@ -1318,12 +1337,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> struct iov_iter *iter)
> {
> const struct iomap *iomap = &iomi->iomap;
> - const struct iomap *srcmap = &iomi->srcmap;
> + const struct iomap *srcmap = iomap_iter_srcmap(iomi);
> loff_t length = iomap_length(iomi);
> loff_t pos = iomi->pos;
> struct dax_device *dax_dev = iomap->dax_dev;
> loff_t end = pos + length, done = 0;
> bool write = iov_iter_rw(iter) == WRITE;
> + bool cow = write && iomap->flags & IOMAP_F_SHARED;
> ssize_t ret = 0;
> size_t xfer;
> int id;
> @@ -1350,7 +1370,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> * into page tables. We have to tear down these mappings so that data
> * written by write(2) is visible in mmap.
> */
> - if (iomap->flags & IOMAP_F_NEW) {
> + if (iomap->flags & IOMAP_F_NEW || cow) {
> invalidate_inode_pages2_range(iomi->inode->i_mapping,
> pos >> PAGE_SHIFT,
> (end - 1) >> PAGE_SHIFT);
> @@ -1384,8 +1404,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> break;
> }
>
> - if (write &&
> - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
> + if (cow) {
> ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
> kaddr);
> if (ret)
> @@ -1532,7 +1551,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
> struct xa_state *xas, void **entry, bool pmd)
> {
> const struct iomap *iomap = &iter->iomap;
> - const struct iomap *srcmap = &iter->srcmap;
> + const struct iomap *srcmap = iomap_iter_srcmap(iter);
> size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
> loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
> bool write = iter->flags & IOMAP_WRITE;
> @@ -1563,8 +1582,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>
> *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
>
> - if (write &&
> - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
> + if (write && iomap->flags & IOMAP_F_SHARED) {
> err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
> if (err)
> return dax_fault_return(err);
> @@ -1936,15 +1954,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
Does the dedupe change need to be in this patch? It looks ok both
before and after, so I don't know why it's necessary.
Welp, thank you for fixing the problems, at least. After a couple of
days it looks like the serious problems have cleared up.
--D
> .len = len,
> .flags = IOMAP_DAX,
> };
> - int ret;
> + int ret, compared = 0;
>
> - while ((ret = iomap_iter(&src_iter, ops)) > 0) {
> - while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
> - dst_iter.processed = dax_range_compare_iter(&src_iter,
> - &dst_iter, len, same);
> - }
> - if (ret <= 0)
> - src_iter.processed = ret;
> + while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
> + (ret = iomap_iter(&dst_iter, ops)) > 0) {
> + compared = dax_range_compare_iter(&src_iter, &dst_iter, len,
> + same);
> + if (compared < 0)
> + return ret;
> + src_iter.processed = dst_iter.processed = compared;
> }
> return ret;
> }
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 07da03976ec1..d9401d0300ad 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1215,7 +1215,7 @@ xfs_read_iomap_begin(
> return error;
> error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> &nimaps, 0);
> - if (!error && (flags & IOMAP_REPORT))
> + if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode)))
> error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> xfs_iunlock(ip, lockmode);
>
> @@ -1370,7 +1370,7 @@ xfs_zero_range(
>
> if (IS_DAX(inode))
> return dax_zero_range(inode, pos, len, did_zero,
> - &xfs_direct_write_iomap_ops);
> + &xfs_dax_write_iomap_ops);
> return iomap_zero_range(inode, pos, len, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> @@ -1385,7 +1385,7 @@ xfs_truncate_page(
>
> if (IS_DAX(inode))
> return dax_truncate_page(inode, pos, did_zero,
> - &xfs_direct_write_iomap_ops);
> + &xfs_dax_write_iomap_ops);
> return iomap_truncate_page(inode, pos, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> --
> 2.38.1
>