[PATCH v2 01/14] dax: Introduce holder for dax_device

From: Shiyang Ruan
Date: Fri Jun 03 2022 - 01:38:27 EST


To easily track filesystem from a pmem device, we introduce a holder for
dax_device structure, and also its operation. This holder is used to
remember who is using this dax_device:
- When it is the backend of a filesystem, the holder will be the
instance of this filesystem.
- When this pmem device is one of the targets in a mapped device, the
holder will be this mapped device. In this case, the mapped device
has its own dax_device and it will follow the first rule. So that we
can finally track to the filesystem we needed.

The holder and holder_ops will be set when filesystem is being mounted,
or an target device is being activated.

Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxx>
Reviewed-by: Christoph Hellwig <hch@xxxxxx>
Reviewed-by: Dan Williams <dan.j.wiliams@xxxxxxxxx>
Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx>
---
drivers/dax/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++-
drivers/md/dm.c | 2 +-
fs/erofs/super.c | 10 ++++---
fs/ext2/super.c | 7 +++--
fs/ext4/super.c | 9 +++---
fs/xfs/xfs_buf.c | 5 ++--
include/linux/dax.h | 33 ++++++++++++++++------
7 files changed, 110 insertions(+), 23 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 50a08b2ec247..9b5e2a5eb0ae 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -22,6 +22,8 @@
* @private: dax driver private data
* @flags: state and boolean properties
* @ops: operations for this device
+ * @holder_data: holder of a dax_device: could be filesystem or mapped device
+ * @holder_ops: operations for the inner holder
*/
struct dax_device {
struct inode inode;
@@ -29,6 +31,8 @@ struct dax_device {
void *private;
unsigned long flags;
const struct dax_operations *ops;
+ void *holder_data;
+ const struct dax_holder_operations *holder_ops;
};

static dev_t dax_devt;
@@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host);
* fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
* @bdev: block device to find a dax_device for
* @start_off: returns the byte offset into the dax_device that @bdev starts
+ * @holder: filesystem or mapped device inside the dax_device
+ * @ops: operations for the inner holder
*/
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+ void *holder, const struct dax_holder_operations *ops)
{
struct dax_device *dax_dev;
u64 part_size;
@@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
dax_dev = NULL;
+ else if (holder) {
+ if (!cmpxchg(&dax_dev->holder_data, NULL, holder))
+ dax_dev->holder_ops = ops;
+ else
+ dax_dev = NULL;
+ }
dax_read_unlock(id);

return dax_dev;
}
EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+
+void fs_put_dax(struct dax_device *dax_dev, void *holder)
+{
+ if (dax_dev && holder &&
+ cmpxchg(&dax_dev->holder_data, holder, NULL) == holder)
+ dax_dev->holder_ops = NULL;
+ put_dax(dax_dev);
+}
+EXPORT_SYMBOL_GPL(fs_put_dax);
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */

enum dax_device_flags {
@@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
}
EXPORT_SYMBOL_GPL(dax_recovery_write);

+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off,
+ u64 len, int mf_flags)
+{
+ int rc, id;
+
+ id = dax_read_lock();
+ if (!dax_alive(dax_dev)) {
+ rc = -ENXIO;
+ goto out;
+ }
+
+ if (!dax_dev->holder_ops) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags);
+out:
+ dax_read_unlock(id);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(dax_holder_notify_failure);
+
#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
@@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev)
if (!dax_dev)
return;

+ if (dax_dev->holder_data != NULL)
+ dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+
clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
synchronize_srcu(&dax_srcu);
+
+ /* clear holder data */
+ dax_dev->holder_ops = NULL;
+ dax_dev->holder_data = NULL;
}
EXPORT_SYMBOL_GPL(kill_dax);

@@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev)
}
EXPORT_SYMBOL_GPL(put_dax);

+/**
+ * dax_holder() - obtain the holder of a dax device
+ * @dax_dev: a dax_device instance
+
+ * Return: the holder's data which represents the holder if registered,
+ * otherwize NULL.
+ */
+void *dax_holder(struct dax_device *dax_dev)
+{
+ return dax_dev->holder_data;
+}
+EXPORT_SYMBOL_GPL(dax_holder);
+
/**
* inode_dax: convert a public inode into its dax_dev
* @inode: An inode with i_cdev pointing to a dax_dev
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb0a551bd88..3de8167a3905 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -760,7 +760,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
}

td->dm_dev.bdev = bdev;
- td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
+ td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
return 0;
}

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 95addc5c9d34..3173debeaa5a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
+ NULL, NULL);
}

dif->blocks = le32_to_cpu(dis->blocks);
@@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
}

sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
- &sbi->dax_part_off);
+ &sbi->dax_part_off,
+ NULL, NULL);
}

err = erofs_read_superblock(sb);
@@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
{
struct erofs_device_info *dif = ptr;

- fs_put_dax(dif->dax_dev);
+ fs_put_dax(dif->dax_dev, NULL);
if (dif->bdev)
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
erofs_fscache_unregister_cookie(&dif->fscache);
@@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb)
return;

erofs_free_dev_context(sbi->devs);
- fs_put_dax(sbi->dax_dev);
+ fs_put_dax(sbi->dax_dev, NULL);
erofs_fscache_unregister_cookie(&sbi->s_fscache);
erofs_fscache_unregister_fs(sb);
kfree(sbi->opt.fsid);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f6a19f6d9f6d..4638946251b9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb)
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}

@@ -835,7 +835,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);

spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
@@ -1204,7 +1205,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
failed_mount:
brelse(bh);
failed_sbi:
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 450c918d68fc..0e91243b9616 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
@@ -4262,7 +4262,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi)
return;

kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}

@@ -4274,7 +4274,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
if (!sbi)
return NULL;

- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);

sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
@@ -4286,7 +4287,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
sbi->s_sb = sb;
return sbi;
err_out:
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
return NULL;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4aa9c9cf5b6e..1ec2a7b6d44e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1911,7 +1911,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);

blkdev_issue_flush(btp->bt_bdev);
- fs_put_dax(btp->bt_daxdev);
+ fs_put_dax(btp->bt_daxdev, NULL);

kmem_free(btp);
}
@@ -1964,7 +1964,8 @@ xfs_alloc_buftarg(
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL,
+ NULL);

/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e7b81634c52a..cf85fc36da5f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -43,8 +43,21 @@ struct dax_operations {
void *addr, size_t bytes, struct iov_iter *iter);
};

+struct dax_holder_operations {
+ /*
+ * notify_failure - notify memory failure into inner holder device
+ * @dax_dev: the dax device which contains the holder
+ * @offset: offset on this dax device where memory failure occurs
+ * @len: length of this memory failure event
+ * @flags: action flags for memory failure handler
+ */
+ int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
+ u64 len, int mf_flags);
+};
+
#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
+void *dax_holder(struct dax_device *dax_dev);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
@@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
return dax_synchronous(dax_dev);
}
#else
+static inline void *dax_holder(struct dax_device *dax_dev)
+{
+ return NULL;
+}
static inline struct dax_device *alloc_dax(void *private,
const struct dax_operations *ops)
{
@@ -114,12 +131,9 @@ struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
- u64 *start_off);
-static inline void fs_put_dax(struct dax_device *dax_dev)
-{
- put_dax(dax_dev);
-}
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+ void *holder, const struct dax_holder_operations *ops);
+void fs_put_dax(struct dax_device *dax_dev, void *holder);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
@@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk)
{
}
static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
- u64 *start_off)
+ u64 *start_off, void *holder,
+ const struct dax_holder_operations *ops)
{
return NULL;
}
-static inline void fs_put_dax(struct dax_device *dax_dev)
+static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
}
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
@@ -203,6 +218,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
size_t nr_pages);
+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
+ int mf_flags);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);

ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
--
2.36.1