[PATCH v12 1/7] dax: Introduce holder for dax_device

From: Shiyang Ruan
Date: Sun Apr 10 2022 - 12:09:27 EST


To easily track filesystem from a pmem device, we introduce a holder for
dax_device structure, and also its operation. This holder is used to
remember who is using this dax_device:
- When it is the backend of a filesystem, the holder will be the
instance of this filesystem.
- When this pmem device is one of the targets in a mapped device, the
holder will be this mapped device. In this case, the mapped device
has its own dax_device and it will follow the first rule. So that we
can finally track to the filesystem we needed.

The holder and holder_ops will be set when filesystem is being mounted,
or an target device is being activated.

Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxx>
---
drivers/dax/super.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-
drivers/md/dm.c | 2 +-
fs/erofs/super.c | 10 ++++---
fs/ext2/super.c | 7 ++---
fs/ext4/super.c | 9 ++++---
fs/xfs/xfs_buf.c | 5 ++--
include/linux/dax.h | 33 +++++++++++++++++------
7 files changed, 109 insertions(+), 23 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 0211e6f7b47a..619a05615497 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -22,6 +22,8 @@
* @private: dax driver private data
* @flags: state and boolean properties
* @ops: operations for this device
+ * @holder_data: holder of a dax_device: could be filesystem or mapped device
+ * @holder_ops: operations for the inner holder
*/
struct dax_device {
struct inode inode;
@@ -29,6 +31,8 @@ struct dax_device {
void *private;
unsigned long flags;
const struct dax_operations *ops;
+ void *holder_data;
+ const struct dax_holder_operations *holder_ops;
};

static dev_t dax_devt;
@@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host);
* fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
* @bdev: block device to find a dax_device for
* @start_off: returns the byte offset into the dax_device that @bdev starts
+ * @holder: filesystem or mapped device inside the dax_device
+ * @ops: operations for the inner holder
*/
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+ void *holder, const struct dax_holder_operations *ops)
{
struct dax_device *dax_dev;
u64 part_size;
@@ -92,11 +99,25 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
dax_dev = NULL;
+ else if (holder) {
+ if (!cmpxchg(&dax_dev->holder_data, NULL, holder))
+ dax_dev->holder_ops = ops;
+ else
+ dax_dev = NULL;
+ }
dax_read_unlock(id);

return dax_dev;
}
EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+
+void fs_put_dax(struct dax_device *dax_dev, void *holder)
+{
+ if (holder && cmpxchg(&dax_dev->holder_data, holder, NULL) == holder)
+ dax_dev->holder_ops = NULL;
+ put_dax(dax_dev);
+}
+EXPORT_SYMBOL_GPL(fs_put_dax);
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */

enum dax_device_flags {
@@ -194,6 +215,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);

+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off,
+ u64 len, int mf_flags)
+{
+ int rc, id;
+
+ id = dax_read_lock();
+ if (!dax_alive(dax_dev)) {
+ rc = -ENXIO;
+ goto out;
+ }
+
+ if (!dax_dev->holder_ops) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags);
+out:
+ dax_read_unlock(id);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(dax_holder_notify_failure);
+
#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
@@ -267,8 +311,15 @@ void kill_dax(struct dax_device *dax_dev)
if (!dax_dev)
return;

+ if (dax_dev->holder_data != NULL)
+ dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+
clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
synchronize_srcu(&dax_srcu);
+
+ /* clear holder data */
+ dax_dev->holder_ops = NULL;
+ dax_dev->holder_data = NULL;
}
EXPORT_SYMBOL_GPL(kill_dax);

@@ -410,6 +461,19 @@ void put_dax(struct dax_device *dax_dev)
}
EXPORT_SYMBOL_GPL(put_dax);

+/**
+ * dax_holder() - obtain the holder of a dax device
+ * @dax_dev: a dax_device instance
+
+ * Return: the holder's data which represents the holder if registered,
+ * otherwize NULL.
+ */
+void *dax_holder(struct dax_device *dax_dev)
+{
+ return dax_dev->holder_data;
+}
+EXPORT_SYMBOL_GPL(dax_holder);
+
/**
* inode_dax: convert a public inode into its dax_dev
* @inode: An inode with i_cdev pointing to a dax_dev
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8200a7498baf..d165ab828b04 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -729,7 +729,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
}

td->dm_dev.bdev = bdev;
- td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
+ td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
return 0;
}

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0c4b41130c2f..d0eab646ef94 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -267,7 +267,8 @@ static int erofs_init_devices(struct super_block *sb,
break;
}
dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
+ NULL, NULL);
dif->blocks = le32_to_cpu(dis->blocks);
dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
sbi->total_blocks += dif->blocks;
@@ -597,7 +598,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)

sb->s_fs_info = sbi;
sbi->opt = ctx->opt;
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off);
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off,
+ NULL, NULL);
sbi->devs = ctx->devs;
ctx->devs = NULL;

@@ -687,7 +689,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
{
struct erofs_device_info *dif = ptr;

- fs_put_dax(dif->dax_dev);
+ fs_put_dax(dif->dax_dev, NULL);
if (dif->bdev)
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
kfree(dif->path);
@@ -756,7 +758,7 @@ static void erofs_kill_sb(struct super_block *sb)
return;

erofs_free_dev_context(sbi->devs);
- fs_put_dax(sbi->dax_dev);
+ fs_put_dax(sbi->dax_dev, NULL);
kfree(sbi);
sb->s_fs_info = NULL;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f6a19f6d9f6d..4638946251b9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb)
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}

@@ -835,7 +835,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);

spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
@@ -1204,7 +1205,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
failed_mount:
brelse(bh);
failed_sbi:
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f5f4f2606ab2..59d5b2e8f838 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1300,7 +1300,7 @@ static void ext4_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
@@ -4351,7 +4351,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi)
return;

kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}

@@ -4363,7 +4363,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
if (!sbi)
return NULL;

- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);

sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
@@ -4375,7 +4376,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
sbi->s_sb = sb;
return sbi;
err_out:
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
return NULL;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e1afb9e503e1..f9ca08398d32 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1911,7 +1911,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);

blkdev_issue_flush(btp->bt_bdev);
- fs_put_dax(btp->bt_daxdev);
+ fs_put_dax(btp->bt_daxdev, NULL);

kmem_free(btp);
}
@@ -1964,7 +1964,8 @@ xfs_alloc_buftarg(
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL,
+ NULL);

/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9fc5f99a0ae2..9c426a207ba8 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -32,8 +32,21 @@ struct dax_operations {
int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
};

+struct dax_holder_operations {
+ /*
+ * notify_failure - notify memory failure into inner holder device
+ * @dax_dev: the dax device which contains the holder
+ * @offset: offset on this dax device where memory failure occurs
+ * @len: length of this memory failure event
+ * @flags: action flags for memory failure handler
+ */
+ int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
+ u64 len, int mf_flags);
+};
+
#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
+void *dax_holder(struct dax_device *dax_dev);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
@@ -53,6 +66,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
return dax_synchronous(dax_dev);
}
#else
+static inline void *dax_holder(struct dax_device *dax_dev)
+{
+ return NULL;
+}
static inline struct dax_device *alloc_dax(void *private,
const struct dax_operations *ops)
{
@@ -96,12 +113,9 @@ struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
- u64 *start_off);
-static inline void fs_put_dax(struct dax_device *dax_dev)
-{
- put_dax(dax_dev);
-}
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+ void *holder, const struct dax_holder_operations *ops);
+void fs_put_dax(struct dax_device *dax_dev, void *holder);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
@@ -111,11 +125,12 @@ static inline void dax_remove_host(struct gendisk *disk)
{
}
static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
- u64 *start_off)
+ u64 *start_off, void *holder,
+ const struct dax_holder_operations *ops)
{
return NULL;
}
-static inline void fs_put_dax(struct dax_device *dax_dev)
+static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
}
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
@@ -185,6 +200,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
size_t nr_pages);
+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
+ int mf_flags);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);

ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
--
2.35.1