[PATCH 3/3] block: fix the lock inversion caused by kobject_del under sysfs_lock

From: Jianchao Wang
Date: Fri Jun 22 2018 - 03:10:46 EST


Currently, the kobject_del for sysfs mq and hctx entry is invoked
under q->sysfs_lock. lock inversion will came up when someone try
to access the sysfs entries concurrently. The scenario is as below:
hold q->sysfs_lock access mq,hctx sysfs entries
kobject_del
__kernfs_remove kernfs_get_active
kernfs_drain
wait kn->active require q->sysfs_lock
To fix this issue, we do as following:
- still reserve q->sysfs_lock as the sync method between queue, mq
hctx sysfs entries' show and store method.
- use QUEUE_FLAG_REGISTERED to sync the blk_register/unregister_queue
with sysfs entries.
- use QUEUE_FLAG_REGISTERED to sync the blk_mq_register/unregister_dev
with blk_mq_sysfs_register/unregister.
- change q->mq_sysfs_init_done to q->mq_sysfs_ready to sync
blk_mq_register/unregister_dev and blk_mq_sysfs_register/unregister
with mq,hctx entries.
Then we don't need sysfs_lock on kobject_del anymore.

Signed-off-by: Jianchao Wang <jianchao.w.wang@xxxxxxxxxx>
---
block/blk-mq-sysfs.c | 66 +++++++++++++++++++++++++++++++++++++-------------
block/blk-sysfs.c | 39 +++++++++++++++--------------
block/blk-wbt.c | 4 ---
block/elevator.c | 3 ++-
include/linux/blkdev.h | 2 +-
5 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index ec26745..0923c2c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -55,7 +55,9 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,

res = -ENOENT;
mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
+ if (!blk_queue_dying(q) &&
+ blk_queue_registered(q) &&
+ q->mq_sysfs_ready)
res = entry->show(ctx, page);
mutex_unlock(&q->sysfs_lock);
return res;
@@ -78,7 +80,9 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,

res = -ENOENT;
mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
+ if (!blk_queue_dying(q) &&
+ blk_queue_registered(q) &&
+ q->mq_sysfs_ready)
res = entry->store(ctx, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
@@ -101,7 +105,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,

res = -ENOENT;
mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
+ if (!blk_queue_dying(q) &&
+ blk_queue_registered(q) &&
+ q->mq_sysfs_ready)
res = entry->show(hctx, page);
mutex_unlock(&q->sysfs_lock);
return res;
@@ -125,7 +131,9 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,

res = -ENOENT;
mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
+ if (!blk_queue_dying(q) &&
+ blk_queue_registered(q) &&
+ q->mq_sysfs_ready)
res = entry->store(hctx, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
@@ -253,7 +261,9 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;

- lockdep_assert_held(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_lock);
+ q->mq_sysfs_ready = false;
+ mutex_unlock(&q->sysfs_lock);

queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
@@ -262,7 +272,6 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
kobject_del(&q->mq_kobj);
kobject_put(&dev->kobj);

- q->mq_sysfs_init_done = false;
}

void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
@@ -295,13 +304,22 @@ void blk_mq_sysfs_init(struct request_queue *q)
}
}

+/*
+ * blk_mq_register_dev/blk_mq_unregister_dev are only invoked by
+ * blk_register_queue/blk_unregister_queue. So we could use
+ * QUEUE_FLAG_REGISTERED to sync with blk_mq_sysfs_register/unregister.
+ * If QUEUE_FLAG_REGISTERED is set, q->mq_kobj is ok, otherwise, it is
+ * not there.
+ * blk_mq_register/unregister_dev blk_mq_sysfs_register/unregister all
+ * use q->mq_sysfs_ready to sync with mq and hctx sysfs entries' store
+ * and show method.
+ */
int blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int ret, i;

WARN_ON_ONCE(!q->kobj.parent);
- lockdep_assert_held(&q->sysfs_lock);

ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -315,7 +333,9 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
goto unreg;
}

- q->mq_sysfs_init_done = true;
+ mutex_lock(&q->sysfs_lock);
+ q->mq_sysfs_ready = true;
+ mutex_unlock(&q->sysfs_lock);

out:
return ret;
@@ -335,15 +355,20 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;

+ /*
+ * If QUEUE_FLAG_REGISTERED is not set, q->mq_kobj
+ * is not ready.
+ */
mutex_lock(&q->sysfs_lock);
- if (!q->mq_sysfs_init_done)
- goto unlock;
+ if (!blk_queue_registered(q)) {
+ mutex_unlock(&q->sysfs_lock);
+ return;
+ }
+ q->mq_sysfs_ready = false;
+ mutex_unlock(&q->sysfs_lock);

queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
-
-unlock:
- mutex_unlock(&q->sysfs_lock);
}

int blk_mq_sysfs_register(struct request_queue *q)
@@ -351,9 +376,16 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;

+ /*
+ * If QUEUE_FLAG_REGISTERED is not set, q->mq_kobj
+ * is not ready.
+ */
mutex_lock(&q->sysfs_lock);
- if (!q->mq_sysfs_init_done)
- goto unlock;
+ if (!blk_queue_registered(q)) {
+ mutex_unlock(&q->sysfs_lock);
+ return ret;
+ }
+ mutex_unlock(&q->sysfs_lock);

queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
@@ -361,8 +393,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
break;
}

-unlock:
+ mutex_lock(&q->sysfs_lock);
+ q->mq_sysfs_ready = true;
mutex_unlock(&q->sysfs_lock);
-
return ret;
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 96dcbb9..a3ef681 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -741,7 +741,8 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
if (!entry->show)
return -EIO;
mutex_lock(&q->sysfs_lock);
- if (blk_queue_dying(q)) {
+ if (blk_queue_dying(q) ||
+ !blk_queue_registered(q)) {
mutex_unlock(&q->sysfs_lock);
return -ENOENT;
}
@@ -763,7 +764,8 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,

q = container_of(kobj, struct request_queue, kobj);
mutex_lock(&q->sysfs_lock);
- if (blk_queue_dying(q)) {
+ if (blk_queue_dying(q) ||
+ !blk_queue_registered(q)) {
mutex_unlock(&q->sysfs_lock);
return -ENOENT;
}
@@ -866,7 +868,6 @@ int blk_register_queue(struct gendisk *disk)
WARN_ONCE(blk_queue_registered(q),
"%s is registering an already registered queue\n",
kobject_name(&dev->kobj));
- queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);

/*
* SCSI probing may synchronously create and destroy a lot of
@@ -887,17 +888,16 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
return ret;

- /* Prevent changes through sysfs until registration is completed. */
- mutex_lock(&q->sysfs_lock);
-
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0) {
blk_trace_remove_sysfs(dev);
- goto unlock;
+ return ret;
}

if (q->mq_ops) {
- blk_mq_register_dev(dev, q);
+ ret = blk_mq_register_dev(dev, q);
+ if (ret)
+ goto err_kobj;
blk_mq_debugfs_register(q);
}

@@ -907,20 +907,25 @@ int blk_register_queue(struct gendisk *disk)

blk_throtl_register_queue(q);

+ /* Prevent changes through sysfs until registration is completed. */
+ mutex_lock(&q->sysfs_lock);
if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q);
if (ret) {
mutex_unlock(&q->sysfs_lock);
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
- kobject_del(&q->kobj);
- blk_trace_remove_sysfs(dev);
- kobject_put(&dev->kobj);
- return ret;
+ goto err_kobj;
}
}
- ret = 0;
-unlock:
+
+ blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
mutex_unlock(&q->sysfs_lock);
+ return 0;
+
+err_kobj:
+ kobject_uevent(&q->kobj, KOBJ_REMOVE);
+ kobject_del(&q->kobj);
+ blk_trace_remove_sysfs(dev);
+ kobject_put(&dev->kobj);
return ret;
}
EXPORT_SYMBOL_GPL(blk_register_queue);
@@ -949,16 +954,14 @@ void blk_unregister_queue(struct gendisk *disk)
* concurrent elv_iosched_store() calls.
*/
mutex_lock(&q->sysfs_lock);
-
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
-
+ mutex_unlock(&q->sysfs_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
*/
if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q);
- mutex_unlock(&q->sysfs_lock);

kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 43ae265..c26deca 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -706,10 +706,6 @@ void wbt_enable_default(struct request_queue *q)
if (q->rq_wb)
return;

- /* Queue not registered? Maybe shutting down... */
- if (!blk_queue_registered(q))
- return;
-
if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
(q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
wbt_init(q);
diff --git a/block/elevator.c b/block/elevator.c
index a574841..a68e3e6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -869,7 +869,8 @@ void elv_unregister_queue(struct request_queue *q)
kobject_del(&e->kobj);
e->registered = 0;
/* Re-enable throttling in case elevator disabled it */
- wbt_enable_default(q);
+ if (blk_queue_registered(q))
+ wbt_enable_default(q);
}
}

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d6174ed..df53e8f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -666,7 +666,7 @@ struct request_queue {
struct dentry *sched_debugfs_dir;
#endif

- bool mq_sysfs_init_done;
+ bool mq_sysfs_ready;

size_t cmd_size;
void *rq_alloc_data;
--
2.7.4