Re: [PATCH v5 5/7] blktrace: fix debugfs use after free

From: Christoph Hellwig
Date: Tue May 19 2020 - 12:37:36 EST


I don't think we need any of that symlink stuff. Even if we want it
(which I don't), it should not be in a bug fix patch.

In fact to fix the blktrace race I think we only need something like
this fairly trivial patch (completely untested so far) below.

(and with that we can also drop the previous patch, as blk-debugfs.c
becomes rather pointless)


diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 15df3a36e9fa4..a2800bc56fb4d 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -824,9 +824,6 @@ void blk_mq_debugfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;

- q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
- blk_debugfs_root);
-
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);

/*
@@ -857,9 +854,7 @@ void blk_mq_debugfs_register(struct request_queue *q)

void blk_mq_debugfs_unregister(struct request_queue *q)
{
- debugfs_remove_recursive(q->debugfs_dir);
q->sched_debugfs_dir = NULL;
- q->debugfs_dir = NULL;
}

static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 561624d4cc4e7..8e6ea4a13f550 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -11,6 +11,7 @@
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
#include <linux/blk-cgroup.h>
+#include <linux/debugfs.h>

#include "blk.h"
#include "blk-mq.h"
@@ -918,6 +919,7 @@ static void blk_release_queue(struct kobject *kobj)

blk_trace_shutdown(q);

+ debugfs_remove_recursive(q->debugfs_dir);
if (queue_is_mq(q))
blk_mq_debugfs_unregister(q);

@@ -989,6 +991,27 @@ int blk_register_queue(struct gendisk *disk)
goto unlock;
}

+ /*
+ * Blktrace needs a debugsfs name even for queues that don't register
+ * a gendisk, so it lazily registers the debugfs directory. But that
+ * can get us into a situation where a SCSI device is found, with no
+ * driver for it (yet). Then blktrace is used on the device, creating
+ * the debugfs directory, and only after that a drivers is loaded. In
+ * that case we might already have a debugfs directory registered here.
+ * Even worse we could be racing with blktrace to register it.
+ */
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ mutex_lock(&q->blk_trace_mutex);
+ if (!q->debugfs_dir) {
+ q->debugfs_dir =
+ debugfs_create_dir(kobject_name(q->kobj.parent),
+ blk_debugfs_root);
+ }
+ mutex_unlock(&q->blk_trace_mutex);
+#else
+ blk_queue_debugfs_register(q);
+#endif
+
if (queue_is_mq(q)) {
__blk_mq_register_dev(dev, q);
blk_mq_debugfs_register(q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8801f3d7cf4a3..7a4de524f408f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -574,8 +574,8 @@ struct request_queue {
struct list_head tag_set_list;
struct bio_set bio_split;

-#ifdef CONFIG_BLK_DEBUG_FS
struct dentry *debugfs_dir;
+#ifdef CONFIG_BLK_DEBUG_FS
struct dentry *sched_debugfs_dir;
struct dentry *rqos_debugfs_dir;
#endif
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 3b6ff5902edce..eb6db276e2931 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -22,7 +22,6 @@ struct blk_trace {
u64 end_lba;
u32 pid;
u32 dev;
- struct dentry *dir;
struct dentry *dropped_file;
struct dentry *msg_file;
struct list_head running_list;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ca39dc3230cb3..1b622e970cede 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,7 +311,6 @@ static void blk_trace_free(struct blk_trace *bt)
debugfs_remove(bt->msg_file);
debugfs_remove(bt->dropped_file);
relay_close(bt->rchan);
- debugfs_remove(bt->dir);
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -476,15 +475,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct blk_user_trace_setup *buts)
{
struct blk_trace *bt = NULL;
- struct dentry *dir = NULL;
int ret;

if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;

- if (!blk_debugfs_root)
- return -ENOENT;
-
strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';

@@ -494,6 +489,25 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
*/
strreplace(buts->name, '/', '_');

+ /*
+ * For queues that do not have a gendisk attached to them, the debugfs
+ * directory will not have been created at setup time. Create it here
+ * lazily, it will only be removed when the queue is torn down.
+ *
+ * As blktrace relies on debugfs for its interface the debugfs directory
+ * is required, contrary to the usual mantra of not checking for debugfs
+ * files or directories.
+ */
+ if (!q->debugfs_dir) {
+ q->debugfs_dir =
+ debugfs_create_dir(buts->name, blk_debugfs_root);
+ }
+ if (IS_ERR_OR_NULL(q->debugfs_dir)) {
+ pr_warn("debugfs_dir not present for %s so skipping\n",
+ buts->name);
+ return -ENOENT;
+ }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
@@ -507,23 +521,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->msg_data)
goto err;

- ret = -ENOENT;
-
- dir = debugfs_lookup(buts->name, blk_debugfs_root);
- if (!dir)
- bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
-
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);

ret = -EIO;
- bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
- &blk_dropped_fops);
+ bt->dropped_file = debugfs_create_file("dropped", 0444, q->debugfs_dir,
+ bt, &blk_dropped_fops);

- bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+ bt->msg_file = debugfs_create_file("msg", 0222, q->debugfs_dir, bt,
+ &blk_msg_fops);

- bt->rchan = relay_open("trace", dir, buts->buf_size,
+ bt->rchan = relay_open("trace", q->debugfs_dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
if (!bt->rchan)
goto err;
@@ -551,8 +560,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,

ret = 0;
err:
- if (dir && !bt->dir)
- dput(dir);
if (ret)
blk_trace_free(bt);
return ret;