[PATCH 4.19 08/45] libnvdimm/bus: Fix wait_nvdimm_bus_probe_idle() ABBA deadlock

From: Greg Kroah-Hartman
Date: Thu Aug 08 2019 - 15:09:03 EST


commit ca6bf264f6d856f959c4239cda1047b587745c67 upstream.

A multithreaded namespace creation/destruction stress test currently
deadlocks with the following lockup signature:

INFO: task ndctl:2924 blocked for more than 122 seconds.
Tainted: G OE 5.2.0-rc4+ #3382
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
ndctl D 0 2924 1176 0x00000000
Call Trace:
? __schedule+0x27e/0x780
schedule+0x30/0xb0
wait_nvdimm_bus_probe_idle+0x8a/0xd0 [libnvdimm]
? finish_wait+0x80/0x80
uuid_store+0xe6/0x2e0 [libnvdimm]
kernfs_fop_write+0xf0/0x1a0
vfs_write+0xb7/0x1b0
ksys_write+0x5c/0xd0
do_syscall_64+0x60/0x240

INFO: task ndctl:2923 blocked for more than 122 seconds.
Tainted: G OE 5.2.0-rc4+ #3382
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
ndctl D 0 2923 1175 0x00000000
Call Trace:
? __schedule+0x27e/0x780
? __mutex_lock+0x489/0x910
schedule+0x30/0xb0
schedule_preempt_disabled+0x11/0x20
__mutex_lock+0x48e/0x910
? nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm]
? __lock_acquire+0x23f/0x1710
? nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm]
nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm]
__dax_pmem_probe+0x5e/0x210 [dax_pmem_core]
? nvdimm_bus_probe+0x1d0/0x2c0 [libnvdimm]
dax_pmem_probe+0xc/0x20 [dax_pmem]
nvdimm_bus_probe+0x90/0x2c0 [libnvdimm]
really_probe+0xef/0x390
driver_probe_device+0xb4/0x100

In this sequence an 'nd_dax' device is being probed and trying to take
the lock on its backing namespace to validate that the 'nd_dax' device
indeed has exclusive access to the backing namespace. Meanwhile, another
thread is trying to update the uuid property of that same backing
namespace. So one thread is in the probe path trying to acquire the
lock, and the other thread has acquired the lock and tries to flush the
probe path.

Fix this deadlock by not holding the namespace device_lock over the
wait_nvdimm_bus_probe_idle() synchronization step. In turn this requires
the device_lock to be held on entry to wait_nvdimm_bus_probe_idle() and
subsequently dropped internally to wait_nvdimm_bus_probe_idle().

Cc: <stable@xxxxxxxxxxxxxxx>
Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation")
Cc: Vishal Verma <vishal.l.verma@xxxxxxxxx>
Tested-by: Jane Chu <jane.chu@xxxxxxxxxx>
Link: https://lore.kernel.org/r/156341210094.292348.2384694131126767789.stgit@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>
---
drivers/nvdimm/bus.c | 14 +++++++++-----
drivers/nvdimm/region_devs.c | 4 ++++
2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 5abcdb4faa644..2ba22cd1331b0 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -865,10 +865,12 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
do {
if (nvdimm_bus->probe_active == 0)
break;
- nvdimm_bus_unlock(&nvdimm_bus->dev);
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
wait_event(nvdimm_bus->wait,
nvdimm_bus->probe_active == 0);
- nvdimm_bus_lock(&nvdimm_bus->dev);
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
} while (true);
}

@@ -994,7 +996,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
case ND_CMD_ARS_START:
case ND_CMD_CLEAR_ERROR:
case ND_CMD_CALL:
- dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n",
+ dev_dbg(dev, "'%s' command while read-only.\n",
nvdimm ? nvdimm_cmd_name(cmd)
: nvdimm_bus_cmd_name(cmd));
return -EPERM;
@@ -1083,7 +1085,8 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
goto out;
}

- nvdimm_bus_lock(&nvdimm_bus->dev);
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf);
if (rc)
goto out_unlock;
@@ -1103,7 +1106,8 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
rc = -EFAULT;

out_unlock:
- nvdimm_bus_unlock(&nvdimm_bus->dev);
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
out:
kfree(in_env);
kfree(out_env);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index e7377f1028ef6..0303296e6d5b6 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -425,10 +425,12 @@ static ssize_t available_size_show(struct device *dev,
* memory nvdimm_bus_lock() is dropped, but that's userspace's
* problem to not race itself.
*/
+ device_lock(dev);
nvdimm_bus_lock(dev);
wait_nvdimm_bus_probe_idle(dev);
available = nd_region_available_dpa(nd_region);
nvdimm_bus_unlock(dev);
+ device_unlock(dev);

return sprintf(buf, "%llu\n", available);
}
@@ -440,10 +442,12 @@ static ssize_t max_available_extent_show(struct device *dev,
struct nd_region *nd_region = to_nd_region(dev);
unsigned long long available = 0;

+ device_lock(dev);
nvdimm_bus_lock(dev);
wait_nvdimm_bus_probe_idle(dev);
available = nd_region_allocatable_dpa(nd_region);
nvdimm_bus_unlock(dev);
+ device_unlock(dev);

return sprintf(buf, "%llu\n", available);
}
--
2.20.1