[PATCH 4.14 14/21] nvme-pci: allocate device queues storage space at probe

From: Greg Kroah-Hartman
Date: Tue Aug 07 2018 - 14:53:56 EST


4.14-stable review patch. If anyone has any objections, please let me know.

------------------

From: Sagi Grimberg <sagi@xxxxxxxxxxx>

commit 147b27e4bd08406a6abebedbb478b431ec197be1 upstream.

It may cause race by setting 'nvmeq' in nvme_init_request()
because .init_request is called inside switching io scheduler, which
may happen when the NVMe device is being resetted and its nvme queues
are being freed and created. We don't have any sync between the two
pathes.

This patch changes the nvmeq allocation to occur at probe time so
there is no way we can dereference it at init_request.

[ 93.268391] kernel BUG at drivers/nvme/host/pci.c:408!
[ 93.274146] invalid opcode: 0000 [#1] SMP
[ 93.278618] Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss
nfsv4 dns_resolver nfs lockd grace fscache sunrpc ipmi_ssif vfat fat
intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel
kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel iTCO_wdt
intel_cstate ipmi_si iTCO_vendor_support intel_uncore mxm_wmi mei_me
ipmi_devintf intel_rapl_perf pcspkr sg ipmi_msghandler lpc_ich dcdbas mei
shpchp acpi_power_meter wmi dm_multipath ip_tables xfs libcrc32c sd_mod
mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt
fb_sys_fops ttm drm ahci libahci nvme libata crc32c_intel nvme_core tg3
megaraid_sas ptp i2c_core pps_core dm_mirror dm_region_hash dm_log dm_mod
[ 93.349071] CPU: 5 PID: 1842 Comm: sh Not tainted 4.15.0-rc2.ming+ #4
[ 93.356256] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.5.5 08/16/2017
[ 93.364801] task: 00000000fb8abf2a task.stack: 0000000028bd82d1
[ 93.371408] RIP: 0010:nvme_init_request+0x36/0x40 [nvme]
[ 93.377333] RSP: 0018:ffffc90002537ca8 EFLAGS: 00010246
[ 93.383161] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000008
[ 93.391122] RDX: 0000000000000000 RSI: ffff880276ae0000 RDI: ffff88047bae9008
[ 93.399084] RBP: ffff88047bae9008 R08: ffff88047bae9008 R09: 0000000009dabc00
[ 93.407045] R10: 0000000000000004 R11: 000000000000299c R12: ffff880186bc1f00
[ 93.415007] R13: ffff880276ae0000 R14: 0000000000000000 R15: 0000000000000071
[ 93.422969] FS: 00007f33cf288740(0000) GS:ffff88047ba80000(0000) knlGS:0000000000000000
[ 93.431996] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 93.438407] CR2: 00007f33cf28e000 CR3: 000000047e5bb006 CR4: 00000000001606e0
[ 93.446368] Call Trace:
[ 93.449103] blk_mq_alloc_rqs+0x231/0x2a0
[ 93.453579] blk_mq_sched_alloc_tags.isra.8+0x42/0x80
[ 93.459214] blk_mq_init_sched+0x7e/0x140
[ 93.463687] elevator_switch+0x5a/0x1f0
[ 93.467966] ? elevator_get.isra.17+0x52/0xc0
[ 93.472826] elv_iosched_store+0xde/0x150
[ 93.477299] queue_attr_store+0x4e/0x90
[ 93.481580] kernfs_fop_write+0xfa/0x180
[ 93.485958] __vfs_write+0x33/0x170
[ 93.489851] ? __inode_security_revalidate+0x4c/0x60
[ 93.495390] ? selinux_file_permission+0xda/0x130
[ 93.500641] ? _cond_resched+0x15/0x30
[ 93.504815] vfs_write+0xad/0x1a0
[ 93.508512] SyS_write+0x52/0xc0
[ 93.512113] do_syscall_64+0x61/0x1a0
[ 93.516199] entry_SYSCALL64_slow_path+0x25/0x25
[ 93.521351] RIP: 0033:0x7f33ce96aab0
[ 93.525337] RSP: 002b:00007ffe57570238 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 93.533785] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f33ce96aab0
[ 93.541746] RDX: 0000000000000006 RSI: 00007f33cf28e000 RDI: 0000000000000001
[ 93.549707] RBP: 00007f33cf28e000 R08: 000000000000000a R09: 00007f33cf288740
[ 93.557669] R10: 00007f33cf288740 R11: 0000000000000246 R12: 00007f33cec42400
[ 93.565630] R13: 0000000000000006 R14: 0000000000000001 R15: 0000000000000000
[ 93.573592] Code: 4c 8d 40 08 4c 39 c7 74 16 48 8b 00 48 8b 04 08 48 85 c0
74 16 48 89 86 78 01 00 00 31 c0 c3 8d 4a 01 48 63 c9 48 c1 e1 03 eb de <0f>
0b 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 85 f6 53 48 89
[ 93.594676] RIP: nvme_init_request+0x36/0x40 [nvme] RSP: ffffc90002537ca8
[ 93.602273] ---[ end trace 810dde3993e5f14e ]---

Reported-by: Yi Zhang <yi.zhang@xxxxxxxxxx>
Signed-off-by: Sagi Grimberg <sagi@xxxxxxxxxxx>
Signed-off-by: Christoph Hellwig <hch@xxxxxx>
Signed-off-by: Jon Derrick <jonathan.derrick@xxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
---
drivers/nvme/host/pci.c | 61 +++++++++++++++++++-----------------------------
1 file changed, 25 insertions(+), 36 deletions(-)

--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -77,7 +77,7 @@ static void nvme_dev_disable(struct nvme
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
struct nvme_dev {
- struct nvme_queue **queues;
+ struct nvme_queue *queues;
struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
@@ -348,7 +348,7 @@ static int nvme_admin_init_hctx(struct b
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->queues[0];

WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@ -370,7 +370,7 @@ static int nvme_init_hctx(struct blk_mq_
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+ struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];

if (!nvmeq->tags)
nvmeq->tags = &dev->tagset.tags[hctx_idx];
@@ -386,7 +386,7 @@ static int nvme_init_request(struct blk_
struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
- struct nvme_queue *nvmeq = dev->queues[queue_idx];
+ struct nvme_queue *nvmeq = &dev->queues[queue_idx];

BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
@@ -900,7 +900,7 @@ static int nvme_poll(struct blk_mq_hw_ct
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
- struct nvme_queue *nvmeq = dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->queues[0];
struct nvme_command c;

memset(&c, 0, sizeof(c));
@@ -1146,7 +1146,6 @@ static void nvme_free_queue(struct nvme_
if (nvmeq->sq_cmds)
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
- kfree(nvmeq);
}

static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@ -1154,10 +1153,8 @@ static void nvme_free_queues(struct nvme
int i;

for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
- struct nvme_queue *nvmeq = dev->queues[i];
dev->ctrl.queue_count--;
- dev->queues[i] = NULL;
- nvme_free_queue(nvmeq);
+ nvme_free_queue(&dev->queues[i]);
}
}

@@ -1189,10 +1186,8 @@ static int nvme_suspend_queue(struct nvm

static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
- struct nvme_queue *nvmeq = dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->queues[0];

- if (!nvmeq)
- return;
if (nvme_suspend_queue(nvmeq))
return;

@@ -1246,13 +1241,10 @@ static int nvme_alloc_sq_cmds(struct nvm
return 0;
}

-static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
- int depth, int node)
+static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
+ int depth, int node)
{
- struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
- node);
- if (!nvmeq)
- return NULL;
+ struct nvme_queue *nvmeq = &dev->queues[qid];

nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
&nvmeq->cq_dma_addr, GFP_KERNEL);
@@ -1271,17 +1263,15 @@ static struct nvme_queue *nvme_alloc_que
nvmeq->q_depth = depth;
nvmeq->qid = qid;
nvmeq->cq_vector = -1;
- dev->queues[qid] = nvmeq;
dev->ctrl.queue_count++;

- return nvmeq;
+ return 0;

free_cqdma:
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
- kfree(nvmeq);
- return NULL;
+ return -ENOMEM;
}

static int queue_request_irq(struct nvme_queue *nvmeq)
@@ -1468,14 +1458,12 @@ static int nvme_pci_configure_admin_queu
if (result < 0)
return result;

- nvmeq = dev->queues[0];
- if (!nvmeq) {
- nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
- dev_to_node(dev->dev));
- if (!nvmeq)
- return -ENOMEM;
- }
+ result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+ dev_to_node(dev->dev));
+ if (result)
+ return result;

+ nvmeq = &dev->queues[0];
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;

@@ -1505,7 +1493,7 @@ static int nvme_create_io_queues(struct

for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
/* vector == qid - 1, match nvme_create_queue */
- if (!nvme_alloc_queue(dev, i, dev->q_depth,
+ if (nvme_alloc_queue(dev, i, dev->q_depth,
pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
ret = -ENOMEM;
break;
@@ -1514,7 +1502,7 @@ static int nvme_create_io_queues(struct

max = min(dev->max_qid, dev->ctrl.queue_count - 1);
for (i = dev->online_queues; i <= max; i++) {
- ret = nvme_create_queue(dev->queues[i], i);
+ ret = nvme_create_queue(&dev->queues[i], i);
if (ret)
break;
}
@@ -1770,7 +1758,7 @@ static int nvme_setup_host_mem(struct nv

static int nvme_setup_io_queues(struct nvme_dev *dev)
{
- struct nvme_queue *adminq = dev->queues[0];
+ struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, nr_io_queues;
unsigned long size;
@@ -1896,7 +1884,7 @@ static void nvme_disable_io_queues(struc
retry:
timeout = ADMIN_TIMEOUT;
for (; i > 0; i--, sent++)
- if (nvme_delete_queue(dev->queues[i], opcode))
+ if (nvme_delete_queue(&dev->queues[i], opcode))
break;

while (sent--) {
@@ -2081,7 +2069,7 @@ static void nvme_dev_disable(struct nvme

queues = dev->online_queues - 1;
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
- nvme_suspend_queue(dev->queues[i]);
+ nvme_suspend_queue(&dev->queues[i]);

if (dead) {
/* A device might become IO incapable very soon during
@@ -2089,7 +2077,7 @@ static void nvme_dev_disable(struct nvme
* queue_count can be 0 here.
*/
if (dev->ctrl.queue_count)
- nvme_suspend_queue(dev->queues[0]);
+ nvme_suspend_queue(&dev->queues[0]);
} else {
nvme_disable_io_queues(dev, queues);
nvme_disable_admin_queue(dev, shutdown);
@@ -2345,7 +2333,8 @@ static int nvme_probe(struct pci_dev *pd
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
- dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
+
+ dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(struct nvme_queue),
GFP_KERNEL, node);
if (!dev->queues)
goto free;