Re: [PATCH V3] NVMe: basic conversion to blk-mq

From: Keith Busch
Date: Wed May 28 2014 - 23:07:55 EST


On Wed, 28 May 2014, Matias BjÃrling wrote:
This converts the current NVMe driver to utilize the blk-mq layer.

I am concerned about device hot removal since the h/w queues can be
freed at any time. I *think* blk-mq helps with this in that the driver
will not see a new request after calling blk_cleanup_queue. If you can
confirm that's true and that blk-mq waits for all requests active in the
driver to return to the block layer, then we're probably okay in this
path. That wasn't true as a bio based driver which is why we are cautious
with all the locking and barriers. But what about the IOCTL paths?

It also doesn't look like we're handling the case where the SGL can't
map to a PRP.

+static void req_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct nvme_iod *iod = ctx;
- struct bio *bio = iod->private;
+ struct request *req = iod->private;
+
u16 status = le16_to_cpup(&cqe->status) >> 1;

- if (unlikely(status)) {
- if (!(status & NVME_SC_DNR ||
- bio->bi_rw & REQ_FAILFAST_MASK) &&
- (jiffies - iod->start_time) < IOD_TIMEOUT) {
- if (!waitqueue_active(&nvmeq->sq_full))
- add_wait_queue(&nvmeq->sq_full,
- &nvmeq->sq_cong_wait);
- list_add_tail(&iod->node, &nvmeq->iod_bio);
- wake_up(&nvmeq->sq_full);
- return;
- }
- }
if (iod->nents) {
- dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
- bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- nvme_end_io_acct(bio, iod->start_time);
+ dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+ rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
}
nvme_free_iod(nvmeq->dev, iod);
- if (status)
- bio_endio(bio, -EIO);
+
+ if (unlikely(status))
+ req->errors = -EIO;
else
- bio_endio(bio, 0);
+ req->errors = 0;
+
+ blk_mq_complete_request(req);
}

Is blk-mq going to retry intermittently failed commands for me? It
doesn't look like it will.

+static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+{
+ struct request *req;
+ struct nvme_command cmnd;
+
+ req = blk_mq_alloc_request(ns->queue, WRITE, GFP_KERNEL, false);
+ if (!req)
+ return -ENOMEM;
+
+ nvme_setup_flush(&cmnd, ns, req->tag);
+ nvme_submit_sync_cmd(req, &cmnd, NULL, NVME_IO_TIMEOUT);

return 0;
}

It looks like this function above is being called from an interrupt
context where we are already holding a spinlock. The sync command will
try to take that same lock.

/*
- * Called with local interrupts disabled and the q_lock held. May not sleep.
+ * Called with preemption disabled, may not sleep.
*/
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
- struct bio *bio)
+static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+ struct request *req)
{
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
struct nvme_iod *iod;
- int psegs = bio_phys_segments(ns->queue, bio);
- int result;
+ enum dma_data_direction dma_dir;
+ int psegs = req->nr_phys_segments;

- if ((bio->bi_rw & REQ_FLUSH) && psegs)
- return nvme_split_flush_data(nvmeq, bio);
-
- iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+ iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
if (!iod)
- return -ENOMEM;
+ return BLK_MQ_RQ_QUEUE_BUSY;
+
+ iod->private = req;
+
+ nvme_set_info(cmd, iod, req_completion);
+
+ if ((req->cmd_flags & REQ_FLUSH) && psegs) {
+ struct flush_cmd_info *flush_cmd = kmalloc(
+ sizeof(struct flush_cmd_info), GFP_KERNEL);

The comment above says "may not sleep", but using GFP_KERNEL here. I
actually think it is safe to sleep, though since you're not taking a
lock until later, so maybe you can change all the allocs to GFP_KERNEL?

+static struct blk_mq_ops nvme_mq_admin_ops = {
+ .queue_rq = nvme_queue_request,

I think you would get some unpredictable behavior if a request really
did go through nvme_queue_request on the admin queue. I don't see how
that would happen; just sayin.

static void nvme_create_io_queues(struct nvme_dev *dev)
{
- unsigned i, max;
+ unsigned i;

- max = min(dev->max_qid, num_online_cpus());
- for (i = dev->queue_count; i <= max; i++)
+ for (i = dev->queue_count; i <= dev->max_qid; i++)
if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
break;

- max = min(dev->queue_count - 1, num_online_cpus());
- for (i = dev->online_queues; i <= max; i++)
- if (nvme_create_queue(raw_nvmeq(dev, i), i))
+ for (i = dev->online_queues; i <= dev->max_qid; i++)
+ if (nvme_create_queue(dev->queues[i], i))
break;
}

I think your loop criteria is off. We may not have been successful in
allocating a queue to request the controller create it on its side,
so I don't think you want to use max_qid.

+ dev->tagset.ops = &nvme_mq_ops;
+ dev->tagset.nr_hw_queues = dev->queue_count - 1;

The queue_count is how many the driver allocated, but the controller
may not have successfully created it. The online_queues count is how
many the controller has available for use.