Re: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

From: Sagi Grimberg
Date: Tue Aug 20 2019 - 13:33:44 EST

From: Long Li <longli@xxxxxxxxxxxxx>

When a NVMe hardware queue is mapped to several CPU queues, it is possible
that the CPU this hardware queue is bound to is flooded by returning I/O for
other CPUs.

For example, consider the following scenario:
1. CPU 0, 1, 2 and 3 share the same hardware queue
2. the hardware queue interrupts CPU 0 for I/O response
3. processes from CPU 1, 2 and 3 keep sending I/Os

CPU 0 may be flooded with interrupts from NVMe device that are I/O responses
for CPU 1, 2 and 3. Under heavy I/O load, it is possible that CPU 0 spends
all the time serving NVMe and other system interrupts, but doesn't have a
chance to run in process context.

To fix this, CPU 0 can schedule a work to complete the I/O request when it
detects the scheduler is not making progress. This serves multiple purposes:

1. This CPU has to be scheduled to complete the request. The other CPUs can't
issue more I/Os until some previous I/Os are completed. This helps this CPU
get out of NVMe interrupts.

2. This acts a throttling mechanisum for NVMe devices, in that it can not
starve a CPU while servicing I/Os from other CPUs.

3. This CPU can make progress on RCU and other work items on its queue.

The problem is indeed real, but this is the wrong approach in my mind.

We already have irqpoll which takes care proper budgeting polling
cycles and not hogging the cpu.

I've sent rfc for this particular problem before [1]. At the time IIRC,
Christoph suggested that we will poll the first batch directly from
the irq context and reap the rest in irqpoll handler.


How about something like this instead:
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 71127a366d3c..84bf16d75109 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -24,6 +24,7 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/sed-opal.h>
#include <linux/pci-p2pdma.h>
+#include <linux/irq_poll.h>

#include "trace.h"
#include "nvme.h"
@@ -32,6 +33,7 @@
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))

#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))

* These can be higher, but we need to ensure that any command doesn't
@@ -189,6 +191,7 @@ struct nvme_queue {
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
u32 *dbbuf_cq_ei;
+ struct irq_poll iop;
struct completion delete_done;

@@ -1015,6 +1018,23 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
return found;

+static int nvme_irqpoll_handler(struct irq_poll *iop, int budget)
+ struct nvme_queue *nvmeq = container_of(iop, struct nvme_queue, iop);
+ struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
+ u16 start, end;
+ int completed;
+ completed = nvme_process_cq(nvmeq, &start, &end, budget);
+ nvme_complete_cqes(nvmeq, start, end);
+ if (completed < budget) {
+ irq_poll_complete(&nvmeq->iop);
+ enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ }
+ return completed;
static irqreturn_t nvme_irq(int irq, void *data)
struct nvme_queue *nvmeq = data;
@@ -1028,12 +1048,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
if (nvmeq->cq_head != nvmeq->last_cq_head)
- nvme_process_cq(nvmeq, &start, &end, -1);
+ nvme_process_cq(nvmeq, &start, &end, NVME_POLL_BUDGET_IRQ);
nvmeq->last_cq_head = nvmeq->cq_head;

if (start != end) {
nvme_complete_cqes(nvmeq, start, end);
+ if (nvme_cqe_pending(nvmeq)) {
+ disable_irq_nosync(irq);
+ irq_poll_sched(&nvmeq->iop);
+ }

@@ -1347,6 +1371,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)

static void nvme_free_queue(struct nvme_queue *nvmeq)
+ irq_poll_disable(&nvmeq->iop);
dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
if (!nvmeq->sq_cmds)
@@ -1481,6 +1506,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
nvmeq->dev = dev;
+ irq_poll_init(&nvmeq->iop, NVME_POLL_BUDGET_IRQ, nvme_irqpoll_handler);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];