RE: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

From: Long Li
Date: Wed Aug 21 2019 - 04:39:17 EST


>>>Subject: Re: [PATCH 3/3] nvme: complete request in work queue on CPU
>>>with flooded interrupts
>>>
>>>
>>>> From: Long Li <longli@xxxxxxxxxxxxx>
>>>>
>>>> When a NVMe hardware queue is mapped to several CPU queues, it is
>>>> possible that the CPU this hardware queue is bound to is flooded by
>>>> returning I/O for other CPUs.
>>>>
>>>> For example, consider the following scenario:
>>>> 1. CPU 0, 1, 2 and 3 share the same hardware queue 2. the hardware
>>>> queue interrupts CPU 0 for I/O response 3. processes from CPU 1, 2 and
>>>> 3 keep sending I/Os
>>>>
>>>> CPU 0 may be flooded with interrupts from NVMe device that are I/O
>>>> responses for CPU 1, 2 and 3. Under heavy I/O load, it is possible
>>>> that CPU 0 spends all the time serving NVMe and other system
>>>> interrupts, but doesn't have a chance to run in process context.
>>>>
>>>> To fix this, CPU 0 can schedule a work to complete the I/O request
>>>> when it detects the scheduler is not making progress. This serves multiple
>>>purposes:
>>>>
>>>> 1. This CPU has to be scheduled to complete the request. The other
>>>> CPUs can't issue more I/Os until some previous I/Os are completed.
>>>> This helps this CPU get out of NVMe interrupts.
>>>>
>>>> 2. This acts a throttling mechanisum for NVMe devices, in that it can
>>>> not starve a CPU while servicing I/Os from other CPUs.
>>>>
>>>> 3. This CPU can make progress on RCU and other work items on its queue.
>>>
>>>The problem is indeed real, but this is the wrong approach in my mind.
>>>
>>>We already have irqpoll which takes care proper budgeting polling cycles
>>>and not hogging the cpu.
>>>
>>>I've sent rfc for this particular problem before [1]. At the time IIRC,
>>>Christoph suggested that we will poll the first batch directly from the irq
>>>context and reap the rest in irqpoll handler.

Thanks for the pointer. I will test and report back.

>>>
>>>[1]:
>>>https://nam06.safelinks.protection.outlook.com/?url=http%3A%2F%2Flists.
>>>infradead.org%2Fpipermail%2Flinux-nvme%2F2016-
>>>October%2F006497.html&amp;data=02%7C01%7Clongli%40microsoft.com%
>>>7C0ebf36eff15c4182116608d725948b93%7C72f988bf86f141af91ab2d7cd011d
>>>b47%7C1%7C0%7C637019192254250361&amp;sdata=fJ%2Fkc8HLSmfzaY3BY
>>>E66zlZKD6FjcXgMJZzVGCVqI%2FU%3D&amp;reserved=0
>>>
>>>How about something like this instead:
>>>--
>>>diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index
>>>71127a366d3c..84bf16d75109 100644
>>>--- a/drivers/nvme/host/pci.c
>>>+++ b/drivers/nvme/host/pci.c
>>>@@ -24,6 +24,7 @@
>>> #include <linux/io-64-nonatomic-lo-hi.h>
>>> #include <linux/sed-opal.h>
>>> #include <linux/pci-p2pdma.h>
>>>+#include <linux/irq_poll.h>
>>>
>>> #include "trace.h"
>>> #include "nvme.h"
>>>@@ -32,6 +33,7 @@
>>> #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
>>>
>>> #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
>>>+#define NVME_POLL_BUDGET_IRQ 256
>>>
>>> /*
>>> * These can be higher, but we need to ensure that any command doesn't
>>>@@ -189,6 +191,7 @@ struct nvme_queue {
>>> u32 *dbbuf_cq_db;
>>> u32 *dbbuf_sq_ei;
>>> u32 *dbbuf_cq_ei;
>>>+ struct irq_poll iop;
>>> struct completion delete_done;
>>> };
>>>
>>>@@ -1015,6 +1018,23 @@ static inline int nvme_process_cq(struct
>>>nvme_queue *nvmeq, u16 *start,
>>> return found;
>>> }
>>>
>>>+static int nvme_irqpoll_handler(struct irq_poll *iop, int budget) {
>>>+ struct nvme_queue *nvmeq = container_of(iop, struct nvme_queue,
>>>iop);
>>>+ struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
>>>+ u16 start, end;
>>>+ int completed;
>>>+
>>>+ completed = nvme_process_cq(nvmeq, &start, &end, budget);
>>>+ nvme_complete_cqes(nvmeq, start, end);
>>>+ if (completed < budget) {
>>>+ irq_poll_complete(&nvmeq->iop);
>>>+ enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
>>>+ }
>>>+
>>>+ return completed;
>>>+}
>>>+
>>> static irqreturn_t nvme_irq(int irq, void *data)
>>> {
>>> struct nvme_queue *nvmeq = data; @@ -1028,12 +1048,16 @@ static
>>>irqreturn_t nvme_irq(int irq, void *data)
>>> rmb();
>>> if (nvmeq->cq_head != nvmeq->last_cq_head)
>>> ret = IRQ_HANDLED;
>>>- nvme_process_cq(nvmeq, &start, &end, -1);
>>>+ nvme_process_cq(nvmeq, &start, &end, NVME_POLL_BUDGET_IRQ);
>>> nvmeq->last_cq_head = nvmeq->cq_head;
>>> wmb();
>>>
>>> if (start != end) {
>>> nvme_complete_cqes(nvmeq, start, end);
>>>+ if (nvme_cqe_pending(nvmeq)) {
>>>+ disable_irq_nosync(irq);
>>>+ irq_poll_sched(&nvmeq->iop);
>>>+ }
>>> return IRQ_HANDLED;
>>> }
>>>
>>>@@ -1347,6 +1371,7 @@ static enum blk_eh_timer_return
>>>nvme_timeout(struct request *req, bool reserved)
>>>
>>> static void nvme_free_queue(struct nvme_queue *nvmeq)
>>> {
>>>+ irq_poll_disable(&nvmeq->iop);
>>> dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
>>> (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
>>> if (!nvmeq->sq_cmds)
>>>@@ -1481,6 +1506,7 @@ static int nvme_alloc_queue(struct nvme_dev
>>>*dev, int qid, int depth)
>>> nvmeq->dev = dev;
>>> spin_lock_init(&nvmeq->sq_lock);
>>> spin_lock_init(&nvmeq->cq_poll_lock);
>>>+ irq_poll_init(&nvmeq->iop, NVME_POLL_BUDGET_IRQ,
>>>nvme_irqpoll_handler);
>>> nvmeq->cq_head = 0;
>>> nvmeq->cq_phase = 1;
>>> nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
>>>--