[PATCH 1/3] scsi: lpfc: Fix race conditions in ELS retry handling

From: Kyle Mahlkuch

Date: Thu Apr 09 2026 - 11:13:06 EST


This patch addresses critical race conditions in the lpfc driver's ELS
retry event handling that can lead to a use-after-free.

The primary issue is a TOCTOU (Time-of-Check to Tim-of-Use) race in the
lpfc_cancel_retry_delay_tmo(), where the NLP_DELAY_TMO flag is cleared
before acquiring the lock to check if the retry event is queued, and the
worker lpfc_els_retry_delay_handler(). This create a window where the
timer can be rescheduled and fire, causing both the cancel path and the
worker thread to release the same reference, resulting in a double-put
and use-after-free.

Fixes the primary TOCTOU race by
- moving the flag check inside section protected by hbalock
- Add a NULL checking in the work handler to gracefully handle cases
where the event payload has been consumed by the cancel path

Signed-off-by: Thinh Tran <thinhtr@xxxxxxxxxxxxx>
Signed-off-by: Kyle Mahlkuch <kmahlkuc@xxxxxxxxxxxxx>
---
drivers/scsi/lpfc/lpfc_els.c | 48 +++++++++++++++++++++++++-------
drivers/scsi/lpfc/lpfc_hbadisc.c | 9 ++++++
2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index b71db7d7d747..ccc0734f5daa 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -4329,18 +4329,40 @@ lpfc_issue_els_edc(struct lpfc_vport *vport, uint8_t retry)
void
lpfc_cancel_retry_delay_tmo(struct lpfc_vport *vport, struct lpfc_nodelist *nlp)
{
- struct lpfc_work_evt *evtp;
+ struct lpfc_hba *phba = vport->phba;
+ struct lpfc_work_evt *evtp = &nlp->els_retry_evt;
+ struct lpfc_nodelist *arg_ndlp = NULL;
+ unsigned long flags;

- if (!test_and_clear_bit(NLP_DELAY_TMO, &nlp->nlp_flag))
+ /*
+ * Check and clear NLP_DELAY_TMO flag inside critical section to
+ * prevent TOCTOU race with timer rescheduling. If retry event is
+ * queued, remove it and consume its payload to prevent double-put.
+ * This protects against concurrent execution with lpfc_work_list_done()
+ * which may be processing this event. The event holds a reference to
+ * the nodelist that must be released exactly once.
+ */
+ spin_lock_irqsave(&phba->hbalock, flags);
+ if (!test_and_clear_bit(NLP_DELAY_TMO, &nlp->nlp_flag)) {
+ spin_unlock_irqrestore(&phba->hbalock, flags);
return;
+ }
+
+ if (!list_empty(&evtp->evt_listp)) {
+ list_del_init(&evtp->evt_listp);
+ arg_ndlp = (struct lpfc_nodelist *)evtp->evt_arg1;
+ evtp->evt_arg1 = NULL;
+ }
+ spin_unlock_irqrestore(&phba->hbalock, flags);
+
+ /* Delete timer and clear state outside the lock */
timer_delete_sync(&nlp->nlp_delayfunc);
nlp->nlp_last_elscmd = 0;
- if (!list_empty(&nlp->els_retry_evt.evt_listp)) {
- list_del_init(&nlp->els_retry_evt.evt_listp);
- /* Decrement nlp reference count held for the delayed retry */
- evtp = &nlp->els_retry_evt;
- lpfc_nlp_put((struct lpfc_nodelist *)evtp->evt_arg1);
- }
+
+ /* Drop the event-held reference */
+ if (arg_ndlp)
+ lpfc_nlp_put(arg_ndlp);
+
if (test_and_clear_bit(NLP_NPR_2B_DISC, &nlp->nlp_flag)) {
if (vport->num_disc_nodes) {
if (vport->port_state < LPFC_VPORT_READY) {
@@ -4422,10 +4444,16 @@ lpfc_els_retry_delay_handler(struct lpfc_nodelist *ndlp)
spin_lock_irq(&ndlp->lock);
cmd = ndlp->nlp_last_elscmd;
ndlp->nlp_last_elscmd = 0;
- spin_unlock_irq(&ndlp->lock);

- if (!test_and_clear_bit(NLP_DELAY_TMO, &ndlp->nlp_flag))
+ /*
+ * Check and clear NLP_DELAY_TMO flag inside critical section to
+ * prevent TOCTOU race with lpfc_cancel_retry_delay_tmo()
+ */
+ if (!test_and_clear_bit(NLP_DELAY_TMO, &ndlp->nlp_flag)) {
+ spin_unlock_irq(&ndlp->lock);
return;
+ }
+ spin_unlock_irq(&ndlp->lock);

/*
* If a discovery event readded nlp_delayfunc after timer
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index 43d246c5c049..e318e3f5aa7c 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -846,6 +846,15 @@ lpfc_work_list_done(struct lpfc_hba *phba)
switch (evtp->evt) {
case LPFC_EVT_ELS_RETRY:
ndlp = (struct lpfc_nodelist *) (evtp->evt_arg1);
+ /*
+ * Consume the payload to prevent reuse or double-put.
+ * evt_arg1 was populated when event was queued.
+ */
+ evtp->evt_arg1 = NULL;
+ if (!ndlp) {
+ /* Event already consumed by cancel path */
+ break;
+ }
if (!hba_pci_err) {
lpfc_els_retry_delay_handler(ndlp);
free_evt = 0; /* evt is part of ndlp */
--
2.52.0