[PATCH v2 1/5] vfio/pci: Latch disable_idle_d3 per device

From: Alex Williamson

Date: Thu Jun 11 2026 - 17:36:37 EST


When disable_idle_d3 was introduced in vfio-pci, it directly manipulated
the device power state with pci_set_power_state(). There were no
refcounts to maintain or balanced operations, we could unconditionally
bring the device to D0 and conditionally move it to D3hot. Therefore
the module parameter was made writable.

Later, in commit c61302aa48f7 ("vfio/pci: Move module parameters to
vfio_pci.c"), as part of the vfio-pci-core split, the writable aspect
of the module parameter was nullified. The parameter value could still
be changed through sysfs, but the vfio-pci driver latched the values
into vfio-pci-core globals at module init. Loading the vfio-pci module,
or unloading and reloading, with non-default or different values could
change the globals relative to existing devices bound to vfio-pci
variant drivers.

Runtime PM was introduced in commit 7ab5e10eda02 ("vfio/pci: Move the
unused device into low power state with runtime PM"), which marks the
point where power states became refcounted. PM get and put operations
need to be balanced, but the same module operations noted above can
change the global variables relative to those devices already bound to
vfio-pci variant drivers. This introduces a window where PM operations
can now become unbalanced.

To resolve this with a narrow footprint for stable backports, the
disable_idle_d3 flag is latched into the vfio_pci_core_device at the
time of initialization, such that the device always operates with a
consistent value.

Fixes: 7ab5e10eda02 ("vfio/pci: Move the unused device into low power state with runtime PM")
Cc: stable@xxxxxxxxxxxxxxx
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---
drivers/vfio/pci/vfio_pci_core.c | 30 ++++++++++++++++++++----------
include/linux/vfio_pci_core.h | 1 +
2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index a28f1e99362c..9f71eae0cc94 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -538,7 +538,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
u16 cmd;
u8 msix_pos;

- if (!disable_idle_d3) {
+ if (!vdev->disable_idle_d3) {
ret = pm_runtime_resume_and_get(&pdev->dev);
if (ret < 0)
return ret;
@@ -617,7 +617,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
out_disable_device:
pci_disable_device(pdev);
out_power:
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_put(&pdev->dev);
return ret;
}
@@ -753,7 +753,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
vfio_pci_dev_set_try_reset(vdev->vdev.dev_set);

/* Put the pm-runtime usage counter acquired during enable */
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_put(&pdev->dev);
}
EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
@@ -2144,6 +2144,8 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);

+ vdev->disable_idle_d3 = disable_idle_d3;
+
return 0;
}
EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev);
@@ -2239,7 +2241,7 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)

dev->driver->pm = &vfio_pci_core_pm_ops;
pm_runtime_allow(dev);
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_put(dev);

ret = vfio_register_group_dev(&vdev->vdev);
@@ -2248,7 +2250,7 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
return 0;

out_power:
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_get_noresume(dev);

pm_runtime_forbid(dev);
@@ -2267,7 +2269,7 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
vfio_pci_vf_uninit(vdev);
vfio_pci_vga_uninit(vdev);

- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_get_noresume(&vdev->pdev->dev);

pm_runtime_forbid(&vdev->pdev->dev);
@@ -2429,6 +2431,8 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
int ret;

list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+ if (cur->disable_idle_d3)
+ continue;
ret = pm_runtime_resume_and_get(&cur->pdev->dev);
if (ret)
goto unwind;
@@ -2438,8 +2442,11 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)

unwind:
list_for_each_entry_continue_reverse(cur, &dev_set->device_list,
- vdev.dev_set_list)
+ vdev.dev_set_list) {
+ if (cur->disable_idle_d3)
+ continue;
pm_runtime_put(&cur->pdev->dev);
+ }

return ret;
}
@@ -2552,8 +2559,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
up_write(&vdev->memory_lock);
}

- list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+ list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
+ if (vdev->disable_idle_d3)
+ continue;
pm_runtime_put(&vdev->pdev->dev);
+ }

err_unlock:
mutex_unlock(&dev_set->lock);
@@ -2599,7 +2609,7 @@ static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
* state. Increment the usage count for all the devices in the dev_set
* before reset and decrement the same after reset.
*/
- if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set))
+ if (vfio_pci_dev_set_pm_runtime_get(dev_set))
return;

if (!pci_reset_bus(pdev))
@@ -2609,7 +2619,7 @@ static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
if (reset_done)
cur->needs_reset = false;

- if (!disable_idle_d3)
+ if (!cur->disable_idle_d3)
pm_runtime_put(&cur->pdev->dev);
}
}
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 5fc6ce4dd786..27aab3fdbb91 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -127,6 +127,7 @@ struct vfio_pci_core_device {
bool needs_pm_restore:1;
bool pm_intx_masked:1;
bool pm_runtime_engaged:1;
+ bool disable_idle_d3:1;
bool sriov_active;
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
--
2.53.0