Re: [PATCH 1/1] vfio/pci: add PCIe TPH device ioctl
From: Wathsala Vithanage
Date: Thu Jan 29 2026 - 09:06:39 EST
Hi Alex,
Just checking back on the VFIO PCI TPH patch below. You’d mentioned wanting more time to evaluate the implications, so I wanted to see if you had any remaining concerns or if you’d like me to rework this in a different direction.
Thanks,
Wathsala
On 11/6/25 17:19, Wathsala Vithanage wrote:
On 11/5/25 13:15, Alex Williamson wrote:
On Mon, 27 Oct 2025 09:33:33 -0500
Wathsala Vithanage <wathsala.vithanage@xxxxxxx> wrote:
On 10/16/25 16:41, Jeremy Linton wrote:TBH, I'm not sure why we didn't use a DEVICE_FEATURE for this. Seems
Hi,
On 10/13/25 11:35 AM, Wathsala Vithanage wrote:
TLP Processing Hints (TPH) let a requester provide steering hints thatA quick look at this, it seems its following the way the existing vfio
can enable direct cache injection on supported platforms and PCIe
devices. The PCIe core already exposes TPH handling to kernel drivers.
This change adds the VFIO_DEVICE_PCI_TPH ioctl and exposes TPH control
to user space to reduce memory latency and improve throughput for
polling drivers (e.g., DPDK poll-mode drivers). Through this interface,
user-space drivers can:
- enable or disable TPH for the device function
- program steering tags in device-specific mode
The ioctl is available only when the device advertises the TPH
Capability. Invalid modes or tags are rejected. No functional change
occurs unless the ioctl is used.
Signed-off-by: Wathsala Vithanage <wathsala.vithanage@xxxxxxx>
---
drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
include/uapi/linux/vfio.h | 36 ++++++++++++++++
2 files changed, 110 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci_core.c
b/drivers/vfio/pci/vfio_pci_core.c
index 7dcf5439dedc..0646d9a483fb 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -28,6 +28,7 @@
#include <linux/nospec.h>
#include <linux/sched/mm.h>
#include <linux/iommufd.h>
+#include <linux/pci-tph.h>
#if IS_ENABLED(CONFIG_EEH)
#include <asm/eeh.h>
#endif
@@ -1443,6 +1444,77 @@ static int vfio_pci_ioctl_ioeventfd(struct
vfio_pci_core_device *vdev,
ioeventfd.fd);
}
+static int vfio_pci_tph_set_st(struct vfio_pci_core_device *vdev,
+ const struct vfio_pci_tph_entry *ent)
+{
+ int ret, mem_type;
+ u16 st;
+ u32 cpu_id = ent->cpu_id;
+
+ if (cpu_id >= nr_cpu_ids || !cpu_present(cpu_id))
+ return -EINVAL;
+
+ if (!cpumask_test_cpu(cpu_id, current->cpus_ptr))
+ return -EINVAL;
+
+ switch (ent->mem_type) {
+ case VFIO_TPH_MEM_TYPE_VMEM:
+ mem_type = TPH_MEM_TYPE_VM;
+ break;
+ case VFIO_TPH_MEM_TYPE_PMEM:
+ mem_type = TPH_MEM_TYPE_PM;
+ break;
+ default:
+ return -EINVAL;
+ }
+ ret = pcie_tph_get_cpu_st(vdev->pdev, mem_type,
topology_core_id(cpu_id),
+ &st);
+ if (ret)
+ return ret;
+ /*
+ * PCI core enforces table bounds and disables TPH on error.
+ */
+ return pcie_tph_set_st_entry(vdev->pdev, ent->index, st);
+}
+
+static int vfio_pci_tph_enable(struct vfio_pci_core_device *vdev,
int mode)
+{
+ /* IV mode is not supported. */
+ if (mode == PCI_TPH_ST_IV_MODE)
+ return -EINVAL;
+ /* PCI core validates 'mode' and returns -EINVAL on bad values. */
+ return pcie_enable_tph(vdev->pdev, mode);
+}
+
+static int vfio_pci_tph_disable(struct vfio_pci_core_device *vdev)
+{
+ pcie_disable_tph(vdev->pdev);
+ return 0;
+}
+
+static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
+ void __user *uarg)
+{
+ struct vfio_pci_tph tph;
+
+ if (copy_from_user(&tph, uarg, sizeof(struct vfio_pci_tph)))
+ return -EFAULT;
+
+ if (tph.argsz != sizeof(struct vfio_pci_tph))
+ return -EINVAL;
+
+ switch (tph.op) {
+ case VFIO_DEVICE_TPH_ENABLE:
+ return vfio_pci_tph_enable(vdev, tph.mode);
+ case VFIO_DEVICE_TPH_DISABLE:
+ return vfio_pci_tph_disable(vdev);
+ case VFIO_DEVICE_TPH_SET_ST:
+ return vfio_pci_tph_set_st(vdev, &tph.ent);
+ default:
+ return -EINVAL;
+ }
+}
+
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned
int cmd,
unsigned long arg)
{
@@ -1467,6 +1539,8 @@ long vfio_pci_core_ioctl(struct vfio_device
*core_vdev, unsigned int cmd,
return vfio_pci_ioctl_reset(vdev, uarg);
case VFIO_DEVICE_SET_IRQS:
return vfio_pci_ioctl_set_irqs(vdev, uarg);
+ case VFIO_DEVICE_PCI_TPH:
+ return vfio_pci_ioctl_tph(vdev, uarg);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 75100bf009ba..cfdee851031e 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -873,6 +873,42 @@ struct vfio_device_ioeventfd {
#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16)
+/**
+ * VFIO_DEVICE_PCI_TPH - _IO(VFIO_TYPE, VFIO_BASE + 22)
+ *
+ * Control PCIe TLP Processing Hints (TPH) on a PCIe device.
+ *
+ * Supported operations:
+ * - VFIO_DEVICE_TPH_ENABLE: enable TPH in no-steering-tag (NS) or
+ * device-specific (DS) mode. IV mode is not supported via this ioctl
+ * and returns -EINVAL.
+ * - VFIO_DEVICE_TPH_DISABLE: disable TPH on the device.
+ * - VFIO_DEVICE_TPH_SET_ST: program an entry in the device TPH
Steering-Tag
+ * (ST) table. The kernel derives the ST from cpu_id and mem_type;
the
+ * value is not returned to userspace.
+ */
+struct vfio_pci_tph_entry {
+ __u32 cpu_id; /* CPU logical ID */
+ __u8 mem_type;
+#define VFIO_TPH_MEM_TYPE_VMEM 0 /* Request volatile memory
ST */
+#define VFIO_TPH_MEM_TYPE_PMEM 1 /* Request persistent
memory ST */
+ __u8 rsvd[1];
+ __u16 index; /* ST-table index */
+};
+
+struct vfio_pci_tph {
+ __u32 argsz; /* Size of vfio_pci_tph */
+ __u32 mode; /* NS and DS modes; IV not supported */
+ __u32 op;
+#define VFIO_DEVICE_TPH_ENABLE 0
+#define VFIO_DEVICE_TPH_DISABLE 1
+#define VFIO_DEVICE_TPH_SET_ST 2
+ struct vfio_pci_tph_entry ent;
+};
+
+#define VFIO_DEVICE_PCI_TPH _IO(VFIO_TYPE, VFIO_BASE + 22)
IOCTls are defined, yet two of them (ENABLE and DISABLE) won't likely
really change their structure, or don't need a structure in the case
of disable. Why not use IOW() and let the kernel error handling deal
with those two as independent ioctls?
Thanks,
It will require two IOCTLs. I’m ok with having two IOCTLs for this
feature if the maintainers are fine with it.
like we could implement a SET operation that does enable/disable and
Thanks Alex, it was implemented as a DEVICE_FEATURE in RFC v1,
except it had a GET operation to get the tag to the user; which we
decided to drop.
another for steering tags. I still need to fully grasp theThis is now same as the already merged RDMA TPH feature.
implications of this support though. Thanks,
https://lore.kernel.org/linux-rdma/cover.1751907231.git.leon@xxxxxxxxxx/
--wathsala