[PATCH RFCv2 09/13] iommufd: Add IOMMU_OPTION_SW_MSI_START/SIZE ioctls

From: Nicolin Chen
Date: Fri Jan 10 2025 - 22:35:47 EST


For systems that require MSI pages to be mapped into the IOMMU translation
the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default
recommended IOVA window to place these mappings. However, there is nothing
special about this address. And to support the RMR trick in VMM for nested
translation, the VMM needs to know what sw_msi window the kernel is using.
As there is no particular reason to force VMM to adopt the kernel default,
provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use
to directly specify the sw_msi window that it wants to use, which replaces
and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having
to build an API to discover the default IOMMU_RESV_SW_MSI.

Since iommufd now has its own sw_msi function, this is easy to implement.

To keep things simple, the parameters are global to the entire iommufd FD,
and will directly replace the IOMMU_RESV_SW_MSI values. The VMM must set
the values before creating any hwpt's to have any effect.

Suggested-by: Jason Gunthorpe <jgg@xxxxxxxxxx>
Signed-off-by: Nicolin Chen <nicolinc@xxxxxxxxxx>
---
drivers/iommu/iommufd/iommufd_private.h | 4 +++
include/uapi/linux/iommufd.h | 18 ++++++++++++-
drivers/iommu/iommufd/device.c | 4 +++
drivers/iommu/iommufd/io_pagetable.c | 4 ++-
drivers/iommu/iommufd/ioas.c | 34 +++++++++++++++++++++++++
drivers/iommu/iommufd/main.c | 6 +++++
6 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 3e83bbb5912c..9f071609f00b 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -45,6 +45,9 @@ struct iommufd_ctx {
struct mutex sw_msi_lock;
struct list_head sw_msi_list;
unsigned int sw_msi_id;
+ /* User-programmed SW_MSI region, to override igroup->sw_msi_start */
+ phys_addr_t sw_msi_start;
+ size_t sw_msi_size;

u8 account_mode;
/* Compatibility with VFIO no iommu */
@@ -281,6 +284,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
+int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx);

int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
int iommufd_check_iova_range(struct io_pagetable *iopt,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 34810f6ae2b5..c864a201e502 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -294,7 +294,9 @@ struct iommu_ioas_unmap {

/**
* enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
- * ioctl(IOMMU_OPTION_HUGE_PAGES)
+ * ioctl(IOMMU_OPTION_HUGE_PAGES) and
+ * ioctl(IOMMU_OPTION_SW_MSI_START) and
+ * ioctl(IOMMU_OPTION_SW_MSI_SIZE)
* @IOMMU_OPTION_RLIMIT_MODE:
* Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
* to invoke this. Value 0 (default) is user based accounting, 1 uses process
@@ -304,10 +306,24 @@ struct iommu_ioas_unmap {
* iommu mappings. Value 0 disables combining, everything is mapped to
* PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
* option, the object_id must be the IOAS ID.
+ * @IOMMU_OPTION_SW_MSI_START:
+ * Change the base address of the IOMMU mapping region for MSI doorbell(s).
+ * It must be set this before attaching a device to an IOAS/HWPT, otherwise
+ * this option will be not effective on that IOAS/HWPT. User can choose to
+ * let kernel pick a base address, by simply ignoring this option or setting
+ * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0
+ * @IOMMU_OPTION_SW_MSI_SIZE:
+ * Change the size of the IOMMU mapping region for MSI doorbell(s). It must
+ * be set this before attaching a device to an IOAS/HWPT, otherwise it won't
+ * be effective on that IOAS/HWPT. The value is in MB, and the minimum value
+ * is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address
+ * value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id must be 0
*/
enum iommufd_option {
IOMMU_OPTION_RLIMIT_MODE = 0,
IOMMU_OPTION_HUGE_PAGES = 1,
+ IOMMU_OPTION_SW_MSI_START = 2,
+ IOMMU_OPTION_SW_MSI_SIZE = 3,
};

/**
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index f75b3c23cd41..093a3bd798db 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -445,10 +445,14 @@ static int
iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
struct iommufd_hwpt_paging *hwpt_paging)
{
+ struct iommufd_ctx *ictx = idev->ictx;
int rc;

lockdep_assert_held(&idev->igroup->lock);

+ /* Override it with a user-programmed SW_MSI region */
+ if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
+ idev->igroup->sw_msi_start = ictx->sw_msi_start;
rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
idev->dev,
&idev->igroup->sw_msi_start);
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 8a790e597e12..5d7f5ca1eecf 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
num_hw_msi++;
if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
- *sw_msi_start = resv->start;
+ /* Bypass the driver-defined SW_MSI region, if preset */
+ if (*sw_msi_start == PHYS_ADDR_MAX)
+ *sw_msi_start = resv->start;
num_sw_msi++;
}

diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 1542c5fd10a8..3f4e25b660f9 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -620,6 +620,40 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
return -EOPNOTSUPP;
}

+int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx)
+{
+ if (cmd->object_id)
+ return -EOPNOTSUPP;
+
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_SW_MSI_START:
+ cmd->val64 = (u64)ictx->sw_msi_start;
+ break;
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ cmd->val64 = (u64)ictx->sw_msi_size;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+ }
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_SW_MSI_START:
+ ictx->sw_msi_start = (phys_addr_t)cmd->val64;
+ break;
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ ictx->sw_msi_size = (size_t)cmd->val64;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
struct iommufd_ioas *ioas)
{
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 7cc9497b7193..026297265c71 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -229,6 +229,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
init_waitqueue_head(&ictx->destroy_wait);
mutex_init(&ictx->sw_msi_lock);
INIT_LIST_HEAD(&ictx->sw_msi_list);
+ ictx->sw_msi_start = PHYS_ADDR_MAX;
+ ictx->sw_msi_size = 0;
filp->private_data = ictx;
return 0;
}
@@ -287,6 +289,10 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
case IOMMU_OPTION_RLIMIT_MODE:
rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
break;
+ case IOMMU_OPTION_SW_MSI_START:
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ rc = iommufd_option_sw_msi(cmd, ucmd->ictx);
+ break;
case IOMMU_OPTION_HUGE_PAGES:
rc = iommufd_ioas_option(ucmd);
break;
--
2.43.0