[RFC PATCHES 08/17] iommufd: IO page fault delivery initialization and release

From: Lu Baolu
Date: Tue May 30 2023 - 01:40:55 EST


Add some housekeeping code for IO page fault dilivery. Add a fault field
in the iommufd_hw_pagetable structure to store pending IO page faults and
other related data.

The fault field is allocated when an IOPF-capable user HWPT (indicated by
IOMMU_HWPT_ALLOC_FLAGS_IOPF_CAPABLE being set in the allocation user data)
is allocated. This field exists until the HWPT is destroyed. This also
implies that it is possible to determine whether a HWPT is IOPF capable by
checking the fault field.

When an IOPF-capable HWPT is attached to a device (could also be a PASID of
a device in the future), a fault cookie is allocated and set to the device.
The cookie is cleared and freed when HWPT is detached from the device.

Signed-off-by: Yi Liu <yi.l.liu@xxxxxxxxx>
Signed-off-by: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx>
---
drivers/iommu/iommufd/iommufd_private.h | 12 +++++
drivers/iommu/iommufd/device.c | 61 +++++++++++++++++++++++--
drivers/iommu/iommufd/hw_pagetable.c | 55 ++++++++++++++++++++++
3 files changed, 125 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index e951815f5707..5ff139acc5c0 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -236,6 +236,13 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,

int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);

+struct hw_pgtable_fault {
+ struct mutex mutex;
+ struct list_head deliver;
+ struct list_head response;
+ struct eventfd_ctx *trigger;
+};
+
/*
* A HW pagetable is called an iommu_domain inside the kernel. This user object
* allows directly creating and inspecting the domains. Domains that have kernel
@@ -252,6 +259,7 @@ struct iommufd_hw_pagetable {
bool msi_cookie : 1;
/* Head at iommufd_ioas::hwpt_list */
struct list_head hwpt_item;
+ struct hw_pgtable_fault *fault;
};

struct iommufd_hw_pagetable *
@@ -314,6 +322,10 @@ struct iommufd_device {
bool has_user_data;
};

+struct iommufd_fault_cookie {
+ struct iommufd_device *idev;
+};
+
static inline struct iommufd_device *
iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id)
{
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 29b212714e2c..3408f1fc3e9f 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -374,6 +374,44 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
return 0;
}

+static int iommufd_device_set_fault_cookie(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_device *idev,
+ ioasid_t pasid)
+{
+ struct iommufd_fault_cookie *fcookie, *curr;
+
+ if (!hwpt->fault)
+ return 0;
+
+ fcookie = kzalloc(sizeof(*fcookie), GFP_KERNEL);
+ if (!fcookie)
+ return -ENOMEM;
+ fcookie->idev = idev;
+
+ curr = iommu_set_device_fault_cookie(idev->dev, pasid, fcookie);
+ if (IS_ERR(curr)) {
+ kfree(fcookie);
+ return PTR_ERR(curr);
+ }
+ kfree(curr);
+
+ return 0;
+}
+
+static void iommufd_device_unset_fault_cookie(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_device *idev,
+ ioasid_t pasid)
+{
+ struct iommufd_fault_cookie *curr;
+
+ if (!hwpt->fault)
+ return;
+
+ curr = iommu_set_device_fault_cookie(idev->dev, pasid, NULL);
+ WARN_ON(IS_ERR(curr));
+ kfree(curr);
+}
+
int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
struct iommufd_device *idev)
{
@@ -398,6 +436,10 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
if (rc)
goto err_unlock;

+ rc = iommufd_device_set_fault_cookie(hwpt, idev, 0);
+ if (rc)
+ goto err_unresv;
+
/*
* Only attach to the group once for the first device that is in the
* group. All the other devices will follow this attachment. The user
@@ -408,17 +450,21 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
if (list_empty(&idev->igroup->device_list)) {
rc = iommufd_group_setup_msi(idev->igroup, hwpt);
if (rc)
- goto err_unresv;
+ goto err_unset;

rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
if (rc)
- goto err_unresv;
+ goto err_unset;
idev->igroup->hwpt = hwpt;
}
+
refcount_inc(&hwpt->obj.users);
list_add_tail(&idev->group_item, &idev->igroup->device_list);
mutex_unlock(&idev->igroup->lock);
return 0;
+
+err_unset:
+ iommufd_device_unset_fault_cookie(hwpt, idev, 0);
err_unresv:
iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
err_unlock:
@@ -433,6 +479,7 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)

mutex_lock(&idev->igroup->lock);
list_del(&idev->group_item);
+ iommufd_device_unset_fault_cookie(hwpt, idev, 0);
if (list_empty(&idev->igroup->device_list)) {
iommu_detach_group(hwpt->domain, idev->igroup->group);
idev->igroup->hwpt = NULL;
@@ -502,9 +549,14 @@ iommufd_device_do_replace(struct iommufd_device *idev,
if (rc)
goto err_unresv;

+ iommufd_device_unset_fault_cookie(old_hwpt, idev, 0);
+ rc = iommufd_device_set_fault_cookie(hwpt, idev, 0);
+ if (rc)
+ goto err_unresv;
+
rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
if (rc)
- goto err_unresv;
+ goto err_replace;

if (hwpt->ioas != old_hwpt->ioas) {
list_for_each_entry(cur, &igroup->device_list, group_item)
@@ -526,6 +578,9 @@ iommufd_device_do_replace(struct iommufd_device *idev,

/* Caller must destroy old_hwpt */
return old_hwpt;
+err_replace:
+ iommufd_device_unset_fault_cookie(hwpt, idev, 0);
+ iommufd_device_set_fault_cookie(old_hwpt, idev, 0);
err_unresv:
list_for_each_entry(cur, &igroup->device_list, group_item)
iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev);
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 47ec7ddd5f5d..d6d550c3d0cc 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -3,12 +3,16 @@
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
*/
#include <linux/iommu.h>
+#include <linux/eventfd.h>
#include <uapi/linux/iommufd.h>

#include "../iommu-priv.h"
#include "iommufd_private.h"
#include "iommufd_test.h"

+static struct hw_pgtable_fault *hw_pagetable_fault_alloc(int eventfd);
+static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault);
+
void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
{
struct iommufd_hw_pagetable *hwpt =
@@ -27,6 +31,9 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)

if (hwpt->parent)
refcount_dec(&hwpt->parent->obj.users);
+
+ if (hwpt->fault)
+ hw_pagetable_fault_free(hwpt->fault);
refcount_dec(&hwpt->ioas->obj.users);
}

@@ -255,6 +262,11 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
goto out_put_pt;
}

+ if (!parent && (cmd->flags & IOMMU_HWPT_ALLOC_FLAGS_IOPF_CAPABLE)) {
+ rc = -EINVAL;
+ goto out_put_pt;
+ }
+
if (klen) {
if (!cmd->data_len) {
rc = -EINVAL;
@@ -282,6 +294,14 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
goto out_unlock;
}

+ if (cmd->flags & IOMMU_HWPT_ALLOC_FLAGS_IOPF_CAPABLE) {
+ hwpt->fault = hw_pagetable_fault_alloc(cmd->event_fd);
+ if (IS_ERR(hwpt->fault)) {
+ rc = PTR_ERR(hwpt->fault);
+ goto out_hwpt;
+ }
+ }
+
cmd->out_hwpt_id = hwpt->obj.id;
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
if (rc)
@@ -346,3 +366,38 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
iommufd_put_object(&hwpt->obj);
return rc;
}
+
+static struct hw_pgtable_fault *hw_pagetable_fault_alloc(int eventfd)
+{
+ struct hw_pgtable_fault *fault;
+ int rc;
+
+ fault = kzalloc(sizeof(*fault), GFP_KERNEL);
+ if (!fault)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&fault->deliver);
+ INIT_LIST_HEAD(&fault->response);
+ mutex_init(&fault->mutex);
+
+ fault->trigger = eventfd_ctx_fdget(eventfd);
+ if (IS_ERR(fault->trigger)) {
+ rc = PTR_ERR(fault->trigger);
+ goto out_free;
+ }
+
+ return fault;
+
+out_free:
+ kfree(fault);
+ return ERR_PTR(rc);
+}
+
+static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
+{
+ WARN_ON(!list_empty(&fault->deliver));
+ WARN_ON(!list_empty(&fault->response));
+
+ eventfd_ctx_put(fault->trigger);
+ kfree(fault);
+}
--
2.34.1