Re: [RFC PATCH 2/2] tracing/user_events: Fixup enable faults asyncly

From: Mathieu Desnoyers
Date: Fri Oct 28 2022 - 18:19:13 EST


On 2022-10-27 18:40, Beau Belgrave wrote:
When events are enabled within the various tracing facilities, such as
ftrace/perf, the event_mutex is held. As events are enabled pages are
accessed. We do not want page faults to occur under this lock. Instead
queue the fault to a workqueue to be handled in a process context safe
way without the lock.

The enable address is disabled while the async fault-in occurs. This
ensures that we don't attempt fault-in more than is necessary. Once the
page has been faulted in, the address write is attempted again. If the
page couldn't fault-in, then we wait until the next time the event is
enabled to prevent any potential infinite loops.

I'm also unclear about how the system call initiating the enabled state change is delayed (or not) when a page fault is queued.

I would expect that when a page fault is needed, after enqueuing work to the worker thread, the system call initiating the state change would somehow wait for a completion (after releasing the user events mutex). That completion would be signaled by the worker thread either if the page fault fails, or if the state change is done.

Thoughts ?

Thanks,

Mathieu


Signed-off-by: Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx>
---
kernel/trace/trace_events_user.c | 125 ++++++++++++++++++++++++++++++-
1 file changed, 121 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 633f24c2a1ac..f1eb8101e053 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -81,11 +81,22 @@ struct user_event_enabler {
struct list_head link;
struct mm_struct *mm;
struct file *file;
+ refcount_t refcnt;
unsigned long enable_addr;
unsigned int enable_bit: 5,
- __reserved: 27;
+ __reserved: 26,
+ disabled: 1;
};
+/* Used for asynchronous faulting in of pages */
+struct user_event_enabler_fault {
+ struct work_struct work;
+ struct user_event_enabler *enabler;
+ struct user_event *event;
+};
+
+static struct kmem_cache *fault_cache;
+
/*
* Stores per-event properties, as users register events
* within a file a user_event might be created if it does not
@@ -236,6 +247,19 @@ static void user_event_enabler_destroy(struct user_event_enabler *enabler)
kfree(enabler);
}
+static __always_inline struct user_event_enabler
+*user_event_enabler_get(struct user_event_enabler *enabler)
+{
+ refcount_inc(&enabler->refcnt);
+ return enabler;
+}
+
+static void user_event_enabler_put(struct user_event_enabler *enabler)
+{
+ if (refcount_dec_and_test(&enabler->refcnt))
+ user_event_enabler_destroy(enabler);
+}
+
static void user_event_enabler_remove(struct file *file,
struct user_event *user)
{
@@ -249,13 +273,93 @@ static void user_event_enabler_remove(struct file *file,
if (enabler->file != file)
continue;
+ enabler->disabled = 0;
list_del(&enabler->link);
- user_event_enabler_destroy(enabler);
+ user_event_enabler_put(enabler);
}
mutex_unlock(&event_mutex);
}
+static void user_event_enabler_write(struct user_event_enabler *enabler,
+ struct user_event *user);
+
+static void user_event_enabler_fault_fixup(struct work_struct *work)
+{
+ struct user_event_enabler_fault *fault = container_of(
+ work, struct user_event_enabler_fault, work);
+ struct user_event_enabler *enabler = fault->enabler;
+ struct user_event *user = fault->event;
+ struct mm_struct *mm = enabler->mm;
+ unsigned long uaddr = enabler->enable_addr;
+ bool unlocked = false;
+ int ret;
+
+ might_sleep();
+
+ mmap_read_lock(mm);
+
+ ret = fixup_user_fault(mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ &unlocked);
+
+ mmap_read_unlock(mm);
+
+ if (ret)
+ pr_warn("user_events: Fixup fault failed with %d "
+ "for mm: 0x%pK offset: 0x%llx event: %s\n", ret, mm,
+ (unsigned long long)uaddr, EVENT_NAME(user));
+
+ /* Prevent state changes from racing */
+ mutex_lock(&event_mutex);
+
+ /*
+ * If we managed to get the page, re-issue the write. We do not
+ * want to get into a possible infinite loop, which is why we only
+ * attempt again directly if the page came in. If we couldn't get
+ * the page here, then we will try again the next time the event is
+ * enabled/disabled.
+ */
+ enabler->disabled = 0;
+
+ if (!ret)
+ user_event_enabler_write(enabler, user);
+
+ mutex_unlock(&event_mutex);
+
+ refcount_dec(&user->refcnt);
+ user_event_enabler_put(enabler);
+ kmem_cache_free(fault_cache, fault);
+}
+
+static bool user_event_enabler_queue_fault(struct user_event_enabler *enabler,
+ struct user_event *user)
+{
+ struct user_event_enabler_fault *fault;
+
+ fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+
+ if (!fault)
+ return false;
+
+ INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
+ fault->enabler = user_event_enabler_get(enabler);
+ fault->event = user;
+
+ refcount_inc(&user->refcnt);
+ enabler->disabled = 1;
+
+ if (!schedule_work(&fault->work)) {
+ enabler->disabled = 0;
+ refcount_dec(&user->refcnt);
+ user_event_enabler_put(enabler);
+ kmem_cache_free(fault_cache, fault);
+
+ return false;
+ }
+
+ return true;
+}
+
static void user_event_enabler_write(struct user_event_enabler *enabler,
struct user_event *user)
{
@@ -266,6 +370,11 @@ static void user_event_enabler_write(struct user_event_enabler *enabler,
void *kaddr;
int ret;
+ lockdep_assert_held(&event_mutex);
+
+ if (unlikely(enabler->disabled))
+ return;
+
mmap_read_lock(mm);
ret = pin_user_pages_remote(mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
@@ -273,8 +382,10 @@ static void user_event_enabler_write(struct user_event_enabler *enabler,
mmap_read_unlock(mm);
- if (ret <= 0) {
- pr_warn("user_events: Enable write failed\n");
+ if (unlikely(ret <= 0)) {
+ if (!user_event_enabler_queue_fault(enabler, user))
+ pr_warn("user_events: Unable to queue fault handler\n");
+
return;
}
@@ -321,6 +432,7 @@ static struct user_event_enabler
enabler->file = file;
enabler->enable_addr = (unsigned long)reg->enable_addr;
enabler->enable_bit = reg->enable_bit;
+ refcount_set(&enabler->refcnt, 1);
/* Prevents state changes from racing with new enablers */
mutex_lock(&event_mutex);
@@ -1902,6 +2014,11 @@ static int __init trace_events_user_init(void)
{
int ret;
+ fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
+
+ if (!fault_cache)
+ return -ENOMEM;
+
init_group = user_event_group_create(&init_user_ns);
if (!init_group)

--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com