[RFC PATCH 11/17] perf: Implement mlock accounting for shmem ring buffers

From: Alexander Shishkin
Date: Tue Sep 05 2017 - 09:44:13 EST


With shmem ring buffers, one can mlock at most nr_pages * nr_cpus, we
only need to do the accounting once the event is created (by means of
sys_perf_event_open()). This implements such accounting, by adding a
shared reference counter: when it goes 0 -> 1, we account the pages,
when it goes to 0, we undo the accounting.

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
kernel/events/core.c | 12 +++--
kernel/events/internal.h | 5 +-
kernel/events/ring_buffer.c | 124 +++++++++++++++++++++++++++++++++++++-------
3 files changed, 116 insertions(+), 25 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c80ffcdb5c..1fed69d4ba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4309,7 +4309,6 @@ static void _free_event(struct perf_event *event)

event->attach_state &= ~PERF_ATTACH_DETACHED;

- ring_buffer_unaccount(event->rb, false);
rb_free_detached(event->rb, event);
}

@@ -9525,9 +9524,11 @@ static void account_event(struct perf_event *event)
account_pmu_sb_event(event);
}

-static int perf_event_detach(struct perf_event *event, struct task_struct *task,
- struct mm_struct *mm)
+static int
+perf_event_detach(struct perf_event *event, struct perf_event *parent_event,
+ struct task_struct *task, struct mm_struct *mm)
{
+ struct ring_buffer *parent_rb = parent_event ? parent_event->rb : NULL;
char *filename;
int err;

@@ -9545,7 +9546,7 @@ static int perf_event_detach(struct perf_event *event, struct task_struct *task,
if (!event->dent)
return -ENOMEM;

- err = rb_alloc_detached(event, task, mm);
+ err = rb_alloc_detached(event, task, mm, parent_rb);
if (err) {
tracefs_remove(event->dent);
event->dent = NULL;
@@ -11015,7 +11016,8 @@ inherit_event(struct perf_event *parent_event,
if (detached) {
int err;

- err = perf_event_detach(child_event, child, NULL);
+ err = perf_event_detach(child_event, parent_event, child,
+ NULL);
if (err) {
perf_free_event(child_event, child_ctx);
mutex_unlock(&parent_event->child_mutex);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 80d36a7277..3dc66961d9 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -53,6 +53,8 @@ struct ring_buffer {
void **aux_pages;
void *aux_priv;

+ atomic_t *acct_refcount;
+
/* tmpfs file for kernel-owned ring buffers */
struct file *shmem_file;
unsigned long shmem_file_addr;
@@ -93,7 +95,8 @@ extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
extern void rb_free_aux(struct ring_buffer *rb);
extern int rb_alloc_detached(struct perf_event *event,
struct task_struct *task,
- struct mm_struct *mm);
+ struct mm_struct *mm,
+ struct ring_buffer *parent_rb);
extern void rb_free_detached(struct ring_buffer *rb, struct perf_event *event);
extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
extern void ring_buffer_put(struct ring_buffer *rb);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 771dfdb71f..896d441642 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -570,8 +570,8 @@ void *perf_get_aux(struct perf_output_handle *handle)
* error out. Otherwise, keep track of the pages used in the ring_buffer so
* that the accounting can be undone when the pages are freed.
*/
-static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
- unsigned long nr_pages, bool aux)
+static int __ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
+ unsigned long nr_pages, unsigned long *locked)
{
unsigned long total, limit, pinned;

@@ -589,6 +589,9 @@ static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,

total = atomic_long_read(&rb->mmap_user->locked_vm) + nr_pages;

+ free_uid(rb->mmap_user);
+ rb->mmap_user = NULL;
+
pinned = 0;
if (total > limit) {
/*
@@ -609,27 +612,33 @@ static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
return -EPERM;
}

- if (aux)
- rb->aux_mmap_locked = pinned;
- else
- rb->mmap_locked = pinned;
-
+ *locked = pinned;
mm->pinned_vm += pinned;
}

if (!rb->mmap_mapping)
rb->mmap_mapping = mm;

- /* account for user page */
- if (!aux)
- nr_pages++;
-
rb->mmap_user = get_current_user();
atomic_long_add(nr_pages, &rb->mmap_user->locked_vm);

return 0;
}

+static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
+ unsigned long nr_pages, bool aux)
+{
+ int ret;
+
+ /* account for user page */
+ if (!aux)
+ nr_pages++;
+ ret = __ring_buffer_account(rb, mm, nr_pages,
+ aux ? &rb->aux_mmap_locked : &rb->mmap_locked);
+
+ return ret;
+}
+
/*
* Undo the mlock pages accounting done in ring_buffer_account().
*/
@@ -641,6 +650,9 @@ void ring_buffer_unaccount(struct ring_buffer *rb, bool aux)
if (!rb->nr_pages && !rb->aux_nr_pages)
return;

+ if (WARN_ON_ONCE(!rb->mmap_user))
+ return;
+
atomic_long_sub(nr_pages, &rb->mmap_user->locked_vm);
if (rb->mmap_mapping)
rb->mmap_mapping->pinned_vm -= pinned;
@@ -850,7 +862,8 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount)) {
- ring_buffer_unaccount(rb, true);
+ if (!rb->shmem_file)
+ ring_buffer_unaccount(rb, true);

__rb_free_aux(rb);
}
@@ -1087,13 +1100,68 @@ void rb_put_kernel_pages(struct ring_buffer *rb, bool final)
WRITE_ONCE(rb->shmem_pages_in, 0);
}

+/*
+ * SHMEM memory is accounted once per user allocated event (via
+ * the syscall), since we can have at most NR_CPUS * nr_pages
+ * pinned pages at any given point in time, regardless of how
+ * many events there actually are.
+ *
+ * The first one (parent_rb==NULL) is where we do the accounting;
+ * it will also be the one coming from the syscall, so if it fails,
+ * we'll hand them back the error.
+ * Others just inherit and bump the counter; can't fail.
+ */
+static int
+rb_shmem_account(struct ring_buffer *rb, struct ring_buffer *parent_rb)
+{
+ unsigned long nr_pages = perf_rb_size(rb) >> PAGE_SHIFT;
+ int ret = 0;
+
+ if (parent_rb) {
+ /* "parent" rb *must* have accounting refcounter */
+ if (WARN_ON_ONCE(!parent_rb->acct_refcount))
+ return -EINVAL;
+
+ rb->acct_refcount = parent_rb->acct_refcount;
+ atomic_inc(rb->acct_refcount);
+
+ return 0;
+ }
+
+ /* All (data + aux + user page) in one go */
+ ret = __ring_buffer_account(rb, NULL, nr_pages,
+ &rb->mmap_locked);
+ if (ret)
+ return ret;
+
+ rb->acct_refcount = kmalloc(sizeof(*rb->acct_refcount),
+ GFP_KERNEL);
+ if (!rb->acct_refcount)
+ return -ENOMEM;
+
+ atomic_set(rb->acct_refcount, 1);
+
+ return 0;
+}
+
+static void rb_shmem_unaccount(struct ring_buffer *rb)
+{
+ if (!atomic_dec_and_test(rb->acct_refcount)) {
+ rb->acct_refcount = NULL;
+ return;
+ }
+
+ ring_buffer_unaccount(rb, false);
+ kfree(rb->acct_refcount);
+}
+
/*
* Allocate a ring_buffer for a detached event and attach it to this event.
* There's one ring_buffer per detached event and vice versa, so
* ring_buffer_attach() does not apply.
*/
int rb_alloc_detached(struct perf_event *event, struct task_struct *task,
- struct mm_struct *mm)
+ struct mm_struct *mm, struct ring_buffer *parent_rb)
{
int aux_nr_pages = event->attr.detached_aux_nr_pages;
int nr_pages = event->attr.detached_nr_pages;
@@ -1116,18 +1184,22 @@ int rb_alloc_detached(struct perf_event *event, struct task_struct *task,
if (IS_ERR(rb))
return PTR_ERR(rb);

+ if (flags & RING_BUFFER_SHMEM) {
+ ret = rb_shmem_account(rb, parent_rb);
+ if (ret)
+ goto err_free;
+ }
+
if (aux_nr_pages) {
ret = rb_alloc_aux(rb, event, pgoff, aux_nr_pages, 0, flags);
if (ret)
- goto err_free;
+ goto err_unaccount;
}

if (flags & RING_BUFFER_SHMEM) {
ret = rb_shmem_setup(event, task, rb);
- if (ret) {
- rb_free_aux(rb);
- goto err_free;
- }
+ if (ret)
+ goto err_free_aux;

rb_toggle_paused(rb, true);
} else {
@@ -1146,8 +1218,19 @@ int rb_alloc_detached(struct perf_event *event, struct task_struct *task,

return 0;

+err_free_aux:
+ if (!(flags & RING_BUFFER_SHMEM))
+ rb_free_aux(rb);
+
+err_unaccount:
+ if (flags & RING_BUFFER_SHMEM)
+ rb_shmem_unaccount(rb);
+
err_free:
- rb_free(rb);
+ if (flags & RING_BUFFER_SHMEM)
+ kfree(rb);
+ else
+ rb_free(rb);

return ret;
}
@@ -1161,6 +1244,9 @@ void rb_free_detached(struct ring_buffer *rb, struct perf_event *event)
rb_shmem_unmap(event);
shmem_truncate_range(rb->shmem_file->f_inode, 0, (loff_t)-1);
rb_put_kernel_pages(rb, true);
+ rb_shmem_unaccount(rb);
+ } else {
+ ring_buffer_unaccount(rb, false);
}

atomic_set(&rb->aux_mmap_count, 0);
--
2.14.1