[RFC PATCH 02/17] perf: Factor out mlock accounting

From: Alexander Shishkin
Date: Tue Sep 05 2017 - 09:31:01 EST


This patch moves ring buffer memory accounting down the rb_alloc() path
so that its callers won't have to worry about it. This also serves the
additional purpose of slightly cleaning up perf_mmap().

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
kernel/events/core.c | 67 +++-----------------
kernel/events/internal.h | 5 +-
kernel/events/ring_buffer.c | 145 ++++++++++++++++++++++++++++++++++++++------
3 files changed, 136 insertions(+), 81 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9389e27cb0..24099ed9e5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5122,6 +5122,8 @@ void ring_buffer_put(struct ring_buffer *rb)
if (!atomic_dec_and_test(&rb->refcount))
return;

+ ring_buffer_unaccount(rb, false);
+
WARN_ON_ONCE(!list_empty(&rb->event_list));

call_rcu(&rb->rcu_head, rb_free_rcu);
@@ -5156,9 +5158,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;

struct ring_buffer *rb = ring_buffer_get(event);
- struct user_struct *mmap_user = rb->mmap_user;
- int mmap_locked = rb->mmap_locked;
- unsigned long size = perf_data_size(rb);

if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
@@ -5178,11 +5177,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
*/
perf_pmu_output_stop(event);

- /* now it's safe to free the pages */
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
-
- /* this has to be the last one */
+ /* now it's safe to free the pages; ought to be the last one */
rb_free_aux(rb);
WARN_ON_ONCE(atomic_read(&rb->aux_refcount));

@@ -5243,19 +5238,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
}
rcu_read_unlock();

- /*
- * It could be there's still a few 0-ref events on the list; they'll
- * get cleaned up by free_event() -- they'll also still have their
- * ref on the rb and will free it whenever they are done with it.
- *
- * Aside from that, this buffer is 'fully' detached and unmapped,
- * undo the VM accounting.
- */
-
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= mmap_locked;
- free_uid(mmap_user);
-
out_put:
ring_buffer_put(rb); /* could be last */
}
@@ -5270,13 +5252,9 @@ static const struct vm_operations_struct perf_mmap_vmops = {
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_event *event = file->private_data;
- unsigned long user_locked, user_lock_limit;
- struct user_struct *user = current_user();
- unsigned long locked, lock_limit;
struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
- long user_extra = 0, extra = 0;
int ret = 0, flags = 0;

/*
@@ -5347,7 +5325,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
}

atomic_set(&rb->aux_mmap_count, 1);
- user_extra = nr_pages;

goto accounting;
}
@@ -5384,49 +5361,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
goto unlock;
}

- user_extra = nr_pages + 1;
-
accounting:
- user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
-
- /*
- * Increase the limit linearly with more CPUs:
- */
- user_lock_limit *= num_online_cpus();
-
- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-
- if (user_locked > user_lock_limit)
- extra = user_locked - user_lock_limit;
-
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- locked = vma->vm_mm->pinned_vm + extra;
-
- if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
- !capable(CAP_IPC_LOCK)) {
- ret = -EPERM;
- goto unlock;
- }
-
WARN_ON(!rb && event->rb);

if (vma->vm_flags & VM_WRITE)
flags |= RING_BUFFER_WRITABLE;

if (!rb) {
- rb = rb_alloc(nr_pages,
+ rb = rb_alloc(vma->vm_mm, nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, flags);

- if (!rb) {
- ret = -ENOMEM;
+ if (IS_ERR_OR_NULL(rb)) {
+ ret = PTR_ERR(rb);
+ rb = NULL;
goto unlock;
}

atomic_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
- rb->mmap_locked = extra;

ring_buffer_attach(event, rb);

@@ -5435,15 +5387,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
- if (!ret)
- rb->aux_mmap_locked = extra;
}

unlock:
if (!ret) {
- atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->pinned_vm += extra;
-
atomic_inc(&event->mmap_count);
} else if (rb) {
atomic_dec(&rb->mmap_count);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 843e970473..3e603c45eb 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -36,6 +36,7 @@ struct ring_buffer {
atomic_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
+ struct mm_struct *mmap_mapping;

/* AUX area */
long aux_head;
@@ -56,6 +57,7 @@ struct ring_buffer {
};

extern void rb_free(struct ring_buffer *rb);
+extern void ring_buffer_unaccount(struct ring_buffer *rb, bool aux);

static inline void rb_free_rcu(struct rcu_head *rcu_head)
{
@@ -74,7 +76,8 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
}

extern struct ring_buffer *
-rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, int cpu,
+ int flags);
extern void perf_event_wakeup(struct perf_event *event);
extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12..d36f169cae 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -505,6 +505,88 @@ void *perf_get_aux(struct perf_output_handle *handle)
return handle->rb->aux_priv;
}

+/*
+ * Check if the current user can afford @nr_pages, considering the
+ * perf_event_mlock sysctl and their mlock limit. If the former is exceeded,
+ * pin the remainder on their mm, if the latter is not sufficient either,
+ * error out. Otherwise, keep track of the pages used in the ring_buffer so
+ * that the accounting can be undone when the pages are freed.
+ */
+static int ring_buffer_account(struct ring_buffer *rb, struct mm_struct *mm,
+ unsigned long nr_pages, bool aux)
+{
+ unsigned long total, limit, pinned;
+
+ if (!mm)
+ mm = rb->mmap_mapping;
+
+ rb->mmap_user = current_user();
+
+ limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+ /*
+ * Increase the limit linearly with more CPUs:
+ */
+ limit *= num_online_cpus();
+
+ total = atomic_long_read(&rb->mmap_user->locked_vm) + nr_pages;
+
+ pinned = 0;
+ if (total > limit) {
+ /*
+ * Everything that's over the sysctl_perf_event_mlock
+ * limit needs to be accounted to the consumer's mm.
+ */
+ if (!mm)
+ return -EPERM;
+
+ pinned = total - limit;
+
+ limit = rlimit(RLIMIT_MEMLOCK);
+ limit >>= PAGE_SHIFT;
+ total = mm->pinned_vm + pinned;
+
+ if ((total > limit) && perf_paranoid_tracepoint_raw() &&
+ !capable(CAP_IPC_LOCK)) {
+ return -EPERM;
+ }
+
+ if (aux)
+ rb->aux_mmap_locked = pinned;
+ else
+ rb->mmap_locked = pinned;
+
+ mm->pinned_vm += pinned;
+ }
+
+ if (!rb->mmap_mapping)
+ rb->mmap_mapping = mm;
+
+ /* account for user page */
+ if (!aux)
+ nr_pages++;
+
+ rb->mmap_user = get_current_user();
+ atomic_long_add(nr_pages, &rb->mmap_user->locked_vm);
+
+ return 0;
+}
+
+/*
+ * Undo the mlock pages accounting done in ring_buffer_account().
+ */
+void ring_buffer_unaccount(struct ring_buffer *rb, bool aux)
+{
+ unsigned long nr_pages = aux ? rb->aux_nr_pages : rb->nr_pages + 1;
+ unsigned long pinned = aux ? rb->aux_mmap_locked : rb->mmap_locked;
+
+ atomic_long_sub(nr_pages, &rb->mmap_user->locked_vm);
+ if (rb->mmap_mapping)
+ rb->mmap_mapping->pinned_vm -= pinned;
+
+ free_uid(rb->mmap_user);
+}
+
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)

static struct page *rb_alloc_aux_page(int node, int order)
@@ -574,11 +656,16 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order = 0;
+ int ret, max_order = 0;

if (!has_aux(event))
return -EOPNOTSUPP;

+ ret = ring_buffer_account(rb, NULL, nr_pages, true);
+ if (ret)
+ return ret;
+
+ ret = -ENOMEM;
if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
/*
* We need to start with the max_order that fits in nr_pages,
@@ -593,7 +680,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
!overwrite) {
if (!max_order)
- return -EINVAL;
+ goto out;

max_order--;
}
@@ -654,18 +741,23 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);

out:
- if (!ret)
+ if (!ret) {
rb->aux_pgoff = pgoff;
- else
+ } else {
+ ring_buffer_unaccount(rb, true);
__rb_free_aux(rb);
+ }

return ret;
}

void rb_free_aux(struct ring_buffer *rb)
{
- if (atomic_dec_and_test(&rb->aux_refcount))
+ if (atomic_dec_and_test(&rb->aux_refcount)) {
+ ring_buffer_unaccount(rb, true);
+
__rb_free_aux(rb);
+ }
}

#ifndef CONFIG_PERF_USE_VMALLOC
@@ -699,22 +791,25 @@ static void *perf_mmap_alloc_page(int cpu)
return page_address(page);
}

-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
+ int cpu, int flags)
{
+ unsigned long size = offsetof(struct ring_buffer, data_pages[nr_pages]);
struct ring_buffer *rb;
- unsigned long size;
- int i;
-
- size = sizeof(struct ring_buffer);
- size += nr_pages * sizeof(void *);
+ int i, ret = -ENOMEM;

rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;

+ ret = ring_buffer_account(rb, mm, nr_pages, false);
+ if (ret)
+ goto fail_free_rb;
+
+ ret = -ENOMEM;
rb->user_page = perf_mmap_alloc_page(cpu);
if (!rb->user_page)
- goto fail_user_page;
+ goto fail_unaccount;

for (i = 0; i < nr_pages; i++) {
rb->data_pages[i] = perf_mmap_alloc_page(cpu);
@@ -734,11 +829,14 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)

free_page((unsigned long)rb->user_page);

-fail_user_page:
+fail_unaccount:
+ ring_buffer_unaccount(rb, false);
+
+fail_free_rb:
kfree(rb);

fail:
- return NULL;
+ return ERR_PTR(ret);
}

static void perf_mmap_free_page(unsigned long addr)
@@ -805,19 +903,23 @@ void rb_free(struct ring_buffer *rb)
schedule_work(&rb->work);
}

-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
+ int cpu, int flags)
{
+ unsigned long size = offsetof(struct ring_buffer, data_pages[1]);
struct ring_buffer *rb;
- unsigned long size;
void *all_buf;
-
- size = sizeof(struct ring_buffer);
- size += sizeof(void *);
+ int ret = -ENOMEM;

rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;

+ ret = ring_buffer_account(rb, mm, nr_pages, false);
+ if (ret)
+ goto fail_free;
+
+ ret = -ENOMEM;
INIT_WORK(&rb->work, rb_free_work);

all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
@@ -836,10 +938,13 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
return rb;

fail_all_buf:
+ ring_buffer_unaccount(rb, false);
+
+fail_free:
kfree(rb);

fail:
- return NULL;
+ return ERR_PTR(ret);
}

#endif
--
2.14.1