[169/200] perf_events: Fix races and clean up perf_event and perf_mmap_data interaction

From: Greg KH
Date: Thu Jul 01 2010 - 17:33:15 EST


2.6.34-stable review patch. If anyone has any objections, please let me know.

------------------

From: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>

commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318 upstream.

In order to move toward separate buffer objects, rework the whole
perf_mmap_data construct to be a more self-sufficient entity, one
with its own lifetime rules.

This greatly sanitizes the whole output redirection code, which
was riddled with bugs and races.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxx>


---
include/linux/perf_event.h | 5 -
kernel/perf_event.c | 217 ++++++++++++++++++++++++++-------------------
2 files changed, 129 insertions(+), 93 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -531,6 +531,7 @@ enum perf_event_active_state {
struct file;

struct perf_mmap_data {
+ atomic_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
@@ -538,7 +539,6 @@ struct perf_mmap_data {
int data_order;
int nr_pages; /* nr of data pages */
int writable; /* are we writable */
- int nr_locked; /* nr pages mlocked */

atomic_t poll; /* POLL_ for wakeups */
atomic_t events; /* event_id limit */
@@ -582,7 +582,6 @@ struct perf_event {
int nr_siblings;
int group_flags;
struct perf_event *group_leader;
- struct perf_event *output;
const struct pmu *pmu;

enum perf_event_active_state state;
@@ -643,6 +642,8 @@ struct perf_event {
/* mmap bits */
struct mutex mmap_mutex;
atomic_t mmap_count;
+ int mmap_locked;
+ struct user_struct *mmap_user;
struct perf_mmap_data *data;

/* poll related */
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1832,6 +1832,7 @@ static void free_event_rcu(struct rcu_he
}

static void perf_pending_sync(struct perf_event *event);
+static void perf_mmap_data_put(struct perf_mmap_data *data);

static void free_event(struct perf_event *event)
{
@@ -1847,9 +1848,9 @@ static void free_event(struct perf_event
atomic_dec(&nr_task_events);
}

- if (event->output) {
- fput(event->output->filp);
- event->output = NULL;
+ if (event->data) {
+ perf_mmap_data_put(event->data);
+ event->data = NULL;
}

if (event->destroy)
@@ -2154,7 +2155,27 @@ unlock:
return ret;
}

-static int perf_event_set_output(struct perf_event *event, int output_fd);
+static const struct file_operations perf_fops;
+
+static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+{
+ struct file *file;
+
+ file = fget_light(fd, fput_needed);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &perf_fops) {
+ fput_light(file, *fput_needed);
+ *fput_needed = 0;
+ return ERR_PTR(-EBADF);
+ }
+
+ return file->private_data;
+}
+
+static int perf_event_set_output(struct perf_event *event,
+ struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2181,7 +2202,23 @@ static long perf_ioctl(struct file *file
return perf_event_period(event, (u64 __user *)arg);

case PERF_EVENT_IOC_SET_OUTPUT:
- return perf_event_set_output(event, arg);
+ {
+ struct perf_event *output_event = NULL;
+ int fput_needed = 0;
+ int ret;
+
+ if (arg != -1) {
+ output_event = perf_fget_light(arg, &fput_needed);
+ if (IS_ERR(output_event))
+ return PTR_ERR(output_event);
+ }
+
+ ret = perf_event_set_output(event, output_event);
+ if (output_event)
+ fput_light(output_event->filp, fput_needed);
+
+ return ret;
+ }

case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
@@ -2306,8 +2343,6 @@ perf_mmap_data_alloc(struct perf_event *
unsigned long size;
int i;

- WARN_ON(atomic_read(&event->mmap_count));
-
size = sizeof(struct perf_mmap_data);
size += nr_pages * sizeof(void *);

@@ -2414,8 +2449,6 @@ perf_mmap_data_alloc(struct perf_event *
unsigned long size;
void *all_buf;

- WARN_ON(atomic_read(&event->mmap_count));
-
size = sizeof(struct perf_mmap_data);
size += sizeof(void *);

@@ -2495,7 +2528,7 @@ perf_mmap_data_init(struct perf_event *e
if (!data->watermark)
data->watermark = max_size / 2;

-
+ atomic_set(&data->refcount, 1);
rcu_assign_pointer(event->data, data);
}

@@ -2507,13 +2540,26 @@ static void perf_mmap_data_free_rcu(stru
perf_mmap_data_free(data);
}

-static void perf_mmap_data_release(struct perf_event *event)
+static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
{
- struct perf_mmap_data *data = event->data;
+ struct perf_mmap_data *data;
+
+ rcu_read_lock();
+ data = rcu_dereference(event->data);
+ if (data) {
+ if (!atomic_inc_not_zero(&data->refcount))
+ data = NULL;
+ }
+ rcu_read_unlock();

- WARN_ON(atomic_read(&event->mmap_count));
+ return data;
+}
+
+static void perf_mmap_data_put(struct perf_mmap_data *data)
+{
+ if (!atomic_dec_and_test(&data->refcount))
+ return;

- rcu_assign_pointer(event->data, NULL);
call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
}

@@ -2528,15 +2574,18 @@ static void perf_mmap_close(struct vm_ar
{
struct perf_event *event = vma->vm_file->private_data;

- WARN_ON_ONCE(event->ctx->parent_ctx);
if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
unsigned long size = perf_data_size(event->data);
- struct user_struct *user = current_user();
+ struct user_struct *user = event->mmap_user;
+ struct perf_mmap_data *data = event->data;

atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->locked_vm -= event->data->nr_locked;
- perf_mmap_data_release(event);
+ vma->vm_mm->locked_vm -= event->mmap_locked;
+ rcu_assign_pointer(event->data, NULL);
mutex_unlock(&event->mmap_mutex);
+
+ perf_mmap_data_put(data);
+ free_uid(user);
}
}

@@ -2580,13 +2629,10 @@ static int perf_mmap(struct file *file,

WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->mmap_mutex);
- if (event->output) {
- ret = -EINVAL;
- goto unlock;
- }
-
- if (atomic_inc_not_zero(&event->mmap_count)) {
- if (nr_pages != event->data->nr_pages)
+ if (event->data) {
+ if (event->data->nr_pages == nr_pages)
+ atomic_inc(&event->data->refcount);
+ else
ret = -EINVAL;
goto unlock;
}
@@ -2618,21 +2664,23 @@ static int perf_mmap(struct file *file,
WARN_ON(event->data);

data = perf_mmap_data_alloc(event, nr_pages);
- ret = -ENOMEM;
- if (!data)
+ if (!data) {
+ ret = -ENOMEM;
goto unlock;
+ }

- ret = 0;
perf_mmap_data_init(event, data);
-
- atomic_set(&event->mmap_count, 1);
- atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->locked_vm += extra;
- event->data->nr_locked = extra;
if (vma->vm_flags & VM_WRITE)
event->data->writable = 1;

+ atomic_long_add(user_extra, &user->locked_vm);
+ event->mmap_locked = extra;
+ event->mmap_user = get_current_user();
+ vma->vm_mm->locked_vm += event->mmap_locked;
+
unlock:
+ if (!ret)
+ atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);

vma->vm_flags |= VM_RESERVED;
@@ -2962,7 +3010,6 @@ int perf_output_begin(struct perf_output
struct perf_event *event, unsigned int size,
int nmi, int sample)
{
- struct perf_event *output_event;
struct perf_mmap_data *data;
unsigned long tail, offset, head;
int have_lost;
@@ -2979,10 +3026,6 @@ int perf_output_begin(struct perf_output
if (event->parent)
event = event->parent;

- output_event = rcu_dereference(event->output);
- if (output_event)
- event = output_event;
-
data = rcu_dereference(event->data);
if (!data)
goto out;
@@ -4746,54 +4789,41 @@ err_size:
goto out;
}

-static int perf_event_set_output(struct perf_event *event, int output_fd)
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct perf_event *output_event = NULL;
- struct file *output_file = NULL;
- struct perf_event *old_output;
- int fput_needed = 0;
+ struct perf_mmap_data *data = NULL, *old_data = NULL;
int ret = -EINVAL;

- if (!output_fd)
+ if (!output_event)
goto set;

- output_file = fget_light(output_fd, &fput_needed);
- if (!output_file)
- return -EBADF;
-
- if (output_file->f_op != &perf_fops)
- goto out;
-
- output_event = output_file->private_data;
-
- /* Don't chain output fds */
- if (output_event->output)
- goto out;
-
- /* Don't set an output fd when we already have an output channel */
- if (event->data)
+ /* don't allow circular references */
+ if (event == output_event)
goto out;

- atomic_long_inc(&output_file->f_count);
-
set:
mutex_lock(&event->mmap_mutex);
- old_output = event->output;
- rcu_assign_pointer(event->output, output_event);
- mutex_unlock(&event->mmap_mutex);
+ /* Can't redirect output if we've got an active mmap() */
+ if (atomic_read(&event->mmap_count))
+ goto unlock;

- if (old_output) {
- /*
- * we need to make sure no existing perf_output_*()
- * is still referencing this event.
- */
- synchronize_rcu();
- fput(old_output->filp);
+ if (output_event) {
+ /* get the buffer we want to redirect to */
+ data = perf_mmap_data_get(output_event);
+ if (!data)
+ goto unlock;
}

+ old_data = event->data;
+ rcu_assign_pointer(event->data, data);
ret = 0;
+unlock:
+ mutex_unlock(&event->mmap_mutex);
+
+ if (old_data)
+ perf_mmap_data_put(old_data);
out:
- fput_light(output_file, fput_needed);
return ret;
}

@@ -4809,7 +4839,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
- struct perf_event *event, *group_leader;
+ struct perf_event *event, *group_leader = NULL, *output_event = NULL;
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
@@ -4840,6 +4870,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;

+ if (group_fd != -1) {
+ group_leader = perf_fget_light(group_fd, &fput_needed);
+ if (IS_ERR(group_leader)) {
+ err = PTR_ERR(group_leader);
+ goto err_put_context;
+ }
+ group_file = group_leader->filp;
+ if (flags & PERF_FLAG_FD_OUTPUT)
+ output_event = group_leader;
+ if (flags & PERF_FLAG_FD_NO_GROUP)
+ group_leader = NULL;
+ }
+
/*
* Get the target context (task or percpu):
*/
@@ -4852,16 +4895,9 @@ SYSCALL_DEFINE5(perf_event_open,
/*
* Look up the group leader (we will attach this event to it):
*/
- group_leader = NULL;
- if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+ if (group_leader) {
err = -EINVAL;
- group_file = fget_light(group_fd, &fput_needed);
- if (!group_file)
- goto err_put_context;
- if (group_file->f_op != &perf_fops)
- goto err_put_context;

- group_leader = group_file->private_data;
/*
* Do not allow a recursive hierarchy (this new sibling
* becoming part of another group-sibling):
@@ -4883,9 +4919,16 @@ SYSCALL_DEFINE5(perf_event_open,

event = perf_event_alloc(&attr, cpu, ctx, group_leader,
NULL, NULL, GFP_KERNEL);
- err = PTR_ERR(event);
- if (IS_ERR(event))
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
goto err_put_context;
+ }
+
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_free_put_context;
+ }

event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
if (IS_ERR(event_file)) {
@@ -4893,12 +4936,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_free_put_context;
}

- if (flags & PERF_FLAG_FD_OUTPUT) {
- err = perf_event_set_output(event, group_fd);
- if (err)
- goto err_fput_free_put_context;
- }
-
event->filp = event_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
@@ -4916,8 +4953,6 @@ SYSCALL_DEFINE5(perf_event_open,
fd_install(event_fd, event_file);
return event_fd;

-err_fput_free_put_context:
- fput(event_file);
err_free_put_context:
free_event(event);
err_put_context:


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/