[tip:perf/core] perf core: Allow setting up max frame stack depth via sysctl

From: tip-bot for Arnaldo Carvalho de Melo
Date: Wed Apr 27 2016 - 11:41:13 EST


Commit-ID: c5dfd78eb79851e278b7973031b9ca363da87a7e
Gitweb: http://git.kernel.org/tip/c5dfd78eb79851e278b7973031b9ca363da87a7e
Author: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
AuthorDate: Thu, 21 Apr 2016 12:28:50 -0300
Committer: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
CommitDate: Wed, 27 Apr 2016 10:20:39 -0300

perf core: Allow setting up max frame stack depth via sysctl

The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.

And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.

The new file is:

# cat /proc/sys/kernel/perf_event_max_stack
127

Chaging it:

# echo 256 > /proc/sys/kernel/perf_event_max_stack
# cat /proc/sys/kernel/perf_event_max_stack
256

But as soon as there is some event using callchains we get:

# echo 512 > /proc/sys/kernel/perf_event_max_stack
-bash: echo: write error: Device or resource busy
#

Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.

Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@xxxxxxxxx>
Reviewed-by: Frederic Weisbecker <fweisbec@xxxxxxxxx>
Acked-by: Alexei Starovoitov <ast@xxxxxxxxxx>
Acked-by: David Ahern <dsahern@xxxxxxxxx>
Cc: Adrian Hunter <adrian.hunter@xxxxxxxxx>
Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Cc: He Kuang <hekuang@xxxxxxxxxx>
Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Masami Hiramatsu <mhiramat@xxxxxxxxxx>
Cc: Milian Wolff <milian.wolff@xxxxxxxx>
Cc: Namhyung Kim <namhyung@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Stephane Eranian <eranian@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Vince Weaver <vincent.weaver@xxxxxxxxx>
Cc: Wang Nan <wangnan0@xxxxxxxxxx>
Cc: Zefan Li <lizefan@xxxxxxxxxx>
Link: http://lkml.kernel.org/r/20160426002928.GB16708@xxxxxxxxxx
Signed-off-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
---
Documentation/sysctl/kernel.txt | 14 ++++++++++++++
arch/arm/kernel/perf_callchain.c | 2 +-
arch/arm64/kernel/perf_callchain.c | 4 ++--
arch/metag/kernel/perf_callchain.c | 2 +-
arch/mips/kernel/perf_event.c | 4 ++--
arch/powerpc/perf/callchain.c | 4 ++--
arch/sparc/kernel/perf_event.c | 6 +++---
arch/x86/events/core.c | 4 ++--
arch/xtensa/kernel/perf_event.c | 4 ++--
include/linux/perf_event.h | 8 ++++++--
kernel/bpf/stackmap.c | 8 ++++----
kernel/events/callchain.c | 35 +++++++++++++++++++++++++++++++++--
kernel/sysctl.c | 12 ++++++++++++
13 files changed, 84 insertions(+), 23 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a4..260cde0 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
- panic_on_warn
- perf_cpu_time_max_percent
- perf_event_paranoid
+- perf_event_max_stack
- pid_max
- powersave-nap [ PPC only ]
- printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 1.

==============================================================

+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
pid_max:

PID allocation wrap value. When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5..27563be 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)

tail = (struct frame_tail __user *)regs->ARM_fp - 1;

- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = user_backtrace(tail, entry);
}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff46654..32c3c6e 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,

tail = (struct frame_tail __user *)regs->regs[29];

- while (entry->nr < PERF_MAX_STACK_DEPTH &&
+ while (entry->nr < sysctl_perf_event_max_stack &&
tail && !((unsigned long)tail & 0xf))
tail = user_backtrace(tail, entry);
} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,

tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;

- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = compat_user_backtrace(tail, entry);
#endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 3156334..252abc1 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)

--frame;

- while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+ while ((entry->nr < sysctl_perf_event_max_stack) && frame)
frame = user_backtrace(frame, entry);
}

diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6..5021c54 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
addr = *sp++;
if (__kernel_text_address(addr)) {
perf_callchain_store(entry, addr);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
}
}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
do {
perf_callchain_store(entry, pc);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
pc = unwind_stack(current, &sp, pc, &ra);
} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a675..22d9015 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);

- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned long __user *) sp;
if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);

- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned int __user *) (unsigned long) sp;
if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66..a4b8b5a 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
}
#endif
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}

static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp + STACK_BIAS;
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}

static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
ufp = (unsigned long)sf.fp;
}
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}

void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442..41d93d0 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)

fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
return;

pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f0118..a6b00b3 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
callchain_trace, NULL, entry);
}

void perf_callchain_user(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
callchain_trace, entry);
}

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae..a090700 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {

struct perf_callchain_entry {
__u64 nr;
- __u64 ip[PERF_MAX_STACK_DEPTH];
+ __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};

struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);

+extern int sysctl_perf_event_max_stack;
+
static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
- if (entry->nr < PERF_MAX_STACK_DEPTH) {
+ if (entry->nr < sysctl_perf_event_max_stack) {
entry->ip[entry->nr++] = ip;
return 0;
} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);

+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);

static inline bool perf_paranoid_tracepoint_raw(void)
{
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e9..f5a1954 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 ||
- value_size / 8 > PERF_MAX_STACK_DEPTH)
+ value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);

/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / 8;
- /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
- u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
return -EFAULT;

/* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f..b9325e7 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};

+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+ return (sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;

- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;

for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)

cpu = smp_processor_id();

- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)entries->cpu_entries[cpu]) +
+ (*rctx * perf_callchain_entry__sizeof()));
}

static void
@@ -215,3 +224,25 @@ exit_put:

return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f..c8b3186 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif

/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .procname = "perf_event_max_stack",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ .extra2 = &six_hundred_forty_kb,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{