Introduce helper bpf_get_task_stack(), which dumps stack trace of given
task. This is different to bpf_get_stack(), which gets stack track of
current task. One potential use case of bpf_get_task_stack() is to call
it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
bpf_get_task_stack() uses stack_trace_save_tsk() instead of
get_perf_callchain() for kernel stack. The benefit of this choice is that
stack_trace_save_tsk() doesn't require changes in arch/. The downside of
using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
stack trace to unsigned long array. For 32-bit systems, we need to
translate it to u64 array.
Signed-off-by: Song Liu <songliubraving@xxxxxx>
---
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 35 ++++++++++++++-
kernel/bpf/stackmap.c | 79 ++++++++++++++++++++++++++++++++--
kernel/trace/bpf_trace.c | 2 +
scripts/bpf_helpers_doc.py | 2 +
tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
6 files changed, 149 insertions(+), 5 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 07052d44bca1c..cee31ee56367b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1607,6 +1607,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
+extern const struct bpf_func_proto bpf_get_task_stack_proto;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 19684813faaed..7638412987354 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3252,6 +3252,38 @@ union bpf_attr {
* case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
* is returned or the error code -EACCES in case the skb is not
* subject to CHECKSUM_UNNECESSARY.
+ *
+ * int bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
+ * Description
+ * Return a user or a kernel stack in bpf program provided buffer.
+ * To achieve this, the helper needs *task*, which is a valid
+ * pointer to struct task_struct. To store the stacktrace, the
+ * bpf program provides *buf* with a nonnegative *size*.
+ *
+ * The last argument, *flags*, holds the number of stack frames to
+ * skip (from 0 to 255), masked with
+ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * the following flags:
+ *
+ * **BPF_F_USER_STACK**
+ * Collect a user space stack instead of a kernel stack.
+ * **BPF_F_USER_BUILD_ID**
+ * Collect buildid+offset instead of ips for user stack,
+ * only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * **bpf_get_task_stack**\ () can collect up to
+ * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * to sufficient large buffer size. Note that
+ * this limit can be controlled with the **sysctl** program, and
+ * that it should be manually increased in order to profile long
+ * user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * ::
+ *
+ * # sysctl kernel.perf_event_max_stack=<new value>
+ * Return
+ * A non-negative value equal to or less than *size* on success,
+ * or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3389,7 +3421,8 @@ union bpf_attr {
FN(ringbuf_submit), \
FN(ringbuf_discard), \
FN(ringbuf_query), \
- FN(csum_level),
+ FN(csum_level), \
+ FN(get_task_stack),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 599488f25e404..64b7843057a23 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -348,6 +348,44 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
}
+static struct perf_callchain_entry *
+get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+{
+ struct perf_callchain_entry *entry;
+ int rctx;
+
+ entry = get_callchain_entry(&rctx);
+
+ if (rctx == -1)
+ return NULL;
+[...]
+ if (!entry)
+ goto exit_put;
+
+ entry->nr = init_nr +
+ stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
+ sysctl_perf_event_max_stack - init_nr, 0);
+
+ /* stack_trace_save_tsk() works on unsigned long array, while
+ * perf_callchain_entry uses u64 array. For 32-bit systems, it is
+ * necessary to fix this mismatch.
+ */
+ if (__BITS_PER_LONG != 64) {
+ unsigned long *from = (unsigned long *) entry->ip;
+ u64 *to = entry->ip;
+ int i;
+
+ /* copy data from the end to avoid using extra buffer */
+ for (i = entry->nr - 1; i >= (int)init_nr; i--)
+ to[i] = (u64)(from[i]);
+ }
+
+exit_put:
+ put_callchain_entry(rctx);
+
+ return entry;
+}
+
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
@@ -448,8 +486,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
- u64, flags)