[RFC PATCH 11/11] dtrace: make use of writable buffers in BPF

From: Kris Van Hees
Date: Tue May 21 2019 - 16:43:29 EST


This commit modifies the tiny proof-of-concept DTrace utility to use
the writable-buffer support in BPF along with the new helpers for
buffer reservation and commit. The dtrace_finalize_context() helper
is updated and is now marked with ctx_update because it sets the
buffer pointer to NULL (and size 0).

Signed-off-by: Kris Van Hees <kris.van.hees@xxxxxxxxxx>
Reviewed-by: Nick Alcock <nick.alcock@xxxxxxxxxx>
---
include/uapi/linux/dtrace.h | 4 +
kernel/trace/dtrace/bpf.c | 150 ++++++++++++++++++++++++++++++++++++
tools/dtrace/dt_buffer.c | 54 +++++--------
tools/dtrace/probe1_bpf.c | 47 ++++++-----
4 files changed, 198 insertions(+), 57 deletions(-)

diff --git a/include/uapi/linux/dtrace.h b/include/uapi/linux/dtrace.h
index bbe2562c11f2..3fcc075a429f 100644
--- a/include/uapi/linux/dtrace.h
+++ b/include/uapi/linux/dtrace.h
@@ -33,6 +33,10 @@ struct dtrace_bpf_context {
u32 gid; /* from_kgid(&init_user_ns, current_real_cred()->gid */
u32 euid; /* from_kuid(&init_user_ns, current_real_cred()->euid */
u32 egid; /* from_kgid(&init_user_ns, current_real_cred()->egid */
+
+ /* General output buffer */
+ __bpf_md_ptr(u8 *, buf);
+ __bpf_md_ptr(u8 *, buf_end);
};

/*
diff --git a/kernel/trace/dtrace/bpf.c b/kernel/trace/dtrace/bpf.c
index 95f4103d749e..93bd2f0319cc 100644
--- a/kernel/trace/dtrace/bpf.c
+++ b/kernel/trace/dtrace/bpf.c
@@ -7,6 +7,7 @@
#include <linux/filter.h>
#include <linux/ptrace.h>
#include <linux/sched.h>
+#include <linux/perf_event.h>

/*
* Actual kernel definition of the DTrace BPF context.
@@ -16,6 +17,9 @@ struct dtrace_bpf_ctx {
u32 ecb_id;
u32 probe_id;
struct task_struct *task;
+ struct perf_output_handle handle;
+ u64 buf_len;
+ u8 *buf;
};

/*
@@ -55,6 +59,8 @@ BPF_CALL_2(dtrace_finalize_context, struct dtrace_bpf_ctx *, ctx,

ctx->ecb_id = ecb->id;
ctx->probe_id = ecb->probe_id;
+ ctx->buf_len = 0;
+ ctx->buf = NULL;

return 0;
}
@@ -62,17 +68,119 @@ BPF_CALL_2(dtrace_finalize_context, struct dtrace_bpf_ctx *, ctx,
static const struct bpf_func_proto dtrace_finalize_context_proto = {
.func = dtrace_finalize_context,
.gpl_only = false,
+ .ctx_update = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX, /* ctx */
.arg2_type = ARG_CONST_MAP_PTR, /* map */
};

+BPF_CALL_4(dtrace_buffer_reserve, struct dtrace_bpf_ctx *, ctx,
+ int, id, struct bpf_map *, map, int, size)
+{
+ struct bpf_array *arr = container_of(map, struct bpf_array, map);
+ int cpu = smp_processor_id();
+ struct bpf_event_entry *ee;
+ struct perf_event *ev;
+ int err;
+
+ /*
+ * Make sure the writable-buffer id is valid. We use the default which
+ * is the offset of the start-of-buffer pointer in the public context.
+ */
+ if (id != offsetof(struct dtrace_bpf_context, buf))
+ return -EINVAL;
+
+ /*
+ * Verify whether we have an uncommitted reserve. If so, we deny this
+ * request.
+ */
+ if (ctx->handle.rb)
+ return -EBUSY;
+
+ /*
+ * Perform sanity checks.
+ */
+ if (cpu >= arr->map.max_entries)
+ return -E2BIG;
+ ee = READ_ONCE(arr->ptrs[cpu]);
+ if (!ee)
+ return -ENOENT;
+ ev = ee->event;
+ if (unlikely(ev->attr.type != PERF_TYPE_SOFTWARE ||
+ ev->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+ return -EINVAL;
+ if (unlikely(ev->oncpu != cpu))
+ return -EOPNOTSUPP;
+
+ size = round_up(size, sizeof(u64));
+
+ err = perf_output_begin_forward_in_page(&ctx->handle, ev, size);
+ if (err < 0)
+ return err;
+
+ ctx->buf_len = size;
+ ctx->buf = ctx->handle.addr;
+
+ return 0;
+}
+
+static const struct bpf_func_proto dtrace_buffer_reserve_proto = {
+ .func = dtrace_buffer_reserve,
+ .gpl_only = false,
+ .ctx_update = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX, /* ctx */
+ .arg2_type = ARG_ANYTHING, /* id */
+ .arg3_type = ARG_CONST_MAP_PTR, /* map */
+ .arg4_type = ARG_ANYTHING, /* size */
+};
+
+BPF_CALL_3(dtrace_buffer_commit, struct dtrace_bpf_ctx *, ctx,
+ int, id, struct bpf_map *, map)
+{
+ /*
+ * Make sure the writable-buffer id is valid. We use the default which
+ * is the offset of the start-of-buffer pointer in the public context.
+ */
+ if (id != offsetof(struct dtrace_bpf_context, buf))
+ return -EINVAL;
+
+ /*
+ * Verify that we have an uncommitted reserve. If not, there is really
+ * nothing to be done here.
+ */
+ if (!ctx->handle.rb)
+ return 0;
+
+ perf_output_end(&ctx->handle);
+
+ ctx->handle.rb = NULL;
+ ctx->buf_len = 0;
+ ctx->buf = NULL;
+
+ return 0;
+}
+
+static const struct bpf_func_proto dtrace_buffer_commit_proto = {
+ .func = dtrace_buffer_commit,
+ .gpl_only = false,
+ .ctx_update = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX, /* ctx */
+ .arg2_type = ARG_ANYTHING, /* id */
+ .arg3_type = ARG_CONST_MAP_PTR, /* map */
+};
+
static const struct bpf_func_proto *
dtrace_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_finalize_context:
return &dtrace_finalize_context_proto;
+ case BPF_FUNC_buffer_reserve:
+ return &dtrace_buffer_reserve_proto;
+ case BPF_FUNC_buffer_commit:
+ return &dtrace_buffer_commit_proto;
case BPF_FUNC_perf_event_output:
return bpf_get_perf_event_output_proto();
case BPF_FUNC_trace_printk:
@@ -131,6 +239,22 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type,
if (bpf_ctx_narrow_access_ok(off, size, sizeof(u32)))
return true;
break;
+ case bpf_ctx_range(struct dtrace_bpf_context, buf):
+ info->reg_type = PTR_TO_BUFFER;
+ info->buf_id = offsetof(struct dtrace_bpf_context, buf);
+
+ bpf_ctx_record_field_size(info, sizeof(u64));
+ if (bpf_ctx_narrow_access_ok(off, size, sizeof(u64)))
+ return true;
+ break;
+ case bpf_ctx_range(struct dtrace_bpf_context, buf_end):
+ info->reg_type = PTR_TO_BUFFER_END;
+ info->buf_id = offsetof(struct dtrace_bpf_context, buf);
+
+ bpf_ctx_record_field_size(info, sizeof(u64));
+ if (bpf_ctx_narrow_access_ok(off, size, sizeof(u64)))
+ return true;
+ break;
default:
if (size == sizeof(unsigned long))
return true;
@@ -152,6 +276,10 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type,
* si->dst_reg = ((type *)si->src_reg)->member
* target_size = sizeof(((type *)si->src_reg)->member)
*
+ * BPF_LDX_CTX_FIELD_DST(type, member, dst, si, target_size)
+ * dst = ((type *)si->src_reg)->member
+ * target_size = sizeof(((type *)si->src_reg)->member)
+ *
* BPF_LDX_LNK_FIELD(type, member, si, target_size)
* si->dst_reg = ((type *)si->dst_reg)->member
* target_size = sizeof(((type *)si->dst_reg)->member)
@@ -172,6 +300,13 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type,
*(target_size) = FIELD_SIZEOF(type, member); \
offsetof(type, member); \
}))
+#define BPF_LDX_CTX_FIELD_DST(type, member, dst, si, target_size) \
+ BPF_LDX_MEM(BPF_FIELD_SIZEOF(type, member), \
+ (dst), (si)->src_reg, \
+ ({ \
+ *(target_size) = FIELD_SIZEOF(type, member); \
+ offsetof(type, member); \
+ }))
#define BPF_LDX_LNK_FIELD(type, member, si, target_size) \
BPF_LDX_MEM(BPF_FIELD_SIZEOF(type, member), \
(si)->dst_reg, (si)->dst_reg, \
@@ -261,6 +396,18 @@ static u32 dtrace_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_LNK_PTR(struct task_struct, cred, si);
*insn++ = BPF_LDX_LNK_FIELD(struct cred, egid, si, target_size);
break;
+ case offsetof(struct dtrace_bpf_context, buf):
+ *insn++ = BPF_LDX_CTX_FIELD(struct dtrace_bpf_ctx, buf, si,
+ target_size);
+ break;
+ case offsetof(struct dtrace_bpf_context, buf_end):
+ /* buf_end = ctx->buf + ctx->buf_len */
+ *insn++ = BPF_LDX_CTX_FIELD(struct dtrace_bpf_ctx, buf, si,
+ target_size);
+ *insn++ = BPF_LDX_CTX_FIELD_DST(struct dtrace_bpf_ctx, buf_len,
+ BPF_REG_AX, si, target_size);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+ break;
default:
*insn++ = BPF_LDX_CTX_PTR(struct dtrace_bpf_ctx, regs, si);
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
@@ -308,6 +455,9 @@ static void *dtrace_convert_ctx(enum bpf_prog_type stype, void *ctx)
gctx = this_cpu_ptr(&dtrace_ctx);
gctx->regs = (struct pt_regs *)ctx;
gctx->task = current;
+ gctx->handle.rb = NULL;
+ gctx->buf_len = 0;
+ gctx->buf = NULL;

return gctx;
}
diff --git a/tools/dtrace/dt_buffer.c b/tools/dtrace/dt_buffer.c
index 65c107ca8ac4..28fac9036d69 100644
--- a/tools/dtrace/dt_buffer.c
+++ b/tools/dtrace/dt_buffer.c
@@ -282,33 +282,27 @@ static void write_rb_tail(volatile struct perf_event_mmap_page *rb_page,
*/
static int output_event(u64 *buf)
{
- u8 *data = (u8 *)buf;
- struct perf_event_header *hdr;
- u32 size;
- u64 probe_id, task;
- u32 pid, ppid, cpu, euid, egid, tag;
+ u8 *data = (u8 *)buf;
+ u32 probe_id;
+ u32 flags;
+ u64 task;
+ u32 pid, ppid, cpu, euid, egid, tag;

- hdr = (struct perf_event_header *)data;
- data += sizeof(struct perf_event_header);
+ probe_id = *(u32 *)&(data[0]);

- if (hdr->type != PERF_RECORD_SAMPLE)
- return 1;
+ if (probe_id == PERF_RECORD_LOST) {
+ u16 size;
+ u64 lost;

- size = *(u32 *)data;
- data += sizeof(u32);
+ size = *(u16 *)&(data[6]);
+ lost = *(u16 *)&(data[16]);

- /*
- * The sample should only take up 48 bytes, but as a result of how the
- * BPF program stores the data (filling in a struct that resides on the
- * stack, and sending that off using bpf_perf_event_output()), there is
- * some internal padding
- */
- if (size != 52) {
- printf("Sample size is wrong (%d vs expected %d)\n", size, 52);
- goto out;
+ printf("[%ld probes dropped]\n", lost);
+
+ return size;
}

- probe_id = *(u64 *)&(data[0]);
+ flags = *(u32 *)&(data[4]);
pid = *(u32 *)&(data[8]);
ppid = *(u32 *)&(data[12]);
cpu = *(u32 *)&(data[16]);
@@ -318,19 +312,14 @@ static int output_event(u64 *buf)
tag = *(u32 *)&(data[40]);

if (probe_id != 123)
- printf("Corrupted data (probe_id = %ld)\n", probe_id);
+ printf("Corrupted data (probe_id = %d)\n", probe_id);
if (tag != 0xdace)
printf("Corrupted data (tag = %x)\n", tag);

- printf("CPU-%d: EPID %ld PID %d PPID %d EUID %d EGID %d TASK %08lx\n",
- cpu, probe_id, pid, ppid, euid, egid, task);
+ printf("CPU-%d: [%d/%d] PID %d PPID %d EUID %d EGID %d TASK %08lx\n",
+ cpu, probe_id, flags, pid, ppid, euid, egid, task);

-out:
- /*
- * We processed the perf_event_header, the size, and ;size; bytes of
- * probe data.
- */
- return sizeof(struct perf_event_header) + sizeof(u32) + size;
+ return 48;
}

/*
@@ -351,10 +340,9 @@ static void process_data(struct dtrace_buffer *buf)

/*
* Ensure that the buffer contains enough data for at least one
- * sample (header + sample size + sample data).
+ * sample.
*/
- if (head - tail < sizeof(struct perf_event_header) +
- sizeof(u32) + 48)
+ if (head - tail < 48)
break;

if (*ptr)
diff --git a/tools/dtrace/probe1_bpf.c b/tools/dtrace/probe1_bpf.c
index 5b34edb61412..a3196261e66e 100644
--- a/tools/dtrace/probe1_bpf.c
+++ b/tools/dtrace/probe1_bpf.c
@@ -37,25 +37,16 @@ struct bpf_map_def SEC("maps") buffer_map = {
.max_entries = 2,
};

-struct sample {
- u64 probe_id;
- u32 pid;
- u32 ppid;
- u32 cpu;
- u32 euid;
- u32 egid;
- u64 task;
- u32 tag;
-};
-
#define DPROG(F) SEC("dtrace/"__stringify(F)) int bpf_func_##F
+#define BUF_ID offsetof(struct dtrace_bpf_context, buf)

/* we jump here when syscall number == __NR_write */
DPROG(__NR_write)(struct dtrace_bpf_context *ctx)
{
int cpu = bpf_get_smp_processor_id();
struct dtrace_ecb *ecb;
- struct sample smpl;
+ u8 *buf, *buf_end;
+ int err;

bpf_finalize_context(ctx, &probemap);

@@ -63,17 +54,25 @@ DPROG(__NR_write)(struct dtrace_bpf_context *ctx)
if (!ecb)
return 0;

- memset(&smpl, 0, sizeof(smpl));
- smpl.probe_id = ecb->probe_id;
- smpl.pid = ctx->pid;
- smpl.ppid = ctx->ppid;
- smpl.cpu = ctx->cpu;
- smpl.euid = ctx->euid;
- smpl.egid = ctx->egid;
- smpl.task = ctx->task;
- smpl.tag = 0xdace;
-
- bpf_perf_event_output(ctx, &buffer_map, cpu, &smpl, sizeof(smpl));
+ err = bpf_buffer_reserve(ctx, BUF_ID, &buffer_map, 48);
+ if (err < 0)
+ return -1;
+ buf = ctx->buf;
+ buf_end = ctx->buf_end;
+ if (buf + 48 > buf_end)
+ return -1;
+
+ *(u32 *)(&buf[0]) = ecb->probe_id;
+ *(u32 *)(&buf[4]) = 0;
+ *(u32 *)(&buf[8]) = ctx->pid;
+ *(u32 *)(&buf[12]) = ctx->ppid;
+ *(u32 *)(&buf[16]) = ctx->cpu;
+ *(u32 *)(&buf[20]) = ctx->euid;
+ *(u32 *)(&buf[24]) = ctx->egid;
+ *(u64 *)(&buf[32]) = ctx->task;
+ *(u32 *)(&buf[40]) = 0xdace;
+
+ bpf_buffer_commit(ctx, BUF_ID, &buffer_map);

return 0;
}
@@ -84,7 +83,7 @@ int bpf_prog1(struct pt_regs *ctx)
struct dtrace_ecb ecb;
int cpu = bpf_get_smp_processor_id();

- ecb.id = 1;
+ ecb.id = 3;
ecb.probe_id = 123;

bpf_map_update_elem(&probemap, &cpu, &ecb, BPF_ANY);
--
2.20.1