Re: latency histogram with BPF

From: Wangnan (F)
Date: Fri Jun 12 2015 - 02:59:29 EST




On 2015/6/11 15:25, Daniel Wagner wrote:
Hi Steven and Tom,

I was playing a bit with BPF to see if it would be possible to
get a preempt off latency histogram. On paper this looks like
doable but adding kprobes on trace_preempt_[on|off] is not a
clever idea. Who would have thought? :)

In both cases BPF or based on Tom's 'hist' triggers' patches, there is
some trickery necessary to get it working. While the first approach
has more flexibility what you want to measure or how you want to
present it, I suspect it will be harder to get it working/accepted.

Anyway, here is some code to laugh at.

From e3b9e7122f78953248eae54376b5a65f8beba362 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@xxxxxxxxxxxx>
Date: Thu, 11 Jun 2015 09:19:03 +0200
Subject: [PATCH] hwlathist: BPF based latency tracing

BPF offers another way to generate latency histograms. We attach
kprobes at trace_preempt_off and trace_preempt_on and calculate the
time it takes to from seeing the off/on transition.

The first hashmap is used to store the start time stamp. The key is the
CPU id. The second hashmap stores the log2(time diff). To be able to
have a per-cpu histogram the CPU ID is added to the upper
bits of the hash key.

Obviously, adding kprobes to trace_preempt_[on|off] is
asking for problems:

[SNIP]


CPU 0
latency : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 3602 |************************************************* |
16 -> 31 : 22 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
CPU 1
latency : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 7533 |************************************************* |
16 -> 31 : 125 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
CPU 2
latency : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 3044 |************************************************* |
16 -> 31 : 13 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
CPU 3
latency : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 25182 |************************************************* |
16 -> 31 : 1675 |** |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |

The numbers look a bit too nice and low. I suspect something is going
wrong.

All this is based on the tracex3 examples written by
Alexei Starovoitov <ast@xxxxxxxxxxxx>.

Not for inclusion!

Not-signed-off-by: Daniel Wagner <daniel.wagner@xxxxxxxxxxxx>
---
kernel/trace/bpf_trace.c | 2 +
samples/bpf/Makefile | 6 ++-
samples/bpf/bpf_helpers.h | 2 +
samples/bpf/hwlathist_kern.c | 92 ++++++++++++++++++++++++++++++++++++
samples/bpf/hwlathist_user.c | 108 +++++++++++++++++++++++++++++++++++++++++++
5 files changed, 209 insertions(+), 1 deletion(-)
create mode 100644 samples/bpf/hwlathist_kern.c
create mode 100644 samples/bpf/hwlathist_user.c

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce5..e4d3e76 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_probe_read_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_trace_printk:
/*
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 8fdbd73..8420648 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -10,6 +10,7 @@ hostprogs-y += tracex1
hostprogs-y += tracex2
hostprogs-y += tracex3
hostprogs-y += tracex4
+hostprogs-y += hwlathist
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
@@ -20,6 +21,7 @@ tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
+hwlathist-objs := bpf_load.o libbpf.o hwlathist_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -30,6 +32,7 @@ always += tracex2_kern.o
always += tracex3_kern.o
always += tracex4_kern.o
always += tcbpf1_kern.o
+always += hwlathist_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -40,9 +43,10 @@ HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
HOSTLOADLIBES_tracex4 += -lelf -lrt
+HOSTLOADLIBES_hwlathist += -lelf
# point this to your LLVM backend with bpf support
-LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
+LLC = /opt/llvm/bin/llc
$(obj)/%.o: $(src)/%.c
clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f960b5f..c99bdb6 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -21,6 +21,8 @@ static unsigned long long (*bpf_ktime_get_ns)(void) =
(void *) BPF_FUNC_ktime_get_ns;
static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
(void *) BPF_FUNC_trace_printk;
+static unsigned int (*bpf_get_smp_processor_id)(void) =
+ (void *) BPF_FUNC_get_smp_processor_id;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/hwlathist_kern.c b/samples/bpf/hwlathist_kern.c
new file mode 100644
index 0000000..7a97e7e
--- /dev/null
+++ b/samples/bpf/hwlathist_kern.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(unsigned int),
+ .value_size = sizeof(u64),
+ .max_entries = 1024,
+};
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+

Is it possible to make 'uapi/linux/bpf.h' hold something useful for
eBPF programming, so we can get rid from bpf_helpers?

Like this:

#ifndef _UAPI__LINUX_BPF_H__
#define _UAPI__LINUX_BPF_H__

#include <linux/types.h>
#include <linux/bpf_common.h>
...

#ifdef __BPF__

#define SEC(NAME) __attribute__((section(NAME), used))

...

static void *(*bpf_map_lookup_elem)(void *map, void *key) =
(void *) BPF_FUNC_map_lookup_elem;
...

static unsigned int log2l(unsigned long v)
{
unsigned int hi = v >> 32;

if (hi)
return log2(hi) + 32;
else
return log2(v);
}
...

#endif
#endif

Then by using the shell script Alexei suggested in

https://lkml.kernel.org/r/55777A17.7030000@xxxxxxxxxxxx

and automatically kernel header detector I posted today, users will easily
find helpers, even if the source file is not in 'samples/bpf'.

Thank you.

+SEC("kprobe/trace_preempt_off")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ int cpu = bpf_get_smp_processor_id();
+ u64 ts = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(&my_map, &cpu, &ts, BPF_ANY);
+
+ return 0;
+}
+
+struct bpf_map_def SEC("maps") my_lat = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(unsigned long long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+SEC("kprobe/trace_preempt_on")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long init_val = 1;
+ long *value;
+ u64 *ts;
+ int cpu;
+
+ cpu = bpf_get_smp_processor_id();
+ ts = bpf_map_lookup_elem(&my_map, &cpu);
+ if (!ts)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = log2l(cur_time - *ts);
+
+ bpf_map_delete_elem(&my_map, &cpu);
+
+ u64 key = (((u64)cpu) << 32) | (log2l(delta) & 0xffffffff);
+ value = bpf_map_lookup_elem(&my_lat, &key);
+ if (value)
+ __sync_fetch_and_add((long *)value, 1);
+ else
+ bpf_map_update_elem(&my_lat, &key, &init_val, BPF_ANY);
+
+ return 0;
+
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/hwlathist_user.c b/samples/bpf/hwlathist_user.c
new file mode 100644
index 0000000..7e19f2c
--- /dev/null
+++ b/samples/bpf/hwlathist_user.c
@@ -0,0 +1,108 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_ENTRIES 10
+#define MAX_CPU 8
+#define MAX_STARS 50
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+struct cpu_hist {
+ long data[MAX_ENTRIES];
+ unsigned long max;
+};
+
+static struct cpu_hist cpu_hist[MAX_CPU];
+
+static void get_data(int fd)
+{
+ unsigned long long key, next_key, k;
+ long value;
+ int i, cpu;
+
+ for (i = 0; i < MAX_CPU; i++)
+ cpu_hist[i].max = 0;
+
+ key = 0;
+ while (bpf_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_lookup_elem(fd, &next_key, &value);
+ cpu = next_key >> 32;
+ if (cpu >= MAX_CPU)
+ goto next;
+
+ k = next_key & 0xffffffff;
+ if (k > MAX_ENTRIES)
+ k = MAX_ENTRIES - 1;
+
+ cpu_hist[cpu].data[k] = value;
+ if (value > cpu_hist[cpu].max)
+ cpu_hist[cpu].max = value;
+
+ next:
+ key = next_key;
+ }
+}
+
+static void print_hist(void)
+{
+ char starstr[MAX_STARS];
+ struct cpu_hist *hist;
+ int i, j;
+
+ printf("\033[2J");
+
+ for (j = 0; j < MAX_CPU; j++) {
+ hist = &cpu_hist[j];
+ if (hist->max == 0)
+ continue;
+ printf("CPU %d\n", j);
+ printf(" latency : count distribution\n");
+ for (i = 1; i <= MAX_ENTRIES; i++) {
+ stars(starstr, hist->data[i - 1], hist->max, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1,
+ hist->data[i - 1], MAX_STARS, starstr);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ while (1) {
+ get_data(map_fd[1]);
+ print_hist();
+ sleep(5);
+ }
+
+ return 0;
+}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/