[PATCH RESEND] tcp_estats: ebpf hacks

From: Martin KaFai Lau
Date: Thu Feb 04 2016 - 00:39:43 EST


Signed-off-by: Martin KaFai Lau <kafai@xxxxxx>
---
kernel/trace/bpf_trace.c | 20 ++
samples/Makefile | 2 +-
samples/bpf/Makefile | 11 +-
samples/bpf/bpf_helpers.h | 4 +
samples/bpf/bpf_load.c | 44 +++--
samples/bpf/tcp_trace.h | 51 +++++
samples/bpf/tcp_trace_kern.c | 454 +++++++++++++++++++++++++++++++++++++++++++
samples/bpf/tcp_trace_user.c | 115 +++++++++++
tools/net/Makefile | 6 +-
9 files changed, 689 insertions(+), 18 deletions(-)
create mode 100644 samples/bpf/tcp_trace.h
create mode 100644 samples/bpf/tcp_trace_kern.c
create mode 100644 samples/bpf/tcp_trace_user.c

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 47febbe..977702e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -68,6 +68,7 @@ static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
void *unsafe_ptr = (void *) (long) r3;

return probe_kernel_read(dst, unsafe_ptr, size);
+ /* return __bpf_probe_read_hack(dst, unsafe_ptr, size); */
}

static const struct bpf_func_proto bpf_probe_read_proto = {
@@ -79,6 +80,25 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};

+static u64 bpf_probe_read_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ u32 *dst = (u32 *) (long) r1;
+ int size = (int) r2;
+ u32 *unsafe_ptr = (void *) (long) r3;
+
+ *dst = *unsafe_ptr;
+ return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static const struct bpf_func_proto bpf_probe_read_u32_proto = {
+ .func = bpf_probe_read,
+ .gpl_only = true,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_ANYTHING,
+};
+
/*
* limited trace_printk()
* only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
diff --git a/samples/Makefile b/samples/Makefile
index f00257b..fb87be5 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,4 +1,4 @@
# Makefile for Linux samples code

obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \
- hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/
+ hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ bpf/
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 97e5243..02885ae 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex4
hostprogs-y += tracex5
hostprogs-y += trace_output
hostprogs-y += lathist
+hostprogs-y += tcp_trace

test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
@@ -28,6 +29,7 @@ tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
trace_output-objs := bpf_load.o libbpf.o trace_output_user.o
lathist-objs := bpf_load.o libbpf.o lathist_user.o
+tcp_trace-objs := bpf_load.o libbpf.o tcp_trace_user.o

# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -42,6 +44,7 @@ always += tracex5_kern.o
always += trace_output_kern.o
always += tcbpf1_kern.o
always += lathist_kern.o
+always += tcp_trace_kern.o

HOSTCFLAGS += -I$(objtree)/usr/include

@@ -56,14 +59,16 @@ HOSTLOADLIBES_tracex4 += -lelf -lrt
HOSTLOADLIBES_tracex5 += -lelf
HOSTLOADLIBES_trace_output += -lelf -lrt
HOSTLOADLIBES_lathist += -lelf
+HOSTLOADLIBES_tcp_trace += -lelf

# point this to your LLVM backend with bpf support
-LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
+LLC=/home/kafai/local/llvm-git-master/bin/llc
+CLANG=/home/kafai/local/llvm-git-master/bin/clang

$(obj)/%.o: $(src)/%.c
- clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
+ $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
-D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
- clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
+ $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
-D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=asm -o $@.s
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index e84dd3c..df3f00e 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -33,6 +33,10 @@ static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, int size) =
(void *) BPF_FUNC_perf_event_output;
+static unsigned long long (*bpf_get_prandom_u32)(void) =
+ (void *) BPF_FUNC_get_prandom_u32;
+static unsigned long long (*bpf_probe_read_u32)(void *dst, int size, void *unsafe_ptr) =
+ (void *) BPF_FUNC_probe_read_u32;

/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index da86a8e..408e429 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -68,12 +68,17 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
return -1;
}

+ printf("%s:%d event=%s prog_cnt=%d\n", __FUNCTION__, __LINE__,
+ event, prog_cnt);
+
fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
if (fd < 0) {
printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
return -1;
}

+ /* printf("bpf_prog_load() fd=%d\n%s", fd, bpf_log_buf); */
+
prog_fd[prog_cnt++] = fd;

if (is_socket) {
@@ -103,8 +108,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
return populate_prog_array(event, fd);

snprintf(buf, sizeof(buf),
- "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
- is_kprobe ? 'p' : 'r', event, event);
+ "echo '%c:%s%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+ is_kprobe ? 'p' : 'r', is_kprobe ? "" : "r", event, event);
err = system(buf);
if (err < 0) {
printf("failed to create kprobe '%s' error '%s'\n",
@@ -115,6 +120,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)

strcpy(buf, DEBUGFS);
strcat(buf, "events/kprobes/");
+ if (is_kretprobe)
+ strcat(buf, "r");
strcat(buf, event);
strcat(buf, "/id");

@@ -229,20 +236,28 @@ int load_bpf_file(char *path)
Elf_Data *data, *data_prog, *symbols = NULL;
char *shname, *shname_prog;

- if (elf_version(EV_CURRENT) == EV_NONE)
+ if (elf_version(EV_CURRENT) == EV_NONE) {
+ printf("%s:%d\n", __FUNCTION__, __LINE__);
return 1;
+ }

fd = open(path, O_RDONLY, 0);
- if (fd < 0)
+ if (fd < 0) {
+ printf("%s:%d\n", __FUNCTION__, __LINE__);
return 1;
+ }

elf = elf_begin(fd, ELF_C_READ, NULL);

- if (!elf)
+ if (!elf) {
+ printf("%s:%d\n", __FUNCTION__, __LINE__);
return 1;
+ }

- if (gelf_getehdr(elf, &ehdr) != &ehdr)
+ if (gelf_getehdr(elf, &ehdr) != &ehdr) {
+ printf("%s:%d\n", __FUNCTION__, __LINE__);
return 1;
+ }

/* clear all kprobes */
i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
@@ -271,8 +286,10 @@ int load_bpf_file(char *path)
memcpy(&kern_version, data->d_buf, sizeof(int));
} else if (strcmp(shname, "maps") == 0) {
processed_sec[i] = true;
- if (load_maps(data->d_buf, data->d_size))
+ if (load_maps(data->d_buf, data->d_size)) {
+ printf("%s:%d\n", __FUNCTION__, __LINE__);
return 1;
+ }
} else if (shdr.sh_type == SHT_SYMTAB) {
symbols = data;
}
@@ -280,7 +297,6 @@ int load_bpf_file(char *path)

/* load programs that need map fixup (relocations) */
for (i = 1; i < ehdr.e_shnum; i++) {
-
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (shdr.sh_type == SHT_REL) {
@@ -290,6 +306,8 @@ int load_bpf_file(char *path)
&shdr_prog, &data_prog))
continue;

+ /* printf("%s:%d %s\n", __FUNCTION__, __LINE__, shname_prog); */
+
insns = (struct bpf_insn *) data_prog->d_buf;

processed_sec[shdr.sh_info] = true;
@@ -300,24 +318,28 @@ int load_bpf_file(char *path)

if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
memcmp(shname_prog, "kretprobe/", 10) == 0 ||
- memcmp(shname_prog, "socket", 6) == 0)
+ memcmp(shname_prog, "socket", 6) == 0) {
+ /* printf("%s:%d %s\n", __FUNCTION__, __LINE__, shname_prog); */
load_and_attach(shname_prog, insns, data_prog->d_size);
+ }
}
}

/* load programs that don't use maps */
for (i = 1; i < ehdr.e_shnum; i++) {
-
if (processed_sec[i])
continue;

if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;

+ /* printf("%s:%d %s\n", __FUNCTION__, __LINE__, shname); */
if (memcmp(shname, "kprobe/", 7) == 0 ||
memcmp(shname, "kretprobe/", 10) == 0 ||
- memcmp(shname, "socket", 6) == 0)
+ memcmp(shname, "socket", 6) == 0) {
+ /* printf("%s:%d %s\n", __FUNCTION__, __LINE__, shname); */
load_and_attach(shname, data->d_buf, data->d_size);
+ }
}

close(fd);
diff --git a/samples/bpf/tcp_trace.h b/samples/bpf/tcp_trace.h
new file mode 100644
index 0000000..d6e7ea4
--- /dev/null
+++ b/samples/bpf/tcp_trace.h
@@ -0,0 +1,51 @@
+#ifndef __TCP_TRACE_H
+#define __TCP_TRACE_H
+
+/*
+struct tcp_trace_flow {
+ u32 dst[1];
+};
+*/
+
+struct tcp_trace_flow4 {
+ __be32 dst;
+};
+
+struct tcp_trace_flow6 {
+ __be32 dst0;
+ __be32 dst1;
+};
+
+struct tcp_estats {
+ u64 data_octets_out;
+ u32 data_segs_out;
+ u32 octets_retrans;
+ u32 fast_retrans;
+ u32 timeouts;
+
+ u32 data_segs_in;
+ u64 data_octets_in;
+ u32 segs_in;
+ u32 dup_acks_in;
+ /* u32 sacks_rcvd; */
+ /* u32 sack_blocks_rcvd */
+ u32 dup_acks_out;
+ u32 dup_ack_episodes;
+ u32 sum_octets_reordered;
+
+ /* u64 sndlim_state_ts; */
+ /* u64 sndlim_time[TCP_ESTATS_SNDLIM_NSTATS]; */
+ /* u64 sndlim_trans[TCP_ESTATS_SNDLIM_NSTATS]; */
+ /* u8 sndlim_state; */
+
+ /* u64 rtt_sample_us; */
+ /* u64 max_rtt_us; */
+ /* u64 min_rtt_us; */
+
+ u32 cong_signals;
+ u32 slow_start;
+ u32 cong_avoid;
+ u64 ts;
+};
+
+#endif
diff --git a/samples/bpf/tcp_trace_kern.c b/samples/bpf/tcp_trace_kern.c
new file mode 100644
index 0000000..fd4039f
--- /dev/null
+++ b/samples/bpf/tcp_trace_kern.c
@@ -0,0 +1,454 @@
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include <net/inet_sock.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+#include "bpf_helpers.h"
+#include "tcp_trace.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+#ifdef memset
+#undef memset
+#endif
+
+struct bpf_map_def SEC("maps") tcp_flow_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(void *),
+ .value_size = sizeof(struct tcp_estats),
+ .max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") dst_rack_map4 = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct tcp_trace_flow4),
+ .value_size = sizeof(struct tcp_estats),
+ .max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") dst_rack_map6 = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct tcp_trace_flow6),
+ .value_size = sizeof(struct tcp_estats),
+ .max_entries = 10000,
+};
+
+struct tcphdr_flags {
+ union {
+ __u16 flags;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u16 res1:4,
+ doff:4,
+ fin:1,
+ syn:1,
+ rst:1,
+ psh:1,
+ ack:1,
+ urg:1,
+ ece:1,
+ cwr:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __u16 doff:4,
+ res1:4,
+ cwr:1,
+ ece:1,
+ urg:1,
+ ack:1,
+ psh:1,
+ rst:1,
+ syn:1,
+ fin:1;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+ } u;
+};
+
+static __always_inline unsigned char *__skb_transport_header(struct sk_buff *skb)
+{
+ return _(skb->head) + _(skb->transport_header);
+}
+
+static __always_inline struct tcphdr *__tcp_hdr(struct sk_buff *skb)
+{
+ return (struct tcphdr *)__skb_transport_header(skb);
+}
+
+static __always_inline struct tcphdr_flags __tcp_hdr_flags(struct tcphdr *th)
+{
+ struct tcphdr_flags f;
+
+ f.u.flags = 0;
+
+ bpf_probe_read(&f.u.flags, sizeof(f.u.flags),
+ &th->ack_seq + sizeof(th->ack_seq));
+ return f;
+}
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+static __always_inline unsigned char *__skb_end_pointer(struct sk_buff *skb)
+{
+ return _(skb->head) + _(skb->end);
+}
+
+static __always_inline unsigned int __skb_end_offset(struct sk_buff *skb)
+{
+ return _(skb->end);
+}
+#else
+static __always_inline unsigned char *__skb_end_pointer(struct sk_buff *skb)
+{
+ return _(skb->end);
+}
+
+static __always_inline unsigned int __skb_end_offset(struct sk_buff *skb)
+{
+ return _(skb->end) - _(skb->head);
+}
+#endif
+
+static __always_inline struct skb_shared_info *__skb_shinfo(struct sk_buff *skb)
+{
+ return (struct skb_shared_info *)(__skb_end_pointer(skb));
+}
+
+static __always_inline unsigned int skb_get_data_len(struct sk_buff *skb)
+{
+ return _(TCP_SKB_CB(skb)->end_seq) - _(TCP_SKB_CB(skb)->seq);
+}
+
+static __always_inline u8 inet_csk_get_ca_state(struct sock *sk)
+{
+ /* FIXME: it is really ugly. We need to find a better solution.
+ * How about a bpf helper to access some common sk bit fields?
+ */
+
+ u8 s;
+
+ bpf_probe_read(&s, sizeof(u8), &inet_csk(sk)->icsk_retransmits - 1);
+
+ return (s & 0x3F);
+}
+
+static __always_inline u32 __tcp_receive_window(struct tcp_sock *tp)
+{
+ s32 win = _(tp->rcv_wup) + _(tp->rcv_wnd) - _(tp->rcv_nxt);
+
+ if (win < 0)
+ win = 0;
+ return (u32) win;
+}
+
+static __always_inline bool __skb_queue_empty(struct sk_buff_head *list)
+{
+ return _(list->next) == (struct sk_buff *) list;
+}
+
+#if 0
+static __always_inline void tcp_trace_flow_by_dst_rack(struct sock *sk,
+ struct tcp_trace_flow *ttf)
+{
+ unsigned short family = _(sk->sk_family);
+
+ memset(ttf, 0, sizeof(*ttf));
+
+ if (family == AF_INET) {
+ ttf->family = AF_INET;
+ ttf->dst[0] = _(inet_sk(sk)->inet_daddr);
+ } else {
+ ttf->family = AF_INET6;
+ ttf->dst[0] = _(sk->sk_v6_daddr.s6_addr32[0]);
+ ttf->dst[1] = _(sk->sk_v6_daddr.s6_addr32[1]);
+ ttf->dst[2] = _(sk->sk_v6_daddr.s6_addr32[2]);
+ ttf->dst[3] = _(sk->sk_v6_daddr.s6_addr32[3]);
+ }
+}
+
+static __always_inline void tcp_trace_flow_by_dst_rack(struct sock *sk,
+ struct tcp_trace_flow *ttf)
+{
+ /* char fmt[] = "%x\n"; */
+
+ unsigned short family = _(sk->sk_family);
+
+ memset(ttf, 0, sizeof(*ttf));
+
+ if (family == AF_INET) {
+ ttf->dst[0] = 0xFFFFFFFF;
+ return;
+ }
+
+ /* ttf->dst[0] = bpf_get_prandom_u32() & 0x07FF; */
+ /* ttf->dst[0] = _(inet_sk(sk)->inet_daddr); */
+ ttf->family = family;
+ /* ttf->dst[0] = _(sk->sk_daddr); */
+ /* ttf->dst[0] = _(sk->sk_txhash); */
+ ttf->dst[0] = 0xFFEEFFEE;
+ /* bpf_probe_read_u32(&ttf->dst[0], sizeof(ttf->dst[0]), &inet_sk(sk)->inet_daddr); */
+ /* bpf_trace_printk(fmt, sizeof(fmt), ttf->dst[0]); */
+
+ /* ttf->dst[1] = 0; */
+}
+
+#endif
+
+static __always_inline struct tcp_estats *tcp_estats_get_by_dst_rack(struct sock *sk)
+{
+ struct tcp_estats *tpes = NULL;
+ unsigned short family;
+
+ family = _(sk->sk_family);
+ if (family == AF_INET) {
+ struct tcp_trace_flow4 ttf;
+
+ memset(&ttf, 0, sizeof(ttf));
+
+ /* bpf_probe_read_u32(&ttf.dst, sizeof(u32), &inet_sk(sk)->inet_daddr); */
+ ttf.dst = _(inet_sk(sk)->inet_daddr);
+
+ tpes = (struct tcp_estats *)bpf_map_lookup_elem(&dst_rack_map4, &ttf);
+
+ if (!tpes) {
+ struct tcp_estats new_tpes;
+
+ memset(&new_tpes, 0, sizeof(new_tpes));
+ if (bpf_map_update_elem(&dst_rack_map4, &ttf, &new_tpes, 0))
+ return NULL;
+ else
+ tpes = bpf_map_lookup_elem(&dst_rack_map4, &ttf);
+
+ if (!tpes)
+ return NULL;
+ }
+ } else if (family == AF_INET6) {
+
+ struct tcp_trace_flow6 ttf;
+
+ memset(&ttf, 0, sizeof(ttf));
+
+/*
+ bpf_probe_read_u32(&ttf.dst0, sizeof(u32), &sk->sk_v6_daddr.s6_addr32[0]);
+ bpf_probe_read_u32(&ttf.dst1, sizeof(u32), &sk->sk_v6_daddr.s6_addr32[0]);
+*/
+
+/* ttf.dst[1] = _(sk->sk_v6_daddr.s6_addr32[1]); */
+
+ tpes = (struct tcp_estats *)bpf_map_lookup_elem(&dst_rack_map6, &ttf);
+
+ if (!tpes) {
+ struct tcp_estats new_tpes;
+
+ memset(&new_tpes, 0, sizeof(new_tpes));
+ if (bpf_map_update_elem(&dst_rack_map6, &ttf, &new_tpes, 0))
+ return NULL;
+ else
+ tpes = bpf_map_lookup_elem(&dst_rack_map6, &ttf);
+
+ if (!tpes)
+ return NULL;
+ }
+ }
+
+ return tpes;
+}
+
+static __always_inline struct tcp_estats *tcp_estats_get_by_sk(struct sock *sk)
+{
+ struct tcp_estats *tpes;
+
+ if (!sk)
+ return NULL;
+
+ tpes = (struct tcp_estats *)bpf_map_lookup_elem(&tcp_flow_map, &sk);
+
+ if (!tpes)
+ return NULL;
+
+ return tpes;
+}
+
+static __always_inline struct tcp_estats *tcp_estats_get(struct sock *sk)
+{
+
+ return tcp_estats_get_by_dst_rack(sk);
+}
+
+SEC("kprobe/tcp_rcv_established")
+int trace_rcv_established(struct pt_regs *ctx)
+{
+ struct skb_shared_info *shinfo;
+ struct tcphdr_flags thflags;
+ struct tcp_estats *tpes;
+ unsigned int data_len;
+ struct sk_buff *skb;
+ struct tcp_sock *tp;
+ struct tcphdr *th;
+ struct sock *sk;
+
+ sk = (struct sock *) PT_REGS_PARM1(ctx);
+ skb = (struct sk_buff *) PT_REGS_PARM2(ctx);
+ th = __tcp_hdr(skb);
+
+ if (!sk || !skb)
+ return 0;
+
+#if 0
+ thflags = __tcp_hdr_flags(th);
+ if (_(skb->len) < thflags.u.doff << 2)
+ return 0;
+#endif
+
+ tpes = tcp_estats_get(sk);
+ if (!tpes)
+ return 0;
+
+ tpes->segs_in++;
+
+
+#if 0
+ shinfo = __skb_shinfo(skb);
+ tp = tcp_sk(sk);
+
+
+ data_len = skb_get_data_len(skb);
+ if (data_len) {
+ tpes->data_segs_in += max_t(u16, 1, _(shinfo->gso_segs));
+ tpes->data_octets_in += data_len;
+
+ /* OOO */
+ if (after(_(TCP_SKB_CB(skb)->seq), _(tp->rcv_nxt)) &&
+ before(_(TCP_SKB_CB(skb)->seq),
+ _(tp->rcv_nxt) + __tcp_receive_window(tp))) {
+ tpes->dup_acks_out++;
+
+ if (__skb_queue_empty(&tp->out_of_order_queue))
+ tpes->dup_ack_episodes++;
+ }
+ } else {
+ /* Pure Ack */
+ if (_(TCP_SKB_CB(skb)->ack_seq) == _(tp->snd_una))
+ tpes->dup_acks_in++;
+ }
+#endif
+
+#if 0
+ if (inet_csk_get_ca_state(sk) == TCP_CA_Disorder) {
+ u32 prior_snd_una = _(tcp_sk(sk)->snd_una);
+ u32 ack = _(TCP_SKB_CB(skb)->ack_seq);
+
+ if (after(ack, prior_snd_una))
+ tpes->sum_octets_reordered += (ack - prior_snd_una);
+ }
+#endif
+
+ return 0;
+}
+
+SEC("kprobe/tcp_transmit_skb")
+int trace_transmit_skb(struct pt_regs *ctx)
+{
+ struct tcp_estats *tpes;
+ unsigned int data_len;
+ struct sk_buff *skb;
+ struct sock *sk;
+
+ sk = (struct sock *) PT_REGS_PARM1(ctx);
+ tpes = tcp_estats_get(sk);
+ if (!tpes)
+ return 0;
+
+ skb = (struct sk_buff *) PT_REGS_PARM2(ctx);
+
+ data_len = skb_get_data_len(skb);
+#if 0
+ if (unlikely(_(TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_FIN))
+ data_len--;
+#endif
+
+ if (data_len) {
+ tpes->data_segs_out += _(TCP_SKB_CB(skb)->tcp_gso_segs);
+ tpes->data_octets_out += data_len;
+ }
+
+ return 0;
+}
+
+SEC("kprobe/tcp_slow_start")
+int trace_slow_start(struct pt_regs *ctx)
+{
+ struct tcp_estats *tpes;
+ struct sock *sk;
+
+ sk = (struct sock *) PT_REGS_PARM1(ctx);
+ tpes = tcp_estats_get(sk);
+ if (!tpes)
+ return 0;
+
+ tpes->slow_start++;
+ return 0;
+}
+
+SEC("kprobe/tcp_cong_avoid_ai")
+int trace_cong_avoid_ai(struct pt_regs *ctx)
+{
+ struct tcp_estats *tpes;
+ struct sock *sk;
+
+ sk = (struct sock *) PT_REGS_PARM1(ctx);
+ tpes = tcp_estats_get(sk);
+ if (!tpes)
+ return 0;
+
+ tpes->cong_avoid++;
+ return 0;
+}
+
+SEC("kprobe/tcp_cwnd_reduction")
+int trace_cwnd_reduction(struct pt_regs *ctx)
+{
+ struct tcp_estats *tpes;
+ struct sock *sk;
+ int fast_rexmit;
+
+ sk = (struct sock *) PT_REGS_PARM1(ctx);
+ tpes = tcp_estats_get(sk);
+ if (!tpes)
+ return 0;
+
+ fast_rexmit = (int) PT_REGS_PARM3(ctx);
+ if (fast_rexmit)
+ tpes->fast_retrans++;
+
+ tpes->cong_signals++;
+ return 0;
+}
+
+SEC("kprobe/tcp_init_sock")
+int trace_init_sock(struct pt_regs *ctx)
+{
+ struct tcp_estats new_tpes;
+ struct sock *sk;
+
+ sk = (struct sock *) PT_REGS_PARM1(ctx);
+
+ memset(&new_tpes, 0, sizeof(new_tpes));
+ bpf_map_update_elem(&tcp_flow_map, &sk, &new_tpes, BPF_ANY);
+
+ return 0;
+}
+
+SEC("kprobe/tcp_v4_destroy_sock")
+int trace_destroy_sock(struct pt_regs *ctx)
+{
+ struct sock *sk;
+
+ sk = (struct sock *) PT_REGS_PARM1(ctx);
+ bpf_map_delete_elem(&tcp_flow_map, &sk);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tcp_trace_user.c b/samples/bpf/tcp_trace_user.c
new file mode 100644
index 0000000..c4d4752
--- /dev/null
+++ b/samples/bpf/tcp_trace_user.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <linux/bpf.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint8_t u8;
+#include "tcp_trace.h"
+
+static void log_ttf6(const struct tcp_trace_flow6 *ttf)
+{
+ char dst_ip[INET6_ADDRSTRLEN];
+ u32 dst[4];
+
+ dst[0] = ttf->dst0;
+ dst[1] = ttf->dst1;
+ dst[2] = 0;
+ dst[3] = 0;
+
+ inet_ntop(AF_INET6, dst, dst_ip, sizeof(dst_ip));
+
+ printf("family:%d dst: \"[%s]/%d:%d\"\n",
+ AF_INET6, dst_ip, 0, 0); /*ntohs(ttf->dport));*/
+}
+
+static void log_ttf4(const struct tcp_trace_flow4 *ttf)
+{
+ char dst_ip[INET6_ADDRSTRLEN];
+
+ inet_ntop(AF_INET, &ttf->dst, dst_ip, sizeof(dst_ip));
+
+ printf("family:%d dst: \"[%s]/%d:%d\"\n",
+ AF_INET, dst_ip, 0, 0); /*ntohs(ttf->dport));*/
+}
+
+static void log_stats(const struct tcp_estats *s)
+{
+ printf("\tslow_start:%u cong_avoid:%u cong_signals:%u "
+
+ "\tdata_segs_out:%u data_octets_out:%lu "
+ "\tdup_acks_out:%u dup_ack_episodes:%u sum_octets_reordered:%u "
+ "\tfast_retrans:%u octets_retrans:%u "
+ "\ttimeouts:%u\n"
+
+ "\tsegs_in:%u data_segs_in:%u data_octets_in:%lu dup_acks_in:%u\n",
+
+ s->slow_start, s->cong_avoid, s->cong_signals,
+
+ s->data_segs_out, s->data_octets_out,
+ s->dup_acks_out, s->dup_ack_episodes, s->sum_octets_reordered,
+ s->fast_retrans, s->octets_retrans,
+ s->timeouts,
+
+ s->segs_in, s->data_segs_in, s->data_octets_in, s->dup_acks_in);
+}
+
+static void tcp_estats_log6(const struct tcp_trace_flow6 *ttf,
+ const struct tcp_estats *tpes)
+{
+ log_ttf6(ttf);
+ log_stats(tpes);
+}
+
+static void tcp_estats_log4(const struct tcp_trace_flow4 *ttf,
+ const struct tcp_estats *tpes)
+{
+ log_ttf4(ttf);
+ log_stats(tpes);
+}
+
+int main(int ac, char **argv)
+{
+ struct tcp_trace_flow4 ttf4, next_ttf4;
+ struct tcp_trace_flow6 ttf6, next_ttf6;
+ struct tcp_estats tpes;
+ char filename[256];
+ void *sk, *next_sk;
+ unsigned int c;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ while (1) {
+ memset(&ttf4, 0, sizeof(ttf4));
+ memset(&next_ttf4, 0, sizeof(next_ttf4));
+ c = 0;
+ while (bpf_get_next_key(map_fd[1], &ttf4, &next_ttf4) == 0) {
+ if (!bpf_lookup_elem(map_fd[1], &next_ttf4, &tpes))
+ tcp_estats_log4(&next_ttf4, &tpes);
+ ttf4 = next_ttf4;
+ c++;
+ }
+ memset(&ttf6, 0, sizeof(ttf6));
+ memset(&next_ttf6, 0, sizeof(next_ttf6));
+ while (bpf_get_next_key(map_fd[2], &ttf6, &next_ttf6) == 0) {
+ if (!bpf_lookup_elem(map_fd[2], &next_ttf6, &tpes))
+ tcp_estats_log6(&next_ttf6, &tpes);
+ ttf6 = next_ttf6;
+ c++;
+ }
+ printf("c=%u\n", c);
+ sleep(10);
+ }
+
+ return 0;
+}
diff --git a/tools/net/Makefile b/tools/net/Makefile
index ee577ea..2528d02 100644
--- a/tools/net/Makefile
+++ b/tools/net/Makefile
@@ -12,15 +12,15 @@ YACC = bison

all : bpf_jit_disasm bpf_dbg bpf_asm

-bpf_jit_disasm : CFLAGS = -Wall -O2 -DPACKAGE='bpf_jit_disasm'
+bpf_jit_disasm : CFLAGS = -Wall -O2 -DPACKAGE='bpf_jit_disasm' -I '../../include/uapi' -I '../../include'
bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl
bpf_jit_disasm : bpf_jit_disasm.o

-bpf_dbg : CFLAGS = -Wall -O2
+bpf_dbg : CFLAGS = -Wall -O2 -I '../../include/uapi' -I '../../include'
bpf_dbg : LDLIBS = -lreadline
bpf_dbg : bpf_dbg.o

-bpf_asm : CFLAGS = -Wall -O2 -I.
+bpf_asm : CFLAGS = -Wall -O2 -I. -I '../../include/uapi' -I '../../include'
bpf_asm : LDLIBS =
bpf_asm : bpf_asm.o bpf_exp.yacc.o bpf_exp.lex.o
bpf_exp.lex.o : bpf_exp.yacc.c
--
2.5.1