Skip to content

Commit c4f6699

Browse files
Alexei Starovoitovborkmann
Alexei Starovoitov
authored andcommitted
bpf: introduce BPF_RAW_TRACEPOINT
Introduce BPF_PROG_TYPE_RAW_TRACEPOINT bpf program type to access kernel internal arguments of the tracepoints in their raw form. >From bpf program point of view the access to the arguments look like: struct bpf_raw_tracepoint_args { __u64 args[0]; }; int bpf_prog(struct bpf_raw_tracepoint_args *ctx) { // program can read args[N] where N depends on tracepoint // and statically verified at program load+attach time } kprobe+bpf infrastructure allows programs access function arguments. This feature allows programs access raw tracepoint arguments. Similar to proposed 'dynamic ftrace events' there are no abi guarantees to what the tracepoints arguments are and what their meaning is. The program needs to type cast args properly and use bpf_probe_read() helper to access struct fields when argument is a pointer. For every tracepoint __bpf_trace_##call function is prepared. In assembler it looks like: (gdb) disassemble __bpf_trace_xdp_exception Dump of assembler code for function __bpf_trace_xdp_exception: 0xffffffff81132080 <+0>: mov %ecx,%ecx 0xffffffff81132082 <+2>: jmpq 0xffffffff811231f0 <bpf_trace_run3> where TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), The above assembler snippet is casting 32-bit 'act' field into 'u64' to pass into bpf_trace_run3(), while 'dev' and 'xdp' args are passed as-is. All of ~500 of __bpf_trace_*() functions are only 5-10 byte long and in total this approach adds 7k bytes to .text. This approach gives the lowest possible overhead while calling trace_xdp_exception() from kernel C code and transitioning into bpf land. Since tracepoint+bpf are used at speeds of 1M+ events per second this is valuable optimization. The new BPF_RAW_TRACEPOINT_OPEN sys_bpf command is introduced that returns anon_inode FD of 'bpf-raw-tracepoint' object. The user space looks like: // load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type prog_fd = bpf_prog_load(...); // receive anon_inode fd for given bpf_raw_tracepoint with prog attached raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd); Ctrl-C of tracing daemon or cmdline tool that uses this feature will automatically detach bpf program, unload it and unregister tracepoint probe. On the kernel side the __bpf_raw_tp_map section of pointers to tracepoint definition and to __bpf_trace_*() probe function is used to find a tracepoint with "xdp_exception" name and corresponding __bpf_trace_xdp_exception() probe function which are passed to tracepoint_probe_register() to connect probe with tracepoint. Addition of bpf_raw_tracepoint doesn't interfere with ftrace and perf tracepoint mechanisms. perf_event_open() can be used in parallel on the same tracepoint. Multiple bpf_raw_tracepoint_open("xdp_exception", prog_fd) are permitted. Each with its own bpf program. The kernel will execute all tracepoint probes and all attached bpf programs. In the future bpf_raw_tracepoints can be extended with query/introspection logic. __bpf_raw_tp_map section logic was contributed by Steven Rostedt Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: Steven Rostedt (VMware) <[email protected]> Acked-by: Steven Rostedt (VMware) <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent cf14f27 commit c4f6699

File tree

9 files changed

+424
-0
lines changed

9 files changed

+424
-0
lines changed

Diff for: include/asm-generic/vmlinux.lds.h

+10
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,15 @@
178178
#define TRACE_SYSCALLS()
179179
#endif
180180

181+
#ifdef CONFIG_BPF_EVENTS
182+
#define BPF_RAW_TP() STRUCT_ALIGN(); \
183+
VMLINUX_SYMBOL(__start__bpf_raw_tp) = .; \
184+
KEEP(*(__bpf_raw_tp_map)) \
185+
VMLINUX_SYMBOL(__stop__bpf_raw_tp) = .;
186+
#else
187+
#define BPF_RAW_TP()
188+
#endif
189+
181190
#ifdef CONFIG_SERIAL_EARLYCON
182191
#define EARLYCON_TABLE() STRUCT_ALIGN(); \
183192
VMLINUX_SYMBOL(__earlycon_table) = .; \
@@ -249,6 +258,7 @@
249258
LIKELY_PROFILE() \
250259
BRANCH_PROFILE() \
251260
TRACE_PRINTKS() \
261+
BPF_RAW_TP() \
252262
TRACEPOINT_STR()
253263

254264
/*

Diff for: include/linux/bpf_types.h

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
1919
BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
2020
BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
2121
BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
22+
BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
2223
#endif
2324
#ifdef CONFIG_CGROUP_BPF
2425
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)

Diff for: include/linux/trace_events.h

+42
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
468468
int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
469469
void perf_event_detach_bpf_prog(struct perf_event *event);
470470
int perf_event_query_prog_array(struct perf_event *event, void __user *info);
471+
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
472+
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
473+
struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
471474
#else
472475
static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
473476
{
@@ -487,6 +490,18 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info)
487490
{
488491
return -EOPNOTSUPP;
489492
}
493+
static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p)
494+
{
495+
return -EOPNOTSUPP;
496+
}
497+
static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p)
498+
{
499+
return -EOPNOTSUPP;
500+
}
501+
static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
502+
{
503+
return NULL;
504+
}
490505
#endif
491506

492507
enum {
@@ -546,6 +561,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
546561
void perf_trace_buf_update(void *record, u16 type);
547562
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
548563

564+
void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
565+
void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
566+
void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
567+
u64 arg3);
568+
void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2,
569+
u64 arg3, u64 arg4);
570+
void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2,
571+
u64 arg3, u64 arg4, u64 arg5);
572+
void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2,
573+
u64 arg3, u64 arg4, u64 arg5, u64 arg6);
574+
void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2,
575+
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7);
576+
void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2,
577+
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
578+
u64 arg8);
579+
void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2,
580+
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
581+
u64 arg8, u64 arg9);
582+
void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2,
583+
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
584+
u64 arg8, u64 arg9, u64 arg10);
585+
void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2,
586+
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
587+
u64 arg8, u64 arg9, u64 arg10, u64 arg11);
588+
void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2,
589+
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
590+
u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12);
549591
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
550592
struct trace_event_call *call, u64 count,
551593
struct pt_regs *regs, struct hlist_head *head,

Diff for: include/linux/tracepoint-defs.h

+6
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,10 @@ struct tracepoint {
3535
struct tracepoint_func __rcu *funcs;
3636
};
3737

38+
struct bpf_raw_event_map {
39+
struct tracepoint *tp;
40+
void *bpf_func;
41+
u32 num_args;
42+
} __aligned(32);
43+
3844
#endif

Diff for: include/trace/bpf_probe.h

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
3+
#undef TRACE_SYSTEM_VAR
4+
5+
#ifdef CONFIG_BPF_EVENTS
6+
7+
#undef __entry
8+
#define __entry entry
9+
10+
#undef __get_dynamic_array
11+
#define __get_dynamic_array(field) \
12+
((void *)__entry + (__entry->__data_loc_##field & 0xffff))
13+
14+
#undef __get_dynamic_array_len
15+
#define __get_dynamic_array_len(field) \
16+
((__entry->__data_loc_##field >> 16) & 0xffff)
17+
18+
#undef __get_str
19+
#define __get_str(field) ((char *)__get_dynamic_array(field))
20+
21+
#undef __get_bitmask
22+
#define __get_bitmask(field) (char *)__get_dynamic_array(field)
23+
24+
#undef __perf_count
25+
#define __perf_count(c) (c)
26+
27+
#undef __perf_task
28+
#define __perf_task(t) (t)
29+
30+
/* cast any integer, pointer, or small struct to u64 */
31+
#define UINTTYPE(size) \
32+
__typeof__(__builtin_choose_expr(size == 1, (u8)1, \
33+
__builtin_choose_expr(size == 2, (u16)2, \
34+
__builtin_choose_expr(size == 4, (u32)3, \
35+
__builtin_choose_expr(size == 8, (u64)4, \
36+
(void)5)))))
37+
#define __CAST_TO_U64(x) ({ \
38+
typeof(x) __src = (x); \
39+
UINTTYPE(sizeof(x)) __dst; \
40+
memcpy(&__dst, &__src, sizeof(__dst)); \
41+
(u64)__dst; })
42+
43+
#define __CAST1(a,...) __CAST_TO_U64(a)
44+
#define __CAST2(a,...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__)
45+
#define __CAST3(a,...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__)
46+
#define __CAST4(a,...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__)
47+
#define __CAST5(a,...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__)
48+
#define __CAST6(a,...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__)
49+
#define __CAST7(a,...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__)
50+
#define __CAST8(a,...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__)
51+
#define __CAST9(a,...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__)
52+
#define __CAST10(a,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__)
53+
#define __CAST11(a,...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__)
54+
#define __CAST12(a,...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__)
55+
/* tracepoints with more than 12 arguments will hit build error */
56+
#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
57+
58+
#undef DECLARE_EVENT_CLASS
59+
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
60+
static notrace void \
61+
__bpf_trace_##call(void *__data, proto) \
62+
{ \
63+
struct bpf_prog *prog = __data; \
64+
CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \
65+
}
66+
67+
/*
68+
* This part is compiled out, it is only here as a build time check
69+
* to make sure that if the tracepoint handling changes, the
70+
* bpf probe will fail to compile unless it too is updated.
71+
*/
72+
#undef DEFINE_EVENT
73+
#define DEFINE_EVENT(template, call, proto, args) \
74+
static inline void bpf_test_probe_##call(void) \
75+
{ \
76+
check_trace_callback_type_##call(__bpf_trace_##template); \
77+
} \
78+
static struct bpf_raw_event_map __used \
79+
__attribute__((section("__bpf_raw_tp_map"))) \
80+
__bpf_trace_tp_map_##call = { \
81+
.tp = &__tracepoint_##call, \
82+
.bpf_func = (void *)__bpf_trace_##template, \
83+
.num_args = COUNT_ARGS(args), \
84+
};
85+
86+
87+
#undef DEFINE_EVENT_PRINT
88+
#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
89+
DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
90+
91+
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
92+
#endif /* CONFIG_BPF_EVENTS */

Diff for: include/trace/define_trace.h

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
#ifdef TRACEPOINTS_ENABLED
9696
#include <trace/trace_events.h>
9797
#include <trace/perf.h>
98+
#include <trace/bpf_probe.h>
9899
#endif
99100

100101
#undef TRACE_EVENT

Diff for: include/uapi/linux/bpf.h

+11
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ enum bpf_cmd {
9494
BPF_MAP_GET_FD_BY_ID,
9595
BPF_OBJ_GET_INFO_BY_FD,
9696
BPF_PROG_QUERY,
97+
BPF_RAW_TRACEPOINT_OPEN,
9798
};
9899

99100
enum bpf_map_type {
@@ -134,6 +135,7 @@ enum bpf_prog_type {
134135
BPF_PROG_TYPE_SK_SKB,
135136
BPF_PROG_TYPE_CGROUP_DEVICE,
136137
BPF_PROG_TYPE_SK_MSG,
138+
BPF_PROG_TYPE_RAW_TRACEPOINT,
137139
};
138140

139141
enum bpf_attach_type {
@@ -344,6 +346,11 @@ union bpf_attr {
344346
__aligned_u64 prog_ids;
345347
__u32 prog_cnt;
346348
} query;
349+
350+
struct {
351+
__u64 name;
352+
__u32 prog_fd;
353+
} raw_tracepoint;
347354
} __attribute__((aligned(8)));
348355

349356
/* BPF helper function descriptions:
@@ -1152,4 +1159,8 @@ struct bpf_cgroup_dev_ctx {
11521159
__u32 minor;
11531160
};
11541161

1162+
struct bpf_raw_tracepoint_args {
1163+
__u64 args[0];
1164+
};
1165+
11551166
#endif /* _UAPI__LINUX_BPF_H__ */

Diff for: kernel/bpf/syscall.c

+78
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,81 @@ static int bpf_obj_get(const union bpf_attr *attr)
13151315
attr->file_flags);
13161316
}
13171317

1318+
struct bpf_raw_tracepoint {
1319+
struct bpf_raw_event_map *btp;
1320+
struct bpf_prog *prog;
1321+
};
1322+
1323+
static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
1324+
{
1325+
struct bpf_raw_tracepoint *raw_tp = filp->private_data;
1326+
1327+
if (raw_tp->prog) {
1328+
bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
1329+
bpf_prog_put(raw_tp->prog);
1330+
}
1331+
kfree(raw_tp);
1332+
return 0;
1333+
}
1334+
1335+
static const struct file_operations bpf_raw_tp_fops = {
1336+
.release = bpf_raw_tracepoint_release,
1337+
.read = bpf_dummy_read,
1338+
.write = bpf_dummy_write,
1339+
};
1340+
1341+
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
1342+
1343+
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
1344+
{
1345+
struct bpf_raw_tracepoint *raw_tp;
1346+
struct bpf_raw_event_map *btp;
1347+
struct bpf_prog *prog;
1348+
char tp_name[128];
1349+
int tp_fd, err;
1350+
1351+
if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
1352+
sizeof(tp_name) - 1) < 0)
1353+
return -EFAULT;
1354+
tp_name[sizeof(tp_name) - 1] = 0;
1355+
1356+
btp = bpf_find_raw_tracepoint(tp_name);
1357+
if (!btp)
1358+
return -ENOENT;
1359+
1360+
raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
1361+
if (!raw_tp)
1362+
return -ENOMEM;
1363+
raw_tp->btp = btp;
1364+
1365+
prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
1366+
BPF_PROG_TYPE_RAW_TRACEPOINT);
1367+
if (IS_ERR(prog)) {
1368+
err = PTR_ERR(prog);
1369+
goto out_free_tp;
1370+
}
1371+
1372+
err = bpf_probe_register(raw_tp->btp, prog);
1373+
if (err)
1374+
goto out_put_prog;
1375+
1376+
raw_tp->prog = prog;
1377+
tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
1378+
O_CLOEXEC);
1379+
if (tp_fd < 0) {
1380+
bpf_probe_unregister(raw_tp->btp, prog);
1381+
err = tp_fd;
1382+
goto out_put_prog;
1383+
}
1384+
return tp_fd;
1385+
1386+
out_put_prog:
1387+
bpf_prog_put(prog);
1388+
out_free_tp:
1389+
kfree(raw_tp);
1390+
return err;
1391+
}
1392+
13181393
#ifdef CONFIG_CGROUP_BPF
13191394

13201395
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
@@ -1925,6 +2000,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
19252000
case BPF_OBJ_GET_INFO_BY_FD:
19262001
err = bpf_obj_get_info_by_fd(&attr, uattr);
19272002
break;
2003+
case BPF_RAW_TRACEPOINT_OPEN:
2004+
err = bpf_raw_tracepoint_open(&attr);
2005+
break;
19282006
default:
19292007
err = -EINVAL;
19302008
break;

0 commit comments

Comments
 (0)