Skip to content

Commit 2541517

Browse files
Alexei StarovoitovIngo Molnar
Alexei Starovoitov
authored and
Ingo Molnar
committed
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute user-defined BPF byte-code programs without being able to crash or hang the kernel in any way. The BPF engine makes sure that such programs have a finite execution time and that they cannot break out of their sandbox. The user interface is to attach to a kprobe via the perf syscall: struct perf_event_attr attr = { .type = PERF_TYPE_TRACEPOINT, .config = event_id, ... }; event_fd = perf_event_open(&attr,...); ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); 'prog_fd' is a file descriptor associated with BPF program previously loaded. 'event_id' is an ID of the kprobe created. Closing 'event_fd': close(event_fd); ... automatically detaches BPF program from it. BPF programs can call in-kernel helper functions to: - lookup/update/delete elements in maps - probe_read - wraper of probe_kernel_read() used to access any kernel data structures BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is architecture dependent) and return 0 to ignore the event and 1 to store kprobe event into the ring buffer. Note, kprobes are a fundamentally _not_ a stable kernel ABI, so BPF programs attached to kprobes must be recompiled for every kernel version and user must supply correct LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call. Signed-off-by: Alexei Starovoitov <[email protected]> Reviewed-by: Steven Rostedt <[email protected]> Reviewed-by: Masami Hiramatsu <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: Daniel Borkmann <[email protected]> Cc: David S. Miller <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Peter Zijlstra <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent 72cbbc8 commit 2541517

File tree

8 files changed

+219
-1
lines changed

8 files changed

+219
-1
lines changed

include/linux/ftrace_event.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ struct trace_array;
1313
struct trace_buffer;
1414
struct tracer;
1515
struct dentry;
16+
struct bpf_prog;
1617

1718
struct trace_print_flags {
1819
unsigned long mask;
@@ -306,6 +307,7 @@ struct ftrace_event_call {
306307
#ifdef CONFIG_PERF_EVENTS
307308
int perf_refcount;
308309
struct hlist_head __percpu *perf_events;
310+
struct bpf_prog *prog;
309311

310312
int (*perf_perm)(struct ftrace_event_call *,
311313
struct perf_event *);
@@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
551553
event_triggers_post_call(file, tt);
552554
}
553555

556+
#ifdef CONFIG_BPF_SYSCALL
557+
unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
558+
#else
559+
static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
560+
{
561+
return 1;
562+
}
563+
#endif
564+
554565
enum {
555566
FILTER_OTHER = 0,
556567
FILTER_STATIC_STRING,

include/uapi/linux/bpf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ enum bpf_map_type {
118118
enum bpf_prog_type {
119119
BPF_PROG_TYPE_UNSPEC,
120120
BPF_PROG_TYPE_SOCKET_FILTER,
121+
BPF_PROG_TYPE_KPROBE,
121122
};
122123

123124
/* flags for BPF_MAP_UPDATE_ELEM command */
@@ -151,6 +152,7 @@ union bpf_attr {
151152
__u32 log_level; /* verbosity level of verifier */
152153
__u32 log_size; /* size of user buffer */
153154
__aligned_u64 log_buf; /* user supplied buffer */
155+
__u32 kern_version; /* checked when prog_type=kprobe */
154156
};
155157
} __attribute__((aligned(8)));
156158

@@ -162,6 +164,7 @@ enum bpf_func_id {
162164
BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
163165
BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
164166
BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
167+
BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */
165168
__BPF_FUNC_MAX_ID,
166169
};
167170

include/uapi/linux/perf_event.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ struct perf_event_attr {
381381
#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5)
382382
#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
383383
#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
384+
#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
384385

385386
enum perf_event_ioc_flags {
386387
PERF_IOC_FLAG_GROUP = 1U << 0,

kernel/bpf/syscall.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/file.h>
1717
#include <linux/license.h>
1818
#include <linux/filter.h>
19+
#include <linux/version.h>
1920

2021
static LIST_HEAD(bpf_map_types);
2122

@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
467468
}
468469

469470
/* last field in 'union bpf_attr' used by this command */
470-
#define BPF_PROG_LOAD_LAST_FIELD log_buf
471+
#define BPF_PROG_LOAD_LAST_FIELD kern_version
471472

472473
static int bpf_prog_load(union bpf_attr *attr)
473474
{
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
492493
if (attr->insn_cnt >= BPF_MAXINSNS)
493494
return -EINVAL;
494495

496+
if (type == BPF_PROG_TYPE_KPROBE &&
497+
attr->kern_version != LINUX_VERSION_CODE)
498+
return -EINVAL;
499+
495500
/* plain bpf_prog allocation */
496501
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
497502
if (!prog)

kernel/events/core.c

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
#include <linux/module.h>
4343
#include <linux/mman.h>
4444
#include <linux/compat.h>
45+
#include <linux/bpf.h>
46+
#include <linux/filter.h>
4547

4648
#include "internal.h"
4749

@@ -3407,6 +3409,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
34073409
}
34083410

34093411
static void perf_event_free_filter(struct perf_event *event);
3412+
static void perf_event_free_bpf_prog(struct perf_event *event);
34103413

34113414
static void free_event_rcu(struct rcu_head *head)
34123415
{
@@ -3416,6 +3419,7 @@ static void free_event_rcu(struct rcu_head *head)
34163419
if (event->ns)
34173420
put_pid_ns(event->ns);
34183421
perf_event_free_filter(event);
3422+
perf_event_free_bpf_prog(event);
34193423
kfree(event);
34203424
}
34213425

@@ -3928,6 +3932,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
39283932
static int perf_event_set_output(struct perf_event *event,
39293933
struct perf_event *output_event);
39303934
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3935+
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
39313936

39323937
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
39333938
{
@@ -3981,6 +3986,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
39813986
case PERF_EVENT_IOC_SET_FILTER:
39823987
return perf_event_set_filter(event, (void __user *)arg);
39833988

3989+
case PERF_EVENT_IOC_SET_BPF:
3990+
return perf_event_set_bpf_prog(event, arg);
3991+
39843992
default:
39853993
return -ENOTTY;
39863994
}
@@ -6455,6 +6463,49 @@ static void perf_event_free_filter(struct perf_event *event)
64556463
ftrace_profile_free_filter(event);
64566464
}
64576465

6466+
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6467+
{
6468+
struct bpf_prog *prog;
6469+
6470+
if (event->attr.type != PERF_TYPE_TRACEPOINT)
6471+
return -EINVAL;
6472+
6473+
if (event->tp_event->prog)
6474+
return -EEXIST;
6475+
6476+
if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
6477+
/* bpf programs can only be attached to kprobes */
6478+
return -EINVAL;
6479+
6480+
prog = bpf_prog_get(prog_fd);
6481+
if (IS_ERR(prog))
6482+
return PTR_ERR(prog);
6483+
6484+
if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
6485+
/* valid fd, but invalid bpf program type */
6486+
bpf_prog_put(prog);
6487+
return -EINVAL;
6488+
}
6489+
6490+
event->tp_event->prog = prog;
6491+
6492+
return 0;
6493+
}
6494+
6495+
static void perf_event_free_bpf_prog(struct perf_event *event)
6496+
{
6497+
struct bpf_prog *prog;
6498+
6499+
if (!event->tp_event)
6500+
return;
6501+
6502+
prog = event->tp_event->prog;
6503+
if (prog) {
6504+
event->tp_event->prog = NULL;
6505+
bpf_prog_put(prog);
6506+
}
6507+
}
6508+
64586509
#else
64596510

64606511
static inline void perf_tp_register(void)
@@ -6470,6 +6521,14 @@ static void perf_event_free_filter(struct perf_event *event)
64706521
{
64716522
}
64726523

6524+
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6525+
{
6526+
return -ENOENT;
6527+
}
6528+
6529+
static void perf_event_free_bpf_prog(struct perf_event *event)
6530+
{
6531+
}
64736532
#endif /* CONFIG_EVENT_TRACING */
64746533

64756534
#ifdef CONFIG_HAVE_HW_BREAKPOINT

kernel/trace/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
5353
endif
5454
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
5555
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
56+
obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
5657
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
5758
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
5859
ifeq ($(CONFIG_PM),y)

kernel/trace/bpf_trace.c

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
2+
*
3+
* This program is free software; you can redistribute it and/or
4+
* modify it under the terms of version 2 of the GNU General Public
5+
* License as published by the Free Software Foundation.
6+
*/
7+
#include <linux/kernel.h>
8+
#include <linux/types.h>
9+
#include <linux/slab.h>
10+
#include <linux/bpf.h>
11+
#include <linux/filter.h>
12+
#include <linux/uaccess.h>
13+
#include "trace.h"
14+
15+
static DEFINE_PER_CPU(int, bpf_prog_active);
16+
17+
/**
18+
* trace_call_bpf - invoke BPF program
19+
* @prog: BPF program
20+
* @ctx: opaque context pointer
21+
*
22+
* kprobe handlers execute BPF programs via this helper.
23+
* Can be used from static tracepoints in the future.
24+
*
25+
* Return: BPF programs always return an integer which is interpreted by
26+
* kprobe handler as:
27+
* 0 - return from kprobe (event is filtered out)
28+
* 1 - store kprobe event into ring buffer
29+
* Other values are reserved and currently alias to 1
30+
*/
31+
unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
32+
{
33+
unsigned int ret;
34+
35+
if (in_nmi()) /* not supported yet */
36+
return 1;
37+
38+
preempt_disable();
39+
40+
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
41+
/*
42+
* since some bpf program is already running on this cpu,
43+
* don't call into another bpf program (same or different)
44+
* and don't send kprobe event into ring-buffer,
45+
* so return zero here
46+
*/
47+
ret = 0;
48+
goto out;
49+
}
50+
51+
rcu_read_lock();
52+
ret = BPF_PROG_RUN(prog, ctx);
53+
rcu_read_unlock();
54+
55+
out:
56+
__this_cpu_dec(bpf_prog_active);
57+
preempt_enable();
58+
59+
return ret;
60+
}
61+
EXPORT_SYMBOL_GPL(trace_call_bpf);
62+
63+
static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
64+
{
65+
void *dst = (void *) (long) r1;
66+
int size = (int) r2;
67+
void *unsafe_ptr = (void *) (long) r3;
68+
69+
return probe_kernel_read(dst, unsafe_ptr, size);
70+
}
71+
72+
static const struct bpf_func_proto bpf_probe_read_proto = {
73+
.func = bpf_probe_read,
74+
.gpl_only = true,
75+
.ret_type = RET_INTEGER,
76+
.arg1_type = ARG_PTR_TO_STACK,
77+
.arg2_type = ARG_CONST_STACK_SIZE,
78+
.arg3_type = ARG_ANYTHING,
79+
};
80+
81+
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
82+
{
83+
switch (func_id) {
84+
case BPF_FUNC_map_lookup_elem:
85+
return &bpf_map_lookup_elem_proto;
86+
case BPF_FUNC_map_update_elem:
87+
return &bpf_map_update_elem_proto;
88+
case BPF_FUNC_map_delete_elem:
89+
return &bpf_map_delete_elem_proto;
90+
case BPF_FUNC_probe_read:
91+
return &bpf_probe_read_proto;
92+
default:
93+
return NULL;
94+
}
95+
}
96+
97+
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
98+
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
99+
{
100+
/* check bounds */
101+
if (off < 0 || off >= sizeof(struct pt_regs))
102+
return false;
103+
104+
/* only read is allowed */
105+
if (type != BPF_READ)
106+
return false;
107+
108+
/* disallow misaligned access */
109+
if (off % size != 0)
110+
return false;
111+
112+
return true;
113+
}
114+
115+
static struct bpf_verifier_ops kprobe_prog_ops = {
116+
.get_func_proto = kprobe_prog_func_proto,
117+
.is_valid_access = kprobe_prog_is_valid_access,
118+
};
119+
120+
static struct bpf_prog_type_list kprobe_tl = {
121+
.ops = &kprobe_prog_ops,
122+
.type = BPF_PROG_TYPE_KPROBE,
123+
};
124+
125+
static int __init register_kprobe_prog_ops(void)
126+
{
127+
bpf_register_prog_type(&kprobe_tl);
128+
return 0;
129+
}
130+
late_initcall(register_kprobe_prog_ops);

kernel/trace/trace_kprobe.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,11 +1134,15 @@ static void
11341134
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
11351135
{
11361136
struct ftrace_event_call *call = &tk->tp.call;
1137+
struct bpf_prog *prog = call->prog;
11371138
struct kprobe_trace_entry_head *entry;
11381139
struct hlist_head *head;
11391140
int size, __size, dsize;
11401141
int rctx;
11411142

1143+
if (prog && !trace_call_bpf(prog, regs))
1144+
return;
1145+
11421146
head = this_cpu_ptr(call->perf_events);
11431147
if (hlist_empty(head))
11441148
return;
@@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
11651169
struct pt_regs *regs)
11661170
{
11671171
struct ftrace_event_call *call = &tk->tp.call;
1172+
struct bpf_prog *prog = call->prog;
11681173
struct kretprobe_trace_entry_head *entry;
11691174
struct hlist_head *head;
11701175
int size, __size, dsize;
11711176
int rctx;
11721177

1178+
if (prog && !trace_call_bpf(prog, regs))
1179+
return;
1180+
11731181
head = this_cpu_ptr(call->perf_events);
11741182
if (hlist_empty(head))
11751183
return;

0 commit comments

Comments
 (0)