Skip to content

Commit 1be7f75

Browse files
Alexei Starovoitovdavem330
Alexei Starovoitov
authored andcommitted
bpf: enable non-root eBPF programs
In order to let unprivileged users load and execute eBPF programs teach verifier to prevent pointer leaks. Verifier will prevent - any arithmetic on pointers (except R10+Imm which is used to compute stack addresses) - comparison of pointers (except if (map_value_ptr == 0) ... ) - passing pointers to helper functions - indirectly passing pointers in stack to helper functions - returning pointer from bpf program - storing pointers into ctx or maps Spill/fill of pointers into stack is allowed, but mangling of pointers stored in the stack or reading them byte by byte is not. Within bpf programs the pointers do exist, since programs need to be able to access maps, pass skb pointer to LD_ABS insns, etc but programs cannot pass such pointer values to the outside or obfuscate them. Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs, so that socket filters (tcpdump), af_packet (quic acceleration) and future kcm can use it. tracing and tc cls/act program types still require root permissions, since tracing actually needs to be able to see all kernel pointers and tc is for root only. For example, the following unprivileged socket filter program is allowed: int bpf_prog1(struct __sk_buff *skb) { u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); u64 *value = bpf_map_lookup_elem(&my_map, &index); if (value) *value += skb->len; return 0; } but the following program is not: int bpf_prog1(struct __sk_buff *skb) { u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); u64 *value = bpf_map_lookup_elem(&my_map, &index); if (value) *value += (u64) skb; return 0; } since it would leak the kernel address into the map. Unprivileged socket filter bpf programs have access to the following helper functions: - map lookup/update/delete (but they cannot store kernel pointers into them) - get_random (it's already exposed to unprivileged user space) - get_smp_processor_id - tail_call into another socket filter program - ktime_get_ns The feature is controlled by sysctl kernel.unprivileged_bpf_disabled. This toggle defaults to off (0), but can be set true (1). Once true, bpf programs and maps cannot be accessed from unprivileged process, and the toggle cannot be set back to false. Signed-off-by: Alexei Starovoitov <[email protected]> Reviewed-by: Kees Cook <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 0fa2887 commit 1be7f75

File tree

5 files changed

+120
-15
lines changed

5 files changed

+120
-15
lines changed

include/linux/bpf.h

+2
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ void bpf_prog_put_rcu(struct bpf_prog *prog);
167167
struct bpf_map *bpf_map_get(struct fd f);
168168
void bpf_map_put(struct bpf_map *map);
169169

170+
extern int sysctl_unprivileged_bpf_disabled;
171+
170172
/* verify correctness of eBPF program */
171173
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
172174
#else

kernel/bpf/syscall.c

+6-5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include <linux/filter.h>
1919
#include <linux/version.h>
2020

21+
int sysctl_unprivileged_bpf_disabled __read_mostly;
22+
2123
static LIST_HEAD(bpf_map_types);
2224

2325
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -544,6 +546,9 @@ static int bpf_prog_load(union bpf_attr *attr)
544546
attr->kern_version != LINUX_VERSION_CODE)
545547
return -EINVAL;
546548

549+
if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
550+
return -EPERM;
551+
547552
/* plain bpf_prog allocation */
548553
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
549554
if (!prog)
@@ -599,11 +604,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
599604
union bpf_attr attr = {};
600605
int err;
601606

602-
/* the syscall is limited to root temporarily. This restriction will be
603-
* lifted when security audit is clean. Note that eBPF+tracing must have
604-
* this restriction, since it may pass kernel data to user space
605-
*/
606-
if (!capable(CAP_SYS_ADMIN))
607+
if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
607608
return -EPERM;
608609

609610
if (!access_ok(VERIFY_READ, uattr, 1))

kernel/bpf/verifier.c

+97-9
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ struct verifier_env {
199199
struct verifier_state_list **explored_states; /* search pruning optimization */
200200
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
201201
u32 used_map_cnt; /* number of used maps */
202+
bool allow_ptr_leaks;
202203
};
203204

204205
/* verbose verifier prints what it's seeing
@@ -538,6 +539,21 @@ static int bpf_size_to_bytes(int bpf_size)
538539
return -EINVAL;
539540
}
540541

542+
static bool is_spillable_regtype(enum bpf_reg_type type)
543+
{
544+
switch (type) {
545+
case PTR_TO_MAP_VALUE:
546+
case PTR_TO_MAP_VALUE_OR_NULL:
547+
case PTR_TO_STACK:
548+
case PTR_TO_CTX:
549+
case FRAME_PTR:
550+
case CONST_PTR_TO_MAP:
551+
return true;
552+
default:
553+
return false;
554+
}
555+
}
556+
541557
/* check_stack_read/write functions track spill/fill of registers,
542558
* stack boundary and alignment are checked in check_mem_access()
543559
*/
@@ -550,9 +566,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
550566
*/
551567

552568
if (value_regno >= 0 &&
553-
(state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
554-
state->regs[value_regno].type == PTR_TO_STACK ||
555-
state->regs[value_regno].type == PTR_TO_CTX)) {
569+
is_spillable_regtype(state->regs[value_regno].type)) {
556570

557571
/* register containing pointer is being spilled into stack */
558572
if (size != BPF_REG_SIZE) {
@@ -643,6 +657,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
643657
return -EACCES;
644658
}
645659

660+
static bool is_pointer_value(struct verifier_env *env, int regno)
661+
{
662+
if (env->allow_ptr_leaks)
663+
return false;
664+
665+
switch (env->cur_state.regs[regno].type) {
666+
case UNKNOWN_VALUE:
667+
case CONST_IMM:
668+
return false;
669+
default:
670+
return true;
671+
}
672+
}
673+
646674
/* check whether memory at (regno + off) is accessible for t = (read | write)
647675
* if t==write, value_regno is a register which value is stored into memory
648676
* if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +697,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
669697
}
670698

671699
if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
700+
if (t == BPF_WRITE && value_regno >= 0 &&
701+
is_pointer_value(env, value_regno)) {
702+
verbose("R%d leaks addr into map\n", value_regno);
703+
return -EACCES;
704+
}
672705
err = check_map_access(env, regno, off, size);
673706
if (!err && t == BPF_READ && value_regno >= 0)
674707
mark_reg_unknown_value(state->regs, value_regno);
675708

676709
} else if (state->regs[regno].type == PTR_TO_CTX) {
710+
if (t == BPF_WRITE && value_regno >= 0 &&
711+
is_pointer_value(env, value_regno)) {
712+
verbose("R%d leaks addr into ctx\n", value_regno);
713+
return -EACCES;
714+
}
677715
err = check_ctx_access(env, off, size, t);
678716
if (!err && t == BPF_READ && value_regno >= 0)
679717
mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +722,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
684722
verbose("invalid stack off=%d size=%d\n", off, size);
685723
return -EACCES;
686724
}
687-
if (t == BPF_WRITE)
725+
if (t == BPF_WRITE) {
726+
if (!env->allow_ptr_leaks &&
727+
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
728+
size != BPF_REG_SIZE) {
729+
verbose("attempt to corrupt spilled pointer on stack\n");
730+
return -EACCES;
731+
}
688732
err = check_stack_write(state, off, size, value_regno);
689-
else
733+
} else {
690734
err = check_stack_read(state, off, size, value_regno);
735+
}
691736
} else {
692737
verbose("R%d invalid mem access '%s'\n",
693738
regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +820,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
775820
return -EACCES;
776821
}
777822

778-
if (arg_type == ARG_ANYTHING)
823+
if (arg_type == ARG_ANYTHING) {
824+
if (is_pointer_value(env, regno)) {
825+
verbose("R%d leaks addr into helper function\n", regno);
826+
return -EACCES;
827+
}
779828
return 0;
829+
}
780830

781831
if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
782832
arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -950,8 +1000,9 @@ static int check_call(struct verifier_env *env, int func_id)
9501000
}
9511001

9521002
/* check validity of 32-bit and 64-bit arithmetic operations */
953-
static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
1003+
static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
9541004
{
1005+
struct reg_state *regs = env->cur_state.regs;
9551006
u8 opcode = BPF_OP(insn->code);
9561007
int err;
9571008

@@ -976,6 +1027,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
9761027
if (err)
9771028
return err;
9781029

1030+
if (is_pointer_value(env, insn->dst_reg)) {
1031+
verbose("R%d pointer arithmetic prohibited\n",
1032+
insn->dst_reg);
1033+
return -EACCES;
1034+
}
1035+
9791036
/* check dest operand */
9801037
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
9811038
if (err)
@@ -1012,6 +1069,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
10121069
*/
10131070
regs[insn->dst_reg] = regs[insn->src_reg];
10141071
} else {
1072+
if (is_pointer_value(env, insn->src_reg)) {
1073+
verbose("R%d partial copy of pointer\n",
1074+
insn->src_reg);
1075+
return -EACCES;
1076+
}
10151077
regs[insn->dst_reg].type = UNKNOWN_VALUE;
10161078
regs[insn->dst_reg].map_ptr = NULL;
10171079
}
@@ -1061,8 +1123,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
10611123
/* pattern match 'bpf_add Rx, imm' instruction */
10621124
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
10631125
regs[insn->dst_reg].type == FRAME_PTR &&
1064-
BPF_SRC(insn->code) == BPF_K)
1126+
BPF_SRC(insn->code) == BPF_K) {
10651127
stack_relative = true;
1128+
} else if (is_pointer_value(env, insn->dst_reg)) {
1129+
verbose("R%d pointer arithmetic prohibited\n",
1130+
insn->dst_reg);
1131+
return -EACCES;
1132+
} else if (BPF_SRC(insn->code) == BPF_X &&
1133+
is_pointer_value(env, insn->src_reg)) {
1134+
verbose("R%d pointer arithmetic prohibited\n",
1135+
insn->src_reg);
1136+
return -EACCES;
1137+
}
10661138

10671139
/* check dest operand */
10681140
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1173,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
11011173
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
11021174
if (err)
11031175
return err;
1176+
1177+
if (is_pointer_value(env, insn->src_reg)) {
1178+
verbose("R%d pointer comparison prohibited\n",
1179+
insn->src_reg);
1180+
return -EACCES;
1181+
}
11041182
} else {
11051183
if (insn->src_reg != BPF_REG_0) {
11061184
verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1233,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
11551233
regs[insn->dst_reg].type = CONST_IMM;
11561234
regs[insn->dst_reg].imm = 0;
11571235
}
1236+
} else if (is_pointer_value(env, insn->dst_reg)) {
1237+
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
1238+
return -EACCES;
11581239
} else if (BPF_SRC(insn->code) == BPF_K &&
11591240
(opcode == BPF_JEQ || opcode == BPF_JNE)) {
11601241

@@ -1658,7 +1739,7 @@ static int do_check(struct verifier_env *env)
16581739
}
16591740

16601741
if (class == BPF_ALU || class == BPF_ALU64) {
1661-
err = check_alu_op(regs, insn);
1742+
err = check_alu_op(env, insn);
16621743
if (err)
16631744
return err;
16641745

@@ -1816,6 +1897,11 @@ static int do_check(struct verifier_env *env)
18161897
if (err)
18171898
return err;
18181899

1900+
if (is_pointer_value(env, BPF_REG_0)) {
1901+
verbose("R0 leaks addr as return value\n");
1902+
return -EACCES;
1903+
}
1904+
18191905
process_bpf_exit:
18201906
insn_idx = pop_stack(env, &prev_insn_idx);
18211907
if (insn_idx < 0) {
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
21442230
if (ret < 0)
21452231
goto skip_full_check;
21462232

2233+
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
2234+
21472235
ret = do_check(env);
21482236

21492237
skip_full_check:

kernel/sysctl.c

+13
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
#include <linux/binfmts.h>
6565
#include <linux/sched/sysctl.h>
6666
#include <linux/kexec.h>
67+
#include <linux/bpf.h>
6768

6869
#include <asm/uaccess.h>
6970
#include <asm/processor.h>
@@ -1138,6 +1139,18 @@ static struct ctl_table kern_table[] = {
11381139
.mode = 0644,
11391140
.proc_handler = timer_migration_handler,
11401141
},
1142+
#endif
1143+
#ifdef CONFIG_BPF_SYSCALL
1144+
{
1145+
.procname = "unprivileged_bpf_disabled",
1146+
.data = &sysctl_unprivileged_bpf_disabled,
1147+
.maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
1148+
.mode = 0644,
1149+
/* only handle a transition from default "0" to "1" */
1150+
.proc_handler = proc_dointvec_minmax,
1151+
.extra1 = &one,
1152+
.extra2 = &one,
1153+
},
11411154
#endif
11421155
{ }
11431156
};

net/core/filter.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -1640,7 +1640,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
16401640
case BPF_FUNC_ktime_get_ns:
16411641
return &bpf_ktime_get_ns_proto;
16421642
case BPF_FUNC_trace_printk:
1643-
return bpf_get_trace_printk_proto();
1643+
if (capable(CAP_SYS_ADMIN))
1644+
return bpf_get_trace_printk_proto();
16441645
default:
16451646
return NULL;
16461647
}

0 commit comments

Comments
 (0)