Skip to content

Commit d83525c

Browse files
Alexei Starovoitovborkmann
Alexei Starovoitov
authored andcommitted
bpf: introduce bpf_spin_lock
Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let bpf program serialize access to other variables. Example: struct hash_elem { int cnt; struct bpf_spin_lock lock; }; struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key); if (val) { bpf_spin_lock(&val->lock); val->cnt++; bpf_spin_unlock(&val->lock); } Restrictions and safety checks: - bpf_spin_lock is only allowed inside HASH and ARRAY maps. - BTF description of the map is mandatory for safety analysis. - bpf program can take one bpf_spin_lock at a time, since two or more can cause dead locks. - only one 'struct bpf_spin_lock' is allowed per map element. It drastically simplifies implementation yet allows bpf program to use any number of bpf_spin_locks. - when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed. - bpf program must bpf_spin_unlock() before return. - bpf program can access 'struct bpf_spin_lock' only via bpf_spin_lock()/bpf_spin_unlock() helpers. - load/store into 'struct bpf_spin_lock lock;' field is not allowed. - to use bpf_spin_lock() helper the BTF description of map value must be a struct and have 'struct bpf_spin_lock anyname;' field at the top level. Nested lock inside another struct is not allowed. - syscall map_lookup doesn't copy bpf_spin_lock field to user space. - syscall map_update and program map_update do not update bpf_spin_lock field. - bpf_spin_lock cannot be on the stack or inside networking packet. bpf_spin_lock can only be inside HASH or ARRAY map value. - bpf_spin_lock is available to root only and to all program types. - bpf_spin_lock is not allowed in inner maps of map-in-map. - ld_abs is not allowed inside spin_lock-ed region. - tracing progs and socket filter progs cannot use bpf_spin_lock due to insufficient preemption checks Implementation details: - cgroup-bpf class of programs can nest with xdp/tc programs. Hence bpf_spin_lock is equivalent to spin_lock_irqsave. Other solutions to avoid nested bpf_spin_lock are possible. Like making sure that all networking progs run with softirq disabled. spin_lock_irqsave is the simplest and doesn't add overhead to the programs that don't use it. - arch_spinlock_t is used when its implemented as queued_spin_lock - archs can force their own arch_spinlock_t - on architectures where queued_spin_lock is not available and sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used. - presence of bpf_spin_lock inside map value could have been indicated via extra flag during map_create, but specifying it via BTF is cleaner. It provides introspection for map key/value and reduces user mistakes. Next steps: - allow bpf_spin_lock in other map types (like cgroup local storage) - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper to request kernel to grab bpf_spin_lock before rewriting the value. That will serialize access to map elements. Acked-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent 1832f4e commit d83525c

File tree

14 files changed

+386
-26
lines changed

14 files changed

+386
-26
lines changed

include/linux/bpf.h

+34-3
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,15 @@ struct bpf_map {
7272
u32 value_size;
7373
u32 max_entries;
7474
u32 map_flags;
75-
u32 pages;
75+
int spin_lock_off; /* >=0 valid offset, <0 error */
7676
u32 id;
7777
int numa_node;
7878
u32 btf_key_type_id;
7979
u32 btf_value_type_id;
8080
struct btf *btf;
81+
u32 pages;
8182
bool unpriv_array;
82-
/* 55 bytes hole */
83+
/* 51 bytes hole */
8384

8485
/* The 3rd and 4th cacheline with misc members to avoid false sharing
8586
* particularly with refcounting.
@@ -91,6 +92,34 @@ struct bpf_map {
9192
char name[BPF_OBJ_NAME_LEN];
9293
};
9394

95+
static inline bool map_value_has_spin_lock(const struct bpf_map *map)
96+
{
97+
return map->spin_lock_off >= 0;
98+
}
99+
100+
static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
101+
{
102+
if (likely(!map_value_has_spin_lock(map)))
103+
return;
104+
*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
105+
(struct bpf_spin_lock){};
106+
}
107+
108+
/* copy everything but bpf_spin_lock */
109+
static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
110+
{
111+
if (unlikely(map_value_has_spin_lock(map))) {
112+
u32 off = map->spin_lock_off;
113+
114+
memcpy(dst, src, off);
115+
memcpy(dst + off + sizeof(struct bpf_spin_lock),
116+
src + off + sizeof(struct bpf_spin_lock),
117+
map->value_size - off - sizeof(struct bpf_spin_lock));
118+
} else {
119+
memcpy(dst, src, map->value_size);
120+
}
121+
}
122+
94123
struct bpf_offload_dev;
95124
struct bpf_offloaded_map;
96125

@@ -162,6 +191,7 @@ enum bpf_arg_type {
162191
ARG_PTR_TO_CTX, /* pointer to context */
163192
ARG_ANYTHING, /* any (initialized) argument is ok */
164193
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */
194+
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
165195
};
166196

167197
/* type of values returned from helper functions */
@@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
879909
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
880910
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
881911
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
882-
912+
extern const struct bpf_func_proto bpf_spin_lock_proto;
913+
extern const struct bpf_func_proto bpf_spin_unlock_proto;
883914
extern const struct bpf_func_proto bpf_get_local_storage_proto;
884915

885916
/* Shared helpers among cBPF and eBPF. */

include/linux/bpf_verifier.h

+1
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ struct bpf_verifier_state {
148148
/* call stack tracking */
149149
struct bpf_func_state *frame[MAX_CALL_FRAMES];
150150
u32 curframe;
151+
u32 active_spin_lock;
151152
bool speculative;
152153
};
153154

include/linux/btf.h

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf);
5050
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
5151
const struct btf_member *m,
5252
u32 expected_offset, u32 expected_size);
53+
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
5354

5455
#ifdef CONFIG_BPF_SYSCALL
5556
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);

include/uapi/linux/bpf.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -2422,7 +2422,9 @@ union bpf_attr {
24222422
FN(map_peek_elem), \
24232423
FN(msg_push_data), \
24242424
FN(msg_pop_data), \
2425-
FN(rc_pointer_rel),
2425+
FN(rc_pointer_rel), \
2426+
FN(spin_lock), \
2427+
FN(spin_unlock),
24262428

24272429
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
24282430
* function eBPF program intends to call
@@ -3056,4 +3058,7 @@ struct bpf_line_info {
30563058
__u32 line_col;
30573059
};
30583060

3061+
struct bpf_spin_lock {
3062+
__u32 val;
3063+
};
30593064
#endif /* _UAPI__LINUX_BPF_H__ */

kernel/Kconfig.locks

+3
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
242242
def_bool y if ARCH_USE_QUEUED_SPINLOCKS
243243
depends on SMP
244244

245+
config BPF_ARCH_SPINLOCK
246+
bool
247+
245248
config ARCH_USE_QUEUED_RWLOCKS
246249
bool
247250

kernel/bpf/arraymap.c

+4-3
Original file line numberDiff line numberDiff line change
@@ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
270270
memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
271271
value, map->value_size);
272272
else
273-
memcpy(array->value +
274-
array->elem_size * (index & array->index_mask),
275-
value, map->value_size);
273+
copy_map_value(map,
274+
array->value +
275+
array->elem_size * (index & array->index_mask),
276+
value);
276277
return 0;
277278
}
278279

kernel/bpf/btf.c

+42
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
355355
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
356356
}
357357

358+
static bool __btf_type_is_struct(const struct btf_type *t)
359+
{
360+
return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
361+
}
362+
358363
static bool btf_type_is_array(const struct btf_type *t)
359364
{
360365
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
@@ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
20452050
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
20462051
}
20472052

2053+
/* find 'struct bpf_spin_lock' in map value.
2054+
* return >= 0 offset if found
2055+
* and < 0 in case of error
2056+
*/
2057+
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
2058+
{
2059+
const struct btf_member *member;
2060+
u32 i, off = -ENOENT;
2061+
2062+
if (!__btf_type_is_struct(t))
2063+
return -EINVAL;
2064+
2065+
for_each_member(i, t, member) {
2066+
const struct btf_type *member_type = btf_type_by_id(btf,
2067+
member->type);
2068+
if (!__btf_type_is_struct(member_type))
2069+
continue;
2070+
if (member_type->size != sizeof(struct bpf_spin_lock))
2071+
continue;
2072+
if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
2073+
"bpf_spin_lock"))
2074+
continue;
2075+
if (off != -ENOENT)
2076+
/* only one 'struct bpf_spin_lock' is allowed */
2077+
return -E2BIG;
2078+
off = btf_member_bit_offset(t, member);
2079+
if (off % 8)
2080+
/* valid C code cannot generate such BTF */
2081+
return -EINVAL;
2082+
off /= 8;
2083+
if (off % __alignof__(struct bpf_spin_lock))
2084+
/* valid struct bpf_spin_lock will be 4 byte aligned */
2085+
return -EINVAL;
2086+
}
2087+
return off;
2088+
}
2089+
20482090
static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
20492091
u32 type_id, void *data, u8 bits_offset,
20502092
struct seq_file *m)

kernel/bpf/core.c

+2
Original file line numberDiff line numberDiff line change
@@ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
20022002
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
20032003
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
20042004
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
2005+
const struct bpf_func_proto bpf_spin_lock_proto __weak;
2006+
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
20052007

20062008
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
20072009
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;

kernel/bpf/hashtab.c

+10-11
Original file line numberDiff line numberDiff line change
@@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
718718
BITS_PER_LONG == 64;
719719
}
720720

721-
static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
722-
{
723-
u32 size = htab->map.value_size;
724-
725-
if (percpu || fd_htab_map_needs_adjust(htab))
726-
size = round_up(size, 8);
727-
return size;
728-
}
729-
730721
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
731722
void *value, u32 key_size, u32 hash,
732723
bool percpu, bool onallcpus,
733724
struct htab_elem *old_elem)
734725
{
735-
u32 size = htab_size_value(htab, percpu);
726+
u32 size = htab->map.value_size;
736727
bool prealloc = htab_is_prealloc(htab);
737728
struct htab_elem *l_new, **pl_new;
738729
void __percpu *pptr;
@@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
770761
l_new = ERR_PTR(-ENOMEM);
771762
goto dec_count;
772763
}
764+
check_and_init_map_lock(&htab->map,
765+
l_new->key + round_up(key_size, 8));
773766
}
774767

775768
memcpy(l_new->key, key, key_size);
776769
if (percpu) {
770+
size = round_up(size, 8);
777771
if (prealloc) {
778772
pptr = htab_elem_get_ptr(l_new, key_size);
779773
} else {
@@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
791785

792786
if (!prealloc)
793787
htab_elem_set_ptr(l_new, key_size, pptr);
794-
} else {
788+
} else if (fd_htab_map_needs_adjust(htab)) {
789+
size = round_up(size, 8);
795790
memcpy(l_new->key + round_up(key_size, 8), value, size);
791+
} else {
792+
copy_map_value(&htab->map,
793+
l_new->key + round_up(key_size, 8),
794+
value);
796795
}
797796

798797
l_new->hash = hash;

kernel/bpf/helpers.c

+80
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
221221
.arg2_type = ARG_CONST_SIZE,
222222
};
223223

224+
#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
225+
226+
static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
227+
{
228+
arch_spinlock_t *l = (void *)lock;
229+
union {
230+
__u32 val;
231+
arch_spinlock_t lock;
232+
} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
233+
234+
compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
235+
BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
236+
BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
237+
arch_spin_lock(l);
238+
}
239+
240+
static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
241+
{
242+
arch_spinlock_t *l = (void *)lock;
243+
244+
arch_spin_unlock(l);
245+
}
246+
247+
#else
248+
249+
static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
250+
{
251+
atomic_t *l = (void *)lock;
252+
253+
BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
254+
do {
255+
atomic_cond_read_relaxed(l, !VAL);
256+
} while (atomic_xchg(l, 1));
257+
}
258+
259+
static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
260+
{
261+
atomic_t *l = (void *)lock;
262+
263+
atomic_set_release(l, 0);
264+
}
265+
266+
#endif
267+
268+
static DEFINE_PER_CPU(unsigned long, irqsave_flags);
269+
270+
notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
271+
{
272+
unsigned long flags;
273+
274+
local_irq_save(flags);
275+
__bpf_spin_lock(lock);
276+
__this_cpu_write(irqsave_flags, flags);
277+
return 0;
278+
}
279+
280+
const struct bpf_func_proto bpf_spin_lock_proto = {
281+
.func = bpf_spin_lock,
282+
.gpl_only = false,
283+
.ret_type = RET_VOID,
284+
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
285+
};
286+
287+
notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
288+
{
289+
unsigned long flags;
290+
291+
flags = __this_cpu_read(irqsave_flags);
292+
__bpf_spin_unlock(lock);
293+
local_irq_restore(flags);
294+
return 0;
295+
}
296+
297+
const struct bpf_func_proto bpf_spin_unlock_proto = {
298+
.func = bpf_spin_unlock,
299+
.gpl_only = false,
300+
.ret_type = RET_VOID,
301+
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
302+
};
303+
224304
#ifdef CONFIG_CGROUPS
225305
BPF_CALL_0(bpf_get_current_cgroup_id)
226306
{

kernel/bpf/map_in_map.c

+5
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
3737
return ERR_PTR(-EINVAL);
3838
}
3939

40+
if (map_value_has_spin_lock(inner_map)) {
41+
fdput(f);
42+
return ERR_PTR(-ENOTSUPP);
43+
}
44+
4045
inner_map_meta_size = sizeof(*inner_map_meta);
4146
/* In some cases verifier needs to access beyond just base map. */
4247
if (inner_map->ops == &array_map_ops)

kernel/bpf/syscall.c

+19-2
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map,
463463
return -ENOTSUPP;
464464
}
465465

466-
static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
466+
static int map_check_btf(struct bpf_map *map, const struct btf *btf,
467467
u32 btf_key_id, u32 btf_value_id)
468468
{
469469
const struct btf_type *key_type, *value_type;
@@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
478478
if (!value_type || value_size != map->value_size)
479479
return -EINVAL;
480480

481+
map->spin_lock_off = btf_find_spin_lock(btf, value_type);
482+
483+
if (map_value_has_spin_lock(map)) {
484+
if (map->map_type != BPF_MAP_TYPE_HASH &&
485+
map->map_type != BPF_MAP_TYPE_ARRAY)
486+
return -ENOTSUPP;
487+
if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
488+
map->value_size) {
489+
WARN_ONCE(1,
490+
"verifier bug spin_lock_off %d value_size %d\n",
491+
map->spin_lock_off, map->value_size);
492+
return -EFAULT;
493+
}
494+
}
495+
481496
if (map->ops->map_check_btf)
482497
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
483498

@@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr)
542557
map->btf = btf;
543558
map->btf_key_type_id = attr->btf_key_type_id;
544559
map->btf_value_type_id = attr->btf_value_type_id;
560+
} else {
561+
map->spin_lock_off = -EINVAL;
545562
}
546563

547564
err = security_bpf_map_alloc(map);
@@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr)
740757
err = -ENOENT;
741758
} else {
742759
err = 0;
743-
memcpy(value, ptr, value_size);
760+
copy_map_value(map, value, ptr);
744761
}
745762
rcu_read_unlock();
746763
}

0 commit comments

Comments
 (0)