Skip to content

Commit b741f16

Browse files
rgushchinborkmann
authored andcommitted
bpf: introduce per-cpu cgroup local storage
This commit introduced per-cpu cgroup local storage. Per-cpu cgroup local storage is very similar to simple cgroup storage (let's call it shared), except all the data is per-cpu. The main goal of per-cpu variant is to implement super fast counters (e.g. packet counters), which don't require neither lookups, neither atomic operations. >From userspace's point of view, accessing a per-cpu cgroup storage is similar to other per-cpu map types (e.g. per-cpu hashmaps and arrays). Writing to a per-cpu cgroup storage is not atomic, but is performed by copying longs, so some minimal atomicity is here, exactly as with other per-cpu maps. Signed-off-by: Roman Gushchin <[email protected]> Cc: Daniel Borkmann <[email protected]> Cc: Alexei Starovoitov <[email protected]> Acked-by: Song Liu <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent f294b37 commit b741f16

File tree

8 files changed

+179
-28
lines changed

8 files changed

+179
-28
lines changed

Diff for: include/linux/bpf-cgroup.h

+19-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@ struct bpf_storage_buffer {
3737
};
3838

3939
struct bpf_cgroup_storage {
40-
struct bpf_storage_buffer *buf;
40+
union {
41+
struct bpf_storage_buffer *buf;
42+
void __percpu *percpu_buf;
43+
};
4144
struct bpf_cgroup_storage_map *map;
4245
struct bpf_cgroup_storage_key key;
4346
struct list_head list;
@@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
109112
static inline enum bpf_cgroup_storage_type cgroup_storage_type(
110113
struct bpf_map *map)
111114
{
115+
if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
116+
return BPF_CGROUP_STORAGE_PERCPU;
117+
112118
return BPF_CGROUP_STORAGE_SHARED;
113119
}
114120

@@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
131137
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
132138
void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
133139

140+
int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
141+
int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
142+
void *value, u64 flags);
143+
134144
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
135145
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
136146
({ \
@@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
285295
struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
286296
static inline void bpf_cgroup_storage_free(
287297
struct bpf_cgroup_storage *storage) {}
298+
static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
299+
void *value) {
300+
return 0;
301+
}
302+
static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
303+
void *key, void *value, u64 flags) {
304+
return 0;
305+
}
288306

289307
#define cgroup_bpf_enabled (0)
290308
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)

Diff for: include/linux/bpf.h

+1
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ struct bpf_prog_offload {
274274

275275
enum bpf_cgroup_storage_type {
276276
BPF_CGROUP_STORAGE_SHARED,
277+
BPF_CGROUP_STORAGE_PERCPU,
277278
__BPF_CGROUP_STORAGE_MAX
278279
};
279280

Diff for: include/linux/bpf_types.h

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
4343
#endif
4444
#ifdef CONFIG_CGROUP_BPF
4545
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
46+
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
4647
#endif
4748
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
4849
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)

Diff for: include/uapi/linux/bpf.h

+1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ enum bpf_map_type {
127127
BPF_MAP_TYPE_SOCKHASH,
128128
BPF_MAP_TYPE_CGROUP_STORAGE,
129129
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
130+
BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
130131
};
131132

132133
enum bpf_prog_type {

Diff for: kernel/bpf/helpers.c

+7-1
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
206206
*/
207207
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
208208
struct bpf_cgroup_storage *storage;
209+
void *ptr;
209210

210211
storage = this_cpu_read(bpf_cgroup_storage[stype]);
211212

212-
return (unsigned long)&READ_ONCE(storage->buf)->data[0];
213+
if (stype == BPF_CGROUP_STORAGE_SHARED)
214+
ptr = &READ_ONCE(storage->buf)->data[0];
215+
else
216+
ptr = this_cpu_ptr(storage->percpu_buf);
217+
218+
return (unsigned long)ptr;
213219
}
214220

215221
const struct bpf_func_proto bpf_get_local_storage_proto = {

Diff for: kernel/bpf/local_storage.c

+130-20
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
152152
return 0;
153153
}
154154

155+
int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
156+
void *value)
157+
{
158+
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
159+
struct bpf_cgroup_storage_key *key = _key;
160+
struct bpf_cgroup_storage *storage;
161+
int cpu, off = 0;
162+
u32 size;
163+
164+
rcu_read_lock();
165+
storage = cgroup_storage_lookup(map, key, false);
166+
if (!storage) {
167+
rcu_read_unlock();
168+
return -ENOENT;
169+
}
170+
171+
/* per_cpu areas are zero-filled and bpf programs can only
172+
* access 'value_size' of them, so copying rounded areas
173+
* will not leak any kernel data
174+
*/
175+
size = round_up(_map->value_size, 8);
176+
for_each_possible_cpu(cpu) {
177+
bpf_long_memcpy(value + off,
178+
per_cpu_ptr(storage->percpu_buf, cpu), size);
179+
off += size;
180+
}
181+
rcu_read_unlock();
182+
return 0;
183+
}
184+
185+
int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
186+
void *value, u64 map_flags)
187+
{
188+
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
189+
struct bpf_cgroup_storage_key *key = _key;
190+
struct bpf_cgroup_storage *storage;
191+
int cpu, off = 0;
192+
u32 size;
193+
194+
if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
195+
return -EINVAL;
196+
197+
rcu_read_lock();
198+
storage = cgroup_storage_lookup(map, key, false);
199+
if (!storage) {
200+
rcu_read_unlock();
201+
return -ENOENT;
202+
}
203+
204+
/* the user space will provide round_up(value_size, 8) bytes that
205+
* will be copied into per-cpu area. bpf programs can only access
206+
* value_size of it. During lookup the same extra bytes will be
207+
* returned or zeros which were zero-filled by percpu_alloc,
208+
* so no kernel data leaks possible
209+
*/
210+
size = round_up(_map->value_size, 8);
211+
for_each_possible_cpu(cpu) {
212+
bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
213+
value + off, size);
214+
off += size;
215+
}
216+
rcu_read_unlock();
217+
return 0;
218+
}
219+
155220
static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
156221
void *_next_key)
157222
{
@@ -287,60 +352,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
287352
spin_unlock_bh(&map->lock);
288353
}
289354

355+
static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
356+
{
357+
size_t size;
358+
359+
if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) {
360+
size = sizeof(struct bpf_storage_buffer) + map->value_size;
361+
*pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
362+
PAGE_SIZE) >> PAGE_SHIFT;
363+
} else {
364+
size = map->value_size;
365+
*pages = round_up(round_up(size, 8) * num_possible_cpus(),
366+
PAGE_SIZE) >> PAGE_SHIFT;
367+
}
368+
369+
return size;
370+
}
371+
290372
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
291373
enum bpf_cgroup_storage_type stype)
292374
{
293375
struct bpf_cgroup_storage *storage;
294376
struct bpf_map *map;
377+
gfp_t flags;
378+
size_t size;
295379
u32 pages;
296380

297381
map = prog->aux->cgroup_storage[stype];
298382
if (!map)
299383
return NULL;
300384

301-
pages = round_up(sizeof(struct bpf_cgroup_storage) +
302-
sizeof(struct bpf_storage_buffer) +
303-
map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
385+
size = bpf_cgroup_storage_calculate_size(map, &pages);
386+
304387
if (bpf_map_charge_memlock(map, pages))
305388
return ERR_PTR(-EPERM);
306389

307390
storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
308391
__GFP_ZERO | GFP_USER, map->numa_node);
309-
if (!storage) {
310-
bpf_map_uncharge_memlock(map, pages);
311-
return ERR_PTR(-ENOMEM);
312-
}
392+
if (!storage)
393+
goto enomem;
313394

314-
storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
315-
map->value_size, __GFP_ZERO | GFP_USER,
316-
map->numa_node);
317-
if (!storage->buf) {
318-
bpf_map_uncharge_memlock(map, pages);
319-
kfree(storage);
320-
return ERR_PTR(-ENOMEM);
395+
flags = __GFP_ZERO | GFP_USER;
396+
397+
if (stype == BPF_CGROUP_STORAGE_SHARED) {
398+
storage->buf = kmalloc_node(size, flags, map->numa_node);
399+
if (!storage->buf)
400+
goto enomem;
401+
} else {
402+
storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
403+
if (!storage->percpu_buf)
404+
goto enomem;
321405
}
322406

323407
storage->map = (struct bpf_cgroup_storage_map *)map;
324408

325409
return storage;
410+
411+
enomem:
412+
bpf_map_uncharge_memlock(map, pages);
413+
kfree(storage);
414+
return ERR_PTR(-ENOMEM);
415+
}
416+
417+
static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu)
418+
{
419+
struct bpf_cgroup_storage *storage =
420+
container_of(rcu, struct bpf_cgroup_storage, rcu);
421+
422+
kfree(storage->buf);
423+
kfree(storage);
424+
}
425+
426+
static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
427+
{
428+
struct bpf_cgroup_storage *storage =
429+
container_of(rcu, struct bpf_cgroup_storage, rcu);
430+
431+
free_percpu(storage->percpu_buf);
432+
kfree(storage);
326433
}
327434

328435
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
329436
{
330-
u32 pages;
437+
enum bpf_cgroup_storage_type stype;
331438
struct bpf_map *map;
439+
u32 pages;
332440

333441
if (!storage)
334442
return;
335443

336444
map = &storage->map->map;
337-
pages = round_up(sizeof(struct bpf_cgroup_storage) +
338-
sizeof(struct bpf_storage_buffer) +
339-
map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
445+
446+
bpf_cgroup_storage_calculate_size(map, &pages);
340447
bpf_map_uncharge_memlock(map, pages);
341448

342-
kfree_rcu(storage->buf, rcu);
343-
kfree_rcu(storage, rcu);
449+
stype = cgroup_storage_type(map);
450+
if (stype == BPF_CGROUP_STORAGE_SHARED)
451+
call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
452+
else
453+
call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
344454
}
345455

346456
void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,

Diff for: kernel/bpf/syscall.c

+9-2
Original file line numberDiff line numberDiff line change
@@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
686686

687687
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
688688
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
689-
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
689+
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
690+
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
690691
value_size = round_up(map->value_size, 8) * num_possible_cpus();
691692
else if (IS_FD_MAP(map))
692693
value_size = sizeof(u32);
@@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr)
705706
err = bpf_percpu_hash_copy(map, key, value);
706707
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
707708
err = bpf_percpu_array_copy(map, key, value);
709+
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
710+
err = bpf_percpu_cgroup_storage_copy(map, key, value);
708711
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
709712
err = bpf_stackmap_copy(map, key, value);
710713
} else if (IS_FD_ARRAY(map)) {
@@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr)
774777

775778
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
776779
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
777-
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
780+
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
781+
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
778782
value_size = round_up(map->value_size, 8) * num_possible_cpus();
779783
else
780784
value_size = map->value_size;
@@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr)
809813
err = bpf_percpu_hash_update(map, key, value, attr->flags);
810814
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
811815
err = bpf_percpu_array_update(map, key, value, attr->flags);
816+
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
817+
err = bpf_percpu_cgroup_storage_update(map, key, value,
818+
attr->flags);
812819
} else if (IS_FD_ARRAY(map)) {
813820
rcu_read_lock();
814821
err = bpf_fd_array_map_update_elem(map, f.file, key, value,

Diff for: kernel/bpf/verifier.c

+11-4
Original file line numberDiff line numberDiff line change
@@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
20742074
goto error;
20752075
break;
20762076
case BPF_MAP_TYPE_CGROUP_STORAGE:
2077+
case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
20772078
if (func_id != BPF_FUNC_get_local_storage)
20782079
goto error;
20792080
break;
@@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
21642165
goto error;
21652166
break;
21662167
case BPF_FUNC_get_local_storage:
2167-
if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
2168+
if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
2169+
map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
21682170
goto error;
21692171
break;
21702172
case BPF_FUNC_sk_select_reuseport:
@@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
50495051
return 0;
50505052
}
50515053

5054+
static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
5055+
{
5056+
return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
5057+
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
5058+
}
5059+
50525060
/* look for pseudo eBPF instructions that access map FDs and
50535061
* replace them with actual map pointers
50545062
*/
@@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
51395147
}
51405148
env->used_maps[env->used_map_cnt++] = map;
51415149

5142-
if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
5150+
if (bpf_map_is_cgroup_storage(map) &&
51435151
bpf_cgroup_storage_assign(env->prog, map)) {
5144-
verbose(env,
5145-
"only one cgroup storage is allowed\n");
5152+
verbose(env, "only one cgroup storage of each type is allowed\n");
51465153
fdput(f);
51475154
return -EBUSY;
51485155
}

0 commit comments

Comments
 (0)