Skip to content

Commit 546ac1f

Browse files
jrfastabdavem330
authored andcommitted
bpf: add devmap, a map for storing net device references
Device map (devmap) is a BPF map, primarily useful for networking applications, that uses a key to lookup a reference to a netdevice. The map provides a clean way for BPF programs to build virtual port to physical port maps. Additionally, it provides a scoping function for the redirect action itself allowing multiple optimizations. Future patches will leverage the map to provide batching at the XDP layer. Another optimization/feature, that is not yet implemented, would be to support multiple netdevices per key to support efficient multicast and broadcast support. Signed-off-by: John Fastabend <[email protected]> Acked-by: Daniel Borkmann <[email protected]> Acked-by: Jesper Dangaard Brouer <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 5acaee0 commit 546ac1f

File tree

6 files changed

+294
-0
lines changed

6 files changed

+294
-0
lines changed

include/linux/bpf_types.h

+3
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops)
3535
#endif
3636
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
3737
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
38+
#ifdef CONFIG_NET
39+
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
40+
#endif

include/uapi/linux/bpf.h

+1
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ enum bpf_map_type {
104104
BPF_MAP_TYPE_LPM_TRIE,
105105
BPF_MAP_TYPE_ARRAY_OF_MAPS,
106106
BPF_MAP_TYPE_HASH_OF_MAPS,
107+
BPF_MAP_TYPE_DEVMAP,
107108
};
108109

109110
enum bpf_prog_type {

kernel/bpf/Makefile

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ obj-y := core.o
22

33
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
44
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
5+
ifeq ($(CONFIG_NET),y)
6+
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
7+
endif
58
ifeq ($(CONFIG_PERF_EVENTS),y)
69
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
710
endif

kernel/bpf/devmap.c

+264
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
2+
*
3+
* This program is free software; you can redistribute it and/or
4+
* modify it under the terms of version 2 of the GNU General Public
5+
* License as published by the Free Software Foundation.
6+
*
7+
* This program is distributed in the hope that it will be useful, but
8+
* WITHOUT ANY WARRANTY; without even the implied warranty of
9+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10+
* General Public License for more details.
11+
*/
12+
13+
/* Devmaps primary use is as a backend map for XDP BPF helper call
14+
* bpf_redirect_map(). Because XDP is mostly concerned with performance we
15+
* spent some effort to ensure the datapath with redirect maps does not use
16+
* any locking. This is a quick note on the details.
17+
*
18+
* We have three possible paths to get into the devmap control plane bpf
19+
* syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
20+
* will invoke an update, delete, or lookup operation. To ensure updates and
21+
* deletes appear atomic from the datapath side xchg() is used to modify the
22+
* netdev_map array. Then because the datapath does a lookup into the netdev_map
23+
* array (read-only) from an RCU critical section we use call_rcu() to wait for
24+
* an rcu grace period before free'ing the old data structures. This ensures the
25+
* datapath always has a valid copy. However, the datapath does a "flush"
26+
* operation that pushes any pending packets in the driver outside the RCU
27+
* critical section. Each bpf_dtab_netdev tracks these pending operations using
28+
* an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
29+
* until all bits are cleared indicating outstanding flush operations have
30+
* completed.
31+
*
32+
* BPF syscalls may race with BPF program calls on any of the update, delete
33+
* or lookup operations. As noted above the xchg() operation also keep the
34+
* netdev_map consistent in this case. From the devmap side BPF programs
35+
* calling into these operations are the same as multiple user space threads
36+
* making system calls.
37+
*/
38+
#include <linux/bpf.h>
39+
#include <linux/jhash.h>
40+
#include <linux/filter.h>
41+
#include <linux/rculist_nulls.h>
42+
#include "percpu_freelist.h"
43+
#include "bpf_lru_list.h"
44+
#include "map_in_map.h"
45+
46+
struct bpf_dtab_netdev {
47+
struct net_device *dev;
48+
int key;
49+
struct rcu_head rcu;
50+
struct bpf_dtab *dtab;
51+
};
52+
53+
struct bpf_dtab {
54+
struct bpf_map map;
55+
struct bpf_dtab_netdev **netdev_map;
56+
};
57+
58+
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
59+
{
60+
struct bpf_dtab *dtab;
61+
u64 cost;
62+
int err;
63+
64+
/* check sanity of attributes */
65+
if (attr->max_entries == 0 || attr->key_size != 4 ||
66+
attr->value_size != 4 || attr->map_flags)
67+
return ERR_PTR(-EINVAL);
68+
69+
/* if value_size is bigger, the user space won't be able to
70+
* access the elements.
71+
*/
72+
if (attr->value_size > KMALLOC_MAX_SIZE)
73+
return ERR_PTR(-E2BIG);
74+
75+
dtab = kzalloc(sizeof(*dtab), GFP_USER);
76+
if (!dtab)
77+
return ERR_PTR(-ENOMEM);
78+
79+
/* mandatory map attributes */
80+
dtab->map.map_type = attr->map_type;
81+
dtab->map.key_size = attr->key_size;
82+
dtab->map.value_size = attr->value_size;
83+
dtab->map.max_entries = attr->max_entries;
84+
dtab->map.map_flags = attr->map_flags;
85+
86+
err = -ENOMEM;
87+
88+
/* make sure page count doesn't overflow */
89+
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
90+
if (cost >= U32_MAX - PAGE_SIZE)
91+
goto free_dtab;
92+
93+
dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
94+
95+
/* if map size is larger than memlock limit, reject it early */
96+
err = bpf_map_precharge_memlock(dtab->map.pages);
97+
if (err)
98+
goto free_dtab;
99+
100+
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
101+
sizeof(struct bpf_dtab_netdev *));
102+
if (!dtab->netdev_map)
103+
goto free_dtab;
104+
105+
return &dtab->map;
106+
107+
free_dtab:
108+
kfree(dtab);
109+
return ERR_PTR(err);
110+
}
111+
112+
static void dev_map_free(struct bpf_map *map)
113+
{
114+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
115+
int i;
116+
117+
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
118+
* so the programs (can be more than one that used this map) were
119+
* disconnected from events. Wait for outstanding critical sections in
120+
* these programs to complete. The rcu critical section only guarantees
121+
* no further reads against netdev_map. It does __not__ ensure pending
122+
* flush operations (if any) are complete.
123+
*/
124+
synchronize_rcu();
125+
126+
for (i = 0; i < dtab->map.max_entries; i++) {
127+
struct bpf_dtab_netdev *dev;
128+
129+
dev = dtab->netdev_map[i];
130+
if (!dev)
131+
continue;
132+
133+
dev_put(dev->dev);
134+
kfree(dev);
135+
}
136+
137+
/* At this point bpf program is detached and all pending operations
138+
* _must_ be complete
139+
*/
140+
bpf_map_area_free(dtab->netdev_map);
141+
kfree(dtab);
142+
}
143+
144+
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
145+
{
146+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
147+
u32 index = key ? *(u32 *)key : U32_MAX;
148+
u32 *next = (u32 *)next_key;
149+
150+
if (index >= dtab->map.max_entries) {
151+
*next = 0;
152+
return 0;
153+
}
154+
155+
if (index == dtab->map.max_entries - 1)
156+
return -ENOENT;
157+
158+
*next = index + 1;
159+
return 0;
160+
}
161+
162+
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
163+
* update happens in parallel here a dev_put wont happen until after reading the
164+
* ifindex.
165+
*/
166+
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
167+
{
168+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
169+
struct bpf_dtab_netdev *dev;
170+
u32 i = *(u32 *)key;
171+
172+
if (i >= map->max_entries)
173+
return NULL;
174+
175+
dev = READ_ONCE(dtab->netdev_map[i]);
176+
return dev ? &dev->dev->ifindex : NULL;
177+
}
178+
179+
static void __dev_map_entry_free(struct rcu_head *rcu)
180+
{
181+
struct bpf_dtab_netdev *old_dev;
182+
183+
old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
184+
dev_put(old_dev->dev);
185+
kfree(old_dev);
186+
}
187+
188+
static int dev_map_delete_elem(struct bpf_map *map, void *key)
189+
{
190+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
191+
struct bpf_dtab_netdev *old_dev;
192+
int k = *(u32 *)key;
193+
194+
if (k >= map->max_entries)
195+
return -EINVAL;
196+
197+
/* Use synchronize_rcu() here to ensure any rcu critical sections
198+
* have completed, but this does not guarantee a flush has happened
199+
* yet. Because driver side rcu_read_lock/unlock only protects the
200+
* running XDP program. However, for pending flush operations the
201+
* dev and ctx are stored in another per cpu map. And additionally,
202+
* the driver tear down ensures all soft irqs are complete before
203+
* removing the net device in the case of dev_put equals zero.
204+
*/
205+
old_dev = xchg(&dtab->netdev_map[k], NULL);
206+
if (old_dev)
207+
call_rcu(&old_dev->rcu, __dev_map_entry_free);
208+
return 0;
209+
}
210+
211+
static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
212+
u64 map_flags)
213+
{
214+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
215+
struct net *net = current->nsproxy->net_ns;
216+
struct bpf_dtab_netdev *dev, *old_dev;
217+
u32 i = *(u32 *)key;
218+
u32 ifindex = *(u32 *)value;
219+
220+
if (unlikely(map_flags > BPF_EXIST))
221+
return -EINVAL;
222+
223+
if (unlikely(i >= dtab->map.max_entries))
224+
return -E2BIG;
225+
226+
if (unlikely(map_flags == BPF_NOEXIST))
227+
return -EEXIST;
228+
229+
if (!ifindex) {
230+
dev = NULL;
231+
} else {
232+
dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN);
233+
if (!dev)
234+
return -ENOMEM;
235+
236+
dev->dev = dev_get_by_index(net, ifindex);
237+
if (!dev->dev) {
238+
kfree(dev);
239+
return -EINVAL;
240+
}
241+
242+
dev->key = i;
243+
dev->dtab = dtab;
244+
}
245+
246+
/* Use call_rcu() here to ensure rcu critical sections have completed
247+
* Remembering the driver side flush operation will happen before the
248+
* net device is removed.
249+
*/
250+
old_dev = xchg(&dtab->netdev_map[i], dev);
251+
if (old_dev)
252+
call_rcu(&old_dev->rcu, __dev_map_entry_free);
253+
254+
return 0;
255+
}
256+
257+
const struct bpf_map_ops dev_map_ops = {
258+
.map_alloc = dev_map_alloc,
259+
.map_free = dev_map_free,
260+
.map_get_next_key = dev_map_get_next_key,
261+
.map_lookup_elem = dev_map_lookup_elem,
262+
.map_update_elem = dev_map_update_elem,
263+
.map_delete_elem = dev_map_delete_elem,
264+
};

kernel/bpf/verifier.c

+8
Original file line numberDiff line numberDiff line change
@@ -1276,6 +1276,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
12761276
func_id != BPF_FUNC_current_task_under_cgroup)
12771277
goto error;
12781278
break;
1279+
/* devmap returns a pointer to a live net_device ifindex that we cannot
1280+
* allow to be modified from bpf side. So do not allow lookup elements
1281+
* for now.
1282+
*/
1283+
case BPF_MAP_TYPE_DEVMAP:
1284+
if (func_id == BPF_FUNC_map_lookup_elem)
1285+
goto error;
1286+
break;
12791287
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
12801288
case BPF_MAP_TYPE_HASH_OF_MAPS:
12811289
if (func_id != BPF_FUNC_map_lookup_elem)

tools/testing/selftests/bpf/test_maps.c

+15
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,21 @@ static void test_arraymap_percpu_many_keys(void)
438438
close(fd);
439439
}
440440

441+
static void test_devmap(int task, void *data)
442+
{
443+
int next_key, fd;
444+
__u32 key, value;
445+
446+
fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value),
447+
2, 0);
448+
if (fd < 0) {
449+
printf("Failed to create arraymap '%s'!\n", strerror(errno));
450+
exit(1);
451+
}
452+
453+
close(fd);
454+
}
455+
441456
#define MAP_SIZE (32 * 1024)
442457

443458
static void test_map_large(void)

0 commit comments

Comments
 (0)