Skip to content

Commit 0f8e4bd

Browse files
Alexei Starovoitovdavem330
Alexei Starovoitov
authored andcommitted
bpf: add hashtable type of eBPF maps
add new map type BPF_MAP_TYPE_HASH and its implementation - maps are created/destroyed by userspace. Both userspace and eBPF programs can lookup/update/delete elements from the map - eBPF programs can be called in_irq(), so use spin_lock_irqsave() mechanism for concurrent updates - key/value are opaque range of bytes (aligned to 8 bytes) - user space provides 3 configuration attributes via BPF syscall: key_size, value_size, max_entries - map takes care of allocating/freeing key/value pairs - map_update_elem() must fail to insert new element when max_entries limit is reached to make sure that eBPF programs cannot exhaust memory - map_update_elem() replaces elements in an atomic way - optimized for speed of lookup() which can be called multiple times from eBPF program which itself is triggered by high volume of events . in the future JIT compiler may recognize lookup() call and optimize it further, since key_size is constant for life of eBPF program Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 3274f52 commit 0f8e4bd

File tree

3 files changed

+364
-1
lines changed

3 files changed

+364
-1
lines changed

include/uapi/linux/bpf.h

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ enum bpf_cmd {
111111

112112
enum bpf_map_type {
113113
BPF_MAP_TYPE_UNSPEC,
114+
BPF_MAP_TYPE_HASH,
114115
};
115116

116117
enum bpf_prog_type {

kernel/bpf/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
obj-y := core.o
2-
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
2+
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o
33
ifdef CONFIG_TEST_BPF
44
obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
55
endif

kernel/bpf/hashtab.c

+362
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,362 @@
1+
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2+
*
3+
* This program is free software; you can redistribute it and/or
4+
* modify it under the terms of version 2 of the GNU General Public
5+
* License as published by the Free Software Foundation.
6+
*
7+
* This program is distributed in the hope that it will be useful, but
8+
* WITHOUT ANY WARRANTY; without even the implied warranty of
9+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10+
* General Public License for more details.
11+
*/
12+
#include <linux/bpf.h>
13+
#include <linux/jhash.h>
14+
#include <linux/filter.h>
15+
#include <linux/vmalloc.h>
16+
17+
struct bpf_htab {
18+
struct bpf_map map;
19+
struct hlist_head *buckets;
20+
spinlock_t lock;
21+
u32 count; /* number of elements in this hashtable */
22+
u32 n_buckets; /* number of hash buckets */
23+
u32 elem_size; /* size of each element in bytes */
24+
};
25+
26+
/* each htab element is struct htab_elem + key + value */
27+
struct htab_elem {
28+
struct hlist_node hash_node;
29+
struct rcu_head rcu;
30+
u32 hash;
31+
char key[0] __aligned(8);
32+
};
33+
34+
/* Called from syscall */
35+
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
36+
{
37+
struct bpf_htab *htab;
38+
int err, i;
39+
40+
htab = kzalloc(sizeof(*htab), GFP_USER);
41+
if (!htab)
42+
return ERR_PTR(-ENOMEM);
43+
44+
/* mandatory map attributes */
45+
htab->map.key_size = attr->key_size;
46+
htab->map.value_size = attr->value_size;
47+
htab->map.max_entries = attr->max_entries;
48+
49+
/* check sanity of attributes.
50+
* value_size == 0 may be allowed in the future to use map as a set
51+
*/
52+
err = -EINVAL;
53+
if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
54+
htab->map.value_size == 0)
55+
goto free_htab;
56+
57+
/* hash table size must be power of 2 */
58+
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
59+
60+
err = -E2BIG;
61+
if (htab->map.key_size > MAX_BPF_STACK)
62+
/* eBPF programs initialize keys on stack, so they cannot be
63+
* larger than max stack size
64+
*/
65+
goto free_htab;
66+
67+
err = -ENOMEM;
68+
htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
69+
GFP_USER | __GFP_NOWARN);
70+
71+
if (!htab->buckets) {
72+
htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
73+
if (!htab->buckets)
74+
goto free_htab;
75+
}
76+
77+
for (i = 0; i < htab->n_buckets; i++)
78+
INIT_HLIST_HEAD(&htab->buckets[i]);
79+
80+
spin_lock_init(&htab->lock);
81+
htab->count = 0;
82+
83+
htab->elem_size = sizeof(struct htab_elem) +
84+
round_up(htab->map.key_size, 8) +
85+
htab->map.value_size;
86+
return &htab->map;
87+
88+
free_htab:
89+
kfree(htab);
90+
return ERR_PTR(err);
91+
}
92+
93+
static inline u32 htab_map_hash(const void *key, u32 key_len)
94+
{
95+
return jhash(key, key_len, 0);
96+
}
97+
98+
static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
99+
{
100+
return &htab->buckets[hash & (htab->n_buckets - 1)];
101+
}
102+
103+
static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
104+
void *key, u32 key_size)
105+
{
106+
struct htab_elem *l;
107+
108+
hlist_for_each_entry_rcu(l, head, hash_node)
109+
if (l->hash == hash && !memcmp(&l->key, key, key_size))
110+
return l;
111+
112+
return NULL;
113+
}
114+
115+
/* Called from syscall or from eBPF program */
116+
static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
117+
{
118+
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
119+
struct hlist_head *head;
120+
struct htab_elem *l;
121+
u32 hash, key_size;
122+
123+
/* Must be called with rcu_read_lock. */
124+
WARN_ON_ONCE(!rcu_read_lock_held());
125+
126+
key_size = map->key_size;
127+
128+
hash = htab_map_hash(key, key_size);
129+
130+
head = select_bucket(htab, hash);
131+
132+
l = lookup_elem_raw(head, hash, key, key_size);
133+
134+
if (l)
135+
return l->key + round_up(map->key_size, 8);
136+
137+
return NULL;
138+
}
139+
140+
/* Called from syscall */
141+
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
142+
{
143+
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
144+
struct hlist_head *head;
145+
struct htab_elem *l, *next_l;
146+
u32 hash, key_size;
147+
int i;
148+
149+
WARN_ON_ONCE(!rcu_read_lock_held());
150+
151+
key_size = map->key_size;
152+
153+
hash = htab_map_hash(key, key_size);
154+
155+
head = select_bucket(htab, hash);
156+
157+
/* lookup the key */
158+
l = lookup_elem_raw(head, hash, key, key_size);
159+
160+
if (!l) {
161+
i = 0;
162+
goto find_first_elem;
163+
}
164+
165+
/* key was found, get next key in the same bucket */
166+
next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
167+
struct htab_elem, hash_node);
168+
169+
if (next_l) {
170+
/* if next elem in this hash list is non-zero, just return it */
171+
memcpy(next_key, next_l->key, key_size);
172+
return 0;
173+
}
174+
175+
/* no more elements in this hash list, go to the next bucket */
176+
i = hash & (htab->n_buckets - 1);
177+
i++;
178+
179+
find_first_elem:
180+
/* iterate over buckets */
181+
for (; i < htab->n_buckets; i++) {
182+
head = select_bucket(htab, i);
183+
184+
/* pick first element in the bucket */
185+
next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
186+
struct htab_elem, hash_node);
187+
if (next_l) {
188+
/* if it's not empty, just return it */
189+
memcpy(next_key, next_l->key, key_size);
190+
return 0;
191+
}
192+
}
193+
194+
/* itereated over all buckets and all elements */
195+
return -ENOENT;
196+
}
197+
198+
/* Called from syscall or from eBPF program */
199+
static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
200+
u64 map_flags)
201+
{
202+
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
203+
struct htab_elem *l_new, *l_old;
204+
struct hlist_head *head;
205+
unsigned long flags;
206+
u32 key_size;
207+
int ret;
208+
209+
if (map_flags > BPF_EXIST)
210+
/* unknown flags */
211+
return -EINVAL;
212+
213+
WARN_ON_ONCE(!rcu_read_lock_held());
214+
215+
/* allocate new element outside of lock */
216+
l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
217+
if (!l_new)
218+
return -ENOMEM;
219+
220+
key_size = map->key_size;
221+
222+
memcpy(l_new->key, key, key_size);
223+
memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
224+
225+
l_new->hash = htab_map_hash(l_new->key, key_size);
226+
227+
/* bpf_map_update_elem() can be called in_irq() */
228+
spin_lock_irqsave(&htab->lock, flags);
229+
230+
head = select_bucket(htab, l_new->hash);
231+
232+
l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
233+
234+
if (!l_old && unlikely(htab->count >= map->max_entries)) {
235+
/* if elem with this 'key' doesn't exist and we've reached
236+
* max_entries limit, fail insertion of new elem
237+
*/
238+
ret = -E2BIG;
239+
goto err;
240+
}
241+
242+
if (l_old && map_flags == BPF_NOEXIST) {
243+
/* elem already exists */
244+
ret = -EEXIST;
245+
goto err;
246+
}
247+
248+
if (!l_old && map_flags == BPF_EXIST) {
249+
/* elem doesn't exist, cannot update it */
250+
ret = -ENOENT;
251+
goto err;
252+
}
253+
254+
/* add new element to the head of the list, so that concurrent
255+
* search will find it before old elem
256+
*/
257+
hlist_add_head_rcu(&l_new->hash_node, head);
258+
if (l_old) {
259+
hlist_del_rcu(&l_old->hash_node);
260+
kfree_rcu(l_old, rcu);
261+
} else {
262+
htab->count++;
263+
}
264+
spin_unlock_irqrestore(&htab->lock, flags);
265+
266+
return 0;
267+
err:
268+
spin_unlock_irqrestore(&htab->lock, flags);
269+
kfree(l_new);
270+
return ret;
271+
}
272+
273+
/* Called from syscall or from eBPF program */
274+
static int htab_map_delete_elem(struct bpf_map *map, void *key)
275+
{
276+
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
277+
struct hlist_head *head;
278+
struct htab_elem *l;
279+
unsigned long flags;
280+
u32 hash, key_size;
281+
int ret = -ENOENT;
282+
283+
WARN_ON_ONCE(!rcu_read_lock_held());
284+
285+
key_size = map->key_size;
286+
287+
hash = htab_map_hash(key, key_size);
288+
289+
spin_lock_irqsave(&htab->lock, flags);
290+
291+
head = select_bucket(htab, hash);
292+
293+
l = lookup_elem_raw(head, hash, key, key_size);
294+
295+
if (l) {
296+
hlist_del_rcu(&l->hash_node);
297+
htab->count--;
298+
kfree_rcu(l, rcu);
299+
ret = 0;
300+
}
301+
302+
spin_unlock_irqrestore(&htab->lock, flags);
303+
return ret;
304+
}
305+
306+
static void delete_all_elements(struct bpf_htab *htab)
307+
{
308+
int i;
309+
310+
for (i = 0; i < htab->n_buckets; i++) {
311+
struct hlist_head *head = select_bucket(htab, i);
312+
struct hlist_node *n;
313+
struct htab_elem *l;
314+
315+
hlist_for_each_entry_safe(l, n, head, hash_node) {
316+
hlist_del_rcu(&l->hash_node);
317+
htab->count--;
318+
kfree(l);
319+
}
320+
}
321+
}
322+
323+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
324+
static void htab_map_free(struct bpf_map *map)
325+
{
326+
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
327+
328+
/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
329+
* so the programs (can be more than one that used this map) were
330+
* disconnected from events. Wait for outstanding critical sections in
331+
* these programs to complete
332+
*/
333+
synchronize_rcu();
334+
335+
/* some of kfree_rcu() callbacks for elements of this map may not have
336+
* executed. It's ok. Proceed to free residual elements and map itself
337+
*/
338+
delete_all_elements(htab);
339+
kvfree(htab->buckets);
340+
kfree(htab);
341+
}
342+
343+
static struct bpf_map_ops htab_ops = {
344+
.map_alloc = htab_map_alloc,
345+
.map_free = htab_map_free,
346+
.map_get_next_key = htab_map_get_next_key,
347+
.map_lookup_elem = htab_map_lookup_elem,
348+
.map_update_elem = htab_map_update_elem,
349+
.map_delete_elem = htab_map_delete_elem,
350+
};
351+
352+
static struct bpf_map_type_list tl = {
353+
.ops = &htab_ops,
354+
.type = BPF_MAP_TYPE_HASH,
355+
};
356+
357+
static int __init register_htab_map(void)
358+
{
359+
bpf_register_map_type(&tl);
360+
return 0;
361+
}
362+
late_initcall(register_htab_map);

0 commit comments

Comments
 (0)