Skip to content

Commit cb4d03a

Browse files
Brian VazquezAlexei Starovoitov
Brian Vazquez
authored and
Alexei Starovoitov
committed
bpf: Add generic support for lookup batch op
This commit introduces generic support for the bpf_map_lookup_batch. This implementation can be used by almost all the bpf maps since its core implementation is relying on the existing map_get_next_key and map_lookup_elem. The bpf syscall subcommand introduced is: BPF_MAP_LOOKUP_BATCH The UAPI attribute is: struct { /* struct used by BPF_MAP_*_BATCH commands */ __aligned_u64 in_batch; /* start batch, * NULL to start from beginning */ __aligned_u64 out_batch; /* output: next start batch */ __aligned_u64 keys; __aligned_u64 values; __u32 count; /* input/output: * input: # of key/value * elements * output: # of filled elements */ __u32 map_fd; __u64 elem_flags; __u64 flags; } batch; in_batch/out_batch are opaque values use to communicate between user/kernel space, in_batch/out_batch must be of key_size length. To start iterating from the beginning in_batch must be null, count is the # of key/value elements to retrieve. Note that the 'keys' buffer must be a buffer of key_size * count size and the 'values' buffer must be value_size * count, where value_size must be aligned to 8 bytes by userspace if it's dealing with percpu maps. 'count' will contain the number of keys/values successfully retrieved. Note that 'count' is an input/output variable and it can contain a lower value after a call. If there's no more entries to retrieve, ENOENT will be returned. If error is ENOENT, count might be > 0 in case it copied some values but there were no more entries to retrieve. Note that if the return code is an error and not -EFAULT, count indicates the number of elements successfully processed. Suggested-by: Stanislav Fomichev <[email protected]> Signed-off-by: Brian Vazquez <[email protected]> Signed-off-by: Yonghong Song <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent 15c14a3 commit cb4d03a

File tree

3 files changed

+179
-4
lines changed

3 files changed

+179
-4
lines changed

Diff for: include/linux/bpf.h

+5
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ struct bpf_map_ops {
4444
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
4545
void (*map_release_uref)(struct bpf_map *map);
4646
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
47+
int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
48+
union bpf_attr __user *uattr);
4749

4850
/* funcs callable from userspace and from eBPF programs */
4951
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
@@ -982,6 +984,9 @@ void *bpf_map_area_alloc(u64 size, int numa_node);
982984
void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
983985
void bpf_map_area_free(void *base);
984986
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
987+
int generic_map_lookup_batch(struct bpf_map *map,
988+
const union bpf_attr *attr,
989+
union bpf_attr __user *uattr);
985990

986991
extern int sysctl_unprivileged_bpf_disabled;
987992

Diff for: include/uapi/linux/bpf.h

+18
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ enum bpf_cmd {
107107
BPF_MAP_LOOKUP_AND_DELETE_ELEM,
108108
BPF_MAP_FREEZE,
109109
BPF_BTF_GET_NEXT_ID,
110+
BPF_MAP_LOOKUP_BATCH,
110111
};
111112

112113
enum bpf_map_type {
@@ -420,6 +421,23 @@ union bpf_attr {
420421
__u64 flags;
421422
};
422423

424+
struct { /* struct used by BPF_MAP_*_BATCH commands */
425+
__aligned_u64 in_batch; /* start batch,
426+
* NULL to start from beginning
427+
*/
428+
__aligned_u64 out_batch; /* output: next start batch */
429+
__aligned_u64 keys;
430+
__aligned_u64 values;
431+
__u32 count; /* input/output:
432+
* input: # of key/value
433+
* elements
434+
* output: # of filled elements
435+
*/
436+
__u32 map_fd;
437+
__u64 elem_flags;
438+
__u64 flags;
439+
} batch;
440+
423441
struct { /* anonymous struct used by BPF_PROG_LOAD command */
424442
__u32 prog_type; /* one of enum bpf_prog_type */
425443
__u32 insn_cnt;

Diff for: kernel/bpf/syscall.c

+156-4
Original file line numberDiff line numberDiff line change
@@ -219,10 +219,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
219219
void *ptr;
220220
int err;
221221

222-
if (bpf_map_is_dev_bound(map)) {
223-
err = bpf_map_offload_lookup_elem(map, key, value);
224-
return err;
225-
}
222+
if (bpf_map_is_dev_bound(map))
223+
return bpf_map_offload_lookup_elem(map, key, value);
226224

227225
preempt_disable();
228226
this_cpu_inc(bpf_prog_active);
@@ -1220,6 +1218,109 @@ static int map_get_next_key(union bpf_attr *attr)
12201218
return err;
12211219
}
12221220

1221+
#define MAP_LOOKUP_RETRIES 3
1222+
1223+
int generic_map_lookup_batch(struct bpf_map *map,
1224+
const union bpf_attr *attr,
1225+
union bpf_attr __user *uattr)
1226+
{
1227+
void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1228+
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1229+
void __user *values = u64_to_user_ptr(attr->batch.values);
1230+
void __user *keys = u64_to_user_ptr(attr->batch.keys);
1231+
void *buf, *buf_prevkey, *prev_key, *key, *value;
1232+
int err, retry = MAP_LOOKUP_RETRIES;
1233+
u32 value_size, cp, max_count;
1234+
bool first_key = false;
1235+
1236+
if (attr->batch.elem_flags & ~BPF_F_LOCK)
1237+
return -EINVAL;
1238+
1239+
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1240+
!map_value_has_spin_lock(map))
1241+
return -EINVAL;
1242+
1243+
value_size = bpf_map_value_size(map);
1244+
1245+
max_count = attr->batch.count;
1246+
if (!max_count)
1247+
return 0;
1248+
1249+
if (put_user(0, &uattr->batch.count))
1250+
return -EFAULT;
1251+
1252+
buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1253+
if (!buf_prevkey)
1254+
return -ENOMEM;
1255+
1256+
buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1257+
if (!buf) {
1258+
kvfree(buf_prevkey);
1259+
return -ENOMEM;
1260+
}
1261+
1262+
err = -EFAULT;
1263+
first_key = false;
1264+
prev_key = NULL;
1265+
if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1266+
goto free_buf;
1267+
key = buf;
1268+
value = key + map->key_size;
1269+
if (ubatch)
1270+
prev_key = buf_prevkey;
1271+
1272+
for (cp = 0; cp < max_count;) {
1273+
rcu_read_lock();
1274+
err = map->ops->map_get_next_key(map, prev_key, key);
1275+
rcu_read_unlock();
1276+
if (err)
1277+
break;
1278+
err = bpf_map_copy_value(map, key, value,
1279+
attr->batch.elem_flags);
1280+
1281+
if (err == -ENOENT) {
1282+
if (retry) {
1283+
retry--;
1284+
continue;
1285+
}
1286+
err = -EINTR;
1287+
break;
1288+
}
1289+
1290+
if (err)
1291+
goto free_buf;
1292+
1293+
if (copy_to_user(keys + cp * map->key_size, key,
1294+
map->key_size)) {
1295+
err = -EFAULT;
1296+
goto free_buf;
1297+
}
1298+
if (copy_to_user(values + cp * value_size, value, value_size)) {
1299+
err = -EFAULT;
1300+
goto free_buf;
1301+
}
1302+
1303+
if (!prev_key)
1304+
prev_key = buf_prevkey;
1305+
1306+
swap(prev_key, key);
1307+
retry = MAP_LOOKUP_RETRIES;
1308+
cp++;
1309+
}
1310+
1311+
if (err == -EFAULT)
1312+
goto free_buf;
1313+
1314+
if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1315+
(cp && copy_to_user(uobatch, prev_key, map->key_size))))
1316+
err = -EFAULT;
1317+
1318+
free_buf:
1319+
kfree(buf_prevkey);
1320+
kfree(buf);
1321+
return err;
1322+
}
1323+
12231324
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
12241325

12251326
static int map_lookup_and_delete_elem(union bpf_attr *attr)
@@ -3076,6 +3177,54 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
30763177
return err;
30773178
}
30783179

3180+
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
3181+
3182+
#define BPF_DO_BATCH(fn) \
3183+
do { \
3184+
if (!fn) { \
3185+
err = -ENOTSUPP; \
3186+
goto err_put; \
3187+
} \
3188+
err = fn(map, attr, uattr); \
3189+
} while (0)
3190+
3191+
static int bpf_map_do_batch(const union bpf_attr *attr,
3192+
union bpf_attr __user *uattr,
3193+
int cmd)
3194+
{
3195+
struct bpf_map *map;
3196+
int err, ufd;
3197+
struct fd f;
3198+
3199+
if (CHECK_ATTR(BPF_MAP_BATCH))
3200+
return -EINVAL;
3201+
3202+
ufd = attr->batch.map_fd;
3203+
f = fdget(ufd);
3204+
map = __bpf_map_get(f);
3205+
if (IS_ERR(map))
3206+
return PTR_ERR(map);
3207+
3208+
if (cmd == BPF_MAP_LOOKUP_BATCH &&
3209+
!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
3210+
err = -EPERM;
3211+
goto err_put;
3212+
}
3213+
3214+
if (cmd != BPF_MAP_LOOKUP_BATCH &&
3215+
!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
3216+
err = -EPERM;
3217+
goto err_put;
3218+
}
3219+
3220+
if (cmd == BPF_MAP_LOOKUP_BATCH)
3221+
BPF_DO_BATCH(map->ops->map_lookup_batch);
3222+
3223+
err_put:
3224+
fdput(f);
3225+
return err;
3226+
}
3227+
30793228
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
30803229
{
30813230
union bpf_attr attr = {};
@@ -3173,6 +3322,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
31733322
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
31743323
err = map_lookup_and_delete_elem(&attr);
31753324
break;
3325+
case BPF_MAP_LOOKUP_BATCH:
3326+
err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
3327+
break;
31763328
default:
31773329
err = -EINVAL;
31783330
break;

0 commit comments

Comments
 (0)