Skip to content

Commit e9ddbb7

Browse files
jsitnickiAlexei Starovoitov
authored and
Alexei Starovoitov
committed
bpf: Introduce SK_LOOKUP program type with a dedicated attach point
Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski <[email protected]> Signed-off-by: Jakub Sitnicki <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent ce3aa9c commit e9ddbb7

File tree

10 files changed

+312
-4
lines changed

10 files changed

+312
-4
lines changed

Diff for: include/linux/bpf-netns.h

+3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
enum netns_bpf_attach_type {
99
NETNS_BPF_INVALID = -1,
1010
NETNS_BPF_FLOW_DISSECTOR = 0,
11+
NETNS_BPF_SK_LOOKUP,
1112
MAX_NETNS_BPF_ATTACH_TYPE
1213
};
1314

@@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
1718
switch (attach_type) {
1819
case BPF_FLOW_DISSECTOR:
1920
return NETNS_BPF_FLOW_DISSECTOR;
21+
case BPF_SK_LOOKUP:
22+
return NETNS_BPF_SK_LOOKUP;
2023
default:
2124
return NETNS_BPF_INVALID;
2225
}

Diff for: include/linux/bpf.h

+1
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ enum bpf_arg_type {
249249
ARG_PTR_TO_INT, /* pointer to int */
250250
ARG_PTR_TO_LONG, /* pointer to long */
251251
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
252+
ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */
252253
ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */
253254
ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */
254255
ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */

Diff for: include/linux/bpf_types.h

+2
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
6464
#ifdef CONFIG_INET
6565
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport,
6666
struct sk_reuseport_md, struct sk_reuseport_kern)
67+
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup,
68+
struct bpf_sk_lookup, struct bpf_sk_lookup_kern)
6769
#endif
6870
#if defined(CONFIG_BPF_JIT)
6971
BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops,

Diff for: include/linux/filter.h

+17
Original file line numberDiff line numberDiff line change
@@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern {
12781278
s32 retval;
12791279
};
12801280

1281+
struct bpf_sk_lookup_kern {
1282+
u16 family;
1283+
u16 protocol;
1284+
struct {
1285+
__be32 saddr;
1286+
__be32 daddr;
1287+
} v4;
1288+
struct {
1289+
const struct in6_addr *saddr;
1290+
const struct in6_addr *daddr;
1291+
} v6;
1292+
__be16 sport;
1293+
u16 dport;
1294+
struct sock *selected_sk;
1295+
bool no_reuseport;
1296+
};
1297+
12811298
#endif /* __LINUX_FILTER_H__ */

Diff for: include/uapi/linux/bpf.h

+77
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ enum bpf_prog_type {
189189
BPF_PROG_TYPE_STRUCT_OPS,
190190
BPF_PROG_TYPE_EXT,
191191
BPF_PROG_TYPE_LSM,
192+
BPF_PROG_TYPE_SK_LOOKUP,
192193
};
193194

194195
enum bpf_attach_type {
@@ -228,6 +229,7 @@ enum bpf_attach_type {
228229
BPF_XDP_DEVMAP,
229230
BPF_CGROUP_INET_SOCK_RELEASE,
230231
BPF_XDP_CPUMAP,
232+
BPF_SK_LOOKUP,
231233
__MAX_BPF_ATTACH_TYPE
232234
};
233235

@@ -3069,6 +3071,10 @@ union bpf_attr {
30693071
*
30703072
* long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
30713073
* Description
3074+
* Helper is overloaded depending on BPF program type. This
3075+
* description applies to **BPF_PROG_TYPE_SCHED_CLS** and
3076+
* **BPF_PROG_TYPE_SCHED_ACT** programs.
3077+
*
30723078
* Assign the *sk* to the *skb*. When combined with appropriate
30733079
* routing configuration to receive the packet towards the socket,
30743080
* will cause *skb* to be delivered to the specified socket.
@@ -3094,6 +3100,56 @@ union bpf_attr {
30943100
* **-ESOCKTNOSUPPORT** if the socket type is not supported
30953101
* (reuseport).
30963102
*
3103+
* long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
3104+
* Description
3105+
* Helper is overloaded depending on BPF program type. This
3106+
* description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
3107+
*
3108+
* Select the *sk* as a result of a socket lookup.
3109+
*
3110+
* For the operation to succeed passed socket must be compatible
3111+
* with the packet description provided by the *ctx* object.
3112+
*
3113+
* L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
3114+
* be an exact match. While IP family (**AF_INET** or
3115+
* **AF_INET6**) must be compatible, that is IPv6 sockets
3116+
* that are not v6-only can be selected for IPv4 packets.
3117+
*
3118+
* Only TCP listeners and UDP unconnected sockets can be
3119+
* selected. *sk* can also be NULL to reset any previous
3120+
* selection.
3121+
*
3122+
* *flags* argument can combination of following values:
3123+
*
3124+
* * **BPF_SK_LOOKUP_F_REPLACE** to override the previous
3125+
* socket selection, potentially done by a BPF program
3126+
* that ran before us.
3127+
*
3128+
* * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
3129+
* load-balancing within reuseport group for the socket
3130+
* being selected.
3131+
*
3132+
* On success *ctx->sk* will point to the selected socket.
3133+
*
3134+
* Return
3135+
* 0 on success, or a negative errno in case of failure.
3136+
*
3137+
* * **-EAFNOSUPPORT** if socket family (*sk->family*) is
3138+
* not compatible with packet family (*ctx->family*).
3139+
*
3140+
* * **-EEXIST** if socket has been already selected,
3141+
* potentially by another program, and
3142+
* **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
3143+
*
3144+
* * **-EINVAL** if unsupported flags were specified.
3145+
*
3146+
* * **-EPROTOTYPE** if socket L4 protocol
3147+
* (*sk->protocol*) doesn't match packet protocol
3148+
* (*ctx->protocol*).
3149+
*
3150+
* * **-ESOCKTNOSUPPORT** if socket is not in allowed
3151+
* state (TCP listening or UDP unconnected).
3152+
*
30973153
* u64 bpf_ktime_get_boot_ns(void)
30983154
* Description
30993155
* Return the time elapsed since system boot, in nanoseconds.
@@ -3607,6 +3663,12 @@ enum {
36073663
BPF_RINGBUF_HDR_SZ = 8,
36083664
};
36093665

3666+
/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
3667+
enum {
3668+
BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0),
3669+
BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1),
3670+
};
3671+
36103672
/* Mode for BPF_FUNC_skb_adjust_room helper. */
36113673
enum bpf_adj_room_mode {
36123674
BPF_ADJ_ROOM_NET,
@@ -4349,4 +4411,19 @@ struct bpf_pidns_info {
43494411
__u32 pid;
43504412
__u32 tgid;
43514413
};
4414+
4415+
/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
4416+
struct bpf_sk_lookup {
4417+
__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
4418+
4419+
__u32 family; /* Protocol family (AF_INET, AF_INET6) */
4420+
__u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
4421+
__u32 remote_ip4; /* Network byte order */
4422+
__u32 remote_ip6[4]; /* Network byte order */
4423+
__u32 remote_port; /* Network byte order */
4424+
__u32 local_ip4; /* Network byte order */
4425+
__u32 local_ip6[4]; /* Network byte order */
4426+
__u32 local_port; /* Host byte order */
4427+
};
4428+
43524429
#endif /* _UAPI__LINUX_BPF_H__ */

Diff for: kernel/bpf/net_namespace.c

+5
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type)
373373
switch (type) {
374374
case NETNS_BPF_FLOW_DISSECTOR:
375375
return 1;
376+
case NETNS_BPF_SK_LOOKUP:
377+
return 64;
376378
default:
377379
return 0;
378380
}
@@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
403405
case NETNS_BPF_FLOW_DISSECTOR:
404406
err = flow_dissector_bpf_prog_attach_check(net, link->prog);
405407
break;
408+
case NETNS_BPF_SK_LOOKUP:
409+
err = 0; /* nothing to check */
410+
break;
406411
default:
407412
err = -EINVAL;
408413
break;

Diff for: kernel/bpf/syscall.c

+9
Original file line numberDiff line numberDiff line change
@@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
20222022
default:
20232023
return -EINVAL;
20242024
}
2025+
case BPF_PROG_TYPE_SK_LOOKUP:
2026+
if (expected_attach_type == BPF_SK_LOOKUP)
2027+
return 0;
2028+
return -EINVAL;
20252029
case BPF_PROG_TYPE_EXT:
20262030
if (expected_attach_type)
20272031
return -EINVAL;
@@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
27562760
case BPF_PROG_TYPE_CGROUP_SOCK:
27572761
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
27582762
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2763+
case BPF_PROG_TYPE_SK_LOOKUP:
27592764
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
27602765
case BPF_PROG_TYPE_CGROUP_SKB:
27612766
if (!capable(CAP_NET_ADMIN))
@@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
28172822
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
28182823
case BPF_TRACE_ITER:
28192824
return BPF_PROG_TYPE_TRACING;
2825+
case BPF_SK_LOOKUP:
2826+
return BPF_PROG_TYPE_SK_LOOKUP;
28202827
default:
28212828
return BPF_PROG_TYPE_UNSPEC;
28222829
}
@@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
29532960
case BPF_LIRC_MODE2:
29542961
return lirc_prog_query(attr, uattr);
29552962
case BPF_FLOW_DISSECTOR:
2963+
case BPF_SK_LOOKUP:
29562964
return netns_bpf_prog_query(attr, uattr);
29572965
default:
29582966
return -EINVAL;
@@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr)
38913899
ret = tracing_bpf_link_attach(attr, prog);
38923900
break;
38933901
case BPF_PROG_TYPE_FLOW_DISSECTOR:
3902+
case BPF_PROG_TYPE_SK_LOOKUP:
38943903
ret = netns_bpf_link_create(attr, prog);
38953904
break;
38963905
default:

Diff for: kernel/bpf/verifier.c

+10-3
Original file line numberDiff line numberDiff line change
@@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
38783878
}
38793879
meta->ref_obj_id = reg->ref_obj_id;
38803880
}
3881-
} else if (arg_type == ARG_PTR_TO_SOCKET) {
3881+
} else if (arg_type == ARG_PTR_TO_SOCKET ||
3882+
arg_type == ARG_PTR_TO_SOCKET_OR_NULL) {
38823883
expected_type = PTR_TO_SOCKET;
3883-
if (type != expected_type)
3884-
goto err_type;
3884+
if (!(register_is_null(reg) &&
3885+
arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) {
3886+
if (type != expected_type)
3887+
goto err_type;
3888+
}
38853889
} else if (arg_type == ARG_PTR_TO_BTF_ID) {
38863890
expected_type = PTR_TO_BTF_ID;
38873891
if (type != expected_type)
@@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env)
73547358
return -ENOTSUPP;
73557359
}
73567360
break;
7361+
case BPF_PROG_TYPE_SK_LOOKUP:
7362+
range = tnum_range(SK_DROP, SK_PASS);
7363+
break;
73577364
case BPF_PROG_TYPE_EXT:
73587365
/* freplace program can return anything as its return value
73597366
* depends on the to-be-replaced kernel func or bpf program.

0 commit comments

Comments
 (0)