Skip to content

Commit 87f5fc7

Browse files
dsahernborkmann
authored andcommitted
bpf: Provide helper to do forwarding lookups in kernel FIB table
Provide a helper for doing a FIB and neighbor lookup in the kernel tables from an XDP program. The helper provides a fastpath for forwarding packets. If the packet is a local delivery or for any reason is not a simple lookup and forward, the packet continues up the stack. If it is to be forwarded, the forwarding can be done directly if the neighbor is already known. If the neighbor does not exist, the first few packets go up the stack for neighbor resolution. Once resolved, the xdp program provides the fast path. On successful lookup the nexthop dmac, current device smac and egress device index are returned. The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6 are implemented in this patch. The API includes layer 4 parameters if the XDP program chooses to do deep packet inspection to allow compare against ACLs implemented as FIB rules. Header rewrite is left to the XDP program. The lookup takes 2 flags: - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes straight to the table associated with the device (expert setting for those looking to maximize throughput) - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective. Default is an ingress lookup. Initial performance numbers collected by Jesper, forwarded packets/sec: Full stack XDP FIB lookup XDP Direct lookup IPv4 1,947,969 7,074,156 7,415,333 IPv6 1,728,000 6,165,504 7,262,720 These number are single CPU core forwarding on a Broadwell E5-1650 v4 @ 3.60GHz. Signed-off-by: David Ahern <[email protected]> Acked-by: Jesper Dangaard Brouer <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent 65a2022 commit 87f5fc7

File tree

2 files changed

+347
-1
lines changed

2 files changed

+347
-1
lines changed

Diff for: include/uapi/linux/bpf.h

+80-1
Original file line numberDiff line numberDiff line change
@@ -1828,6 +1828,33 @@ union bpf_attr {
18281828
* Return
18291829
* 0 on success, or a negative error in case of failure.
18301830
*
1831+
*
1832+
* int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
1833+
* Description
1834+
* Do FIB lookup in kernel tables using parameters in *params*.
1835+
* If lookup is successful and result shows packet is to be
1836+
* forwarded, the neighbor tables are searched for the nexthop.
1837+
* If successful (ie., FIB lookup shows forwarding and nexthop
1838+
* is resolved), the nexthop address is returned in ipv4_dst,
1839+
* ipv6_dst or mpls_out based on family, smac is set to mac
1840+
* address of egress device, dmac is set to nexthop mac address,
1841+
* rt_metric is set to metric from route.
1842+
*
1843+
* *plen* argument is the size of the passed in struct.
1844+
* *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
1845+
*
1846+
* **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
1847+
* full lookup using FIB rules
1848+
* **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
1849+
* perspective (default is ingress)
1850+
*
1851+
* *ctx* is either **struct xdp_md** for XDP programs or
1852+
* **struct sk_buff** tc cls_act programs.
1853+
*
1854+
* Return
1855+
* Egress device index on success, 0 if packet needs to continue
1856+
* up the stack for further processing or a negative error in case
1857+
* of failure.
18311858
*/
18321859
#define __BPF_FUNC_MAPPER(FN) \
18331860
FN(unspec), \
@@ -1898,7 +1925,8 @@ union bpf_attr {
18981925
FN(xdp_adjust_tail), \
18991926
FN(skb_get_xfrm_state), \
19001927
FN(get_stack), \
1901-
FN(skb_load_bytes_relative),
1928+
FN(skb_load_bytes_relative), \
1929+
FN(fib_lookup),
19021930

19031931
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
19041932
* function eBPF program intends to call
@@ -2321,4 +2349,55 @@ struct bpf_raw_tracepoint_args {
23212349
__u64 args[0];
23222350
};
23232351

2352+
/* DIRECT: Skip the FIB rules and go to FIB table associated with device
2353+
* OUTPUT: Do lookup from egress perspective; default is ingress
2354+
*/
2355+
#define BPF_FIB_LOOKUP_DIRECT BIT(0)
2356+
#define BPF_FIB_LOOKUP_OUTPUT BIT(1)
2357+
2358+
struct bpf_fib_lookup {
2359+
/* input */
2360+
__u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */
2361+
2362+
/* set if lookup is to consider L4 data - e.g., FIB rules */
2363+
__u8 l4_protocol;
2364+
__be16 sport;
2365+
__be16 dport;
2366+
2367+
/* total length of packet from network header - used for MTU check */
2368+
__u16 tot_len;
2369+
__u32 ifindex; /* L3 device index for lookup */
2370+
2371+
union {
2372+
/* inputs to lookup */
2373+
__u8 tos; /* AF_INET */
2374+
__be32 flowlabel; /* AF_INET6 */
2375+
2376+
/* output: metric of fib result */
2377+
__u32 rt_metric;
2378+
};
2379+
2380+
union {
2381+
__be32 mpls_in;
2382+
__be32 ipv4_src;
2383+
__u32 ipv6_src[4]; /* in6_addr; network order */
2384+
};
2385+
2386+
/* input to bpf_fib_lookup, *dst is destination address.
2387+
* output: bpf_fib_lookup sets to gateway address
2388+
*/
2389+
union {
2390+
/* return for MPLS lookups */
2391+
__be32 mpls_out[4]; /* support up to 4 labels */
2392+
__be32 ipv4_dst;
2393+
__u32 ipv6_dst[4]; /* in6_addr; network order */
2394+
};
2395+
2396+
/* output */
2397+
__be16 h_vlan_proto;
2398+
__be16 h_vlan_TCI;
2399+
__u8 smac[6]; /* ETH_ALEN */
2400+
__u8 dmac[6]; /* ETH_ALEN */
2401+
};
2402+
23242403
#endif /* _UAPI__LINUX_BPF_H__ */

Diff for: net/core/filter.c

+267
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@
6060
#include <net/xfrm.h>
6161
#include <linux/bpf_trace.h>
6262
#include <net/xdp_sock.h>
63+
#include <linux/inetdevice.h>
64+
#include <net/ip_fib.h>
65+
#include <net/flow.h>
66+
#include <net/arp.h>
6367

6468
/**
6569
* sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,265 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
40324036
};
40334037
#endif
40344038

4039+
#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
4040+
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
4041+
const struct neighbour *neigh,
4042+
const struct net_device *dev)
4043+
{
4044+
memcpy(params->dmac, neigh->ha, ETH_ALEN);
4045+
memcpy(params->smac, dev->dev_addr, ETH_ALEN);
4046+
params->h_vlan_TCI = 0;
4047+
params->h_vlan_proto = 0;
4048+
4049+
return dev->ifindex;
4050+
}
4051+
#endif
4052+
4053+
#if IS_ENABLED(CONFIG_INET)
4054+
static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4055+
u32 flags)
4056+
{
4057+
struct in_device *in_dev;
4058+
struct neighbour *neigh;
4059+
struct net_device *dev;
4060+
struct fib_result res;
4061+
struct fib_nh *nh;
4062+
struct flowi4 fl4;
4063+
int err;
4064+
4065+
dev = dev_get_by_index_rcu(net, params->ifindex);
4066+
if (unlikely(!dev))
4067+
return -ENODEV;
4068+
4069+
/* verify forwarding is enabled on this interface */
4070+
in_dev = __in_dev_get_rcu(dev);
4071+
if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
4072+
return 0;
4073+
4074+
if (flags & BPF_FIB_LOOKUP_OUTPUT) {
4075+
fl4.flowi4_iif = 1;
4076+
fl4.flowi4_oif = params->ifindex;
4077+
} else {
4078+
fl4.flowi4_iif = params->ifindex;
4079+
fl4.flowi4_oif = 0;
4080+
}
4081+
fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
4082+
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
4083+
fl4.flowi4_flags = 0;
4084+
4085+
fl4.flowi4_proto = params->l4_protocol;
4086+
fl4.daddr = params->ipv4_dst;
4087+
fl4.saddr = params->ipv4_src;
4088+
fl4.fl4_sport = params->sport;
4089+
fl4.fl4_dport = params->dport;
4090+
4091+
if (flags & BPF_FIB_LOOKUP_DIRECT) {
4092+
u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
4093+
struct fib_table *tb;
4094+
4095+
tb = fib_get_table(net, tbid);
4096+
if (unlikely(!tb))
4097+
return 0;
4098+
4099+
err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
4100+
} else {
4101+
fl4.flowi4_mark = 0;
4102+
fl4.flowi4_secid = 0;
4103+
fl4.flowi4_tun_key.tun_id = 0;
4104+
fl4.flowi4_uid = sock_net_uid(net, NULL);
4105+
4106+
err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
4107+
}
4108+
4109+
if (err || res.type != RTN_UNICAST)
4110+
return 0;
4111+
4112+
if (res.fi->fib_nhs > 1)
4113+
fib_select_path(net, &res, &fl4, NULL);
4114+
4115+
nh = &res.fi->fib_nh[res.nh_sel];
4116+
4117+
/* do not handle lwt encaps right now */
4118+
if (nh->nh_lwtstate)
4119+
return 0;
4120+
4121+
dev = nh->nh_dev;
4122+
if (unlikely(!dev))
4123+
return 0;
4124+
4125+
if (nh->nh_gw)
4126+
params->ipv4_dst = nh->nh_gw;
4127+
4128+
params->rt_metric = res.fi->fib_priority;
4129+
4130+
/* xdp and cls_bpf programs are run in RCU-bh so
4131+
* rcu_read_lock_bh is not needed here
4132+
*/
4133+
neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
4134+
if (neigh)
4135+
return bpf_fib_set_fwd_params(params, neigh, dev);
4136+
4137+
return 0;
4138+
}
4139+
#endif
4140+
4141+
#if IS_ENABLED(CONFIG_IPV6)
4142+
static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4143+
u32 flags)
4144+
{
4145+
struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
4146+
struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
4147+
struct neighbour *neigh;
4148+
struct net_device *dev;
4149+
struct inet6_dev *idev;
4150+
struct fib6_info *f6i;
4151+
struct flowi6 fl6;
4152+
int strict = 0;
4153+
int oif;
4154+
4155+
/* link local addresses are never forwarded */
4156+
if (rt6_need_strict(dst) || rt6_need_strict(src))
4157+
return 0;
4158+
4159+
dev = dev_get_by_index_rcu(net, params->ifindex);
4160+
if (unlikely(!dev))
4161+
return -ENODEV;
4162+
4163+
idev = __in6_dev_get_safely(dev);
4164+
if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
4165+
return 0;
4166+
4167+
if (flags & BPF_FIB_LOOKUP_OUTPUT) {
4168+
fl6.flowi6_iif = 1;
4169+
oif = fl6.flowi6_oif = params->ifindex;
4170+
} else {
4171+
oif = fl6.flowi6_iif = params->ifindex;
4172+
fl6.flowi6_oif = 0;
4173+
strict = RT6_LOOKUP_F_HAS_SADDR;
4174+
}
4175+
fl6.flowlabel = params->flowlabel;
4176+
fl6.flowi6_scope = 0;
4177+
fl6.flowi6_flags = 0;
4178+
fl6.mp_hash = 0;
4179+
4180+
fl6.flowi6_proto = params->l4_protocol;
4181+
fl6.daddr = *dst;
4182+
fl6.saddr = *src;
4183+
fl6.fl6_sport = params->sport;
4184+
fl6.fl6_dport = params->dport;
4185+
4186+
if (flags & BPF_FIB_LOOKUP_DIRECT) {
4187+
u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
4188+
struct fib6_table *tb;
4189+
4190+
tb = ipv6_stub->fib6_get_table(net, tbid);
4191+
if (unlikely(!tb))
4192+
return 0;
4193+
4194+
f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
4195+
} else {
4196+
fl6.flowi6_mark = 0;
4197+
fl6.flowi6_secid = 0;
4198+
fl6.flowi6_tun_key.tun_id = 0;
4199+
fl6.flowi6_uid = sock_net_uid(net, NULL);
4200+
4201+
f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
4202+
}
4203+
4204+
if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
4205+
return 0;
4206+
4207+
if (unlikely(f6i->fib6_flags & RTF_REJECT ||
4208+
f6i->fib6_type != RTN_UNICAST))
4209+
return 0;
4210+
4211+
if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
4212+
f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
4213+
fl6.flowi6_oif, NULL,
4214+
strict);
4215+
4216+
if (f6i->fib6_nh.nh_lwtstate)
4217+
return 0;
4218+
4219+
if (f6i->fib6_flags & RTF_GATEWAY)
4220+
*dst = f6i->fib6_nh.nh_gw;
4221+
4222+
dev = f6i->fib6_nh.nh_dev;
4223+
params->rt_metric = f6i->fib6_metric;
4224+
4225+
/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
4226+
* not needed here. Can not use __ipv6_neigh_lookup_noref here
4227+
* because we need to get nd_tbl via the stub
4228+
*/
4229+
neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
4230+
ndisc_hashfn, dst, dev);
4231+
if (neigh)
4232+
return bpf_fib_set_fwd_params(params, neigh, dev);
4233+
4234+
return 0;
4235+
}
4236+
#endif
4237+
4238+
BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
4239+
struct bpf_fib_lookup *, params, int, plen, u32, flags)
4240+
{
4241+
if (plen < sizeof(*params))
4242+
return -EINVAL;
4243+
4244+
switch (params->family) {
4245+
#if IS_ENABLED(CONFIG_INET)
4246+
case AF_INET:
4247+
return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
4248+
flags);
4249+
#endif
4250+
#if IS_ENABLED(CONFIG_IPV6)
4251+
case AF_INET6:
4252+
return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
4253+
flags);
4254+
#endif
4255+
}
4256+
return 0;
4257+
}
4258+
4259+
static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
4260+
.func = bpf_xdp_fib_lookup,
4261+
.gpl_only = true,
4262+
.ret_type = RET_INTEGER,
4263+
.arg1_type = ARG_PTR_TO_CTX,
4264+
.arg2_type = ARG_PTR_TO_MEM,
4265+
.arg3_type = ARG_CONST_SIZE,
4266+
.arg4_type = ARG_ANYTHING,
4267+
};
4268+
4269+
BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
4270+
struct bpf_fib_lookup *, params, int, plen, u32, flags)
4271+
{
4272+
if (plen < sizeof(*params))
4273+
return -EINVAL;
4274+
4275+
switch (params->family) {
4276+
#if IS_ENABLED(CONFIG_INET)
4277+
case AF_INET:
4278+
return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
4279+
#endif
4280+
#if IS_ENABLED(CONFIG_IPV6)
4281+
case AF_INET6:
4282+
return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
4283+
#endif
4284+
}
4285+
return -ENOTSUPP;
4286+
}
4287+
4288+
static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
4289+
.func = bpf_skb_fib_lookup,
4290+
.gpl_only = true,
4291+
.ret_type = RET_INTEGER,
4292+
.arg1_type = ARG_PTR_TO_CTX,
4293+
.arg2_type = ARG_PTR_TO_MEM,
4294+
.arg3_type = ARG_CONST_SIZE,
4295+
.arg4_type = ARG_ANYTHING,
4296+
};
4297+
40354298
static const struct bpf_func_proto *
40364299
bpf_base_func_proto(enum bpf_func_id func_id)
40374300
{
@@ -4181,6 +4444,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
41814444
case BPF_FUNC_skb_get_xfrm_state:
41824445
return &bpf_skb_get_xfrm_state_proto;
41834446
#endif
4447+
case BPF_FUNC_fib_lookup:
4448+
return &bpf_skb_fib_lookup_proto;
41844449
default:
41854450
return bpf_base_func_proto(func_id);
41864451
}
@@ -4206,6 +4471,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
42064471
return &bpf_xdp_redirect_map_proto;
42074472
case BPF_FUNC_xdp_adjust_tail:
42084473
return &bpf_xdp_adjust_tail_proto;
4474+
case BPF_FUNC_fib_lookup:
4475+
return &bpf_xdp_fib_lookup_proto;
42094476
default:
42104477
return bpf_base_func_proto(func_id);
42114478
}

0 commit comments

Comments
 (0)