Skip to content

Commit 9e2ee5c

Browse files
joamakiborkmann
authored andcommitted
net, bonding: Add XDP support to the bonding driver
XDP is implemented in the bonding driver by transparently delegating the XDP program loading, removal and xmit operations to the bonding slave devices. The overall goal of this work is that XDP programs can be attached to a bond device *without* any further changes (or awareness) necessary to the program itself, meaning the same XDP program can be attached to a native device but also a bonding device. Semantics of XDP_TX when attached to a bond are equivalent in such setting to the case when a tc/BPF program would be attached to the bond, meaning transmitting the packet out of the bond itself using one of the bond's configured xmit methods to select a slave device (rather than XDP_TX on the slave itself). Handling of XDP_TX to transmit using the configured bonding mechanism is therefore implemented by rewriting the BPF program return value in bpf_prog_run_xdp. To avoid performance impact this check is guarded by a static key, which is incremented when a XDP program is loaded onto a bond device. This approach was chosen to avoid changes to drivers implementing XDP. If the slave device does not match the receive device, then XDP_REDIRECT is transparently used to perform the redirection in order to have the network driver release the packet from its RX ring. The bonding driver hashing functions have been refactored to allow reuse with xdp_buff's to avoid code duplication. The motivation for this change is to enable use of bonding (and 802.3ad) in hairpinning L4 load-balancers such as [1] implemented with XDP and also to transparently support bond devices for projects that use XDP given most modern NICs have dual port adapters. An alternative to this approach would be to implement 802.3ad in user-space and implement the bonding load-balancing in the XDP program itself, but is rather a cumbersome endeavor in terms of slave device management (e.g. by watching netlink) and requires separate programs for native vs bond cases for the orchestrator. A native in-kernel implementation overcomes these issues and provides more flexibility. Below are benchmark results done on two machines with 100Gbit Intel E810 (ice) NIC and with 32-core 3970X on sending machine, and 16-core 3950X on receiving machine. 64 byte packets were sent with pktgen-dpdk at full rate. Two issues [2, 3] were identified with the ice driver, so the tests were performed with iommu=off and patch [2] applied. Additionally the bonding round robin algorithm was modified to use per-cpu tx counters as high CPU load (50% vs 10%) and high rate of cache misses were caused by the shared rr_tx_counter (see patch 2/3). The statistics were collected using "sar -n dev -u 1 10". On top of that, for ice, further work is in progress on improving the XDP_TX numbers [4]. -----------------------| CPU |--| rxpck/s |--| txpck/s |---- without patch (1 dev): XDP_DROP: 3.15% 48.6Mpps XDP_TX: 3.12% 18.3Mpps 18.3Mpps XDP_DROP (RSS): 9.47% 116.5Mpps XDP_TX (RSS): 9.67% 25.3Mpps 24.2Mpps ----------------------- with patch, bond (1 dev): XDP_DROP: 3.14% 46.7Mpps XDP_TX: 3.15% 13.9Mpps 13.9Mpps XDP_DROP (RSS): 10.33% 117.2Mpps XDP_TX (RSS): 10.64% 25.1Mpps 24.0Mpps ----------------------- with patch, bond (2 devs): XDP_DROP: 6.27% 92.7Mpps XDP_TX: 6.26% 17.6Mpps 17.5Mpps XDP_DROP (RSS): 11.38% 117.2Mpps XDP_TX (RSS): 14.30% 28.7Mpps 27.4Mpps -------------------------------------------------------------- RSS: Receive Side Scaling, e.g. the packets were sent to a range of destination IPs. [1]: https://cilium.io/blog/2021/05/20/cilium-110#standalonelb [2]: https://lore.kernel.org/bpf/[email protected]/T/#t [3]: https://lore.kernel.org/bpf/CAHn8xckNXci+X_Eb2WMv4uVYjO2331UWB2JLtXr_58z0Av8+8A@mail.gmail.com/ [4]: https://lore.kernel.org/bpf/[email protected]/T/#t Signed-off-by: Jussi Maki <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Cc: Jay Vosburgh <[email protected]> Cc: Veaceslav Falico <[email protected]> Cc: Andy Gospodarek <[email protected]> Cc: Maciej Fijalkowski <[email protected]> Cc: Magnus Karlsson <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent 879af96 commit 9e2ee5c

File tree

2 files changed

+309
-1
lines changed

2 files changed

+309
-1
lines changed

drivers/net/bonding/bond_main.c

Lines changed: 308 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,19 @@ bool bond_sk_check(struct bonding *bond)
317317
}
318318
}
319319

320+
static bool bond_xdp_check(struct bonding *bond)
321+
{
322+
switch (BOND_MODE(bond)) {
323+
case BOND_MODE_ROUNDROBIN:
324+
case BOND_MODE_ACTIVEBACKUP:
325+
case BOND_MODE_8023AD:
326+
case BOND_MODE_XOR:
327+
return true;
328+
default:
329+
return false;
330+
}
331+
}
332+
320333
/*---------------------------------- VLAN -----------------------------------*/
321334

322335
/* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
@@ -2133,6 +2146,41 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
21332146
bond_update_slave_arr(bond, NULL);
21342147

21352148

2149+
if (!slave_dev->netdev_ops->ndo_bpf ||
2150+
!slave_dev->netdev_ops->ndo_xdp_xmit) {
2151+
if (bond->xdp_prog) {
2152+
NL_SET_ERR_MSG(extack, "Slave does not support XDP");
2153+
slave_err(bond_dev, slave_dev, "Slave does not support XDP\n");
2154+
res = -EOPNOTSUPP;
2155+
goto err_sysfs_del;
2156+
}
2157+
} else {
2158+
struct netdev_bpf xdp = {
2159+
.command = XDP_SETUP_PROG,
2160+
.flags = 0,
2161+
.prog = bond->xdp_prog,
2162+
.extack = extack,
2163+
};
2164+
2165+
if (dev_xdp_prog_count(slave_dev) > 0) {
2166+
NL_SET_ERR_MSG(extack,
2167+
"Slave has XDP program loaded, please unload before enslaving");
2168+
slave_err(bond_dev, slave_dev,
2169+
"Slave has XDP program loaded, please unload before enslaving\n");
2170+
res = -EOPNOTSUPP;
2171+
goto err_sysfs_del;
2172+
}
2173+
2174+
res = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
2175+
if (res < 0) {
2176+
/* ndo_bpf() sets extack error message */
2177+
slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res);
2178+
goto err_sysfs_del;
2179+
}
2180+
if (bond->xdp_prog)
2181+
bpf_prog_inc(bond->xdp_prog);
2182+
}
2183+
21362184
slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n",
21372185
bond_is_active_slave(new_slave) ? "an active" : "a backup",
21382186
new_slave->link != BOND_LINK_DOWN ? "an up" : "a down");
@@ -2252,6 +2300,17 @@ static int __bond_release_one(struct net_device *bond_dev,
22522300
/* recompute stats just before removing the slave */
22532301
bond_get_stats(bond->dev, &bond->bond_stats);
22542302

2303+
if (bond->xdp_prog) {
2304+
struct netdev_bpf xdp = {
2305+
.command = XDP_SETUP_PROG,
2306+
.flags = 0,
2307+
.prog = NULL,
2308+
.extack = NULL,
2309+
};
2310+
if (slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp))
2311+
slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n");
2312+
}
2313+
22552314
/* unregister rx_handler early so bond_handle_frame wouldn't be called
22562315
* for this slave anymore.
22572316
*/
@@ -3638,7 +3697,7 @@ static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff
36383697
return 0;
36393698

36403699
ep = (struct ethhdr *)(data + mhoff);
3641-
return ep->h_dest[5] ^ ep->h_source[5] ^ ep->h_proto;
3700+
return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto);
36423701
}
36433702

36443703
static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data,
@@ -3807,6 +3866,26 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
38073866
skb_headlen(skb));
38083867
}
38093868

3869+
/**
3870+
* bond_xmit_hash_xdp - generate a hash value based on the xmit policy
3871+
* @bond: bonding device
3872+
* @xdp: buffer to use for headers
3873+
*
3874+
* The XDP variant of bond_xmit_hash.
3875+
*/
3876+
static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp)
3877+
{
3878+
struct ethhdr *eth;
3879+
3880+
if (xdp->data + sizeof(struct ethhdr) > xdp->data_end)
3881+
return 0;
3882+
3883+
eth = (struct ethhdr *)xdp->data;
3884+
3885+
return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0,
3886+
sizeof(struct ethhdr), xdp->data_end - xdp->data);
3887+
}
3888+
38103889
/*-------------------------- Device entry points ----------------------------*/
38113890

38123891
void bond_work_init_all(struct bonding *bond)
@@ -4455,6 +4534,47 @@ static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond,
44554534
return NULL;
44564535
}
44574536

4537+
static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond,
4538+
struct xdp_buff *xdp)
4539+
{
4540+
struct slave *slave;
4541+
int slave_cnt;
4542+
u32 slave_id;
4543+
const struct ethhdr *eth;
4544+
void *data = xdp->data;
4545+
4546+
if (data + sizeof(struct ethhdr) > xdp->data_end)
4547+
goto non_igmp;
4548+
4549+
eth = (struct ethhdr *)data;
4550+
data += sizeof(struct ethhdr);
4551+
4552+
/* See comment on IGMP in bond_xmit_roundrobin_slave_get() */
4553+
if (eth->h_proto == htons(ETH_P_IP)) {
4554+
const struct iphdr *iph;
4555+
4556+
if (data + sizeof(struct iphdr) > xdp->data_end)
4557+
goto non_igmp;
4558+
4559+
iph = (struct iphdr *)data;
4560+
4561+
if (iph->protocol == IPPROTO_IGMP) {
4562+
slave = rcu_dereference(bond->curr_active_slave);
4563+
if (slave)
4564+
return slave;
4565+
return bond_get_slave_by_id(bond, 0);
4566+
}
4567+
}
4568+
4569+
non_igmp:
4570+
slave_cnt = READ_ONCE(bond->slave_cnt);
4571+
if (likely(slave_cnt)) {
4572+
slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
4573+
return bond_get_slave_by_id(bond, slave_id);
4574+
}
4575+
return NULL;
4576+
}
4577+
44584578
static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
44594579
struct net_device *bond_dev)
44604580
{
@@ -4670,6 +4790,22 @@ static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond,
46704790
return slave;
46714791
}
46724792

4793+
static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond,
4794+
struct xdp_buff *xdp)
4795+
{
4796+
struct bond_up_slave *slaves;
4797+
unsigned int count;
4798+
u32 hash;
4799+
4800+
hash = bond_xmit_hash_xdp(bond, xdp);
4801+
slaves = rcu_dereference(bond->usable_slaves);
4802+
count = slaves ? READ_ONCE(slaves->count) : 0;
4803+
if (unlikely(!count))
4804+
return NULL;
4805+
4806+
return slaves->arr[hash % count];
4807+
}
4808+
46734809
/* Use this Xmit function for 3AD as well as XOR modes. The current
46744810
* usable slave array is formed in the control path. The xmit function
46754811
* just calculates hash and sends the packet out.
@@ -4954,6 +5090,174 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
49545090
return ret;
49555091
}
49565092

5093+
static struct net_device *
5094+
bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp)
5095+
{
5096+
struct bonding *bond = netdev_priv(bond_dev);
5097+
struct slave *slave;
5098+
5099+
/* Caller needs to hold rcu_read_lock() */
5100+
5101+
switch (BOND_MODE(bond)) {
5102+
case BOND_MODE_ROUNDROBIN:
5103+
slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp);
5104+
break;
5105+
5106+
case BOND_MODE_ACTIVEBACKUP:
5107+
slave = bond_xmit_activebackup_slave_get(bond);
5108+
break;
5109+
5110+
case BOND_MODE_8023AD:
5111+
case BOND_MODE_XOR:
5112+
slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp);
5113+
break;
5114+
5115+
default:
5116+
/* Should never happen. Mode guarded by bond_xdp_check() */
5117+
netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond));
5118+
WARN_ON_ONCE(1);
5119+
return NULL;
5120+
}
5121+
5122+
if (slave)
5123+
return slave->dev;
5124+
5125+
return NULL;
5126+
}
5127+
5128+
static int bond_xdp_xmit(struct net_device *bond_dev,
5129+
int n, struct xdp_frame **frames, u32 flags)
5130+
{
5131+
int nxmit, err = -ENXIO;
5132+
5133+
rcu_read_lock();
5134+
5135+
for (nxmit = 0; nxmit < n; nxmit++) {
5136+
struct xdp_frame *frame = frames[nxmit];
5137+
struct xdp_frame *frames1[] = {frame};
5138+
struct net_device *slave_dev;
5139+
struct xdp_buff xdp;
5140+
5141+
xdp_convert_frame_to_buff(frame, &xdp);
5142+
5143+
slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp);
5144+
if (!slave_dev) {
5145+
err = -ENXIO;
5146+
break;
5147+
}
5148+
5149+
err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags);
5150+
if (err < 1)
5151+
break;
5152+
}
5153+
5154+
rcu_read_unlock();
5155+
5156+
/* If error happened on the first frame then we can pass the error up, otherwise
5157+
* report the number of frames that were xmitted.
5158+
*/
5159+
if (err < 0)
5160+
return (nxmit == 0 ? err : nxmit);
5161+
5162+
return nxmit;
5163+
}
5164+
5165+
static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog,
5166+
struct netlink_ext_ack *extack)
5167+
{
5168+
struct bonding *bond = netdev_priv(dev);
5169+
struct list_head *iter;
5170+
struct slave *slave, *rollback_slave;
5171+
struct bpf_prog *old_prog;
5172+
struct netdev_bpf xdp = {
5173+
.command = XDP_SETUP_PROG,
5174+
.flags = 0,
5175+
.prog = prog,
5176+
.extack = extack,
5177+
};
5178+
int err;
5179+
5180+
ASSERT_RTNL();
5181+
5182+
if (!bond_xdp_check(bond))
5183+
return -EOPNOTSUPP;
5184+
5185+
old_prog = bond->xdp_prog;
5186+
bond->xdp_prog = prog;
5187+
5188+
bond_for_each_slave(bond, slave, iter) {
5189+
struct net_device *slave_dev = slave->dev;
5190+
5191+
if (!slave_dev->netdev_ops->ndo_bpf ||
5192+
!slave_dev->netdev_ops->ndo_xdp_xmit) {
5193+
NL_SET_ERR_MSG(extack, "Slave device does not support XDP");
5194+
slave_err(dev, slave_dev, "Slave does not support XDP\n");
5195+
err = -EOPNOTSUPP;
5196+
goto err;
5197+
}
5198+
5199+
if (dev_xdp_prog_count(slave_dev) > 0) {
5200+
NL_SET_ERR_MSG(extack,
5201+
"Slave has XDP program loaded, please unload before enslaving");
5202+
slave_err(dev, slave_dev,
5203+
"Slave has XDP program loaded, please unload before enslaving\n");
5204+
err = -EOPNOTSUPP;
5205+
goto err;
5206+
}
5207+
5208+
err = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
5209+
if (err < 0) {
5210+
/* ndo_bpf() sets extack error message */
5211+
slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err);
5212+
goto err;
5213+
}
5214+
if (prog)
5215+
bpf_prog_inc(prog);
5216+
}
5217+
5218+
if (old_prog)
5219+
bpf_prog_put(old_prog);
5220+
5221+
if (prog)
5222+
static_branch_inc(&bpf_master_redirect_enabled_key);
5223+
else
5224+
static_branch_dec(&bpf_master_redirect_enabled_key);
5225+
5226+
return 0;
5227+
5228+
err:
5229+
/* unwind the program changes */
5230+
bond->xdp_prog = old_prog;
5231+
xdp.prog = old_prog;
5232+
xdp.extack = NULL; /* do not overwrite original error */
5233+
5234+
bond_for_each_slave(bond, rollback_slave, iter) {
5235+
struct net_device *slave_dev = rollback_slave->dev;
5236+
int err_unwind;
5237+
5238+
if (slave == rollback_slave)
5239+
break;
5240+
5241+
err_unwind = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp);
5242+
if (err_unwind < 0)
5243+
slave_err(dev, slave_dev,
5244+
"Error %d when unwinding XDP program change\n", err_unwind);
5245+
else if (xdp.prog)
5246+
bpf_prog_inc(xdp.prog);
5247+
}
5248+
return err;
5249+
}
5250+
5251+
static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp)
5252+
{
5253+
switch (xdp->command) {
5254+
case XDP_SETUP_PROG:
5255+
return bond_xdp_set(dev, xdp->prog, xdp->extack);
5256+
default:
5257+
return -EINVAL;
5258+
}
5259+
}
5260+
49575261
static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed)
49585262
{
49595263
if (speed == 0 || speed == SPEED_UNKNOWN)
@@ -5042,6 +5346,9 @@ static const struct net_device_ops bond_netdev_ops = {
50425346
.ndo_features_check = passthru_features_check,
50435347
.ndo_get_xmit_slave = bond_xmit_get_slave,
50445348
.ndo_sk_get_lower_dev = bond_sk_get_lower_dev,
5349+
.ndo_bpf = bond_xdp,
5350+
.ndo_xdp_xmit = bond_xdp_xmit,
5351+
.ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave,
50455352
};
50465353

50475354
static const struct device_type bond_type = {

include/net/bonding.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ struct bonding {
258258
/* protecting ipsec_list */
259259
spinlock_t ipsec_lock;
260260
#endif /* CONFIG_XFRM_OFFLOAD */
261+
struct bpf_prog *xdp_prog;
261262
};
262263

263264
#define bond_slave_get_rcu(dev) \

0 commit comments

Comments
 (0)