Skip to content

Commit cbf398d

Browse files
committed
Merge branch 'af-xdp-tx-batch'
Magnus Karlsson says: ==================== This patch set improves the performance of mainly the Tx processing of AF_XDP sockets. Though, patch 3 also improves the Rx path. All in all, this patch set improves the throughput of the l2fwd xdpsock application by around 11%. If we just take a look at Tx processing part, it is improved by 35% to 40%. Hopefully the new batched Tx interfaces should be of value to other drivers implementing AF_XDP zero-copy support. But patch #3 is generic and will improve performance of all drivers when using AF_XDP sockets (under the premises explained in that patch). @daniel. In patch 3, I apply all the padding required to hinder the adjacency prefetcher to prefetch the wrong things. After this patch set, I will submit another patch set that introduces ____cacheline_padding_in_smp in include/linux/cache.h according to your suggestions. The last patch in that patch set will then convert the explicit paddings that we have now to ____cacheline_padding_in_smp. v2 -> v3: * Fixed #pragma warning with clang and defined a loop_unrolled_for macro for easier readability [lkp, Nick] * Simplified invalid descriptor handling in xskq_cons_read_desc_batch() v1 -> v2: * Removed added parameter in i40e_setup_tx_descriptors and adopted a simpler solution [Maciej] * Added test for !xs in xsk_tx_peek_release_desc_batch() [John] * Simplified return path in xsk_tx_peek_release_desc_batch() [John] * Dropped patch #1 in v1 that introduced lazy completions. Hopefully this is not needed when we get busy poll [Jakub] * Iterate over local variable in xskq_prod_reserve_addr_batch() for improved performance * Fixed the fallback path in xsk_tx_peek_release_desc_batch() so that it also produces a batch of descriptors, albeit by using the slower (but more general) older code. This improves the performance of the case when multiple sockets are sharing the same device and queue id. ==================== Signed-off-by: Daniel Borkmann <[email protected]>
2 parents de91e63 + 3106c58 commit cbf398d

File tree

8 files changed

+258
-56
lines changed

8 files changed

+258
-56
lines changed

drivers/net/ethernet/intel/i40e/i40e_txrx.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,8 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring)
676676
i40e_clean_tx_ring(tx_ring);
677677
kfree(tx_ring->tx_bi);
678678
tx_ring->tx_bi = NULL;
679+
kfree(tx_ring->xsk_descs);
680+
tx_ring->xsk_descs = NULL;
679681

680682
if (tx_ring->desc) {
681683
dma_free_coherent(tx_ring->dev, tx_ring->size,
@@ -1277,6 +1279,13 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
12771279
if (!tx_ring->tx_bi)
12781280
goto err;
12791281

1282+
if (ring_is_xdp(tx_ring)) {
1283+
tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs),
1284+
GFP_KERNEL);
1285+
if (!tx_ring->xsk_descs)
1286+
goto err;
1287+
}
1288+
12801289
u64_stats_init(&tx_ring->syncp);
12811290

12821291
/* round up to nearest 4K */
@@ -1300,6 +1309,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
13001309
return 0;
13011310

13021311
err:
1312+
kfree(tx_ring->xsk_descs);
1313+
tx_ring->xsk_descs = NULL;
13031314
kfree(tx_ring->tx_bi);
13041315
tx_ring->tx_bi = NULL;
13051316
return -ENOMEM;

drivers/net/ethernet/intel/i40e/i40e_txrx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ struct i40e_ring {
389389
struct i40e_channel *ch;
390390
struct xdp_rxq_info xdp_rxq;
391391
struct xsk_buff_pool *xsk_pool;
392+
struct xdp_desc *xsk_descs; /* For storing descriptors in the AF_XDP ZC path */
392393
} ____cacheline_internodealigned_in_smp;
393394

394395
static inline bool ring_uses_build_skb(struct i40e_ring *ring)

drivers/net/ethernet/intel/i40e/i40e_xsk.c

Lines changed: 84 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/* Copyright(c) 2018 Intel Corporation. */
33

44
#include <linux/bpf_trace.h>
5+
#include <linux/stringify.h>
56
#include <net/xdp_sock_drv.h>
67
#include <net/xdp.h>
78

@@ -381,58 +382,102 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
381382
return failure ? budget : (int)total_rx_packets;
382383
}
383384

384-
/**
385-
* i40e_xmit_zc - Performs zero-copy Tx AF_XDP
386-
* @xdp_ring: XDP Tx ring
387-
* @budget: NAPI budget
388-
*
389-
* Returns true if the work is finished.
390-
**/
391-
static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
385+
static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
386+
unsigned int *total_bytes)
392387
{
393-
unsigned int sent_frames = 0, total_bytes = 0;
394-
struct i40e_tx_desc *tx_desc = NULL;
395-
struct i40e_tx_buffer *tx_bi;
396-
struct xdp_desc desc;
388+
struct i40e_tx_desc *tx_desc;
397389
dma_addr_t dma;
398390

399-
while (budget-- > 0) {
400-
if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
401-
break;
391+
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
392+
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);
402393

403-
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
404-
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
405-
desc.len);
394+
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
395+
tx_desc->buffer_addr = cpu_to_le64(dma);
396+
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP,
397+
0, desc->len, 0);
406398

407-
tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
408-
tx_bi->bytecount = desc.len;
399+
*total_bytes += desc->len;
400+
}
409401

410-
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
411-
tx_desc->buffer_addr = cpu_to_le64(dma);
412-
tx_desc->cmd_type_offset_bsz =
413-
build_ctob(I40E_TX_DESC_CMD_ICRC
414-
| I40E_TX_DESC_CMD_EOP,
415-
0, desc.len, 0);
402+
static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
403+
unsigned int *total_bytes)
404+
{
405+
u16 ntu = xdp_ring->next_to_use;
406+
struct i40e_tx_desc *tx_desc;
407+
dma_addr_t dma;
408+
u32 i;
416409

417-
sent_frames++;
418-
total_bytes += tx_bi->bytecount;
410+
loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
411+
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
412+
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
419413

420-
xdp_ring->next_to_use++;
421-
if (xdp_ring->next_to_use == xdp_ring->count)
422-
xdp_ring->next_to_use = 0;
414+
tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
415+
tx_desc->buffer_addr = cpu_to_le64(dma);
416+
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
417+
I40E_TX_DESC_CMD_EOP,
418+
0, desc[i].len, 0);
419+
420+
*total_bytes += desc[i].len;
423421
}
424422

425-
if (tx_desc) {
426-
/* Request an interrupt for the last frame and bump tail ptr. */
427-
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
428-
I40E_TXD_QW1_CMD_SHIFT);
429-
i40e_xdp_ring_update_tail(xdp_ring);
423+
xdp_ring->next_to_use = ntu;
424+
}
425+
426+
static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts,
427+
unsigned int *total_bytes)
428+
{
429+
u32 batched, leftover, i;
430+
431+
batched = nb_pkts & ~(PKTS_PER_BATCH - 1);
432+
leftover = nb_pkts & (PKTS_PER_BATCH - 1);
433+
for (i = 0; i < batched; i += PKTS_PER_BATCH)
434+
i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
435+
for (i = batched; i < batched + leftover; i++)
436+
i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes);
437+
}
430438

431-
xsk_tx_release(xdp_ring->xsk_pool);
432-
i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes);
439+
static void i40e_set_rs_bit(struct i40e_ring *xdp_ring)
440+
{
441+
u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1;
442+
struct i40e_tx_desc *tx_desc;
443+
444+
tx_desc = I40E_TX_DESC(xdp_ring, ntu);
445+
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT);
446+
}
447+
448+
/**
449+
* i40e_xmit_zc - Performs zero-copy Tx AF_XDP
450+
* @xdp_ring: XDP Tx ring
451+
* @budget: NAPI budget
452+
*
453+
* Returns true if the work is finished.
454+
**/
455+
static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
456+
{
457+
struct xdp_desc *descs = xdp_ring->xsk_descs;
458+
u32 nb_pkts, nb_processed = 0;
459+
unsigned int total_bytes = 0;
460+
461+
nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget);
462+
if (!nb_pkts)
463+
return false;
464+
465+
if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) {
466+
nb_processed = xdp_ring->count - xdp_ring->next_to_use;
467+
i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes);
468+
xdp_ring->next_to_use = 0;
433469
}
434470

435-
return !!budget;
471+
i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed,
472+
&total_bytes);
473+
474+
/* Request an interrupt for the last frame and bump tail ptr. */
475+
i40e_set_rs_bit(xdp_ring);
476+
i40e_xdp_ring_update_tail(xdp_ring);
477+
478+
i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes);
479+
480+
return true;
436481
}
437482

438483
/**

drivers/net/ethernet/intel/i40e/i40e_xsk.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,22 @@
44
#ifndef _I40E_XSK_H_
55
#define _I40E_XSK_H_
66

7+
/* This value should match the pragma in the loop_unrolled_for
8+
* macro. Why 4? It is strictly empirical. It seems to be a good
9+
* compromise between the advantage of having simultaneous outstanding
10+
* reads to the DMA array that can hide each others latency and the
11+
* disadvantage of having a larger code path.
12+
*/
13+
#define PKTS_PER_BATCH 4
14+
15+
#ifdef __clang__
16+
#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
17+
#elif __GNUC__ >= 8
18+
#define loop_unrolled_for _Pragma("GCC unroll 4") for
19+
#else
20+
#define loop_unrolled_for for
21+
#endif
22+
723
struct i40e_vsi;
824
struct xsk_buff_pool;
925
struct zero_copy_allocator;

include/net/xdp_sock_drv.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
1515
bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
16+
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max);
1617
void xsk_tx_release(struct xsk_buff_pool *pool);
1718
struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
1819
u16 queue_id);
@@ -128,6 +129,12 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool,
128129
return false;
129130
}
130131

132+
static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc,
133+
u32 max)
134+
{
135+
return 0;
136+
}
137+
131138
static inline void xsk_tx_release(struct xsk_buff_pool *pool)
132139
{
133140
}

net/xdp/xsk.c

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,63 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
332332
}
333333
EXPORT_SYMBOL(xsk_tx_peek_desc);
334334

335+
static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
336+
u32 max_entries)
337+
{
338+
u32 nb_pkts = 0;
339+
340+
while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
341+
nb_pkts++;
342+
343+
xsk_tx_release(pool);
344+
return nb_pkts;
345+
}
346+
347+
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
348+
u32 max_entries)
349+
{
350+
struct xdp_sock *xs;
351+
u32 nb_pkts;
352+
353+
rcu_read_lock();
354+
if (!list_is_singular(&pool->xsk_tx_list)) {
355+
/* Fallback to the non-batched version */
356+
rcu_read_unlock();
357+
return xsk_tx_peek_release_fallback(pool, descs, max_entries);
358+
}
359+
360+
xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
361+
if (!xs) {
362+
nb_pkts = 0;
363+
goto out;
364+
}
365+
366+
nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
367+
if (!nb_pkts) {
368+
xs->tx->queue_empty_descs++;
369+
goto out;
370+
}
371+
372+
/* This is the backpressure mechanism for the Tx path. Try to
373+
* reserve space in the completion queue for all packets, but
374+
* if there are fewer slots available, just process that many
375+
* packets. This avoids having to implement any buffering in
376+
* the Tx path.
377+
*/
378+
nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
379+
if (!nb_pkts)
380+
goto out;
381+
382+
xskq_cons_release_n(xs->tx, nb_pkts);
383+
__xskq_cons_release(xs->tx);
384+
xs->sk.sk_write_space(&xs->sk);
385+
386+
out:
387+
rcu_read_unlock();
388+
return nb_pkts;
389+
}
390+
EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
391+
335392
static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
336393
{
337394
struct net_device *dev = xs->dev;

0 commit comments

Comments
 (0)