75
75
#include <linux/skb_array.h>
76
76
#include <linux/bpf.h>
77
77
#include <linux/bpf_trace.h>
78
+ #include <linux/mutex.h>
78
79
79
80
#include <linux/uaccess.h>
80
81
@@ -121,7 +122,8 @@ do { \
121
122
#define TUN_VNET_BE 0x40000000
122
123
123
124
#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
124
- IFF_MULTI_QUEUE | IFF_NAPI)
125
+ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
126
+
125
127
#define GOODCOPY_LEN 128
126
128
127
129
#define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@ struct tun_file {
173
175
unsigned int ifindex ;
174
176
};
175
177
struct napi_struct napi ;
178
+ struct mutex napi_mutex ; /* Protects access to the above napi */
176
179
struct list_head next ;
177
180
struct tun_struct * detached ;
178
181
struct skb_array tx_array ;
@@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
277
280
netif_napi_add (tun -> dev , & tfile -> napi , tun_napi_poll ,
278
281
NAPI_POLL_WEIGHT );
279
282
napi_enable (& tfile -> napi );
283
+ mutex_init (& tfile -> napi_mutex );
280
284
}
281
285
}
282
286
@@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
292
296
netif_napi_del (& tfile -> napi );
293
297
}
294
298
299
+ static bool tun_napi_frags_enabled (const struct tun_struct * tun )
300
+ {
301
+ return READ_ONCE (tun -> flags ) & IFF_NAPI_FRAGS ;
302
+ }
303
+
295
304
#ifdef CONFIG_TUN_VNET_CROSS_LE
296
305
static inline bool tun_legacy_is_little_endian (struct tun_struct * tun )
297
306
{
@@ -1036,14 +1045,18 @@ static void tun_poll_controller(struct net_device *dev)
1036
1045
* supports polling, which enables bridge devices in virt setups to
1037
1046
* still use netconsole
1038
1047
* If NAPI is enabled, however, we need to schedule polling for all
1039
- * queues.
1048
+ * queues unless we are using napi_gro_frags(), which we call in
1049
+ * process context and not in NAPI context.
1040
1050
*/
1041
1051
struct tun_struct * tun = netdev_priv (dev );
1042
1052
1043
1053
if (tun -> flags & IFF_NAPI ) {
1044
1054
struct tun_file * tfile ;
1045
1055
int i ;
1046
1056
1057
+ if (tun_napi_frags_enabled (tun ))
1058
+ return ;
1059
+
1047
1060
rcu_read_lock ();
1048
1061
for (i = 0 ; i < tun -> numqueues ; i ++ ) {
1049
1062
tfile = rcu_dereference (tun -> tfiles [i ]);
@@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
1266
1279
return mask ;
1267
1280
}
1268
1281
1282
+ static struct sk_buff * tun_napi_alloc_frags (struct tun_file * tfile ,
1283
+ size_t len ,
1284
+ const struct iov_iter * it )
1285
+ {
1286
+ struct sk_buff * skb ;
1287
+ size_t linear ;
1288
+ int err ;
1289
+ int i ;
1290
+
1291
+ if (it -> nr_segs > MAX_SKB_FRAGS + 1 )
1292
+ return ERR_PTR (- ENOMEM );
1293
+
1294
+ local_bh_disable ();
1295
+ skb = napi_get_frags (& tfile -> napi );
1296
+ local_bh_enable ();
1297
+ if (!skb )
1298
+ return ERR_PTR (- ENOMEM );
1299
+
1300
+ linear = iov_iter_single_seg_count (it );
1301
+ err = __skb_grow (skb , linear );
1302
+ if (err )
1303
+ goto free ;
1304
+
1305
+ skb -> len = len ;
1306
+ skb -> data_len = len - linear ;
1307
+ skb -> truesize += skb -> data_len ;
1308
+
1309
+ for (i = 1 ; i < it -> nr_segs ; i ++ ) {
1310
+ size_t fragsz = it -> iov [i ].iov_len ;
1311
+ unsigned long offset ;
1312
+ struct page * page ;
1313
+ void * data ;
1314
+
1315
+ if (fragsz == 0 || fragsz > PAGE_SIZE ) {
1316
+ err = - EINVAL ;
1317
+ goto free ;
1318
+ }
1319
+
1320
+ local_bh_disable ();
1321
+ data = napi_alloc_frag (fragsz );
1322
+ local_bh_enable ();
1323
+ if (!data ) {
1324
+ err = - ENOMEM ;
1325
+ goto free ;
1326
+ }
1327
+
1328
+ page = virt_to_head_page (data );
1329
+ offset = data - page_address (page );
1330
+ skb_fill_page_desc (skb , i - 1 , page , offset , fragsz );
1331
+ }
1332
+
1333
+ return skb ;
1334
+ free :
1335
+ /* frees skb and all frags allocated with napi_alloc_frag() */
1336
+ napi_free_frags (& tfile -> napi );
1337
+ return ERR_PTR (err );
1338
+ }
1339
+
1269
1340
/* prepad is the amount to reserve at front. len is length after that.
1270
1341
* linear is a hint as to how much to copy (usually headers). */
1271
1342
static struct sk_buff * tun_alloc_skb (struct tun_file * tfile ,
@@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1478
1549
int err ;
1479
1550
u32 rxhash ;
1480
1551
int skb_xdp = 1 ;
1552
+ bool frags = tun_napi_frags_enabled (tun );
1481
1553
1482
1554
if (!(tun -> dev -> flags & IFF_UP ))
1483
1555
return - EIO ;
@@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1535
1607
zerocopy = true;
1536
1608
}
1537
1609
1538
- if (tun_can_build_skb (tun , tfile , len , noblock , zerocopy )) {
1610
+ if (! frags && tun_can_build_skb (tun , tfile , len , noblock , zerocopy )) {
1539
1611
/* For the packet that is not easy to be processed
1540
1612
* (e.g gso or jumbo packet), we will do it at after
1541
1613
* skb was created with generic XDP routine.
@@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1556
1628
linear = tun16_to_cpu (tun , gso .hdr_len );
1557
1629
}
1558
1630
1559
- skb = tun_alloc_skb (tfile , align , copylen , linear , noblock );
1631
+ if (frags ) {
1632
+ mutex_lock (& tfile -> napi_mutex );
1633
+ skb = tun_napi_alloc_frags (tfile , copylen , from );
1634
+ /* tun_napi_alloc_frags() enforces a layout for the skb.
1635
+ * If zerocopy is enabled, then this layout will be
1636
+ * overwritten by zerocopy_sg_from_iter().
1637
+ */
1638
+ zerocopy = false;
1639
+ } else {
1640
+ skb = tun_alloc_skb (tfile , align , copylen , linear ,
1641
+ noblock );
1642
+ }
1643
+
1560
1644
if (IS_ERR (skb )) {
1561
1645
if (PTR_ERR (skb ) != - EAGAIN )
1562
1646
this_cpu_inc (tun -> pcpu_stats -> rx_dropped );
1647
+ if (frags )
1648
+ mutex_unlock (& tfile -> napi_mutex );
1563
1649
return PTR_ERR (skb );
1564
1650
}
1565
1651
@@ -1571,13 +1657,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1571
1657
if (err ) {
1572
1658
this_cpu_inc (tun -> pcpu_stats -> rx_dropped );
1573
1659
kfree_skb (skb );
1660
+ if (frags ) {
1661
+ tfile -> napi .skb = NULL ;
1662
+ mutex_unlock (& tfile -> napi_mutex );
1663
+ }
1664
+
1574
1665
return - EFAULT ;
1575
1666
}
1576
1667
}
1577
1668
1578
1669
if (virtio_net_hdr_to_skb (skb , & gso , tun_is_little_endian (tun ))) {
1579
1670
this_cpu_inc (tun -> pcpu_stats -> rx_frame_errors );
1580
1671
kfree_skb (skb );
1672
+ if (frags ) {
1673
+ tfile -> napi .skb = NULL ;
1674
+ mutex_unlock (& tfile -> napi_mutex );
1675
+ }
1676
+
1581
1677
return - EINVAL ;
1582
1678
}
1583
1679
@@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1603
1699
skb -> dev = tun -> dev ;
1604
1700
break ;
1605
1701
case IFF_TAP :
1606
- skb -> protocol = eth_type_trans (skb , tun -> dev );
1702
+ if (!frags )
1703
+ skb -> protocol = eth_type_trans (skb , tun -> dev );
1607
1704
break ;
1608
1705
}
1609
1706
@@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1638
1735
1639
1736
rxhash = __skb_get_hash_symmetric (skb );
1640
1737
1641
- if (tun -> flags & IFF_NAPI ) {
1738
+ if (frags ) {
1739
+ /* Exercise flow dissector code path. */
1740
+ u32 headlen = eth_get_headlen (skb -> data , skb_headlen (skb ));
1741
+
1742
+ if (headlen > skb_headlen (skb ) || headlen < ETH_HLEN ) {
1743
+ this_cpu_inc (tun -> pcpu_stats -> rx_dropped );
1744
+ napi_free_frags (& tfile -> napi );
1745
+ mutex_unlock (& tfile -> napi_mutex );
1746
+ WARN_ON (1 );
1747
+ return - ENOMEM ;
1748
+ }
1749
+
1750
+ local_bh_disable ();
1751
+ napi_gro_frags (& tfile -> napi );
1752
+ local_bh_enable ();
1753
+ mutex_unlock (& tfile -> napi_mutex );
1754
+ } else if (tun -> flags & IFF_NAPI ) {
1642
1755
struct sk_buff_head * queue = & tfile -> sk .sk_write_queue ;
1643
1756
int queue_len ;
1644
1757
@@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
2061
2174
if (tfile -> detached )
2062
2175
return - EINVAL ;
2063
2176
2177
+ if ((ifr -> ifr_flags & IFF_NAPI_FRAGS )) {
2178
+ if (!capable (CAP_NET_ADMIN ))
2179
+ return - EPERM ;
2180
+
2181
+ if (!(ifr -> ifr_flags & IFF_NAPI ) ||
2182
+ (ifr -> ifr_flags & TUN_TYPE_MASK ) != IFF_TAP )
2183
+ return - EINVAL ;
2184
+ }
2185
+
2064
2186
dev = __dev_get_by_name (net , ifr -> ifr_name );
2065
2187
if (dev ) {
2066
2188
if (ifr -> ifr_flags & IFF_TUN_EXCL )
0 commit comments