Skip to content

Commit 90e33d4

Browse files
petarpenkovdavem330
authored andcommitted
tun: enable napi_gro_frags() for TUN/TAP driver
Add a TUN/TAP receive mode that exercises the napi_gro_frags() interface. This mode is available only in TAP mode, as the interface expects packets with Ethernet headers. Furthermore, packets follow the layout of the iovec_iter that was received. The first iovec is the linear data, and every one after the first is a fragment. If there are more fragments than the max number, drop the packet. Additionally, invoke eth_get_headlen() to exercise flow dissector code and to verify that the header resides in the linear data. The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option. This is imposed because this mode is intended for testing via tools like syzkaller and packetdrill, and the increased flexibility it provides can introduce security vulnerabilities. This flag is accepted only if the device is in TAP mode and has the IFF_NAPI flag set as well. This is done because both of these are explicit requirements for correct operation in this mode. Signed-off-by: Petar Penkov <[email protected]> Cc: Eric Dumazet <[email protected]> Cc: Mahesh Bandewar <[email protected]> Cc: Willem de Bruijn <[email protected]> Cc: [email protected] Cc: [email protected] Acked-by: Mahesh Bandewar <maheshb@google,com> Signed-off-by: David S. Miller <[email protected]>
1 parent 9431709 commit 90e33d4

File tree

2 files changed

+129
-6
lines changed

2 files changed

+129
-6
lines changed

drivers/net/tun.c

+128-6
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
#include <linux/skb_array.h>
7676
#include <linux/bpf.h>
7777
#include <linux/bpf_trace.h>
78+
#include <linux/mutex.h>
7879

7980
#include <linux/uaccess.h>
8081

@@ -121,7 +122,8 @@ do { \
121122
#define TUN_VNET_BE 0x40000000
122123

123124
#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
124-
IFF_MULTI_QUEUE | IFF_NAPI)
125+
IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
126+
125127
#define GOODCOPY_LEN 128
126128

127129
#define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@ struct tun_file {
173175
unsigned int ifindex;
174176
};
175177
struct napi_struct napi;
178+
struct mutex napi_mutex; /* Protects access to the above napi */
176179
struct list_head next;
177180
struct tun_struct *detached;
178181
struct skb_array tx_array;
@@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
277280
netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
278281
NAPI_POLL_WEIGHT);
279282
napi_enable(&tfile->napi);
283+
mutex_init(&tfile->napi_mutex);
280284
}
281285
}
282286

@@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
292296
netif_napi_del(&tfile->napi);
293297
}
294298

299+
static bool tun_napi_frags_enabled(const struct tun_struct *tun)
300+
{
301+
return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
302+
}
303+
295304
#ifdef CONFIG_TUN_VNET_CROSS_LE
296305
static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
297306
{
@@ -1036,14 +1045,18 @@ static void tun_poll_controller(struct net_device *dev)
10361045
* supports polling, which enables bridge devices in virt setups to
10371046
* still use netconsole
10381047
* If NAPI is enabled, however, we need to schedule polling for all
1039-
* queues.
1048+
* queues unless we are using napi_gro_frags(), which we call in
1049+
* process context and not in NAPI context.
10401050
*/
10411051
struct tun_struct *tun = netdev_priv(dev);
10421052

10431053
if (tun->flags & IFF_NAPI) {
10441054
struct tun_file *tfile;
10451055
int i;
10461056

1057+
if (tun_napi_frags_enabled(tun))
1058+
return;
1059+
10471060
rcu_read_lock();
10481061
for (i = 0; i < tun->numqueues; i++) {
10491062
tfile = rcu_dereference(tun->tfiles[i]);
@@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
12661279
return mask;
12671280
}
12681281

1282+
static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
1283+
size_t len,
1284+
const struct iov_iter *it)
1285+
{
1286+
struct sk_buff *skb;
1287+
size_t linear;
1288+
int err;
1289+
int i;
1290+
1291+
if (it->nr_segs > MAX_SKB_FRAGS + 1)
1292+
return ERR_PTR(-ENOMEM);
1293+
1294+
local_bh_disable();
1295+
skb = napi_get_frags(&tfile->napi);
1296+
local_bh_enable();
1297+
if (!skb)
1298+
return ERR_PTR(-ENOMEM);
1299+
1300+
linear = iov_iter_single_seg_count(it);
1301+
err = __skb_grow(skb, linear);
1302+
if (err)
1303+
goto free;
1304+
1305+
skb->len = len;
1306+
skb->data_len = len - linear;
1307+
skb->truesize += skb->data_len;
1308+
1309+
for (i = 1; i < it->nr_segs; i++) {
1310+
size_t fragsz = it->iov[i].iov_len;
1311+
unsigned long offset;
1312+
struct page *page;
1313+
void *data;
1314+
1315+
if (fragsz == 0 || fragsz > PAGE_SIZE) {
1316+
err = -EINVAL;
1317+
goto free;
1318+
}
1319+
1320+
local_bh_disable();
1321+
data = napi_alloc_frag(fragsz);
1322+
local_bh_enable();
1323+
if (!data) {
1324+
err = -ENOMEM;
1325+
goto free;
1326+
}
1327+
1328+
page = virt_to_head_page(data);
1329+
offset = data - page_address(page);
1330+
skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
1331+
}
1332+
1333+
return skb;
1334+
free:
1335+
/* frees skb and all frags allocated with napi_alloc_frag() */
1336+
napi_free_frags(&tfile->napi);
1337+
return ERR_PTR(err);
1338+
}
1339+
12691340
/* prepad is the amount to reserve at front. len is length after that.
12701341
* linear is a hint as to how much to copy (usually headers). */
12711342
static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
14781549
int err;
14791550
u32 rxhash;
14801551
int skb_xdp = 1;
1552+
bool frags = tun_napi_frags_enabled(tun);
14811553

14821554
if (!(tun->dev->flags & IFF_UP))
14831555
return -EIO;
@@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
15351607
zerocopy = true;
15361608
}
15371609

1538-
if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
1610+
if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
15391611
/* For the packet that is not easy to be processed
15401612
* (e.g gso or jumbo packet), we will do it at after
15411613
* skb was created with generic XDP routine.
@@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
15561628
linear = tun16_to_cpu(tun, gso.hdr_len);
15571629
}
15581630

1559-
skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
1631+
if (frags) {
1632+
mutex_lock(&tfile->napi_mutex);
1633+
skb = tun_napi_alloc_frags(tfile, copylen, from);
1634+
/* tun_napi_alloc_frags() enforces a layout for the skb.
1635+
* If zerocopy is enabled, then this layout will be
1636+
* overwritten by zerocopy_sg_from_iter().
1637+
*/
1638+
zerocopy = false;
1639+
} else {
1640+
skb = tun_alloc_skb(tfile, align, copylen, linear,
1641+
noblock);
1642+
}
1643+
15601644
if (IS_ERR(skb)) {
15611645
if (PTR_ERR(skb) != -EAGAIN)
15621646
this_cpu_inc(tun->pcpu_stats->rx_dropped);
1647+
if (frags)
1648+
mutex_unlock(&tfile->napi_mutex);
15631649
return PTR_ERR(skb);
15641650
}
15651651

@@ -1571,13 +1657,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
15711657
if (err) {
15721658
this_cpu_inc(tun->pcpu_stats->rx_dropped);
15731659
kfree_skb(skb);
1660+
if (frags) {
1661+
tfile->napi.skb = NULL;
1662+
mutex_unlock(&tfile->napi_mutex);
1663+
}
1664+
15741665
return -EFAULT;
15751666
}
15761667
}
15771668

15781669
if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
15791670
this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
15801671
kfree_skb(skb);
1672+
if (frags) {
1673+
tfile->napi.skb = NULL;
1674+
mutex_unlock(&tfile->napi_mutex);
1675+
}
1676+
15811677
return -EINVAL;
15821678
}
15831679

@@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
16031699
skb->dev = tun->dev;
16041700
break;
16051701
case IFF_TAP:
1606-
skb->protocol = eth_type_trans(skb, tun->dev);
1702+
if (!frags)
1703+
skb->protocol = eth_type_trans(skb, tun->dev);
16071704
break;
16081705
}
16091706

@@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
16381735

16391736
rxhash = __skb_get_hash_symmetric(skb);
16401737

1641-
if (tun->flags & IFF_NAPI) {
1738+
if (frags) {
1739+
/* Exercise flow dissector code path. */
1740+
u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
1741+
1742+
if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
1743+
this_cpu_inc(tun->pcpu_stats->rx_dropped);
1744+
napi_free_frags(&tfile->napi);
1745+
mutex_unlock(&tfile->napi_mutex);
1746+
WARN_ON(1);
1747+
return -ENOMEM;
1748+
}
1749+
1750+
local_bh_disable();
1751+
napi_gro_frags(&tfile->napi);
1752+
local_bh_enable();
1753+
mutex_unlock(&tfile->napi_mutex);
1754+
} else if (tun->flags & IFF_NAPI) {
16421755
struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
16431756
int queue_len;
16441757

@@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
20612174
if (tfile->detached)
20622175
return -EINVAL;
20632176

2177+
if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
2178+
if (!capable(CAP_NET_ADMIN))
2179+
return -EPERM;
2180+
2181+
if (!(ifr->ifr_flags & IFF_NAPI) ||
2182+
(ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
2183+
return -EINVAL;
2184+
}
2185+
20642186
dev = __dev_get_by_name(net, ifr->ifr_name);
20652187
if (dev) {
20662188
if (ifr->ifr_flags & IFF_TUN_EXCL)

include/uapi/linux/if_tun.h

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
#define IFF_TUN 0x0001
6262
#define IFF_TAP 0x0002
6363
#define IFF_NAPI 0x0010
64+
#define IFF_NAPI_FRAGS 0x0020
6465
#define IFF_NO_PI 0x1000
6566
/* This flag has no real effect */
6667
#define IFF_ONE_QUEUE 0x2000

0 commit comments

Comments
 (0)