[PATCH RFC v4 6/9] tun: Introduce virtio-net hash reporting feature
From: Akihiko Odaki
Date: Tue Sep 24 2024 - 05:04:09 EST
Allow the guest to reuse the hash value to make receive steering
consistent between the host and guest, and to save hash computation.
Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx>
---
Documentation/networking/tuntap.rst | 7 +++
drivers/net/Kconfig | 1 +
drivers/net/tun.c | 117 +++++++++++++++++++++++++++++++-----
include/uapi/linux/if_tun.h | 44 ++++++++++++++
4 files changed, 155 insertions(+), 14 deletions(-)
diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
index 4d7087f727be..86b4ae8caa8a 100644
--- a/Documentation/networking/tuntap.rst
+++ b/Documentation/networking/tuntap.rst
@@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
}
+3.4 Reference
+-------------
+
+``linux/if_tun.h`` defines the interface described below:
+
+.. kernel-doc:: include/uapi/linux/if_tun.h
+
Universal TUN/TAP device driver Frequently Asked Question
=========================================================
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 9920b3a68ed1..e2a7bd703550 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -395,6 +395,7 @@ config TUN
tristate "Universal TUN/TAP device driver support"
depends on INET
select CRC32
+ select SKB_EXTENSIONS
help
TUN/TAP provides packet reception and transmission for user space
programs. It can be viewed as a simple Point-to-Point or Ethernet
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9d93ab9ee58f..986e4a5bf04d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -173,6 +173,10 @@ struct tun_prog {
struct bpf_prog *prog;
};
+struct tun_vnet_hash_container {
+ struct tun_vnet_hash common;
+};
+
/* Since the socket were moved to tun_file, to preserve the behavior of persist
* device, socket filter, sndbuf and vnet header size were restore when the
* file were attached to a persist device.
@@ -210,6 +214,7 @@ struct tun_struct {
struct bpf_prog __rcu *xdp_prog;
struct tun_prog __rcu *steering_prog;
struct tun_prog __rcu *filter_prog;
+ struct tun_vnet_hash vnet_hash;
struct ethtool_link_ksettings link_ksettings;
/* init args */
struct file *file;
@@ -221,6 +226,11 @@ struct veth {
__be16 h_vlan_TCI;
};
+static const struct tun_vnet_hash tun_vnet_hash_cap = {
+ .flags = TUN_VNET_HASH_REPORT,
+ .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
+};
+
static void tun_flow_init(struct tun_struct *tun);
static void tun_flow_uninit(struct tun_struct *tun);
@@ -322,10 +332,15 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
if (get_user(be, argp))
return -EFAULT;
- if (be)
+ if (be) {
+ if (!(tun->flags & TUN_VNET_LE) &&
+ (tun->vnet_hash.flags & TUN_VNET_HASH_REPORT))
+ return -EBUSY;
+
tun->flags |= TUN_VNET_BE;
- else
+ } else {
tun->flags &= ~TUN_VNET_BE;
+ }
return 0;
}
@@ -524,12 +539,17 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
*/
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
+ struct tun_vnet_hash_ext *ext;
+ struct flow_keys keys;
struct tun_flow_entry *e;
u32 txq, numqueues;
numqueues = READ_ONCE(tun->numqueues);
- txq = __skb_get_hash_symmetric(skb);
+ memset(&keys, 0, sizeof(keys));
+ skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
+
+ txq = flow_hash_from_keys(&keys);
e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
if (e) {
tun_flow_save_rps_rxhash(e, txq);
@@ -538,6 +558,16 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
txq = reciprocal_scale(txq, numqueues);
}
+ if (tun->vnet_hash.flags & TUN_VNET_HASH_REPORT) {
+ ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
+ if (ext) {
+ u32 types = tun->vnet_hash.types;
+
+ ext->report = virtio_net_hash_report(types, keys.basic);
+ ext->value = skb->l4_hash ? skb->hash : txq;
+ }
+ }
+
return txq;
}
@@ -2120,33 +2150,58 @@ static ssize_t tun_put_user(struct tun_struct *tun,
}
if (vnet_hdr_sz) {
- struct virtio_net_hdr gso;
+ struct tun_vnet_hash_ext *ext;
+ size_t vnet_hdr_content_sz = sizeof(struct virtio_net_hdr);
+ union {
+ struct virtio_net_hdr hdr;
+ struct virtio_net_hdr_v1_hash hdr_v1_hash;
+ } vnet_hdr;
+ int ret;
if (iov_iter_count(iter) < vnet_hdr_sz)
return -EINVAL;
- if (virtio_net_hdr_from_skb(skb, &gso,
- tun_is_little_endian(tun), true,
- vlan_hlen)) {
+ ext = vnet_hdr_sz < sizeof(vnet_hdr.hdr_v1_hash) ?
+ NULL : skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
+
+ if (ext) {
+ vnet_hdr_content_sz = sizeof(vnet_hdr.hdr_v1_hash);
+ memset(&vnet_hdr, 0, vnet_hdr_content_sz);
+ vnet_hdr.hdr_v1_hash.hdr.num_buffers = __cpu_to_virtio16(true, 1);
+ vnet_hdr.hdr_v1_hash.hash_value = cpu_to_le32(ext->value);
+ vnet_hdr.hdr_v1_hash.hash_report = cpu_to_le16(ext->report);
+ } else {
+ vnet_hdr_content_sz = sizeof(struct virtio_net_hdr);
+ }
+
+ ret = virtio_net_hdr_from_skb(skb,
+ &vnet_hdr.hdr,
+ tun_is_little_endian(tun),
+ true,
+ vlan_hlen);
+
+ if (ret) {
struct skb_shared_info *sinfo = skb_shinfo(skb);
if (net_ratelimit()) {
netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
- sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
- tun16_to_cpu(tun, gso.hdr_len));
+ sinfo->gso_type,
+ tun16_to_cpu(tun, vnet_hdr.hdr.gso_size),
+ tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len));
print_hex_dump(KERN_ERR, "tun: ",
DUMP_PREFIX_NONE,
16, 1, skb->head,
- min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
+ min(tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len), 64),
+ true);
}
WARN_ON_ONCE(1);
return -EINVAL;
}
- if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
+ if (copy_to_iter(&vnet_hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz)
return -EFAULT;
- iov_iter_zero(vnet_hdr_sz - sizeof(gso), iter);
+ iov_iter_zero(vnet_hdr_sz - vnet_hdr_content_sz, iter);
}
if (vlan_hlen) {
@@ -3094,6 +3149,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
int le;
int ret;
bool do_notify = false;
+ struct tun_vnet_hash vnet_hash;
if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
(_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
@@ -3115,6 +3171,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
return open_related_ns(&net->ns, get_net_ns);
+ } else if (cmd == TUNGETVNETHASHCAP) {
+ return copy_to_user(argp, &tun_vnet_hash_cap, sizeof(tun_vnet_hash_cap)) ?
+ -EFAULT : 0;
}
rtnl_lock();
@@ -3314,6 +3373,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
break;
}
+ if (tun->vnet_hash.flags & TUN_VNET_HASH_REPORT &&
+ vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr_v1_hash)) {
+ ret = -EBUSY;
+ break;
+ }
+
tun->vnet_hdr_sz = vnet_hdr_sz;
break;
@@ -3328,10 +3393,17 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = -EFAULT;
break;
}
- if (le)
+ if (le) {
tun->flags |= TUN_VNET_LE;
- else
+ } else {
+ if (tun->vnet_hash.flags & TUN_VNET_HASH_REPORT &&
+ !tun_legacy_is_little_endian(tun)) {
+ ret = -EBUSY;
+ break;
+ }
+
tun->flags &= ~TUN_VNET_LE;
+ }
break;
case TUNGETVNETBE:
@@ -3396,6 +3468,23 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = open_related_ns(&net->ns, get_net_ns);
break;
+ case TUNSETVNETHASH:
+ if (copy_from_user(&vnet_hash, argp, sizeof(vnet_hash))) {
+ ret = -EFAULT;
+ break;
+ }
+ argp = (struct tun_vnet_hash __user *)argp + 1;
+
+ if ((vnet_hash.flags & TUN_VNET_HASH_REPORT) &&
+ (tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) ||
+ !tun_is_little_endian(tun))) {
+ ret = -EBUSY;
+ break;
+ }
+
+ tun->vnet_hash = vnet_hash;
+ break;
+
default:
ret = -EINVAL;
break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 287cdc81c939..1561e8ce0a0a 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -62,6 +62,30 @@
#define TUNSETCARRIER _IOW('T', 226, int)
#define TUNGETDEVNETNS _IO('T', 227)
+/**
+ * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability.
+ *
+ * The argument is a pointer to &struct tun_vnet_hash which will store the
+ * maximal virtio_net hashing configuration.
+ */
+#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash)
+
+/**
+ * define TUNSETVNETHASH - ioctl to configure virtio_net hashing
+ *
+ * The argument is a pointer to &struct tun_vnet_hash.
+ *
+ * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to
+ * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with
+ * %TUN_VNET_HASH_REPORT.
+ *
+ * The virtio_net header must be configured as little-endian before calling this
+ * ioctl with %TUN_VNET_HASH_REPORT.
+ *
+ * This ioctl currently has no effect on XDP packets.
+ */
+#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash)
+
/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
#define IFF_TAP 0x0002
@@ -115,4 +139,24 @@ struct tun_filter {
__u8 addr[][ETH_ALEN];
};
+/**
+ * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost
+ */
+#define TUN_VNET_HASH_REPORT 0x0001
+
+/**
+ * struct tun_vnet_hash - virtio_net hashing configuration
+ * @flags:
+ * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS
+ * @pad:
+ * Should be filled with zero before passing to %TUNSETVNETHASH
+ * @types:
+ * Bitmask of allowed hash types
+ */
+struct tun_vnet_hash {
+ __u16 flags;
+ __u8 pad[2];
+ __u32 types;
+};
+
#endif /* _UAPI__IF_TUN_H */
--
2.46.0