[PATCH v3 17/55] ip6, udp6: Support MSG_SPLICE_PAGES

From: David Howells
Date: Fri Mar 31 2023 - 12:13:00 EST


Make IP6/UDP6 sendmsg() support MSG_SPLICE_PAGES. This causes pages to be
spliced from the source iterator if possible, copying the data if not.

This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Willem de Bruijn <willemdebruijn.kernel@xxxxxxxxx>
cc: "David S. Miller" <davem@xxxxxxxxxxxxx>
cc: Eric Dumazet <edumazet@xxxxxxxxxx>
cc: Jakub Kicinski <kuba@xxxxxxxxxx>
cc: Paolo Abeni <pabeni@xxxxxxxxxx>
cc: Jens Axboe <axboe@xxxxxxxxx>
cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
cc: netdev@xxxxxxxxxxxxxxx
---
include/net/ip.h | 4 ++++
net/ipv4/ip_output.c | 11 ++++++-----
net/ipv6/ip6_output.c | 28 +++++++++++++++++++++++++---
3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index c3fffaa92d6e..e27d2ceffcfa 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -211,6 +211,10 @@ int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
__u8 tos);
void ip_init(void);
+int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb,
+ unsigned int fragheaderlen, unsigned int maxfraglen,
+ unsigned int hh_len);
+int __ip_splice_pages(struct sock *sk, struct sk_buff *skb, void *from, int *pcopy);
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 41a954ac9e1a..fa2546d944bc 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -959,9 +959,9 @@ csum_page(struct page *page, int offset, int copy)
/*
* Allocate a packet for MSG_SPLICE_PAGES.
*/
-static int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb,
- unsigned int fragheaderlen, unsigned int maxfraglen,
- unsigned int hh_len)
+int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb,
+ unsigned int fragheaderlen, unsigned int maxfraglen,
+ unsigned int hh_len)
{
struct sk_buff *skb_prev = *pskb, *skb;
unsigned int fraggap = skb_prev->len - maxfraglen;
@@ -993,12 +993,12 @@ static int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb,
*pskb = skb;
return 0;
}
+EXPORT_SYMBOL_GPL(__ip_splice_alloc);

/*
* Add (or copy) data pages for MSG_SPLICE_PAGES.
*/
-static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb,
- void *from, int *pcopy)
+int __ip_splice_pages(struct sock *sk, struct sk_buff *skb, void *from, int *pcopy)
{
struct msghdr *msg = from;
struct page *page = NULL, **pages = &page;
@@ -1047,6 +1047,7 @@ static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb,
*pcopy = copy;
return 0;
}
+EXPORT_SYMBOL_GPL(__ip_splice_pages);

static int __ip_append_data(struct sock *sk,
struct flowi4 *fl4,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c314fdde0097..c95d034cb45a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1486,7 +1486,7 @@ static int __ip6_append_data(struct sock *sk,
struct rt6_info *rt = (struct rt6_info *)cork->dst;
struct ipv6_txoptions *opt = v6_cork->opt;
int csummode = CHECKSUM_NONE;
- unsigned int maxnonfragsize, headersize;
+ unsigned int maxnonfragsize, headersize, initial_length;
unsigned int wmem_alloc_delta = 0;
bool paged, extra_uref = false;

@@ -1559,6 +1559,7 @@ static int __ip6_append_data(struct sock *sk,
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;

+ initial_length = length;
if ((flags & MSG_ZEROCOPY) && length) {
struct msghdr *msg = from;

@@ -1589,6 +1590,14 @@ static int __ip6_append_data(struct sock *sk,
skb_zcopy_set(skb, uarg, &extra_uref);
}
}
+ } else if ((flags & MSG_SPLICE_PAGES) && length) {
+ if (inet_sk(sk)->hdrincl)
+ return -EPERM;
+ if (rt->dst.dev->features & NETIF_F_SG)
+ /* We need an empty buffer to attach stuff to */
+ initial_length = transhdrlen;
+ else
+ flags &= ~MSG_SPLICE_PAGES;
}

/*
@@ -1624,6 +1633,15 @@ static int __ip6_append_data(struct sock *sk,
unsigned int fraggap;
unsigned int alloclen, alloc_extra;
unsigned int pagedlen;
+
+ if (unlikely(flags & MSG_SPLICE_PAGES)) {
+ err = __ip_splice_alloc(sk, &skb, fragheaderlen,
+ maxfraglen, hh_len);
+ if (err < 0)
+ goto error;
+ continue;
+ }
+ initial_length = length;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
@@ -1642,7 +1660,7 @@ static int __ip6_append_data(struct sock *sk,
* If remaining data exceeds the mtu,
* we know we need more fragment(s).
*/
- datalen = length + fraggap;
+ datalen = initial_length + fraggap;

if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
@@ -1672,7 +1690,7 @@ static int __ip6_append_data(struct sock *sk,
}
alloclen += alloc_extra;

- if (datalen != length + fraggap) {
+ if (datalen != initial_length + fraggap) {
/*
* this is not the last fragment, the trailer
* space is regarded as data space.
@@ -1778,6 +1796,10 @@ static int __ip6_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
+ } else if (flags & MSG_SPLICE_PAGES) {
+ err = __ip_splice_pages(sk, skb, from, &copy);
+ if (err < 0)
+ goto error;
} else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;