[PATCH 1/2] udp: UDP socket send queue repair

From: Bui Quang Minh
Date: Tue Aug 10 2021 - 10:45:38 EST


In this patch, I implement UDP_REPAIR sockoption and a new path in
udp_recvmsg for dumping the corked packet in UDP socket's send queue.

A userspace program can use recvmsg syscall to get the packet's data and
the msg_name information of the packet. Currently, other related
information in inet_cork that are set in cmsg are not dumped.

While working on this, I was aware of Lese Doru Calin's patch and got some
ideas from it.

Link: https://lore.kernel.org/netdev/20200502082856.GA3152@white/
Signed-off-by: Bui Quang Minh <minhquangbui99@xxxxxxxxx>
---
include/linux/udp.h | 3 +-
include/net/udp.h | 2 +
include/uapi/linux/udp.h | 1 +
net/ipv4/udp.c | 94 +++++++++++++++++++++++++++++++++++++++-
net/ipv6/udp.c | 56 +++++++++++++++++++++++-
5 files changed, 151 insertions(+), 5 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index ae66dadd8543..63df0753966e 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -70,7 +70,8 @@ struct udp_sock {
#define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */
#define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */
__u8 pcflag; /* marks socket as UDP-Lite if > 0 */
- __u8 unused[3];
+ __u8 repair;
+ __u8 unused[2];
/*
* For encapsulation sockets.
*/
diff --git a/include/net/udp.h b/include/net/udp.h
index 360df454356c..4550e72b9f2a 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -331,6 +331,8 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport);
int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
+int udp_peek_sndq(struct sock *sk, struct msghdr *msg,
+ size_t len);

/* UDP uses skb->dev_scratch to cache as much information as possible and avoid
* possibly multiple cache miss on dequeue()
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 4828794efcf8..255d056403da 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -29,6 +29,7 @@ struct udphdr {

/* UDP socket options */
#define UDP_CORK 1 /* Never send partially complete segments */
+#define UDP_REPAIR 2 /* UDP sock is under repair right now */
#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1a742b710e54..c91148956338 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1826,6 +1826,65 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
}
EXPORT_SYMBOL(udp_read_sock);

+static int udp_copy_addr(struct sock *sk, struct msghdr *msg, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi4 *fl4;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+ if (udp_sk(sk)->pending != AF_INET)
+ return -EAGAIN;
+
+ if (sin) {
+ fl4 = &inet->cork.fl.u.ip4;
+ sin->sin_family = AF_INET;
+ sin->sin_port = fl4->fl4_dport;
+ sin->sin_addr.s_addr = fl4->daddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+
+ return 0;
+}
+
+int udp_peek_sndq(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct sk_buff *skb;
+ int copied = 0, err = 0, peek_off, off, header_off, copy_len;
+
+ peek_off = READ_ONCE(sk->sk_peek_off);
+ if (peek_off < 0)
+ off = 0;
+ else
+ off = peek_off;
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ header_off = skb_transport_offset(skb) + sizeof(struct udphdr);
+ if (off > skb->len - header_off) {
+ off -= skb->len - header_off;
+ continue;
+ }
+
+ if (len > skb->len - off - header_off)
+ copy_len = skb->len - off - header_off;
+ else
+ copy_len = len;
+
+ err = skb_copy_datagram_msg(skb, off + header_off, msg, copy_len);
+ if (err)
+ return err;
+
+ copied += copy_len;
+ len -= copy_len;
+ off = 0;
+ }
+
+ if (peek_off >= 0)
+ sk_peek_offset_bwd(sk, -copied);
+ return copied;
+}
+EXPORT_SYMBOL(udp_peek_sndq);
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
@@ -1841,10 +1900,27 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
+ struct udp_sock *up = udp_sk(sk);

if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len, addr_len);

+ if (unlikely(up->repair)) {
+ if (!peeking)
+ return -EPERM;
+
+ lock_sock(sk);
+ err = udp_copy_addr(sk, msg, addr_len);
+ if (err) {
+ release_sock(sk);
+ return err;
+ }
+
+ err = udp_peek_sndq(sk, msg, len);
+ release_sock(sk);
+ return err;
+ }
+
try_again:
off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
@@ -1912,7 +1988,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
(struct sockaddr *)sin);
}

- if (udp_sk(sk)->gro_enabled)
+ if (up->gro_enabled)
udp_cmsg_recv(msg, sk, skb);

if (inet->cmsg_flags)
@@ -1926,7 +2002,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
return err;

csum_copy_err:
- if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
+ if (!__sk_queue_drop_skb(sk, &up->reader_queue, skb, flags,
udp_skb_destructor)) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
@@ -2752,6 +2828,16 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->pcflag |= UDPLITE_RECV_CC;
break;

+ case UDP_REPAIR:
+ if (!sk_net_capable(sk, CAP_NET_ADMIN)) {
+ err = -EPERM;
+ break;
+ }
+
+ up->repair = valbool;
+ sk->sk_peek_off = -1;
+ break;
+
default:
err = -ENOPROTOOPT;
break;
@@ -2820,6 +2906,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->pcrlen;
break;

+ case UDP_REPAIR:
+ val = up->repair;
+ break;
+
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c5e15e94bb00..09b5a489829b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -313,6 +313,42 @@ static int udp6_skb_len(struct sk_buff *skb)
return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb);
}

+static int udp6_copy_addr(struct sock *sk, struct msghdr *msg, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi4 *fl4;
+ struct flowi6 *fl6;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+ if (sin6) {
+ switch (udp_sk(sk)->pending) {
+ case AF_INET:
+ fl4 = &inet->cork.fl.u.ip4;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = fl4->fl4_dport;
+ ipv6_addr_set_v4mapped(fl4->daddr,
+ &sin6->sin6_addr);
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = 0;
+ *addr_len = sizeof(*sin6);
+ break;
+ case AF_INET6:
+ fl6 = &inet->cork.fl.u.ip6;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = fl6->fl6_dport;
+ sin6->sin6_addr = fl6->daddr;
+ sin6->sin6_flowinfo = fl6->flowlabel & IPV6_FLOWINFO_MASK;
+ sin6->sin6_scope_id = fl6->flowi6_oif;
+ *addr_len = sizeof(*sin6);
+ break;
+ default:
+ return -EAGAIN;
+ }
+ }
+
+ return 0;
+}
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
@@ -330,6 +366,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
struct udp_mib __percpu *mib;
bool checksum_valid = false;
int is_udp4;
+ struct udp_sock *up = udp_sk(sk);

if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len);
@@ -337,6 +374,21 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (np->rxpmtu && np->rxopt.bits.rxpmtu)
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);

+ if (unlikely(up->repair)) {
+ if (!peeking)
+ return -EPERM;
+
+ lock_sock(sk);
+ err = udp6_copy_addr(sk, msg, addr_len);
+ if (err) {
+ release_sock(sk);
+ return err;
+ }
+
+ err = udp_peek_sndq(sk, msg, len);
+ release_sock(sk);
+ return err;
+ }
try_again:
off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
@@ -413,7 +465,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
(struct sockaddr *)sin6);
}

- if (udp_sk(sk)->gro_enabled)
+ if (up->gro_enabled)
udp_cmsg_recv(msg, sk, skb);

if (np->rxopt.all)
@@ -436,7 +488,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return err;

csum_copy_err:
- if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
+ if (!__sk_queue_drop_skb(sk, &up->reader_queue, skb, flags,
udp_skb_destructor)) {
SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS);
SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
--
2.17.1