[PATCH v4 bpf-next 05/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From: Kuniyuki Iwashima
Date: Mon Apr 26 2021 - 23:48:21 EST


When we call close() or shutdown() for listening sockets, each child socket
in the accept queue are freed at inet_csk_listen_stop(). If we can get a
new listener by reuseport_migrate_sock() and clone the request by
reqsk_clone(), we try to add it into the new listener's accept queue by
inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free() to
call sock_put() for its listener and free the cloned request.

After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets
NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be
non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid
double free.

Note that we do not update req->rsk_listener and instead clone the req to
migrate because another path may reference the original request. If we
protected it by RCU, we would need to add rcu_read_lock() in many places.

Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/
Suggested-by: Martin KaFai Lau <kafai@xxxxxx>
Signed-off-by: Kuniyuki Iwashima <kuniyu@xxxxxxxxxxxx>
---
include/net/request_sock.h | 2 ++
net/core/request_sock.c | 37 +++++++++++++++++++++++++++++++++
net/ipv4/inet_connection_sock.c | 31 ++++++++++++++++++++++++++-
3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 29e41ff3ec93..c6d6cfd3c93b 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -190,6 +190,8 @@ void reqsk_queue_alloc(struct request_sock_queue *queue);
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
bool reset);

+struct request_sock *reqsk_clone(struct request_sock *req, struct sock *sk);
+
static inline bool reqsk_queue_empty(const struct request_sock_queue *queue)
{
return READ_ONCE(queue->rskq_accept_head) == NULL;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index f35c2e998406..82cf9fbe2668 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -130,3 +130,40 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
out:
spin_unlock_bh(&fastopenq->lock);
}
+
+struct request_sock *reqsk_clone(struct request_sock *req, struct sock *sk)
+{
+ struct sock *req_sk, *nreq_sk;
+ struct request_sock *nreq;
+
+ nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+ if (!nreq) {
+ /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+ sock_put(sk);
+ return NULL;
+ }
+
+ req_sk = req_to_sk(req);
+ nreq_sk = req_to_sk(nreq);
+
+ memcpy(nreq_sk, req_sk,
+ offsetof(struct sock, sk_dontcopy_begin));
+ memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+ req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+ nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_XPS
+ nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+ nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+ nreq->rsk_listener = sk;
+
+ /* We need not acquire fastopenq->lock
+ * because the child socket is locked in inet_csk_listen_stop().
+ */
+ if (tcp_rsk(nreq)->tfo_listener)
+ rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+ return nreq;
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fa806e9167ec..851992405826 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -695,6 +695,13 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);

+static void reqsk_migrate_reset(struct request_sock *req)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ inet_rsk(req)->ipv6_opt = NULL;
+#endif
+}
+
/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock *req)
{
@@ -1036,14 +1043,36 @@ void inet_csk_listen_stop(struct sock *sk)
* of the variants now. --ANK
*/
while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
- struct sock *child = req->sk;
+ struct sock *child = req->sk, *nsk;
+ struct request_sock *nreq;

local_bh_disable();
bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child));
sock_hold(child);

+ nsk = reuseport_migrate_sock(sk, child, NULL);
+ if (nsk) {
+ nreq = reqsk_clone(req, nsk);
+ if (nreq) {
+ refcount_set(&nreq->rsk_refcnt, 1);
+
+ if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+ reqsk_migrate_reset(req);
+ } else {
+ reqsk_migrate_reset(nreq);
+ __reqsk_free(nreq);
+ }
+
+ /* inet_csk_reqsk_queue_add() has already
+ * called inet_child_forget() on failure case.
+ */
+ goto skip_child_forget;
+ }
+ }
+
inet_child_forget(sk, req, child);
+skip_child_forget:
reqsk_put(req);
bh_unlock_sock(child);
local_bh_enable();
--
2.30.2