Re: [PATCH v2] tcp: fix race condition when creating child sockets from syncookies

From: Eric Dumazet
Date: Mon Nov 09 2020 - 11:29:47 EST


On Mon, Nov 9, 2020 at 5:11 PM Ricardo Dias <rdias@xxxxxxxxxxxxxxx> wrote:
>
> When the TCP stack is in SYN flood mode, the server child socket is
> created from the SYN cookie received in a TCP packet with the ACK flag
> set.
>
> The child socket is created when the server receives the first TCP
> packet with a valid SYN cookie from the client. Usually, this packet
> corresponds to the final step of the TCP 3-way handshake, the ACK
> packet. But is also possible to receive a valid SYN cookie from the
> first TCP data packet sent by the client, and thus create a child socket
> from that SYN cookie.
>
> Since a client socket is ready to send data as soon as it receives the
> SYN+ACK packet from the server, the client can send the ACK packet (sent
> by the TCP stack code), and the first data packet (sent by the userspace
> program) almost at the same time, and thus the server will equally
> receive the two TCP packets with valid SYN cookies almost at the same
> instant.
>
> When such event happens, the TCP stack code has a race condition that
> occurs between the momement a lookup is done to the established
> connections hashtable to check for the existence of a connection for the
> same client, and the moment that the child socket is added to the
> established connections hashtable. As a consequence, this race condition
> can lead to a situation where we add two child sockets to the
> established connections hashtable and deliver two sockets to the
> userspace program to the same client.
>
> This patch fixes the race condition by checking if an existing child
> socket exists for the same client when we are adding the second child
> socket to the established connections socket. If an existing child
> socket exists, we return that socket and use it to process the TCP
> packet received, and discard the second child socket to the same client.
>
> Signed-off-by: Ricardo Dias <rdias@xxxxxxxxxx>
> Reported-by: kernel test robot <lkp@xxxxxxxxx>

The kernel test robot reported a bug on your v1, you do not have to
claim the bot found this issue.

> ---
> v2 (2020-11-09):
> * Changed the author's email domain.
> * Removed the helper function inet_ehash_insert_chk_dup and moved the
> logic to the existing inet_ehash_insert.
> * Updated the callers of iner_ehash_nolisten to deal with the new
> logic.
>
>
> include/net/inet_hashtables.h | 6 ++--
> net/dccp/ipv4.c | 4 ++-
> net/dccp/ipv6.c | 4 ++-
> net/ipv4/inet_hashtables.c | 63 +++++++++++++++++++++++++++++------
> net/ipv4/syncookies.c | 5 ++-
> net/ipv4/tcp_ipv4.c | 12 ++++++-
> net/ipv6/tcp_ipv6.c | 19 ++++++++++-
> 7 files changed, 94 insertions(+), 19 deletions(-)
>
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index 92560974ea67..dffa345d52a7 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -247,9 +247,9 @@ void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
> unsigned long high_limit);
> int inet_hashinfo2_init_mod(struct inet_hashinfo *h);
>
> -bool inet_ehash_insert(struct sock *sk, struct sock *osk);
> -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
> -int __inet_hash(struct sock *sk, struct sock *osk);
> +bool inet_ehash_insert(struct sock *sk, struct sock **osk);
> +bool inet_ehash_nolisten(struct sock *sk, struct sock **osk);
> +int __inet_hash(struct sock *sk, struct sock **osk);
> int inet_hash(struct sock *sk);
> void inet_unhash(struct sock *sk);
>
> diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
> index 9c28c8251125..99bbba478991 100644
> --- a/net/dccp/ipv4.c
> +++ b/net/dccp/ipv4.c
> @@ -400,6 +400,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
> struct inet_request_sock *ireq;
> struct inet_sock *newinet;
> struct sock *newsk;
> + struct sock *osk;
>
> if (sk_acceptq_is_full(sk))
> goto exit_overflow;
> @@ -427,7 +428,8 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
>
> if (__inet_inherit_port(sk, newsk) < 0)
> goto put_and_exit;
> - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
> + osk = req_to_sk(req_unhash);
> + *own_req = inet_ehash_nolisten(newsk, &osk);
> if (*own_req)
> ireq->ireq_opt = NULL;
> else
> diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
> index ef4ab28cfde0..91a825c00a97 100644
> --- a/net/dccp/ipv6.c
> +++ b/net/dccp/ipv6.c
> @@ -407,6 +407,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
> struct inet_sock *newinet;
> struct dccp6_sock *newdp6;
> struct sock *newsk;
> + struct sock *osk;
>
> if (skb->protocol == htons(ETH_P_IP)) {
> /*
> @@ -533,7 +534,8 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
> dccp_done(newsk);
> goto out;
> }
> - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
> + osk = req_to_sk(req_unhash);
> + *own_req = inet_ehash_nolisten(newsk, &osk);
> /* Clone pktoptions received with SYN, if we own the req */
> if (*own_req && ireq->pktopts) {
> newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 239e54474b65..8d62b22b9a95 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -510,17 +510,27 @@ static u32 inet_sk_port_offset(const struct sock *sk)
> inet->inet_dport);
> }
>
> -/* insert a socket into ehash, and eventually remove another one
> - * (The another one can be a SYN_RECV or TIMEWAIT
> +/* Insert a socket into ehash, and eventually remove another one
> + * (The another one can be a SYN_RECV or TIMEWAIT)
> + * If an existing socket already exists, it returns that socket
> + * through the osk parameter.
> */
> -bool inet_ehash_insert(struct sock *sk, struct sock *osk)
> +bool inet_ehash_insert(struct sock *sk, struct sock **osk)
> {
> struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
> struct hlist_nulls_head *list;
> struct inet_ehash_bucket *head;
> - spinlock_t *lock;
> + const struct hlist_nulls_node *node;
> + struct sock *esk;
> + spinlock_t *lock; /* protects hashinfo socket entry */
> + struct net *net = sock_net(sk);
> + const int dif = sk->sk_bound_dev_if;
> + const int sdif = sk->sk_bound_dev_if;
> bool ret = true;
>
> + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
> + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
> +

This does not work for IPv6.
This function is used both for IPv4 and IPv6

Please test your changes for IPv6, thank you !

> WARN_ON_ONCE(!sk_unhashed(sk));
>
> sk->sk_hash = sk_ehashfn(sk);
> @@ -529,17 +539,48 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
> lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
>
> spin_lock(lock);
> - if (osk) {
> - WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
> - ret = sk_nulls_del_node_init_rcu(osk);
> + if (osk && *osk) {
> + WARN_ON_ONCE(sk->sk_hash != (*osk)->sk_hash);
> + ret = sk_nulls_del_node_init_rcu(*osk);
> + } else if (osk && !*osk) {
> +begin:
> + sk_nulls_for_each_rcu(esk, node, list) {
> + if (esk->sk_hash != sk->sk_hash)
> + continue;
> + if (likely(INET_MATCH(esk, net, acookie,
> + sk->sk_daddr,
> + sk->sk_rcv_saddr, ports,
> + dif, sdif))) {
> + if (unlikely(!refcount_inc_not_zero(&esk->sk_refcnt)))
> + goto out;
> + if (unlikely(!INET_MATCH(esk, net, acookie,
> + sk->sk_daddr,
> + sk->sk_rcv_saddr,
> + ports,
> + dif, sdif))) {

This can not happen, since you own the spinlock protecting the hash bucket.