Re: [PATCH v4] tcp: fix race condition when creating child sockets from syncookies

From: Eric Dumazet
Date: Mon Nov 16 2020 - 05:57:45 EST




On 11/13/20 8:09 PM, Ricardo Dias wrote:
> When the TCP stack is in SYN flood mode, the server child socket is
> created from the SYN cookie received in a TCP packet with the ACK flag
> set.
>
> The child socket is created when the server receives the first TCP
> packet with a valid SYN cookie from the client. Usually, this packet
> corresponds to the final step of the TCP 3-way handshake, the ACK
> packet. But is also possible to receive a valid SYN cookie from the
> first TCP data packet sent by the client, and thus create a child socket
> from that SYN cookie.
>
>


...

> Signed-off-by: Ricardo Dias <rdias@xxxxxxxxxxxxxxx>
> ---



> +#endif
> #include <net/secure_seq.h>
> #include <net/ip.h>
> #include <net/tcp.h>
> @@ -510,17 +513,27 @@ static u32 inet_sk_port_offset(const struct sock *sk)
> inet->inet_dport);
> }
>
> -/* insert a socket into ehash, and eventually remove another one
> - * (The another one can be a SYN_RECV or TIMEWAIT
> +/* Insert a socket into ehash, and eventually remove another one
> + * (The another one can be a SYN_RECV or TIMEWAIT)
> + * If an existing socket already exists, it returns that socket
> + * through the esk parameter.
> */
> -bool inet_ehash_insert(struct sock *sk, struct sock *osk)
> +bool inet_ehash_insert(struct sock *sk, struct sock *osk, struct sock **esk)
> {
> struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
> struct hlist_nulls_head *list;
> struct inet_ehash_bucket *head;
> - spinlock_t *lock;
> + const struct hlist_nulls_node *node;
> + struct sock *_esk;

> + spinlock_t *lock; /* protects hashinfo socket entry */
Please do not add this comment, I find it confusing, and breaking reverse tree.

> + struct net *net = sock_net(sk);
> + const int dif = sk->sk_bound_dev_if;
> + const int sdif = sk->sk_bound_dev_if;
> bool ret = true;
>
> + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
> + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
> +
> WARN_ON_ONCE(!sk_unhashed(sk));
>
> sk->sk_hash = sk_ehashfn(sk);
> @@ -532,16 +545,49 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
> if (osk) {
> WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
> ret = sk_nulls_del_node_init_rcu(osk);
> + } else if (esk) {

You could move here all the needed new variables, and also
the INET_ADDR_COOKIE(...), const __portpair ports = ...;

This would allow to keep a nice reverse tree order, and keep scope small.

const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
const int sdif = sk->sk_bound_dev_if;
const int dif = sk->sk_bound_dev_if;
const struct hlist_nulls_node *node;
struct net *net = sock_net(sk);

Alternatively you could also move this code into a helper function.


} else if (esk) {
*esk = inet_ehash_lookup_by_sk(sk, node);
}

> + sk_nulls_for_each_rcu(_esk, node, list) {
> + if (_esk->sk_hash != sk->sk_hash)
> + continue;
> + if (sk->sk_family == AF_INET) {
> + if (unlikely(INET_MATCH(_esk, net, acookie,
> + sk->sk_daddr,
> + sk->sk_rcv_saddr,
> + ports, dif, sdif))) {
> + refcount_inc(&_esk->sk_refcnt);
> + goto found;
> + }
> + }
> +#if IS_ENABLED(CONFIG_IPV6)
> + else if (sk->sk_family == AF_INET6) {
> + if (unlikely(INET6_MATCH(_esk, net,
> + &sk->sk_v6_daddr,
> + &sk->sk_v6_rcv_saddr,
> + ports, dif, sdif))) {
> + refcount_inc(&_esk->sk_refcnt);
> + goto found;
> + }
> + }
> +#endif
> + }
> +
>