[PATCH 3.12 73/91] net: fix IP early demux races
From: Jiri Slaby
Date: Tue Jan 05 2016 - 12:54:38 EST
From: Eric Dumazet <edumazet@xxxxxxxxxx>
3.12-stable review patch. If anyone has any objections, please let me know.
===============
[ Upstream commit 5037e9ef9454917b047f9f3a19b4dd179fbf7cd4 ]
David Wilder reported crashes caused by dst reuse.
<quote David>
I am seeing a crash on a distro V4.2.3 kernel caused by a double
release of a dst_entry. In ipv4_dst_destroy() the call to
list_empty() finds a poisoned next pointer, indicating the dst_entry
has already been removed from the list and freed. The crash occurs
18 to 24 hours into a run of a network stress exerciser.
</quote>
Thanks to his detailed report and analysis, we were able to understand
the core issue.
IP early demux can associate a dst to skb, after a lookup in TCP/UDP
sockets.
When socket cache is not properly set, we want to store into
sk->sk_dst_cache the dst for future IP early demux lookups,
by acquiring a stable refcount on the dst.
Problem is this acquisition is simply using an atomic_inc(),
which works well, unless the dst was queued for destruction from
dst_release() noticing dst refcount went to zero, if DST_NOCACHE
was set on dst.
We need to make sure current refcount is not zero before incrementing
it, or risk double free as David reported.
This patch, being a stable candidate, adds two new helpers, and use
them only from IP early demux problematic paths.
It might be possible to merge in net-next skb_dst_force() and
skb_dst_force_safe(), but I prefer having the smallest patch for stable
kernels : Maybe some skb_dst_force() callers do not expect skb->dst
can suddenly be cleared.
Can probably be backported back to linux-3.6 kernels
Reported-by: David J. Wilder <dwilder@xxxxxxxxxx>
Tested-by: David J. Wilder <dwilder@xxxxxxxxxx>
Signed-off-by: Eric Dumazet <edumazet@xxxxxxxxxx>
Signed-off-by: David S. Miller <davem@xxxxxxxxxxxxx>
Signed-off-by: Jiri Slaby <jslaby@xxxxxxx>
---
include/net/dst.h | 33 +++++++++++++++++++++++++++++++++
include/net/sock.h | 2 +-
net/ipv4/tcp_ipv4.c | 9 +++++----
net/ipv6/tcp_ipv6.c | 11 ++++++-----
4 files changed, 45 insertions(+), 10 deletions(-)
diff --git a/include/net/dst.h b/include/net/dst.h
index 30cd2f9cd1dd..d30afbdc1a59 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -306,6 +306,39 @@ static inline void skb_dst_force(struct sk_buff *skb)
}
}
+/**
+ * dst_hold_safe - Take a reference on a dst if possible
+ * @dst: pointer to dst entry
+ *
+ * This helper returns false if it could not safely
+ * take a reference on a dst.
+ */
+static inline bool dst_hold_safe(struct dst_entry *dst)
+{
+ if (dst->flags & DST_NOCACHE)
+ return atomic_inc_not_zero(&dst->__refcnt);
+ dst_hold(dst);
+ return true;
+}
+
+/**
+ * skb_dst_force_safe - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted and not destroyed, grab a ref on it.
+ */
+static inline void skb_dst_force_safe(struct sk_buff *skb)
+{
+ if (skb_dst_is_noref(skb)) {
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (!dst_hold_safe(dst))
+ dst = NULL;
+
+ skb->_skb_refdst = (unsigned long)dst;
+ }
+}
+
/**
* __skb_tunnel_rx - prepare skb for rx reinsert
diff --git a/include/net/sock.h b/include/net/sock.h
index 41d98f1d0459..6ed6df149bce 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -760,7 +760,7 @@ extern void sk_stream_write_space(struct sock *sk);
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
/* dont let skb dst not refcounted, we are going to leave rcu lock */
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
if (!sk->sk_backlog.tail)
sk->sk_backlog.head = skb;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 624ceca7ffd1..09451a2cbd6a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1905,7 +1905,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
skb_queue_len(&tp->ucopy.prequeue) == 0)
return false;
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
if (tp->ucopy.memory > sk->sk_rcvbuf) {
@@ -2098,9 +2098,10 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- dst_hold(dst);
- sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ if (dst_hold_safe(dst)) {
+ sk->sk_rx_dst = dst;
+ inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ }
}
EXPORT_SYMBOL(inet_sk_rx_dst_set);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 65c310d6e92a..90004c6e3bff 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -97,11 +97,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
struct dst_entry *dst = skb_dst(skb);
const struct rt6_info *rt = (const struct rt6_info *)dst;
- dst_hold(dst);
- sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
- if (rt->rt6i_node)
- inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+ if (dst_hold_safe(dst)) {
+ sk->sk_rx_dst = dst;
+ inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ if (rt->rt6i_node)
+ inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+ }
}
static void tcp_v6_hash(struct sock *sk)
--
2.6.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/