[PATCH 4.7 007/184] vti: flush x-netns xfrm cache when vti interface is removed

From: Greg Kroah-Hartman
Date: Thu Sep 22 2016 - 14:42:47 EST


4.7-stable review patch. If anyone has any objections, please let me know.

------------------

From: Lance Richardson <lrichard@xxxxxxxxxx>


[ Upstream commit a5d0dc810abf3d6b241777467ee1d6efb02575fc ]

When executing the script included below, the netns delete operation
hangs with the following message (repeated at 10 second intervals):

kernel:unregister_netdevice: waiting for lo to become free. Usage count = 1

This occurs because a reference to the lo interface in the "secure" netns
is still held by a dst entry in the xfrm bundle cache in the init netns.

Address this problem by garbage collecting the tunnel netns flow cache
when a cross-namespace vti interface receives a NETDEV_DOWN notification.

A more detailed description of the problem scenario (referencing commands
in the script below):

(1) ip link add vti_test type vti local 1.1.1.1 remote 1.1.1.2 key 1

The vti_test interface is created in the init namespace. vti_tunnel_init()
attaches a struct ip_tunnel to the vti interface's netdev_priv(dev),
setting the tunnel net to &init_net.

(2) ip link set vti_test netns secure

The vti_test interface is moved to the "secure" netns. Note that
the associated struct ip_tunnel still has tunnel->net set to &init_net.

(3) ip netns exec secure ping -c 4 -i 0.02 -I 192.168.100.1 192.168.200.1

The first packet sent using the vti device causes xfrm_lookup() to be
called as follows:

dst = xfrm_lookup(tunnel->net, skb_dst(skb), fl, NULL, 0);

Note that tunnel->net is the init namespace, while skb_dst(skb) references
the vti_test interface in the "secure" namespace. The returned dst
references an interface in the init namespace.

Also note that the first parameter to xfrm_lookup() determines which flow
cache is used to store the computed xfrm bundle, so after xfrm_lookup()
returns there will be a cached bundle in the init namespace flow cache
with a dst referencing a device in the "secure" namespace.

(4) ip netns del secure

Kernel begins to delete the "secure" namespace. At some point the
vti_test interface is deleted, at which point dst_ifdown() changes
the dst->dev in the cached xfrm bundle flow from vti_test to lo (still
in the "secure" namespace however).
Since nothing has happened to cause the init namespace's flow cache
to be garbage collected, this dst remains attached to the flow cache,
so the kernel loops waiting for the last reference to lo to go away.

<Begin script>
ip link add br1 type bridge
ip link set dev br1 up
ip addr add dev br1 1.1.1.1/8

ip netns add secure
ip link add vti_test type vti local 1.1.1.1 remote 1.1.1.2 key 1
ip link set vti_test netns secure
ip netns exec secure ip link set vti_test up
ip netns exec secure ip link s lo up
ip netns exec secure ip addr add dev lo 192.168.100.1/24
ip netns exec secure ip route add 192.168.200.0/24 dev vti_test
ip xfrm policy flush
ip xfrm state flush
ip xfrm policy add dir out tmpl src 1.1.1.1 dst 1.1.1.2 \
proto esp mode tunnel mark 1
ip xfrm policy add dir in tmpl src 1.1.1.2 dst 1.1.1.1 \
proto esp mode tunnel mark 1
ip xfrm state add src 1.1.1.1 dst 1.1.1.2 proto esp spi 1 \
mode tunnel enc des3_ede 0x112233445566778811223344556677881122334455667788
ip xfrm state add src 1.1.1.2 dst 1.1.1.1 proto esp spi 1 \
mode tunnel enc des3_ede 0x112233445566778811223344556677881122334455667788

ip netns exec secure ping -c 4 -i 0.02 -I 192.168.100.1 192.168.200.1

ip netns del secure
<End script>

Reported-by: Hangbin Liu <haliu@xxxxxxxxxx>
Reported-by: Jan Tluka <jtluka@xxxxxxxxxx>
Signed-off-by: Lance Richardson <lrichard@xxxxxxxxxx>
Signed-off-by: David S. Miller <davem@xxxxxxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
---
net/ipv4/ip_vti.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)

--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -557,6 +557,33 @@ static struct rtnl_link_ops vti_link_ops
.get_link_net = ip_tunnel_get_link_net,
};

+static bool is_vti_tunnel(const struct net_device *dev)
+{
+ return dev->netdev_ops == &vti_netdev_ops;
+}
+
+static int vti_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ if (!is_vti_tunnel(dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ if (!net_eq(tunnel->net, dev_net(dev)))
+ xfrm_garbage_collect(tunnel->net);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vti_notifier_block __read_mostly = {
+ .notifier_call = vti_device_event,
+};
+
static int __init vti_init(void)
{
const char *msg;
@@ -564,6 +591,8 @@ static int __init vti_init(void)

pr_info("IPv4 over IPsec tunneling driver\n");

+ register_netdevice_notifier(&vti_notifier_block);
+
msg = "tunnel device";
err = register_pernet_device(&vti_net_ops);
if (err < 0)
@@ -596,6 +625,7 @@ xfrm_proto_ah_failed:
xfrm_proto_esp_failed:
unregister_pernet_device(&vti_net_ops);
pernet_dev_failed:
+ unregister_netdevice_notifier(&vti_notifier_block);
pr_err("vti init: failed to register %s\n", msg);
return err;
}
@@ -607,6 +637,7 @@ static void __exit vti_fini(void)
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti_net_ops);
+ unregister_netdevice_notifier(&vti_notifier_block);
}

module_init(vti_init);