[RFCv2 net-next 7/7] openvswitch: Support fragmented IPv4 packets for conntrack

From: Joe Stringer
Date: Mon Mar 02 2015 - 17:04:38 EST

Next message: Joe Stringer: "[RFCv2 net-next 6/7] net: Refactor ip_defrag() APIs"
Previous message: Tom Zanussi: "Re: [PATCH 07/15] mm: Add ___GFP_NOTRACE"
In reply to: Joe Stringer: "[RFCv2 net-next 3/7] openvswitch: Add conntrack action"
Next in thread: Joe Stringer: "[RFCv2 net-next 6/7] net: Refactor ip_defrag() APIs"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

From: Andy Zhou <azhou@xxxxxxxxxx>

The conntrack action now re-assembles fragmented IPv4 packets and only
send a fully re-assembled IP packet to nf_conntrack layer.

When a re-assembled IP frame hits the output action. The output action
will re fragment them into IP fragments based on this packets' incoming
fragment size.

Signed-off-by: Andy Zhou <azhou@xxxxxxxxxx>
---
include/uapi/linux/openvswitch.h | 5 ++-
net/openvswitch/actions.c | 78 ++++++++++++++++++++++++++++++++++----
net/openvswitch/conntrack.c | 43 ++++++++++++++++++++-
net/openvswitch/datapath.c | 40 ++++++++++++++++---
net/openvswitch/datapath.h | 6 +++
net/openvswitch/vport.c | 1 +
6 files changed, 157 insertions(+), 16 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 30d70a3..b947544 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -162,7 +162,9 @@ enum ovs_packet_cmd {
* %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
* output port is actually a tunnel port. Contains the output tunnel key
* extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
- *
+ * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
+ * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
+ * size.
* These attributes follow the &struct ovs_header within the Generic Netlink
* payload for %OVS_PACKET_* commands.
*/
@@ -178,6 +180,7 @@ enum ovs_packet_attr {
OVS_PACKET_ATTR_UNUSED2,
OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe,
error logging should be suppressed. */
+ OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */
__OVS_PACKET_ATTR_MAX
};

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 9bd9f99..789e53a 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -53,6 +53,11 @@ struct deferred_action {
struct sw_flow_key pkt_key;
};

+struct vport_frag_output_info {
+ struct vport *vport;
+ struct sw_flow_key *key;
+};
+
#define DEFERRED_ACTION_FIFO_SIZE 10
struct action_fifo {
int head;
@@ -595,14 +600,67 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}

-static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
+/* Given an IP frame, reconstruct its MAC header based on flow. */
+int ovs_setup_l2_header(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int err;
+
+ err = skb_ensure_writable(skb, ETH_HLEN);
+ if (unlikely(err))
+ return err;
+
+ __skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ ether_addr_copy(eth_hdr(skb)->h_source, key->eth.src);
+ ether_addr_copy(eth_hdr(skb)->h_dest, key->eth.dst);
+ eth_hdr(skb)->h_proto = key->eth.type;
+
+ return 0;
+}
+
+static int ovs_vport_output(struct sk_buff *skb, void *output_arg)
+{
+ struct vport_frag_output_info *arg =
+ (struct vport_frag_output_info *)output_arg;
+ struct sw_flow_key *key = arg->key;
+ struct vport *vport = arg->vport;
+ int err;
+
+ err = ovs_setup_l2_header(skb, key);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+ ovs_vport_send(vport, skb);
+
+ return 0;
+}
+
+static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
+ struct sw_flow_key *key)
{
struct vport *vport = ovs_vport_rcu(dp, out_port);
+ unsigned int mru = OVS_CB(skb)->mru;

- if (likely(vport))
- ovs_vport_send(vport, skb);
- else
+ if (likely(vport)) {
+ if (!mru || (skb->len <= mru + ETH_HLEN)) {
+ ovs_vport_send(vport, skb);
+ } else if (key->eth.type == htons(ETH_P_IP)) {
+ struct vport_frag_output_info arg;
+ unsigned int mtu = mru;
+
+ arg.vport = vport;
+ arg.key = key;
+
+ skb_pull(skb, ETH_HLEN);
+
+ ip_fragment_mtu(skb, mtu, LL_MAX_HEADER, NULL, &arg,
+ ovs_vport_output);
+ }
+ } else {
kfree_skb(skb);
+ }
}

static int output_userspace(struct datapath *dp, struct sk_buff *skb,
@@ -617,6 +675,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
upcall.userdata = NULL;
upcall.portid = 0;
upcall.egress_tun_info = NULL;
+ upcall.mru = OVS_CB(skb)->mru;

for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
@@ -865,7 +924,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);

if (out_skb)
- do_output(dp, out_skb, prev_port);
+ do_output(dp, out_skb, prev_port, key);

prev_port = -1;
}
@@ -929,13 +988,18 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
}

if (unlikely(err)) {
- kfree_skb(skb);
+ /* Hide stolen fragments from user space. */
+ if (err == -EINPROGRESS)
+ err = 0;
+ else
+ kfree_skb(skb);
+
return err;
}
}

if (prev_port != -1)
- do_output(dp, skb, prev_port);
+ do_output(dp, skb, prev_port, key);
else
consume_skb(skb);

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 93d76a5..793d489 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -178,21 +178,60 @@ static int ovs_ct_lookup(struct net *net, u16 zone, struct sw_flow_key *key,
return err;
}

+static int handle_fragments(struct net *net, u16 zone, struct sk_buff *skb,
+ struct sw_flow_key *key)
+{
+ if (key->eth.type == htons(ETH_P_IP)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
+ struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
+ int nh_ofs = skb_network_offset(skb);
+ enum ip_defrag_users user;
+ unsigned int mru;
+ int err;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ user = IP_DEFRAG_CONNTRACK_IN + zone;
+ skb_pull(skb, nh_ofs);
+ err = ip_defrag_net(net, skb, user, &mru);
+ if (err)
+ return err;
+
+ /* Got a reassembled IP frame */
+ skb_clear_hash(skb);
+ ip_send_check(ip_hdr(skb));
+ skb->ignore_df = 1;
+ err = ovs_setup_l2_header(skb, key);
+ if (err)
+ return err;
+
+ ovs_cb.mru = mru;
+ *OVS_CB(skb) = ovs_cb;
+ }
+ } /* XXX Handle IPv6 */
+
+ return 0;
+}
+
int ovs_ct_execute(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_conntrack_info *info)
{
struct net *net;
- int nh_ofs = skb_network_offset(skb);
struct nf_conn *tmpl = info->ct;
- int err = -EINVAL;
+ int nh_ofs, err;

net = ovs_get_net(skb);
if (IS_ERR(net))
return PTR_ERR(net);

+ err = handle_fragments(net, info->zone, skb, key);
+ if (err)
+ return err;
+
/* The conntrack module expects to be working at L3. */
+ nh_ofs = skb_network_offset(skb);
skb_pull(skb, nh_ofs);

+ err = -EINVAL;
if (ovs_ct_lookup__(net, tmpl, key, skb))
goto err_push_skb;

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 46f67ee..1340f21 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -277,6 +277,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
upcall.userdata = NULL;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);
upcall.egress_tun_info = NULL;
+ upcall.mru = OVS_CB(skb)->mru;
error = ovs_dp_upcall(dp, skb, key, &upcall);
if (unlikely(error))
kfree_skb(skb);
@@ -398,9 +399,23 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
if (upcall_info->egress_tun_info)
size += nla_total_size(ovs_tun_key_attr_size());

+ /* OVS_PACKET_ATTR_MRU */
+ if (upcall_info->mru)
+ size += nla_total_size(sizeof(unsigned int));
+
return size;
}

+static void pad_packet(struct datapath *dp, struct sk_buff *skb)
+{
+ if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
+ size_t plen = NLA_ALIGN(skb->len) - skb->len;
+
+ if (plen > 0)
+ memset(skb_put(skb, plen), 0, plen);
+ }
+}
+
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
@@ -479,6 +494,16 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
nla_nest_end(user_skb, nla);
}

+ /* Add OVS_PACKET_ATTR_MRU */
+ if (upcall_info->mru) {
+ if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
+ upcall_info->mru)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+ pad_packet(dp, user_skb);
+ }
+
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy() */
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
@@ -492,12 +517,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
goto out;

/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
- if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
- size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;
-
- if (plen > 0)
- memset(skb_put(user_skb, plen), 0, plen);
- }
+ pad_packet(dp, user_skb);

((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;

@@ -526,6 +546,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
int len;
int err;
bool log = !a[OVS_PACKET_ATTR_PROBE];
+ unsigned int mru;

err = -EINVAL;
if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
@@ -552,6 +573,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
else
packet->protocol = htons(ETH_P_802_2);

+ /* Set packet's mru */
+ mru = 0;
+ if (a[OVS_PACKET_ATTR_MRU])
+ mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
+ OVS_CB(packet)->mru = mru;
+
/* Build an sw_flow for sending this packet. */
flow = ovs_flow_alloc();
err = PTR_ERR(flow);
@@ -612,6 +639,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
+ [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
};

static const struct genl_ops dp_packet_genl_ops[] = {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 9661a01..cfbdda1 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -98,10 +98,13 @@ struct datapath {
* NULL if the packet is not being tunneled.
* @input_vport: The original vport packet came in on. This value is cached
* when a packet is received by OVS.
+ * @mru: The maximum received fragement size; 0 if the packet is not
+ * fragmented.
*/
struct ovs_skb_cb {
struct ovs_tunnel_info *egress_tun_info;
struct vport *input_vport;
+ unsigned int mru;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)

@@ -114,12 +117,14 @@ struct ovs_skb_cb {
* then no packet is sent and the packet is accounted in the datapath's @n_lost
* counter.
* @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
+ * @mru: If not zero, Maximum received IP fragment size.
*/
struct dp_upcall_info {
const struct ovs_tunnel_info *egress_tun_info;
const struct nlattr *userdata;
u32 portid;
u8 cmd;
+ unsigned int mru;
};

/**
@@ -198,6 +203,7 @@ void ovs_dp_notify_wq(struct work_struct *work);

int action_fifos_init(void);
void action_fifos_exit(void);
+int ovs_setup_l2_header(struct sk_buff *skb, struct sw_flow_key *key);

/* 'KEY' must not have any bits set outside of the 'MASK' */
#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index ec2954f..184dd51 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -486,6 +486,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,

OVS_CB(skb)->input_vport = vport;
OVS_CB(skb)->egress_tun_info = NULL;
+ OVS_CB(skb)->mru = 0;
/* Extract flow from 'skb' into 'key'. */
error = ovs_flow_key_extract(tun_info, skb, &key);
if (unlikely(error)) {
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Joe Stringer: "[RFCv2 net-next 6/7] net: Refactor ip_defrag() APIs"
Previous message: Tom Zanussi: "Re: [PATCH 07/15] mm: Add ___GFP_NOTRACE"
In reply to: Joe Stringer: "[RFCv2 net-next 3/7] openvswitch: Add conntrack action"
Next in thread: Joe Stringer: "[RFCv2 net-next 6/7] net: Refactor ip_defrag() APIs"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]