Re: [patch v2, kernel version 3.2.1] net/ipv4/ip_gre: Ethernetmultipoint GRE over IP

From: Štefan Gula
Date: Tue Jan 17 2012 - 06:00:54 EST


DÅa 17. januÃra 2012 11:47, Eric Dumazet <eric.dumazet@xxxxxxxxx> napÃsal/a:
> Le mardi 17 janvier 2012 Ã 11:43 +0100, Åtefan Gula a Ãcrit :
>
>> ok maybe I am getting it wrong, but I am little bit stuck here. I
>> recheck the original bridge code. The difference I recognize is that
>> in bridge code function:
>> br_fdb_init() and br_fdb_fini()
>> are called from module init and module exit functions:
>> br_init and br_deinit
>>
>> in my code they are called from functions:
>> ipgre_init_net and ipgre_exit_net
>> instead of:
>> ipgre_init and ipgre_fini
>>
>> To be honest I am not so familiar enough with kernel structure that I
>> see the difference on the first time. But I think that with your help
>> it can be done easily. The main idea was to create hash-table that is
>> used to determine the destination IPv4 address (part of the entry
>> structure). That hash-table should be different per each gretap
>> interface - I think that's the reason why I put those init and fini
>> inside ipgre_init_net and ipgre_exit_net. Am I right that the
>> placement of this calls is correct or not? If not where those calls
>> should be placed?
>>
>> On the other hand I have no idea how to substitute those two function
>> with a code that you are suggesting kmalloc()/kfree(). I would be glad
>> if you can help me here by providing me example how to substitute
>> those two functions with kmalloc/kfree for the future usage (I am more
>> reverse engineer learner type of person than manuals reading one)
>
> Something like the following ?
>
>
> Note : I also put the "orig_source = iph->saddr;"
> _after_ "iph = ip_hdr(skb);"
>
> Â Â Â Â Â Â Â Â Â Â Â Âiph = ip_hdr(skb);
> #ifdef CONFIG_NET_IPGRE_BRIDGE
> Â Â Â Â Â Â Â Â Â Â Â Âorig_source = iph->saddr;
> #endif
>
>
>
> diff --git a/include/net/ipip.h b/include/net/ipip.h
> index a32654d..6a06fc2 100644
> --- a/include/net/ipip.h
> +++ b/include/net/ipip.h
> @@ -27,6 +27,14 @@ struct ip_tunnel {
> Â Â Â Â__u32 Â Â Â Â Â Â Â Â Â o_seqno; Â Â Â Â/* The last output seqno */
>    Âint           hlen;      /* Precalculated GRE header length */
>    Âint           mlink;
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +#define GRETAP_BR_HASH_BITS 8
> +#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
> +    struct hlist_head    hash[GRETAP_BR_HASH_SIZE];
> +    spinlock_t       Âhash_lock;
> +    unsigned long      ageing_time;
> +    struct timer_list    gc_timer;
> +#endif
>
>    Âstruct ip_tunnel_parm  parms;
>
> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> index 1a8f93b..5b320a3 100644
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
> Â Â Â Â ÂNetwork), but can be distributed all over the Internet. If you want
> Â Â Â Â Âto do that, say Y here and to "IP multicast routing" below.
>
> +config NET_IPGRE_BRIDGE
> + Â Â Â bool "IP: Ethernet over multipoint GRE over IP"
> + Â Â Â depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
> + Â Â Â help
> + Â Â Â Â Allows you to use multipoint GRE VPN as virtual switch and interconnect
> + Â Â Â Â several L2 endpoints over L3 routed infrastructure. It is useful for
> + Â Â Â Â creating multipoint L2 VPNs which can be later used inside bridge
> + Â Â Â Â interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
> +
> Âconfig IP_MROUTE
> Â Â Â Âbool "IP: multicast routing"
> Â Â Â Âdepends on IP_MULTICAST
> diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
> index 2b53a1f..df22565 100644
> --- a/net/ipv4/ip_gre.c
> +++ b/net/ipv4/ip_gre.c
> @@ -52,6 +52,11 @@
> Â#include <net/ip6_route.h>
> Â#endif
>
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +#include <linux/jhash.h>
> +#include <asm/unaligned.h>
> +#endif
> +
> Â/*
> Â ÂProblems & solutions
> Â Â--------------------
> @@ -134,6 +139,172 @@ struct ipgre_net {
> Â Â Â Âstruct net_device *fb_tunnel_dev;
> Â};
>
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + Â Â Â /*
> + Â Â Â Â* This part of code includes codes to enable L2 ethernet
> + Â Â Â Â* switch virtualization over IP routed infrastructure with
> + Â Â Â Â* utilization of multicast capable endpoint using Ethernet
> + Â Â Â Â* over GRE
> + Â Â Â Â*
> + Â Â Â Â* Author: Stefan Gula
> + Â Â Â Â* Signed-off-by: Stefan Gula <steweg@xxxxxxxxx>
> + Â Â Â Â*/
> +struct ipgre_tap_bridge_entry {
> +    struct hlist_node    hlist;
> + Â Â Â __be32 Â Â Â Â Â Â Â Â Âraddr;
> +    unsigned char      addr[ETH_ALEN];
> +    unsigned long      updated;
> +    struct rcu_head     rcu;
> +};
> +
> +static u32 ipgre_salt __read_mostly;
> +
> +static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
> +{
> + Â Â Â u32 key = get_unaligned((u32 *)(mac + 2));
> +
> + Â Â Â return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
> +}
> +
> +static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const struct ipgre_tap_bridge_entry *entry)
> +{
> + Â Â Â return time_before_eq(entry->updated + tunnel->ageing_time,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â jiffies);
> +}
> +
> +static inline void ipgre_tap_bridge_delete(struct ipgre_tap_bridge_entry *entry)
> +{
> + Â Â Â hlist_del_rcu(&entry->hlist);
> + Â Â Â kfree_rcu(entry, rcu);
> +}
> +
> +static void ipgre_tap_bridge_cleanup(unsigned long _data)
> +{
> + Â Â Â struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
> + Â Â Â unsigned long delay = tunnel->ageing_time;
> + Â Â Â unsigned long next_timer = jiffies + tunnel->ageing_time;
> + Â Â Â int i;
> +
> + Â Â Â spin_lock(&tunnel->hash_lock);
> + Â Â Â for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> + Â Â Â Â Â Â Â struct ipgre_tap_bridge_entry *entry;
> + Â Â Â Â Â Â Â struct hlist_node *h, *n;
> +
> + Â Â Â Â Â Â Â hlist_for_each_entry_safe(entry, h, n,
> + Â Â Â Â Â Â Â Â Â Â Â &tunnel->hash[i], hlist)
> + Â Â Â Â Â Â Â {
> + Â Â Â Â Â Â Â Â Â Â Â unsigned long this_timer;
> + Â Â Â Â Â Â Â Â Â Â Â this_timer = entry->updated + delay;
> + Â Â Â Â Â Â Â Â Â Â Â if (time_before_eq(this_timer, jiffies))
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ipgre_tap_bridge_delete(entry);
> + Â Â Â Â Â Â Â Â Â Â Â else if (time_before(this_timer, next_timer))
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â next_timer = this_timer;
> + Â Â Â Â Â Â Â }
> + Â Â Â }
> + Â Â Â spin_unlock(&tunnel->hash_lock);
> + Â Â Â mod_timer(&tunnel->gc_timer, round_jiffies_up(next_timer));
> +}
> +
> +static void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
> +{
> + Â Â Â int i;
> +
> + Â Â Â spin_lock_bh(&tunnel->hash_lock);
> + Â Â Â for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> + Â Â Â Â Â Â Â struct ipgre_tap_bridge_entry *entry;
> + Â Â Â Â Â Â Â struct hlist_node *h, *n;
> +
> + Â Â Â Â Â Â Â hlist_for_each_entry_safe(entry, h, n,
> + Â Â Â Â Â Â Â Â Â Â Â &tunnel->hash[i], hlist)
> + Â Â Â Â Â Â Â {
> + Â Â Â Â Â Â Â Â Â Â Â ipgre_tap_bridge_delete(entry);
> + Â Â Â Â Â Â Â }
> + Â Â Â }
> + Â Â Â spin_unlock_bh(&tunnel->hash_lock);
> +}
> +
> +static struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(
> + Â Â Â struct ip_tunnel *tunnel, const unsigned char *addr)
> +{
> + Â Â Â struct hlist_node *h;
> + Â Â Â struct ipgre_tap_bridge_entry *entry;
> +
> + Â Â Â hlist_for_each_entry_rcu(entry, h,
> + Â Â Â Â Â Â Â Â Â Â Â &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist) {
> + Â Â Â Â Â Â Â if (!compare_ether_addr(entry->addr, addr)) {
> + Â Â Â Â Â Â Â Â Â Â Â if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â entry)))
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â break;
> + Â Â Â Â Â Â Â Â Â Â Â return entry;
> + Â Â Â Â Â Â Â }
> + Â Â Â }
> +
> + Â Â Â return NULL;
> +}
> +
> +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
> + Â Â Â struct hlist_head *head,
> + Â Â Â const unsigned char *addr)
> +{
> + Â Â Â struct hlist_node *h;
> + Â Â Â struct ipgre_tap_bridge_entry *entry;
> +
> + Â Â Â hlist_for_each_entry(entry, h, head, hlist) {
> + Â Â Â Â Â Â Â if (!compare_ether_addr(entry->addr, addr))
> + Â Â Â Â Â Â Â Â Â Â Â return entry;
> + Â Â Â }
> + Â Â Â return NULL;
> +}
> +
> +
> +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find_rcu(
> + Â Â Â struct hlist_head *head,
> + Â Â Â const unsigned char *addr)
> +{
> + Â Â Â struct hlist_node *h;
> + Â Â Â struct ipgre_tap_bridge_entry *entry;
> +
> + Â Â Â hlist_for_each_entry_rcu(entry, h, head, hlist) {
> + Â Â Â Â Â Â Â if (!compare_ether_addr(entry->addr, addr))
> + Â Â Â Â Â Â Â Â Â Â Â return entry;
> + Â Â Â }
> + Â Â Â return NULL;
> +}
> +
> +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
> + Â Â Â struct hlist_head *head,
> + Â Â Â u32 source,
> + Â Â Â const unsigned char *addr)
> +{
> + Â Â Â struct ipgre_tap_bridge_entry *entry;
> +
> + Â Â Â entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
> + Â Â Â if (entry) {
> + Â Â Â Â Â Â Â memcpy(entry->addr, addr, ETH_ALEN);
> + Â Â Â Â Â Â Â entry->raddr = source;
> + Â Â Â Â Â Â Â entry->updated = jiffies;
> + Â Â Â Â Â Â Â hlist_add_head_rcu(&entry->hlist, head);
> + Â Â Â }
> + Â Â Â return entry;
> +}
> +
> +static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
> + Â Â Â const unsigned char *addr)
> +{
> + Â Â Â __be32 raddr = 0;
> + Â Â Â struct ipgre_tap_bridge_entry *entry;
> +
> + Â Â Â rcu_read_lock();
> + Â Â Â entry = __ipgre_tap_bridge_get(tunnel, addr);
> + Â Â Â if (entry)
> + Â Â Â Â Â Â Â raddr = entry->raddr;
> + Â Â Â rcu_read_unlock();
> +
> + Â Â Â return raddr;
> +}
> +
> +#endif
> Â/* Tunnel hash table */
>
> Â/*
> @@ -562,6 +733,12 @@ static int ipgre_rcv(struct sk_buff *skb)
> Â Â Â Âstruct ip_tunnel *tunnel;
>    Âint  Âoffset = 4;
> Â Â Â Â__be16 gre_proto;
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + Â Â Â __be32 orig_source;
> + Â Â Â struct hlist_head *head;
> + Â Â Â struct ipgre_tap_bridge_entry *entry;
> + Â Â Â const struct ethhdr *tethhdr;
> +#endif
>
> Â Â Â Âif (!pskb_may_pull(skb, 16))
> Â Â Â Â Â Â Â Âgoto drop_nolock;
> @@ -659,10 +836,38 @@ static int ipgre_rcv(struct sk_buff *skb)
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âtunnel->dev->stats.rx_errors++;
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âgoto drop;
> Â Â Â Â Â Â Â Â Â Â Â Â}
> -
> Â Â Â Â Â Â Â Â Â Â Â Âiph = ip_hdr(skb);
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + Â Â Â Â Â Â Â Â Â Â Â orig_source = iph->saddr;
> +#endif
> Â Â Â Â Â Â Â Â Â Â Â Âskb->protocol = eth_type_trans(skb, tunnel->dev);
> Â Â Â Â Â Â Â Â Â Â Â Âskb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + Â Â Â Â Â Â Â Â Â Â Â if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â tethhdr = eth_hdr(skb);
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â if (!is_multicast_ether_addr(
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â tethhdr->h_source)) {
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â head = &tunnel->hash[
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ipgre_tap_bridge_hash(
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â tethhdr->h_source)];
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â entry = ipgre_tap_bridge_find_rcu(head,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â tethhdr->h_source);
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â if (likely(entry)) {
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â entry->raddr = orig_source;
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â entry->updated = jiffies;
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â } else {
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â spin_lock(&tunnel->hash_lock);
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â if (!ipgre_tap_bridge_find(head,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â tethhdr->h_source))
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ipgre_tap_bridge_create(
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â head,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â orig_source,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â tethhdr->h_source);
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â spin_unlock(&tunnel->hash_lock);
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â }
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â }
> + Â Â Â Â Â Â Â Â Â Â Â }
> +#endif
> Â Â Â Â Â Â Â Â}
>
> Â Â Â Â Â Â Â Âtstats = this_cpu_ptr(tunnel->dev->tstats);
> @@ -702,7 +907,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
> Â Â Â Âstruct iphdr Â*iph; Â Â Â Â Â Â Â Â Â Â /* Our new IP header */
> Â Â Â Âunsigned int max_headroom; Â Â Â Â Â Â Â/* The extra header space needed */
>    Âint  Âgre_hlen;
> - Â Â Â __be32 dst;
> + Â Â Â __be32 dst = 0;
>    Âint  Âmtu;
>
> Â Â Â Âif (dev->type == ARPHRD_ETHER)
> @@ -716,7 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
> Â Â Â Â Â Â Â Âtiph = &tunnel->parms.iph;
> Â Â Â Â}
>
> - Â Â Â if ((dst = tiph->daddr) == 0) {
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + Â Â Â if ((dev->type == ARPHRD_ETHER) &&
> + Â Â Â Â Â Â Â ipv4_is_multicast(tunnel->parms.iph.daddr))
> + Â Â Â Â Â Â Â dst = ipgre_tap_bridge_get_raddr(tunnel,
> + Â Â Â Â Â Â Â Â Â Â Â ((struct ethhdr *)skb->data)->h_dest);
> +#endif
> + Â Â Â if (dst == 0)
> + Â Â Â Â Â Â Â dst = tiph->daddr;
> + Â Â Â if (dst == 0) {
> Â Â Â Â Â Â Â Â/* NBMA tunnel */
>
> Â Â Â Â Â Â Â Âif (skb_dst(skb) == NULL) {
> @@ -1211,6 +1424,16 @@ static int ipgre_open(struct net_device *dev)
> Â Â Â Â Â Â Â Â Â Â Â Âreturn -EADDRNOTAVAIL;
> Â Â Â Â Â Â Â Ât->mlink = dev->ifindex;
> Â Â Â Â Â Â Â Âip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + Â Â Â Â Â Â Â if (t->dev->type == ARPHRD_ETHER) {
> + Â Â Â Â Â Â Â Â Â Â Â INIT_HLIST_HEAD(t->hash);
> + Â Â Â Â Â Â Â Â Â Â Â spin_lock_init(&t->hash_lock);
> + Â Â Â Â Â Â Â Â Â Â Â t->ageing_time = 300 * HZ;
> + Â Â Â Â Â Â Â Â Â Â Â setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (unsigned long) t);
> + Â Â Â Â Â Â Â Â Â Â Â mod_timer(&t->gc_timer, jiffies + t->ageing_time);
> + Â Â Â Â Â Â Â }
> +#endif
> Â Â Â Â}
> Â Â Â Âreturn 0;
> Â}
> @@ -1221,6 +1444,12 @@ static int ipgre_close(struct net_device *dev)
>
> Â Â Â Âif (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
> Â Â Â Â Â Â Â Âstruct in_device *in_dev;
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + Â Â Â Â Â Â Â if (t->dev->type == ARPHRD_ETHER) {
> + Â Â Â Â Â Â Â Â Â Â Â ipgre_tap_bridge_flush(t);
> + Â Â Â Â Â Â Â Â Â Â Â del_timer_sync(&t->gc_timer);
> + Â Â Â Â Â Â Â }
> +#endif
> Â Â Â Â Â Â Â Âin_dev = inetdev_by_index(dev_net(dev), t->mlink);
> Â Â Â Â Â Â Â Âif (in_dev)
> Â Â Â Â Â Â Â Â Â Â Â Âip_mc_dec_group(in_dev, t->parms.iph.daddr);
> @@ -1707,6 +1936,9 @@ static int __init ipgre_init(void)
>
> Â Â Â Âprintk(KERN_INFO "GRE over IPv4 tunneling driver\n");
>
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + Â Â Â get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
> +#endif
> Â Â Â Âerr = register_pernet_device(&ipgre_net_ops);
> Â Â Â Âif (err < 0)
> Â Â Â Â Â Â Â Âreturn err;
>
>
looks good... I am just wondering whether my previous question about
the placement of calls for ipgre_tap_bridge_init and
ipgre_tap_bridge_fini? Would it be also possible to have this
done/fixed when I migrate those inside the ipgre_init and ipgre_fini ?
I would like to have much rather identical parts of code with standard
bridge code just in case somebody would start doing generalization of
bridge code which can be then reused anywhere inside the kernel space
- simpler migration process later.
N‹§²æìr¸›yúèšØb²X¬¶ÇvØ^–)Þ{.nÇ+‰·¥Š{±‘êçzX§¶›¡Ü}©ž²ÆzÚ&j:+v‰¨¾«‘êçzZ+€Ê+zf£¢·hšˆ§~†­†Ûiÿûàz¹®w¥¢¸?™¨è­Ú&¢)ßf”ù^jÇy§m…á@A«a¶Úÿ 0¶ìh®å’i