[PATCH] Optimize lookup of /0 xfrm policies

From: Yannick Brosseau
Date: Thu Aug 30 2018 - 15:35:38 EST


Currently, all the xfrm policies that are not /32 end up in
the inexact policies linked list which take a long time to lookup.

We can optimize the case where we have a /0 prefix in the policy, which
means we can match any address to that part.
We do this by putting those policies in the direct hash table after
zeroing the address part.
At lookup time, we do an additional lookup with the packet address
and either the destination or source address zeroed out.
We still call xfrm_policy_match to validate that the packet match the
selector.

In our tests, with this optimization we reduce softirq cpu utilisation
from about 40% to 7% with 3k policies.

Signed-off-by: Yannick Brosseau <scientist@xxxxxx>
---
net/xfrm/xfrm_hash.h | 10 +++++
net/xfrm/xfrm_policy.c | 88 +++++++++++++++++++++++++++++++++++++++++-
2 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
index 61be810389d8..40997fb5336d 100644
--- a/net/xfrm/xfrm_hash.h
+++ b/net/xfrm/xfrm_hash.h
@@ -145,6 +145,16 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
const xfrm_address_t *saddr = &sel->saddr;
unsigned int h = 0;

+ /* A selector with a prefixlen of zero can basically be ignored in
+ * the matching. To speed up the lookup, let's hash it without those
+ * component. In the lookup, we'll do an additional check for a zero
+ * daddr and a zero saddr.
+ */
+ if (sel->prefixlen_d == 0)
+ dbits = 0;
+ if (sel->prefixlen_s == 0)
+ sbits = 0;
+
switch (family) {
case AF_INET:
if (sel->prefixlen_d < dbits ||
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 3110c3fbee20..7c2259f140d5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1096,8 +1096,10 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
int err;
struct xfrm_policy *pol, *ret;
const xfrm_address_t *daddr, *saddr;
+ static const xfrm_address_t zero_addr = {0};
+
struct hlist_head *chain;
- unsigned int sequence;
+ unsigned int sequence, first_sequence;
u32 priority;

daddr = xfrm_flowi_daddr(fl, family);
@@ -1112,6 +1114,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
chain = policy_hash_direct(net, daddr, saddr, family, dir);
} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));

+ first_sequence = sequence;
priority = ~0U;
ret = NULL;
hlist_for_each_entry_rcu(pol, chain, bydst) {
@@ -1129,6 +1132,87 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
break;
}
}
+
+ /* XXX FB NOT UPSTREAM YET T12762593 */
+ /* Do an additional lookup for saddr == 0, since we stored source
+ * selector with a prefix len of 0 that way in the bydst hash
+ */
+ do {
+ sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+ chain = policy_hash_direct(net, daddr, &zero_addr, family, dir);
+ } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
+ if ((pol->priority >= priority) && ret)
+ break;
+
+ err = xfrm_policy_match(pol, fl, type, family, dir);
+ if (err) {
+ if (err == -ESRCH)
+ continue;
+ else {
+ ret = ERR_PTR(err);
+ goto fail;
+ }
+ } else {
+ ret = pol;
+ priority = ret->priority;
+ break;
+ }
+ }
+
+ /* Do an additional lookup for daddr == 0, since we stored dest
+ * selector with a prefix len of 0 that way in the bydst hash
+ */
+ do {
+ sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+ chain = policy_hash_direct(net, &zero_addr, saddr, family, dir);
+ } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
+ if ((pol->priority >= priority) && ret)
+ break;
+
+ err = xfrm_policy_match(pol, fl, type, family, dir);
+ if (err) {
+ if (err == -ESRCH)
+ continue;
+ else {
+ ret = ERR_PTR(err);
+ goto fail;
+ }
+ } else {
+ ret = pol;
+ priority = ret->priority;
+ break;
+ }
+ }
+
+ /* Do an additional lookup for both saddr and daddr == 0 */
+ do {
+ sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+ chain = policy_hash_direct(net, &zero_addr, &zero_addr, family, dir);
+ } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
+ if ((pol->priority >= priority) && ret)
+ break;
+
+ err = xfrm_policy_match(pol, fl, type, family, dir);
+ if (err) {
+ if (err == -ESRCH)
+ continue;
+ else {
+ ret = ERR_PTR(err);
+ goto fail;
+ }
+ } else {
+ ret = pol;
+ priority = ret->priority;
+ break;
+ }
+ }
+
chain = &net->xfrm.policy_inexact[dir];
hlist_for_each_entry_rcu(pol, chain, bydst) {
if ((pol->priority >= priority) && ret)
@@ -1148,7 +1232,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
}

- if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+ if (read_seqcount_retry(&xfrm_policy_hash_generation, first_sequence))
goto retry;

if (ret && !xfrm_pol_hold_rcu(ret))
--
2.18.0