diff options
author | Florian Westphal <fw@strlen.de> | 2017-09-06 08:39:51 -0400 |
---|---|---|
committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2017-09-08 12:55:50 -0400 |
commit | e1bf1687740ce1a3598a1c5e452b852ff2190682 (patch) | |
tree | 38df06856971c709b8becc0b505f7a1660764742 | |
parent | a5d7a714569199f909cd60ff7074107bf15c7db4 (diff) |
netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
It was not a good idea. The custom hash table was a much better
fit for this purpose.
A fast lookup is not essential, in fact for most cases there is no lookup
at all because original tuple is not taken and can be used as-is.
What needs to be fast is insertion and deletion.
rhlist removal however requires a rhlist walk.
We can have thousands of entries in such a list if source port/addresses
are reused for multiple flows, if this happens removal requests are so
expensive that deletions of a few thousand flows can take several
seconds(!).
The advantages that we got from rhashtable are:
1) table auto-sizing
2) multiple locks
1) would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource
table are not a problem compared to current deletion cost.
2) is easy to add to custom hash table.
I tried to add hlist_node to rhlist to speed up rhltable_remove but this
isn't doable without changing semantics. rhltable_remove_fast will
check that the to-be-deleted object is part of the table and that
requires a list walk that we want to avoid.
Furthermore, using hlist_node increases size of struct rhlist_head, which
in turn increases nf_conn size.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
Reported-by: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
-rw-r--r-- | include/net/netfilter/nf_conntrack.h | 3 | ||||
-rw-r--r-- | include/net/netfilter/nf_nat.h | 1 | ||||
-rw-r--r-- | net/netfilter/nf_nat_core.c | 130 |
3 files changed, 54 insertions, 80 deletions
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index fdc9c64a1c94..8f3bd30511de 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/bitops.h> | 17 | #include <linux/bitops.h> |
18 | #include <linux/compiler.h> | 18 | #include <linux/compiler.h> |
19 | #include <linux/atomic.h> | 19 | #include <linux/atomic.h> |
20 | #include <linux/rhashtable.h> | ||
21 | 20 | ||
22 | #include <linux/netfilter/nf_conntrack_tcp.h> | 21 | #include <linux/netfilter/nf_conntrack_tcp.h> |
23 | #include <linux/netfilter/nf_conntrack_dccp.h> | 22 | #include <linux/netfilter/nf_conntrack_dccp.h> |
@@ -77,7 +76,7 @@ struct nf_conn { | |||
77 | possible_net_t ct_net; | 76 | possible_net_t ct_net; |
78 | 77 | ||
79 | #if IS_ENABLED(CONFIG_NF_NAT) | 78 | #if IS_ENABLED(CONFIG_NF_NAT) |
80 | struct rhlist_head nat_bysource; | 79 | struct hlist_node nat_bysource; |
81 | #endif | 80 | #endif |
82 | /* all members below initialized via memset */ | 81 | /* all members below initialized via memset */ |
83 | u8 __nfct_init_offset[0]; | 82 | u8 __nfct_init_offset[0]; |
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 05c82a1a4267..b71701302e61 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h | |||
@@ -1,6 +1,5 @@ | |||
1 | #ifndef _NF_NAT_H | 1 | #ifndef _NF_NAT_H |
2 | #define _NF_NAT_H | 2 | #define _NF_NAT_H |
3 | #include <linux/rhashtable.h> | ||
4 | #include <linux/netfilter_ipv4.h> | 3 | #include <linux/netfilter_ipv4.h> |
5 | #include <linux/netfilter/nf_nat.h> | 4 | #include <linux/netfilter/nf_nat.h> |
6 | #include <net/netfilter/nf_conntrack_tuple.h> | 5 | #include <net/netfilter/nf_conntrack_tuple.h> |
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index dc3519cc7209..f090419f5f97 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c | |||
@@ -30,19 +30,17 @@ | |||
30 | #include <net/netfilter/nf_conntrack_zones.h> | 30 | #include <net/netfilter/nf_conntrack_zones.h> |
31 | #include <linux/netfilter/nf_nat.h> | 31 | #include <linux/netfilter/nf_nat.h> |
32 | 32 | ||
33 | static DEFINE_SPINLOCK(nf_nat_lock); | ||
34 | |||
33 | static DEFINE_MUTEX(nf_nat_proto_mutex); | 35 | static DEFINE_MUTEX(nf_nat_proto_mutex); |
34 | static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] | 36 | static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] |
35 | __read_mostly; | 37 | __read_mostly; |
36 | static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] | 38 | static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] |
37 | __read_mostly; | 39 | __read_mostly; |
38 | 40 | ||
39 | struct nf_nat_conn_key { | 41 | static struct hlist_head *nf_nat_bysource __read_mostly; |
40 | const struct net *net; | 42 | static unsigned int nf_nat_htable_size __read_mostly; |
41 | const struct nf_conntrack_tuple *tuple; | 43 | static unsigned int nf_nat_hash_rnd __read_mostly; |
42 | const struct nf_conntrack_zone *zone; | ||
43 | }; | ||
44 | |||
45 | static struct rhltable nf_nat_bysource_table; | ||
46 | 44 | ||
47 | inline const struct nf_nat_l3proto * | 45 | inline const struct nf_nat_l3proto * |
48 | __nf_nat_l3proto_find(u8 family) | 46 | __nf_nat_l3proto_find(u8 family) |
@@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) | |||
118 | EXPORT_SYMBOL(nf_xfrm_me_harder); | 116 | EXPORT_SYMBOL(nf_xfrm_me_harder); |
119 | #endif /* CONFIG_XFRM */ | 117 | #endif /* CONFIG_XFRM */ |
120 | 118 | ||
121 | static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) | 119 | /* We keep an extra hash for each conntrack, for fast searching. */ |
120 | static unsigned int | ||
121 | hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) | ||
122 | { | 122 | { |
123 | const struct nf_conntrack_tuple *t; | 123 | unsigned int hash; |
124 | const struct nf_conn *ct = data; | 124 | |
125 | get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); | ||
125 | 126 | ||
126 | t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | ||
127 | /* Original src, to ensure we map it consistently if poss. */ | 127 | /* Original src, to ensure we map it consistently if poss. */ |
128 | hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), | ||
129 | tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); | ||
128 | 130 | ||
129 | seed ^= net_hash_mix(nf_ct_net(ct)); | 131 | return reciprocal_scale(hash, nf_nat_htable_size); |
130 | return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), | ||
131 | t->dst.protonum ^ seed); | ||
132 | } | 132 | } |
133 | 133 | ||
134 | /* Is this tuple already taken? (not by us) */ | 134 | /* Is this tuple already taken? (not by us) */ |
@@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct, | |||
184 | t->src.u.all == tuple->src.u.all); | 184 | t->src.u.all == tuple->src.u.all); |
185 | } | 185 | } |
186 | 186 | ||
187 | static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, | ||
188 | const void *obj) | ||
189 | { | ||
190 | const struct nf_nat_conn_key *key = arg->key; | ||
191 | const struct nf_conn *ct = obj; | ||
192 | |||
193 | if (!same_src(ct, key->tuple) || | ||
194 | !net_eq(nf_ct_net(ct), key->net) || | ||
195 | !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL)) | ||
196 | return 1; | ||
197 | |||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static struct rhashtable_params nf_nat_bysource_params = { | ||
202 | .head_offset = offsetof(struct nf_conn, nat_bysource), | ||
203 | .obj_hashfn = nf_nat_bysource_hash, | ||
204 | .obj_cmpfn = nf_nat_bysource_cmp, | ||
205 | .nelem_hint = 256, | ||
206 | .min_size = 1024, | ||
207 | }; | ||
208 | |||
209 | /* Only called for SRC manip */ | 187 | /* Only called for SRC manip */ |
210 | static int | 188 | static int |
211 | find_appropriate_src(struct net *net, | 189 | find_appropriate_src(struct net *net, |
@@ -216,26 +194,22 @@ find_appropriate_src(struct net *net, | |||
216 | struct nf_conntrack_tuple *result, | 194 | struct nf_conntrack_tuple *result, |
217 | const struct nf_nat_range *range) | 195 | const struct nf_nat_range *range) |
218 | { | 196 | { |
197 | unsigned int h = hash_by_src(net, tuple); | ||
219 | const struct nf_conn *ct; | 198 | const struct nf_conn *ct; |
220 | struct nf_nat_conn_key key = { | ||
221 | .net = net, | ||
222 | .tuple = tuple, | ||
223 | .zone = zone | ||
224 | }; | ||
225 | struct rhlist_head *hl, *h; | ||
226 | |||
227 | hl = rhltable_lookup(&nf_nat_bysource_table, &key, | ||
228 | nf_nat_bysource_params); | ||
229 | 199 | ||
230 | rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { | 200 | hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { |
231 | nf_ct_invert_tuplepr(result, | 201 | if (same_src(ct, tuple) && |
232 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); | 202 | net_eq(net, nf_ct_net(ct)) && |
233 | result->dst = tuple->dst; | 203 | nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { |
234 | 204 | /* Copy source part from reply tuple. */ | |
235 | if (in_range(l3proto, l4proto, result, range)) | 205 | nf_ct_invert_tuplepr(result, |
236 | return 1; | 206 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); |
207 | result->dst = tuple->dst; | ||
208 | |||
209 | if (in_range(l3proto, l4proto, result, range)) | ||
210 | return 1; | ||
211 | } | ||
237 | } | 212 | } |
238 | |||
239 | return 0; | 213 | return 0; |
240 | } | 214 | } |
241 | 215 | ||
@@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct, | |||
408 | const struct nf_nat_range *range, | 382 | const struct nf_nat_range *range, |
409 | enum nf_nat_manip_type maniptype) | 383 | enum nf_nat_manip_type maniptype) |
410 | { | 384 | { |
385 | struct net *net = nf_ct_net(ct); | ||
411 | struct nf_conntrack_tuple curr_tuple, new_tuple; | 386 | struct nf_conntrack_tuple curr_tuple, new_tuple; |
412 | 387 | ||
413 | /* Can't setup nat info for confirmed ct. */ | 388 | /* Can't setup nat info for confirmed ct. */ |
@@ -449,19 +424,14 @@ nf_nat_setup_info(struct nf_conn *ct, | |||
449 | } | 424 | } |
450 | 425 | ||
451 | if (maniptype == NF_NAT_MANIP_SRC) { | 426 | if (maniptype == NF_NAT_MANIP_SRC) { |
452 | struct nf_nat_conn_key key = { | 427 | unsigned int srchash; |
453 | .net = nf_ct_net(ct), | 428 | |
454 | .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, | 429 | srchash = hash_by_src(net, |
455 | .zone = nf_ct_zone(ct), | 430 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
456 | }; | 431 | spin_lock_bh(&nf_nat_lock); |
457 | int err; | 432 | hlist_add_head_rcu(&ct->nat_bysource, |
458 | 433 | &nf_nat_bysource[srchash]); | |
459 | err = rhltable_insert_key(&nf_nat_bysource_table, | 434 | spin_unlock_bh(&nf_nat_lock); |
460 | &key, | ||
461 | &ct->nat_bysource, | ||
462 | nf_nat_bysource_params); | ||
463 | if (err) | ||
464 | return NF_DROP; | ||
465 | } | 435 | } |
466 | 436 | ||
467 | /* It's done. */ | 437 | /* It's done. */ |
@@ -570,8 +540,9 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) | |||
570 | * will delete entry from already-freed table. | 540 | * will delete entry from already-freed table. |
571 | */ | 541 | */ |
572 | clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); | 542 | clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); |
573 | rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, | 543 | spin_lock_bh(&nf_nat_lock); |
574 | nf_nat_bysource_params); | 544 | hlist_del_rcu(&ct->nat_bysource); |
545 | spin_unlock_bh(&nf_nat_lock); | ||
575 | 546 | ||
576 | /* don't delete conntrack. Although that would make things a lot | 547 | /* don't delete conntrack. Although that would make things a lot |
577 | * simpler, we'd end up flushing all conntracks on nat rmmod. | 548 | * simpler, we'd end up flushing all conntracks on nat rmmod. |
@@ -699,9 +670,11 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); | |||
699 | /* No one using conntrack by the time this called. */ | 670 | /* No one using conntrack by the time this called. */ |
700 | static void nf_nat_cleanup_conntrack(struct nf_conn *ct) | 671 | static void nf_nat_cleanup_conntrack(struct nf_conn *ct) |
701 | { | 672 | { |
702 | if (ct->status & IPS_SRC_NAT_DONE) | 673 | if (ct->status & IPS_SRC_NAT_DONE) { |
703 | rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, | 674 | spin_lock_bh(&nf_nat_lock); |
704 | nf_nat_bysource_params); | 675 | hlist_del_rcu(&ct->nat_bysource); |
676 | spin_unlock_bh(&nf_nat_lock); | ||
677 | } | ||
705 | } | 678 | } |
706 | 679 | ||
707 | static struct nf_ct_ext_type nat_extend __read_mostly = { | 680 | static struct nf_ct_ext_type nat_extend __read_mostly = { |
@@ -825,13 +798,16 @@ static int __init nf_nat_init(void) | |||
825 | { | 798 | { |
826 | int ret; | 799 | int ret; |
827 | 800 | ||
828 | ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); | 801 | /* Leave them the same for the moment. */ |
829 | if (ret) | 802 | nf_nat_htable_size = nf_conntrack_htable_size; |
830 | return ret; | 803 | |
804 | nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); | ||
805 | if (!nf_nat_bysource) | ||
806 | return -ENOMEM; | ||
831 | 807 | ||
832 | ret = nf_ct_extend_register(&nat_extend); | 808 | ret = nf_ct_extend_register(&nat_extend); |
833 | if (ret < 0) { | 809 | if (ret < 0) { |
834 | rhltable_destroy(&nf_nat_bysource_table); | 810 | nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); |
835 | printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); | 811 | printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); |
836 | return ret; | 812 | return ret; |
837 | } | 813 | } |
@@ -865,8 +841,8 @@ static void __exit nf_nat_cleanup(void) | |||
865 | 841 | ||
866 | for (i = 0; i < NFPROTO_NUMPROTO; i++) | 842 | for (i = 0; i < NFPROTO_NUMPROTO; i++) |
867 | kfree(nf_nat_l4protos[i]); | 843 | kfree(nf_nat_l4protos[i]); |
868 | 844 | synchronize_net(); | |
869 | rhltable_destroy(&nf_nat_bysource_table); | 845 | nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); |
870 | } | 846 | } |
871 | 847 | ||
872 | MODULE_LICENSE("GPL"); | 848 | MODULE_LICENSE("GPL"); |