aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2017-09-06 08:39:51 -0400
committerPablo Neira Ayuso <pablo@netfilter.org>2017-09-08 12:55:50 -0400
commite1bf1687740ce1a3598a1c5e452b852ff2190682 (patch)
tree38df06856971c709b8becc0b505f7a1660764742
parenta5d7a714569199f909cd60ff7074107bf15c7db4 (diff)
netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1. It was not a good idea. The custom hash table was a much better fit for this purpose. A fast lookup is not essential, in fact for most cases there is no lookup at all because original tuple is not taken and can be used as-is. What needs to be fast is insertion and deletion. rhlist removal however requires a rhlist walk. We can have thousands of entries in such a list if source port/addresses are reused for multiple flows, if this happens removal requests are so expensive that deletions of a few thousand flows can take several seconds(!). The advantages that we got from rhashtable are: 1) table auto-sizing 2) multiple locks 1) would be nice to have, but it is not essential as we have at most one lookup per new flow, so even a million flows in the bysource table are not a problem compared to current deletion cost. 2) is easy to add to custom hash table. I tried to add hlist_node to rhlist to speed up rhltable_remove but this isn't doable without changing semantics. rhltable_remove_fast will check that the to-be-deleted object is part of the table and that requires a list walk that we want to avoid. Furthermore, using hlist_node increases size of struct rhlist_head, which in turn increases nf_conn size. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821 Reported-by: Ivan Babrou <ibobrik@gmail.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
-rw-r--r--include/net/netfilter/nf_conntrack.h3
-rw-r--r--include/net/netfilter/nf_nat.h1
-rw-r--r--net/netfilter/nf_nat_core.c130
3 files changed, 54 insertions, 80 deletions
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index fdc9c64a1c94..8f3bd30511de 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -17,7 +17,6 @@
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/compiler.h> 18#include <linux/compiler.h>
19#include <linux/atomic.h> 19#include <linux/atomic.h>
20#include <linux/rhashtable.h>
21 20
22#include <linux/netfilter/nf_conntrack_tcp.h> 21#include <linux/netfilter/nf_conntrack_tcp.h>
23#include <linux/netfilter/nf_conntrack_dccp.h> 22#include <linux/netfilter/nf_conntrack_dccp.h>
@@ -77,7 +76,7 @@ struct nf_conn {
77 possible_net_t ct_net; 76 possible_net_t ct_net;
78 77
79#if IS_ENABLED(CONFIG_NF_NAT) 78#if IS_ENABLED(CONFIG_NF_NAT)
80 struct rhlist_head nat_bysource; 79 struct hlist_node nat_bysource;
81#endif 80#endif
82 /* all members below initialized via memset */ 81 /* all members below initialized via memset */
83 u8 __nfct_init_offset[0]; 82 u8 __nfct_init_offset[0];
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 05c82a1a4267..b71701302e61 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -1,6 +1,5 @@
1#ifndef _NF_NAT_H 1#ifndef _NF_NAT_H
2#define _NF_NAT_H 2#define _NF_NAT_H
3#include <linux/rhashtable.h>
4#include <linux/netfilter_ipv4.h> 3#include <linux/netfilter_ipv4.h>
5#include <linux/netfilter/nf_nat.h> 4#include <linux/netfilter/nf_nat.h>
6#include <net/netfilter/nf_conntrack_tuple.h> 5#include <net/netfilter/nf_conntrack_tuple.h>
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index dc3519cc7209..f090419f5f97 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,19 +30,17 @@
30#include <net/netfilter/nf_conntrack_zones.h> 30#include <net/netfilter/nf_conntrack_zones.h>
31#include <linux/netfilter/nf_nat.h> 31#include <linux/netfilter/nf_nat.h>
32 32
33static DEFINE_SPINLOCK(nf_nat_lock);
34
33static DEFINE_MUTEX(nf_nat_proto_mutex); 35static DEFINE_MUTEX(nf_nat_proto_mutex);
34static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] 36static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
35 __read_mostly; 37 __read_mostly;
36static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] 38static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
37 __read_mostly; 39 __read_mostly;
38 40
39struct nf_nat_conn_key { 41static struct hlist_head *nf_nat_bysource __read_mostly;
40 const struct net *net; 42static unsigned int nf_nat_htable_size __read_mostly;
41 const struct nf_conntrack_tuple *tuple; 43static unsigned int nf_nat_hash_rnd __read_mostly;
42 const struct nf_conntrack_zone *zone;
43};
44
45static struct rhltable nf_nat_bysource_table;
46 44
47inline const struct nf_nat_l3proto * 45inline const struct nf_nat_l3proto *
48__nf_nat_l3proto_find(u8 family) 46__nf_nat_l3proto_find(u8 family)
@@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
118EXPORT_SYMBOL(nf_xfrm_me_harder); 116EXPORT_SYMBOL(nf_xfrm_me_harder);
119#endif /* CONFIG_XFRM */ 117#endif /* CONFIG_XFRM */
120 118
121static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) 119/* We keep an extra hash for each conntrack, for fast searching. */
120static unsigned int
121hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
122{ 122{
123 const struct nf_conntrack_tuple *t; 123 unsigned int hash;
124 const struct nf_conn *ct = data; 124
125 get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
125 126
126 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
127 /* Original src, to ensure we map it consistently if poss. */ 127 /* Original src, to ensure we map it consistently if poss. */
128 hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
129 tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
128 130
129 seed ^= net_hash_mix(nf_ct_net(ct)); 131 return reciprocal_scale(hash, nf_nat_htable_size);
130 return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
131 t->dst.protonum ^ seed);
132} 132}
133 133
134/* Is this tuple already taken? (not by us) */ 134/* Is this tuple already taken? (not by us) */
@@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct,
184 t->src.u.all == tuple->src.u.all); 184 t->src.u.all == tuple->src.u.all);
185} 185}
186 186
187static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
188 const void *obj)
189{
190 const struct nf_nat_conn_key *key = arg->key;
191 const struct nf_conn *ct = obj;
192
193 if (!same_src(ct, key->tuple) ||
194 !net_eq(nf_ct_net(ct), key->net) ||
195 !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
196 return 1;
197
198 return 0;
199}
200
201static struct rhashtable_params nf_nat_bysource_params = {
202 .head_offset = offsetof(struct nf_conn, nat_bysource),
203 .obj_hashfn = nf_nat_bysource_hash,
204 .obj_cmpfn = nf_nat_bysource_cmp,
205 .nelem_hint = 256,
206 .min_size = 1024,
207};
208
209/* Only called for SRC manip */ 187/* Only called for SRC manip */
210static int 188static int
211find_appropriate_src(struct net *net, 189find_appropriate_src(struct net *net,
@@ -216,26 +194,22 @@ find_appropriate_src(struct net *net,
216 struct nf_conntrack_tuple *result, 194 struct nf_conntrack_tuple *result,
217 const struct nf_nat_range *range) 195 const struct nf_nat_range *range)
218{ 196{
197 unsigned int h = hash_by_src(net, tuple);
219 const struct nf_conn *ct; 198 const struct nf_conn *ct;
220 struct nf_nat_conn_key key = {
221 .net = net,
222 .tuple = tuple,
223 .zone = zone
224 };
225 struct rhlist_head *hl, *h;
226
227 hl = rhltable_lookup(&nf_nat_bysource_table, &key,
228 nf_nat_bysource_params);
229 199
230 rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { 200 hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
231 nf_ct_invert_tuplepr(result, 201 if (same_src(ct, tuple) &&
232 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 202 net_eq(net, nf_ct_net(ct)) &&
233 result->dst = tuple->dst; 203 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
234 204 /* Copy source part from reply tuple. */
235 if (in_range(l3proto, l4proto, result, range)) 205 nf_ct_invert_tuplepr(result,
236 return 1; 206 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
207 result->dst = tuple->dst;
208
209 if (in_range(l3proto, l4proto, result, range))
210 return 1;
211 }
237 } 212 }
238
239 return 0; 213 return 0;
240} 214}
241 215
@@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct,
408 const struct nf_nat_range *range, 382 const struct nf_nat_range *range,
409 enum nf_nat_manip_type maniptype) 383 enum nf_nat_manip_type maniptype)
410{ 384{
385 struct net *net = nf_ct_net(ct);
411 struct nf_conntrack_tuple curr_tuple, new_tuple; 386 struct nf_conntrack_tuple curr_tuple, new_tuple;
412 387
413 /* Can't setup nat info for confirmed ct. */ 388 /* Can't setup nat info for confirmed ct. */
@@ -449,19 +424,14 @@ nf_nat_setup_info(struct nf_conn *ct,
449 } 424 }
450 425
451 if (maniptype == NF_NAT_MANIP_SRC) { 426 if (maniptype == NF_NAT_MANIP_SRC) {
452 struct nf_nat_conn_key key = { 427 unsigned int srchash;
453 .net = nf_ct_net(ct), 428
454 .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 429 srchash = hash_by_src(net,
455 .zone = nf_ct_zone(ct), 430 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
456 }; 431 spin_lock_bh(&nf_nat_lock);
457 int err; 432 hlist_add_head_rcu(&ct->nat_bysource,
458 433 &nf_nat_bysource[srchash]);
459 err = rhltable_insert_key(&nf_nat_bysource_table, 434 spin_unlock_bh(&nf_nat_lock);
460 &key,
461 &ct->nat_bysource,
462 nf_nat_bysource_params);
463 if (err)
464 return NF_DROP;
465 } 435 }
466 436
467 /* It's done. */ 437 /* It's done. */
@@ -570,8 +540,9 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
570 * will delete entry from already-freed table. 540 * will delete entry from already-freed table.
571 */ 541 */
572 clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); 542 clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
573 rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, 543 spin_lock_bh(&nf_nat_lock);
574 nf_nat_bysource_params); 544 hlist_del_rcu(&ct->nat_bysource);
545 spin_unlock_bh(&nf_nat_lock);
575 546
576 /* don't delete conntrack. Although that would make things a lot 547 /* don't delete conntrack. Although that would make things a lot
577 * simpler, we'd end up flushing all conntracks on nat rmmod. 548 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -699,9 +670,11 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
699/* No one using conntrack by the time this called. */ 670/* No one using conntrack by the time this called. */
700static void nf_nat_cleanup_conntrack(struct nf_conn *ct) 671static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
701{ 672{
702 if (ct->status & IPS_SRC_NAT_DONE) 673 if (ct->status & IPS_SRC_NAT_DONE) {
703 rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, 674 spin_lock_bh(&nf_nat_lock);
704 nf_nat_bysource_params); 675 hlist_del_rcu(&ct->nat_bysource);
676 spin_unlock_bh(&nf_nat_lock);
677 }
705} 678}
706 679
707static struct nf_ct_ext_type nat_extend __read_mostly = { 680static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -825,13 +798,16 @@ static int __init nf_nat_init(void)
825{ 798{
826 int ret; 799 int ret;
827 800
828 ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); 801 /* Leave them the same for the moment. */
829 if (ret) 802 nf_nat_htable_size = nf_conntrack_htable_size;
830 return ret; 803
804 nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
805 if (!nf_nat_bysource)
806 return -ENOMEM;
831 807
832 ret = nf_ct_extend_register(&nat_extend); 808 ret = nf_ct_extend_register(&nat_extend);
833 if (ret < 0) { 809 if (ret < 0) {
834 rhltable_destroy(&nf_nat_bysource_table); 810 nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
835 printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); 811 printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
836 return ret; 812 return ret;
837 } 813 }
@@ -865,8 +841,8 @@ static void __exit nf_nat_cleanup(void)
865 841
866 for (i = 0; i < NFPROTO_NUMPROTO; i++) 842 for (i = 0; i < NFPROTO_NUMPROTO; i++)
867 kfree(nf_nat_l4protos[i]); 843 kfree(nf_nat_l4protos[i]);
868 844 synchronize_net();
869 rhltable_destroy(&nf_nat_bysource_table); 845 nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
870} 846}
871 847
872MODULE_LICENSE("GPL"); 848MODULE_LICENSE("GPL");