diff options
author | Jesper Dangaard Brouer <brouer@redhat.com> | 2014-03-03 08:46:13 -0500 |
---|---|---|
committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2014-03-07 05:41:13 -0500 |
commit | 93bb0ceb75be2fdfa9fc0dd1fb522d9ada515d9c (patch) | |
tree | e47e7b701d8aa47683816a2d913ad7d005c25939 /net/netfilter | |
parent | ca7433df3a672efc88e08222cfa4b3aa965ca324 (diff) |
netfilter: conntrack: remove central spinlock nf_conntrack_lock
nf_conntrack_lock is a monolithic lock and suffers from huge contention
on current generation servers (8 or more core/threads).
Perf locking congestion is clear on base kernel:
- 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh
- _raw_spin_lock_bh
+ 25.33% init_conntrack
+ 24.86% nf_ct_delete_from_lists
+ 24.62% __nf_conntrack_confirm
+ 24.38% destroy_conntrack
+ 0.70% tcp_packet
+ 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup
+ 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free
+ 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer
+ 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete
+ 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table
This patch change conntrack locking and provides a huge performance
improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with
10Gbit/s ixgbe (with tool trafgen):
Base kernel: 810.405 new conntrack/sec
After patch: 2.233.876 new conntrack/sec
Notice other floods attack (SYN+ACK or ACK) can easily be deflected using:
# iptables -A INPUT -m state --state INVALID -j DROP
# sysctl -w net/netfilter/nf_conntrack_tcp_loose=0
Use an array of hashed spinlocks to protect insertions/deletions of
conntracks into the hash table. 1024 spinlocks seem to give good
results, at minimal cost (4KB memory). Due to lockdep max depth,
1024 becomes 8 if CONFIG_LOCKDEP=y
The hash resize is a bit tricky, because we need to take all locks in
the array. A seqcount_t is used to synchronize the hash table users
with the resizing process.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net/netfilter')
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 219 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_helper.c | 12 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_netlink.c | 15 |
3 files changed, 180 insertions, 66 deletions
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4cdf1ade1530..5d1e7d126ebd 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c | |||
@@ -60,12 +60,60 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, | |||
60 | const struct nlattr *attr) __read_mostly; | 60 | const struct nlattr *attr) __read_mostly; |
61 | EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); | 61 | EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); |
62 | 62 | ||
63 | DEFINE_SPINLOCK(nf_conntrack_lock); | 63 | __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; |
64 | EXPORT_SYMBOL_GPL(nf_conntrack_lock); | 64 | EXPORT_SYMBOL_GPL(nf_conntrack_locks); |
65 | 65 | ||
66 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); | 66 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); |
67 | EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); | 67 | EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); |
68 | 68 | ||
69 | static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) | ||
70 | { | ||
71 | h1 %= CONNTRACK_LOCKS; | ||
72 | h2 %= CONNTRACK_LOCKS; | ||
73 | spin_unlock(&nf_conntrack_locks[h1]); | ||
74 | if (h1 != h2) | ||
75 | spin_unlock(&nf_conntrack_locks[h2]); | ||
76 | } | ||
77 | |||
78 | /* return true if we need to recompute hashes (in case hash table was resized) */ | ||
79 | static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, | ||
80 | unsigned int h2, unsigned int sequence) | ||
81 | { | ||
82 | h1 %= CONNTRACK_LOCKS; | ||
83 | h2 %= CONNTRACK_LOCKS; | ||
84 | if (h1 <= h2) { | ||
85 | spin_lock(&nf_conntrack_locks[h1]); | ||
86 | if (h1 != h2) | ||
87 | spin_lock_nested(&nf_conntrack_locks[h2], | ||
88 | SINGLE_DEPTH_NESTING); | ||
89 | } else { | ||
90 | spin_lock(&nf_conntrack_locks[h2]); | ||
91 | spin_lock_nested(&nf_conntrack_locks[h1], | ||
92 | SINGLE_DEPTH_NESTING); | ||
93 | } | ||
94 | if (read_seqcount_retry(&net->ct.generation, sequence)) { | ||
95 | nf_conntrack_double_unlock(h1, h2); | ||
96 | return true; | ||
97 | } | ||
98 | return false; | ||
99 | } | ||
100 | |||
101 | static void nf_conntrack_all_lock(void) | ||
102 | { | ||
103 | int i; | ||
104 | |||
105 | for (i = 0; i < CONNTRACK_LOCKS; i++) | ||
106 | spin_lock_nested(&nf_conntrack_locks[i], i); | ||
107 | } | ||
108 | |||
109 | static void nf_conntrack_all_unlock(void) | ||
110 | { | ||
111 | int i; | ||
112 | |||
113 | for (i = 0; i < CONNTRACK_LOCKS; i++) | ||
114 | spin_unlock(&nf_conntrack_locks[i]); | ||
115 | } | ||
116 | |||
69 | unsigned int nf_conntrack_htable_size __read_mostly; | 117 | unsigned int nf_conntrack_htable_size __read_mostly; |
70 | EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); | 118 | EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); |
71 | 119 | ||
@@ -280,15 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct) | |||
280 | static void nf_ct_delete_from_lists(struct nf_conn *ct) | 328 | static void nf_ct_delete_from_lists(struct nf_conn *ct) |
281 | { | 329 | { |
282 | struct net *net = nf_ct_net(ct); | 330 | struct net *net = nf_ct_net(ct); |
331 | unsigned int hash, reply_hash; | ||
332 | u16 zone = nf_ct_zone(ct); | ||
333 | unsigned int sequence; | ||
283 | 334 | ||
284 | nf_ct_helper_destroy(ct); | 335 | nf_ct_helper_destroy(ct); |
285 | spin_lock_bh(&nf_conntrack_lock); | 336 | |
286 | /* Inside lock so preempt is disabled on module removal path. | 337 | local_bh_disable(); |
287 | * Otherwise we can get spurious warnings. */ | 338 | do { |
288 | NF_CT_STAT_INC(net, delete_list); | 339 | sequence = read_seqcount_begin(&net->ct.generation); |
340 | hash = hash_conntrack(net, zone, | ||
341 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); | ||
342 | reply_hash = hash_conntrack(net, zone, | ||
343 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); | ||
344 | } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); | ||
345 | |||
289 | clean_from_lists(ct); | 346 | clean_from_lists(ct); |
347 | nf_conntrack_double_unlock(hash, reply_hash); | ||
348 | |||
290 | nf_ct_add_to_dying_list(ct); | 349 | nf_ct_add_to_dying_list(ct); |
291 | spin_unlock_bh(&nf_conntrack_lock); | 350 | |
351 | NF_CT_STAT_INC(net, delete_list); | ||
352 | local_bh_enable(); | ||
292 | } | 353 | } |
293 | 354 | ||
294 | static void death_by_event(unsigned long ul_conntrack) | 355 | static void death_by_event(unsigned long ul_conntrack) |
@@ -372,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, | |||
372 | * Warning : | 433 | * Warning : |
373 | * - Caller must take a reference on returned object | 434 | * - Caller must take a reference on returned object |
374 | * and recheck nf_ct_tuple_equal(tuple, &h->tuple) | 435 | * and recheck nf_ct_tuple_equal(tuple, &h->tuple) |
375 | * OR | ||
376 | * - Caller must lock nf_conntrack_lock before calling this function | ||
377 | */ | 436 | */ |
378 | static struct nf_conntrack_tuple_hash * | 437 | static struct nf_conntrack_tuple_hash * |
379 | ____nf_conntrack_find(struct net *net, u16 zone, | 438 | ____nf_conntrack_find(struct net *net, u16 zone, |
@@ -467,14 +526,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) | |||
467 | struct nf_conntrack_tuple_hash *h; | 526 | struct nf_conntrack_tuple_hash *h; |
468 | struct hlist_nulls_node *n; | 527 | struct hlist_nulls_node *n; |
469 | u16 zone; | 528 | u16 zone; |
529 | unsigned int sequence; | ||
470 | 530 | ||
471 | zone = nf_ct_zone(ct); | 531 | zone = nf_ct_zone(ct); |
472 | hash = hash_conntrack(net, zone, | ||
473 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); | ||
474 | reply_hash = hash_conntrack(net, zone, | ||
475 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); | ||
476 | 532 | ||
477 | spin_lock_bh(&nf_conntrack_lock); | 533 | local_bh_disable(); |
534 | do { | ||
535 | sequence = read_seqcount_begin(&net->ct.generation); | ||
536 | hash = hash_conntrack(net, zone, | ||
537 | &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); | ||
538 | reply_hash = hash_conntrack(net, zone, | ||
539 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); | ||
540 | } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); | ||
478 | 541 | ||
479 | /* See if there's one in the list already, including reverse */ | 542 | /* See if there's one in the list already, including reverse */ |
480 | hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) | 543 | hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) |
@@ -493,14 +556,15 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) | |||
493 | /* The caller holds a reference to this object */ | 556 | /* The caller holds a reference to this object */ |
494 | atomic_set(&ct->ct_general.use, 2); | 557 | atomic_set(&ct->ct_general.use, 2); |
495 | __nf_conntrack_hash_insert(ct, hash, reply_hash); | 558 | __nf_conntrack_hash_insert(ct, hash, reply_hash); |
559 | nf_conntrack_double_unlock(hash, reply_hash); | ||
496 | NF_CT_STAT_INC(net, insert); | 560 | NF_CT_STAT_INC(net, insert); |
497 | spin_unlock_bh(&nf_conntrack_lock); | 561 | local_bh_enable(); |
498 | |||
499 | return 0; | 562 | return 0; |
500 | 563 | ||
501 | out: | 564 | out: |
565 | nf_conntrack_double_unlock(hash, reply_hash); | ||
502 | NF_CT_STAT_INC(net, insert_failed); | 566 | NF_CT_STAT_INC(net, insert_failed); |
503 | spin_unlock_bh(&nf_conntrack_lock); | 567 | local_bh_enable(); |
504 | return -EEXIST; | 568 | return -EEXIST; |
505 | } | 569 | } |
506 | EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); | 570 | EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); |
@@ -540,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) | |||
540 | enum ip_conntrack_info ctinfo; | 604 | enum ip_conntrack_info ctinfo; |
541 | struct net *net; | 605 | struct net *net; |
542 | u16 zone; | 606 | u16 zone; |
607 | unsigned int sequence; | ||
543 | 608 | ||
544 | ct = nf_ct_get(skb, &ctinfo); | 609 | ct = nf_ct_get(skb, &ctinfo); |
545 | net = nf_ct_net(ct); | 610 | net = nf_ct_net(ct); |
@@ -552,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb) | |||
552 | return NF_ACCEPT; | 617 | return NF_ACCEPT; |
553 | 618 | ||
554 | zone = nf_ct_zone(ct); | 619 | zone = nf_ct_zone(ct); |
555 | /* reuse the hash saved before */ | 620 | local_bh_disable(); |
556 | hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; | 621 | |
557 | hash = hash_bucket(hash, net); | 622 | do { |
558 | reply_hash = hash_conntrack(net, zone, | 623 | sequence = read_seqcount_begin(&net->ct.generation); |
559 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); | 624 | /* reuse the hash saved before */ |
625 | hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; | ||
626 | hash = hash_bucket(hash, net); | ||
627 | reply_hash = hash_conntrack(net, zone, | ||
628 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple); | ||
629 | |||
630 | } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); | ||
560 | 631 | ||
561 | /* We're not in hash table, and we refuse to set up related | 632 | /* We're not in hash table, and we refuse to set up related |
562 | connections for unconfirmed conns. But packet copies and | 633 | * connections for unconfirmed conns. But packet copies and |
563 | REJECT will give spurious warnings here. */ | 634 | * REJECT will give spurious warnings here. |
635 | */ | ||
564 | /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ | 636 | /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ |
565 | 637 | ||
566 | /* No external references means no one else could have | 638 | /* No external references means no one else could have |
567 | confirmed us. */ | 639 | * confirmed us. |
640 | */ | ||
568 | NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); | 641 | NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); |
569 | pr_debug("Confirming conntrack %p\n", ct); | 642 | pr_debug("Confirming conntrack %p\n", ct); |
570 | |||
571 | spin_lock_bh(&nf_conntrack_lock); | ||
572 | |||
573 | /* We have to check the DYING flag inside the lock to prevent | 643 | /* We have to check the DYING flag inside the lock to prevent |
574 | a race against nf_ct_get_next_corpse() possibly called from | 644 | a race against nf_ct_get_next_corpse() possibly called from |
575 | user context, else we insert an already 'dead' hash, blocking | 645 | user context, else we insert an already 'dead' hash, blocking |
576 | further use of that particular connection -JM */ | 646 | further use of that particular connection -JM */ |
577 | 647 | ||
578 | if (unlikely(nf_ct_is_dying(ct))) { | 648 | if (unlikely(nf_ct_is_dying(ct))) { |
579 | spin_unlock_bh(&nf_conntrack_lock); | 649 | nf_conntrack_double_unlock(hash, reply_hash); |
650 | local_bh_enable(); | ||
580 | return NF_ACCEPT; | 651 | return NF_ACCEPT; |
581 | } | 652 | } |
582 | 653 | ||
@@ -618,8 +689,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) | |||
618 | * stores are visible. | 689 | * stores are visible. |
619 | */ | 690 | */ |
620 | __nf_conntrack_hash_insert(ct, hash, reply_hash); | 691 | __nf_conntrack_hash_insert(ct, hash, reply_hash); |
692 | nf_conntrack_double_unlock(hash, reply_hash); | ||
621 | NF_CT_STAT_INC(net, insert); | 693 | NF_CT_STAT_INC(net, insert); |
622 | spin_unlock_bh(&nf_conntrack_lock); | 694 | local_bh_enable(); |
623 | 695 | ||
624 | help = nfct_help(ct); | 696 | help = nfct_help(ct); |
625 | if (help && help->helper) | 697 | if (help && help->helper) |
@@ -630,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) | |||
630 | return NF_ACCEPT; | 702 | return NF_ACCEPT; |
631 | 703 | ||
632 | out: | 704 | out: |
705 | nf_conntrack_double_unlock(hash, reply_hash); | ||
633 | NF_CT_STAT_INC(net, insert_failed); | 706 | NF_CT_STAT_INC(net, insert_failed); |
634 | spin_unlock_bh(&nf_conntrack_lock); | 707 | local_bh_enable(); |
635 | return NF_DROP; | 708 | return NF_DROP; |
636 | } | 709 | } |
637 | EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); | 710 | EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); |
@@ -674,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); | |||
674 | 747 | ||
675 | /* There's a small race here where we may free a just-assured | 748 | /* There's a small race here where we may free a just-assured |
676 | connection. Too bad: we're in trouble anyway. */ | 749 | connection. Too bad: we're in trouble anyway. */ |
677 | static noinline int early_drop(struct net *net, unsigned int hash) | 750 | static noinline int early_drop(struct net *net, unsigned int _hash) |
678 | { | 751 | { |
679 | /* Use oldest entry, which is roughly LRU */ | 752 | /* Use oldest entry, which is roughly LRU */ |
680 | struct nf_conntrack_tuple_hash *h; | 753 | struct nf_conntrack_tuple_hash *h; |
681 | struct nf_conn *ct = NULL, *tmp; | 754 | struct nf_conn *ct = NULL, *tmp; |
682 | struct hlist_nulls_node *n; | 755 | struct hlist_nulls_node *n; |
683 | unsigned int i, cnt = 0; | 756 | unsigned int i = 0, cnt = 0; |
684 | int dropped = 0; | 757 | int dropped = 0; |
758 | unsigned int hash, sequence; | ||
759 | spinlock_t *lockp; | ||
685 | 760 | ||
686 | rcu_read_lock(); | 761 | local_bh_disable(); |
687 | for (i = 0; i < net->ct.htable_size; i++) { | 762 | restart: |
763 | sequence = read_seqcount_begin(&net->ct.generation); | ||
764 | hash = hash_bucket(_hash, net); | ||
765 | for (; i < net->ct.htable_size; i++) { | ||
766 | lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; | ||
767 | spin_lock(lockp); | ||
768 | if (read_seqcount_retry(&net->ct.generation, sequence)) { | ||
769 | spin_unlock(lockp); | ||
770 | goto restart; | ||
771 | } | ||
688 | hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], | 772 | hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], |
689 | hnnode) { | 773 | hnnode) { |
690 | tmp = nf_ct_tuplehash_to_ctrack(h); | 774 | tmp = nf_ct_tuplehash_to_ctrack(h); |
691 | if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) | 775 | if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && |
776 | !nf_ct_is_dying(tmp) && | ||
777 | atomic_inc_not_zero(&tmp->ct_general.use)) { | ||
692 | ct = tmp; | 778 | ct = tmp; |
779 | break; | ||
780 | } | ||
693 | cnt++; | 781 | cnt++; |
694 | } | 782 | } |
695 | 783 | ||
696 | if (ct != NULL) { | 784 | hash = (hash + 1) % net->ct.htable_size; |
697 | if (likely(!nf_ct_is_dying(ct) && | 785 | spin_unlock(lockp); |
698 | atomic_inc_not_zero(&ct->ct_general.use))) | ||
699 | break; | ||
700 | else | ||
701 | ct = NULL; | ||
702 | } | ||
703 | 786 | ||
704 | if (cnt >= NF_CT_EVICTION_RANGE) | 787 | if (ct || cnt >= NF_CT_EVICTION_RANGE) |
705 | break; | 788 | break; |
706 | 789 | ||
707 | hash = (hash + 1) % net->ct.htable_size; | ||
708 | } | 790 | } |
709 | rcu_read_unlock(); | 791 | local_bh_enable(); |
710 | 792 | ||
711 | if (!ct) | 793 | if (!ct) |
712 | return dropped; | 794 | return dropped; |
@@ -755,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone, | |||
755 | 837 | ||
756 | if (nf_conntrack_max && | 838 | if (nf_conntrack_max && |
757 | unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { | 839 | unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { |
758 | if (!early_drop(net, hash_bucket(hash, net))) { | 840 | if (!early_drop(net, hash)) { |
759 | atomic_dec(&net->ct.count); | 841 | atomic_dec(&net->ct.count); |
760 | net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); | 842 | net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); |
761 | return ERR_PTR(-ENOMEM); | 843 | return ERR_PTR(-ENOMEM); |
@@ -1304,18 +1386,24 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), | |||
1304 | struct nf_conn *ct; | 1386 | struct nf_conn *ct; |
1305 | struct hlist_nulls_node *n; | 1387 | struct hlist_nulls_node *n; |
1306 | int cpu; | 1388 | int cpu; |
1389 | spinlock_t *lockp; | ||
1307 | 1390 | ||
1308 | spin_lock_bh(&nf_conntrack_lock); | ||
1309 | for (; *bucket < net->ct.htable_size; (*bucket)++) { | 1391 | for (; *bucket < net->ct.htable_size; (*bucket)++) { |
1310 | hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { | 1392 | lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; |
1311 | if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) | 1393 | local_bh_disable(); |
1312 | continue; | 1394 | spin_lock(lockp); |
1313 | ct = nf_ct_tuplehash_to_ctrack(h); | 1395 | if (*bucket < net->ct.htable_size) { |
1314 | if (iter(ct, data)) | 1396 | hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { |
1315 | goto found; | 1397 | if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) |
1398 | continue; | ||
1399 | ct = nf_ct_tuplehash_to_ctrack(h); | ||
1400 | if (iter(ct, data)) | ||
1401 | goto found; | ||
1402 | } | ||
1316 | } | 1403 | } |
1404 | spin_unlock(lockp); | ||
1405 | local_bh_enable(); | ||
1317 | } | 1406 | } |
1318 | spin_unlock_bh(&nf_conntrack_lock); | ||
1319 | 1407 | ||
1320 | for_each_possible_cpu(cpu) { | 1408 | for_each_possible_cpu(cpu) { |
1321 | struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); | 1409 | struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); |
@@ -1331,7 +1419,8 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), | |||
1331 | return NULL; | 1419 | return NULL; |
1332 | found: | 1420 | found: |
1333 | atomic_inc(&ct->ct_general.use); | 1421 | atomic_inc(&ct->ct_general.use); |
1334 | spin_unlock_bh(&nf_conntrack_lock); | 1422 | spin_unlock(lockp); |
1423 | local_bh_enable(); | ||
1335 | return ct; | 1424 | return ct; |
1336 | } | 1425 | } |
1337 | 1426 | ||
@@ -1532,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) | |||
1532 | if (!hash) | 1621 | if (!hash) |
1533 | return -ENOMEM; | 1622 | return -ENOMEM; |
1534 | 1623 | ||
1624 | local_bh_disable(); | ||
1625 | nf_conntrack_all_lock(); | ||
1626 | write_seqcount_begin(&init_net.ct.generation); | ||
1627 | |||
1535 | /* Lookups in the old hash might happen in parallel, which means we | 1628 | /* Lookups in the old hash might happen in parallel, which means we |
1536 | * might get false negatives during connection lookup. New connections | 1629 | * might get false negatives during connection lookup. New connections |
1537 | * created because of a false negative won't make it into the hash | 1630 | * created because of a false negative won't make it into the hash |
1538 | * though since that required taking the lock. | 1631 | * though since that required taking the locks. |
1539 | */ | 1632 | */ |
1540 | spin_lock_bh(&nf_conntrack_lock); | 1633 | |
1541 | for (i = 0; i < init_net.ct.htable_size; i++) { | 1634 | for (i = 0; i < init_net.ct.htable_size; i++) { |
1542 | while (!hlist_nulls_empty(&init_net.ct.hash[i])) { | 1635 | while (!hlist_nulls_empty(&init_net.ct.hash[i])) { |
1543 | h = hlist_nulls_entry(init_net.ct.hash[i].first, | 1636 | h = hlist_nulls_entry(init_net.ct.hash[i].first, |
@@ -1554,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) | |||
1554 | 1647 | ||
1555 | init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; | 1648 | init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; |
1556 | init_net.ct.hash = hash; | 1649 | init_net.ct.hash = hash; |
1557 | spin_unlock_bh(&nf_conntrack_lock); | 1650 | |
1651 | write_seqcount_end(&init_net.ct.generation); | ||
1652 | nf_conntrack_all_unlock(); | ||
1653 | local_bh_enable(); | ||
1558 | 1654 | ||
1559 | nf_ct_free_hashtable(old_hash, old_size); | 1655 | nf_ct_free_hashtable(old_hash, old_size); |
1560 | return 0; | 1656 | return 0; |
@@ -1576,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); | |||
1576 | int nf_conntrack_init_start(void) | 1672 | int nf_conntrack_init_start(void) |
1577 | { | 1673 | { |
1578 | int max_factor = 8; | 1674 | int max_factor = 8; |
1579 | int ret, cpu; | 1675 | int i, ret, cpu; |
1676 | |||
1677 | for (i = 0; i < ARRAY_SIZE(nf_conntrack_locks); i++) | ||
1678 | spin_lock_init(&nf_conntrack_locks[i]); | ||
1580 | 1679 | ||
1581 | /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB | 1680 | /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB |
1582 | * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ | 1681 | * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ |
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 29bd704edb85..5b3eae7d4c9a 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c | |||
@@ -423,12 +423,16 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, | |||
423 | unhelp(h, me); | 423 | unhelp(h, me); |
424 | spin_unlock_bh(&pcpu->lock); | 424 | spin_unlock_bh(&pcpu->lock); |
425 | } | 425 | } |
426 | spin_lock_bh(&nf_conntrack_lock); | 426 | local_bh_disable(); |
427 | for (i = 0; i < net->ct.htable_size; i++) { | 427 | for (i = 0; i < net->ct.htable_size; i++) { |
428 | hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) | 428 | spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); |
429 | unhelp(h, me); | 429 | if (i < net->ct.htable_size) { |
430 | hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) | ||
431 | unhelp(h, me); | ||
432 | } | ||
433 | spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); | ||
430 | } | 434 | } |
431 | spin_unlock_bh(&nf_conntrack_lock); | 435 | local_bh_enable(); |
432 | } | 436 | } |
433 | 437 | ||
434 | void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) | 438 | void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) |
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index be4d1b0bbb6a..8d778a9fd063 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c | |||
@@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) | |||
764 | struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); | 764 | struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); |
765 | u_int8_t l3proto = nfmsg->nfgen_family; | 765 | u_int8_t l3proto = nfmsg->nfgen_family; |
766 | int res; | 766 | int res; |
767 | spinlock_t *lockp; | ||
768 | |||
767 | #ifdef CONFIG_NF_CONNTRACK_MARK | 769 | #ifdef CONFIG_NF_CONNTRACK_MARK |
768 | const struct ctnetlink_dump_filter *filter = cb->data; | 770 | const struct ctnetlink_dump_filter *filter = cb->data; |
769 | #endif | 771 | #endif |
770 | 772 | ||
771 | spin_lock_bh(&nf_conntrack_lock); | ||
772 | last = (struct nf_conn *)cb->args[1]; | 773 | last = (struct nf_conn *)cb->args[1]; |
774 | |||
775 | local_bh_disable(); | ||
773 | for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { | 776 | for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { |
774 | restart: | 777 | restart: |
778 | lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; | ||
779 | spin_lock(lockp); | ||
780 | if (cb->args[0] >= net->ct.htable_size) { | ||
781 | spin_unlock(lockp); | ||
782 | goto out; | ||
783 | } | ||
775 | hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], | 784 | hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], |
776 | hnnode) { | 785 | hnnode) { |
777 | if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) | 786 | if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) |
@@ -803,16 +812,18 @@ restart: | |||
803 | if (res < 0) { | 812 | if (res < 0) { |
804 | nf_conntrack_get(&ct->ct_general); | 813 | nf_conntrack_get(&ct->ct_general); |
805 | cb->args[1] = (unsigned long)ct; | 814 | cb->args[1] = (unsigned long)ct; |
815 | spin_unlock(lockp); | ||
806 | goto out; | 816 | goto out; |
807 | } | 817 | } |
808 | } | 818 | } |
819 | spin_unlock(lockp); | ||
809 | if (cb->args[1]) { | 820 | if (cb->args[1]) { |
810 | cb->args[1] = 0; | 821 | cb->args[1] = 0; |
811 | goto restart; | 822 | goto restart; |
812 | } | 823 | } |
813 | } | 824 | } |
814 | out: | 825 | out: |
815 | spin_unlock_bh(&nf_conntrack_lock); | 826 | local_bh_enable(); |
816 | if (last) | 827 | if (last) |
817 | nf_ct_put(last); | 828 | nf_ct_put(last); |
818 | 829 | ||