aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
authorJesper Dangaard Brouer <brouer@redhat.com>2014-03-03 08:46:13 -0500
committerPablo Neira Ayuso <pablo@netfilter.org>2014-03-07 05:41:13 -0500
commit93bb0ceb75be2fdfa9fc0dd1fb522d9ada515d9c (patch)
treee47e7b701d8aa47683816a2d913ad7d005c25939 /net/netfilter
parentca7433df3a672efc88e08222cfa4b3aa965ca324 (diff)
netfilter: conntrack: remove central spinlock nf_conntrack_lock
nf_conntrack_lock is a monolithic lock and suffers from huge contention on current generation servers (8 or more core/threads). Perf locking congestion is clear on base kernel: - 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh - _raw_spin_lock_bh + 25.33% init_conntrack + 24.86% nf_ct_delete_from_lists + 24.62% __nf_conntrack_confirm + 24.38% destroy_conntrack + 0.70% tcp_packet + 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup + 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free + 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer + 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete + 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table This patch change conntrack locking and provides a huge performance improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with 10Gbit/s ixgbe (with tool trafgen): Base kernel: 810.405 new conntrack/sec After patch: 2.233.876 new conntrack/sec Notice other floods attack (SYN+ACK or ACK) can easily be deflected using: # iptables -A INPUT -m state --state INVALID -j DROP # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0 Use an array of hashed spinlocks to protect insertions/deletions of conntracks into the hash table. 1024 spinlocks seem to give good results, at minimal cost (4KB memory). Due to lockdep max depth, 1024 becomes 8 if CONFIG_LOCKDEP=y The hash resize is a bit tricky, because we need to take all locks in the array. A seqcount_t is used to synchronize the hash table users with the resizing process. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Reviewed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/nf_conntrack_core.c219
-rw-r--r--net/netfilter/nf_conntrack_helper.c12
-rw-r--r--net/netfilter/nf_conntrack_netlink.c15
3 files changed, 180 insertions, 66 deletions
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 4cdf1ade1530..5d1e7d126ebd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -60,12 +60,60 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
60 const struct nlattr *attr) __read_mostly; 60 const struct nlattr *attr) __read_mostly;
61EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); 61EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
62 62
63DEFINE_SPINLOCK(nf_conntrack_lock); 63__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
64EXPORT_SYMBOL_GPL(nf_conntrack_lock); 64EXPORT_SYMBOL_GPL(nf_conntrack_locks);
65 65
66__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 66__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
67EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 67EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
68 68
69static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
70{
71 h1 %= CONNTRACK_LOCKS;
72 h2 %= CONNTRACK_LOCKS;
73 spin_unlock(&nf_conntrack_locks[h1]);
74 if (h1 != h2)
75 spin_unlock(&nf_conntrack_locks[h2]);
76}
77
78/* return true if we need to recompute hashes (in case hash table was resized) */
79static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
80 unsigned int h2, unsigned int sequence)
81{
82 h1 %= CONNTRACK_LOCKS;
83 h2 %= CONNTRACK_LOCKS;
84 if (h1 <= h2) {
85 spin_lock(&nf_conntrack_locks[h1]);
86 if (h1 != h2)
87 spin_lock_nested(&nf_conntrack_locks[h2],
88 SINGLE_DEPTH_NESTING);
89 } else {
90 spin_lock(&nf_conntrack_locks[h2]);
91 spin_lock_nested(&nf_conntrack_locks[h1],
92 SINGLE_DEPTH_NESTING);
93 }
94 if (read_seqcount_retry(&net->ct.generation, sequence)) {
95 nf_conntrack_double_unlock(h1, h2);
96 return true;
97 }
98 return false;
99}
100
101static void nf_conntrack_all_lock(void)
102{
103 int i;
104
105 for (i = 0; i < CONNTRACK_LOCKS; i++)
106 spin_lock_nested(&nf_conntrack_locks[i], i);
107}
108
109static void nf_conntrack_all_unlock(void)
110{
111 int i;
112
113 for (i = 0; i < CONNTRACK_LOCKS; i++)
114 spin_unlock(&nf_conntrack_locks[i]);
115}
116
69unsigned int nf_conntrack_htable_size __read_mostly; 117unsigned int nf_conntrack_htable_size __read_mostly;
70EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 118EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
71 119
@@ -280,15 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct)
280static void nf_ct_delete_from_lists(struct nf_conn *ct) 328static void nf_ct_delete_from_lists(struct nf_conn *ct)
281{ 329{
282 struct net *net = nf_ct_net(ct); 330 struct net *net = nf_ct_net(ct);
331 unsigned int hash, reply_hash;
332 u16 zone = nf_ct_zone(ct);
333 unsigned int sequence;
283 334
284 nf_ct_helper_destroy(ct); 335 nf_ct_helper_destroy(ct);
285 spin_lock_bh(&nf_conntrack_lock); 336
286 /* Inside lock so preempt is disabled on module removal path. 337 local_bh_disable();
287 * Otherwise we can get spurious warnings. */ 338 do {
288 NF_CT_STAT_INC(net, delete_list); 339 sequence = read_seqcount_begin(&net->ct.generation);
340 hash = hash_conntrack(net, zone,
341 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
342 reply_hash = hash_conntrack(net, zone,
343 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
344 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
345
289 clean_from_lists(ct); 346 clean_from_lists(ct);
347 nf_conntrack_double_unlock(hash, reply_hash);
348
290 nf_ct_add_to_dying_list(ct); 349 nf_ct_add_to_dying_list(ct);
291 spin_unlock_bh(&nf_conntrack_lock); 350
351 NF_CT_STAT_INC(net, delete_list);
352 local_bh_enable();
292} 353}
293 354
294static void death_by_event(unsigned long ul_conntrack) 355static void death_by_event(unsigned long ul_conntrack)
@@ -372,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
372 * Warning : 433 * Warning :
373 * - Caller must take a reference on returned object 434 * - Caller must take a reference on returned object
374 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 435 * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
375 * OR
376 * - Caller must lock nf_conntrack_lock before calling this function
377 */ 436 */
378static struct nf_conntrack_tuple_hash * 437static struct nf_conntrack_tuple_hash *
379____nf_conntrack_find(struct net *net, u16 zone, 438____nf_conntrack_find(struct net *net, u16 zone,
@@ -467,14 +526,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
467 struct nf_conntrack_tuple_hash *h; 526 struct nf_conntrack_tuple_hash *h;
468 struct hlist_nulls_node *n; 527 struct hlist_nulls_node *n;
469 u16 zone; 528 u16 zone;
529 unsigned int sequence;
470 530
471 zone = nf_ct_zone(ct); 531 zone = nf_ct_zone(ct);
472 hash = hash_conntrack(net, zone,
473 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
474 reply_hash = hash_conntrack(net, zone,
475 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
476 532
477 spin_lock_bh(&nf_conntrack_lock); 533 local_bh_disable();
534 do {
535 sequence = read_seqcount_begin(&net->ct.generation);
536 hash = hash_conntrack(net, zone,
537 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
538 reply_hash = hash_conntrack(net, zone,
539 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
540 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
478 541
479 /* See if there's one in the list already, including reverse */ 542 /* See if there's one in the list already, including reverse */
480 hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 543 hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
@@ -493,14 +556,15 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
493 /* The caller holds a reference to this object */ 556 /* The caller holds a reference to this object */
494 atomic_set(&ct->ct_general.use, 2); 557 atomic_set(&ct->ct_general.use, 2);
495 __nf_conntrack_hash_insert(ct, hash, reply_hash); 558 __nf_conntrack_hash_insert(ct, hash, reply_hash);
559 nf_conntrack_double_unlock(hash, reply_hash);
496 NF_CT_STAT_INC(net, insert); 560 NF_CT_STAT_INC(net, insert);
497 spin_unlock_bh(&nf_conntrack_lock); 561 local_bh_enable();
498
499 return 0; 562 return 0;
500 563
501out: 564out:
565 nf_conntrack_double_unlock(hash, reply_hash);
502 NF_CT_STAT_INC(net, insert_failed); 566 NF_CT_STAT_INC(net, insert_failed);
503 spin_unlock_bh(&nf_conntrack_lock); 567 local_bh_enable();
504 return -EEXIST; 568 return -EEXIST;
505} 569}
506EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 570EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
@@ -540,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
540 enum ip_conntrack_info ctinfo; 604 enum ip_conntrack_info ctinfo;
541 struct net *net; 605 struct net *net;
542 u16 zone; 606 u16 zone;
607 unsigned int sequence;
543 608
544 ct = nf_ct_get(skb, &ctinfo); 609 ct = nf_ct_get(skb, &ctinfo);
545 net = nf_ct_net(ct); 610 net = nf_ct_net(ct);
@@ -552,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)
552 return NF_ACCEPT; 617 return NF_ACCEPT;
553 618
554 zone = nf_ct_zone(ct); 619 zone = nf_ct_zone(ct);
555 /* reuse the hash saved before */ 620 local_bh_disable();
556 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 621
557 hash = hash_bucket(hash, net); 622 do {
558 reply_hash = hash_conntrack(net, zone, 623 sequence = read_seqcount_begin(&net->ct.generation);
559 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 624 /* reuse the hash saved before */
625 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
626 hash = hash_bucket(hash, net);
627 reply_hash = hash_conntrack(net, zone,
628 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
629
630 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
560 631
561 /* We're not in hash table, and we refuse to set up related 632 /* We're not in hash table, and we refuse to set up related
562 connections for unconfirmed conns. But packet copies and 633 * connections for unconfirmed conns. But packet copies and
563 REJECT will give spurious warnings here. */ 634 * REJECT will give spurious warnings here.
635 */
564 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ 636 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
565 637
566 /* No external references means no one else could have 638 /* No external references means no one else could have
567 confirmed us. */ 639 * confirmed us.
640 */
568 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 641 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
569 pr_debug("Confirming conntrack %p\n", ct); 642 pr_debug("Confirming conntrack %p\n", ct);
570
571 spin_lock_bh(&nf_conntrack_lock);
572
573 /* We have to check the DYING flag inside the lock to prevent 643 /* We have to check the DYING flag inside the lock to prevent
574 a race against nf_ct_get_next_corpse() possibly called from 644 a race against nf_ct_get_next_corpse() possibly called from
575 user context, else we insert an already 'dead' hash, blocking 645 user context, else we insert an already 'dead' hash, blocking
576 further use of that particular connection -JM */ 646 further use of that particular connection -JM */
577 647
578 if (unlikely(nf_ct_is_dying(ct))) { 648 if (unlikely(nf_ct_is_dying(ct))) {
579 spin_unlock_bh(&nf_conntrack_lock); 649 nf_conntrack_double_unlock(hash, reply_hash);
650 local_bh_enable();
580 return NF_ACCEPT; 651 return NF_ACCEPT;
581 } 652 }
582 653
@@ -618,8 +689,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
618 * stores are visible. 689 * stores are visible.
619 */ 690 */
620 __nf_conntrack_hash_insert(ct, hash, reply_hash); 691 __nf_conntrack_hash_insert(ct, hash, reply_hash);
692 nf_conntrack_double_unlock(hash, reply_hash);
621 NF_CT_STAT_INC(net, insert); 693 NF_CT_STAT_INC(net, insert);
622 spin_unlock_bh(&nf_conntrack_lock); 694 local_bh_enable();
623 695
624 help = nfct_help(ct); 696 help = nfct_help(ct);
625 if (help && help->helper) 697 if (help && help->helper)
@@ -630,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
630 return NF_ACCEPT; 702 return NF_ACCEPT;
631 703
632out: 704out:
705 nf_conntrack_double_unlock(hash, reply_hash);
633 NF_CT_STAT_INC(net, insert_failed); 706 NF_CT_STAT_INC(net, insert_failed);
634 spin_unlock_bh(&nf_conntrack_lock); 707 local_bh_enable();
635 return NF_DROP; 708 return NF_DROP;
636} 709}
637EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 710EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -674,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
674 747
675/* There's a small race here where we may free a just-assured 748/* There's a small race here where we may free a just-assured
676 connection. Too bad: we're in trouble anyway. */ 749 connection. Too bad: we're in trouble anyway. */
677static noinline int early_drop(struct net *net, unsigned int hash) 750static noinline int early_drop(struct net *net, unsigned int _hash)
678{ 751{
679 /* Use oldest entry, which is roughly LRU */ 752 /* Use oldest entry, which is roughly LRU */
680 struct nf_conntrack_tuple_hash *h; 753 struct nf_conntrack_tuple_hash *h;
681 struct nf_conn *ct = NULL, *tmp; 754 struct nf_conn *ct = NULL, *tmp;
682 struct hlist_nulls_node *n; 755 struct hlist_nulls_node *n;
683 unsigned int i, cnt = 0; 756 unsigned int i = 0, cnt = 0;
684 int dropped = 0; 757 int dropped = 0;
758 unsigned int hash, sequence;
759 spinlock_t *lockp;
685 760
686 rcu_read_lock(); 761 local_bh_disable();
687 for (i = 0; i < net->ct.htable_size; i++) { 762restart:
763 sequence = read_seqcount_begin(&net->ct.generation);
764 hash = hash_bucket(_hash, net);
765 for (; i < net->ct.htable_size; i++) {
766 lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
767 spin_lock(lockp);
768 if (read_seqcount_retry(&net->ct.generation, sequence)) {
769 spin_unlock(lockp);
770 goto restart;
771 }
688 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], 772 hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
689 hnnode) { 773 hnnode) {
690 tmp = nf_ct_tuplehash_to_ctrack(h); 774 tmp = nf_ct_tuplehash_to_ctrack(h);
691 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) 775 if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
776 !nf_ct_is_dying(tmp) &&
777 atomic_inc_not_zero(&tmp->ct_general.use)) {
692 ct = tmp; 778 ct = tmp;
779 break;
780 }
693 cnt++; 781 cnt++;
694 } 782 }
695 783
696 if (ct != NULL) { 784 hash = (hash + 1) % net->ct.htable_size;
697 if (likely(!nf_ct_is_dying(ct) && 785 spin_unlock(lockp);
698 atomic_inc_not_zero(&ct->ct_general.use)))
699 break;
700 else
701 ct = NULL;
702 }
703 786
704 if (cnt >= NF_CT_EVICTION_RANGE) 787 if (ct || cnt >= NF_CT_EVICTION_RANGE)
705 break; 788 break;
706 789
707 hash = (hash + 1) % net->ct.htable_size;
708 } 790 }
709 rcu_read_unlock(); 791 local_bh_enable();
710 792
711 if (!ct) 793 if (!ct)
712 return dropped; 794 return dropped;
@@ -755,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
755 837
756 if (nf_conntrack_max && 838 if (nf_conntrack_max &&
757 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 839 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
758 if (!early_drop(net, hash_bucket(hash, net))) { 840 if (!early_drop(net, hash)) {
759 atomic_dec(&net->ct.count); 841 atomic_dec(&net->ct.count);
760 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 842 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
761 return ERR_PTR(-ENOMEM); 843 return ERR_PTR(-ENOMEM);
@@ -1304,18 +1386,24 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1304 struct nf_conn *ct; 1386 struct nf_conn *ct;
1305 struct hlist_nulls_node *n; 1387 struct hlist_nulls_node *n;
1306 int cpu; 1388 int cpu;
1389 spinlock_t *lockp;
1307 1390
1308 spin_lock_bh(&nf_conntrack_lock);
1309 for (; *bucket < net->ct.htable_size; (*bucket)++) { 1391 for (; *bucket < net->ct.htable_size; (*bucket)++) {
1310 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { 1392 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
1311 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 1393 local_bh_disable();
1312 continue; 1394 spin_lock(lockp);
1313 ct = nf_ct_tuplehash_to_ctrack(h); 1395 if (*bucket < net->ct.htable_size) {
1314 if (iter(ct, data)) 1396 hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
1315 goto found; 1397 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
1398 continue;
1399 ct = nf_ct_tuplehash_to_ctrack(h);
1400 if (iter(ct, data))
1401 goto found;
1402 }
1316 } 1403 }
1404 spin_unlock(lockp);
1405 local_bh_enable();
1317 } 1406 }
1318 spin_unlock_bh(&nf_conntrack_lock);
1319 1407
1320 for_each_possible_cpu(cpu) { 1408 for_each_possible_cpu(cpu) {
1321 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 1409 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
@@ -1331,7 +1419,8 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1331 return NULL; 1419 return NULL;
1332found: 1420found:
1333 atomic_inc(&ct->ct_general.use); 1421 atomic_inc(&ct->ct_general.use);
1334 spin_unlock_bh(&nf_conntrack_lock); 1422 spin_unlock(lockp);
1423 local_bh_enable();
1335 return ct; 1424 return ct;
1336} 1425}
1337 1426
@@ -1532,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1532 if (!hash) 1621 if (!hash)
1533 return -ENOMEM; 1622 return -ENOMEM;
1534 1623
1624 local_bh_disable();
1625 nf_conntrack_all_lock();
1626 write_seqcount_begin(&init_net.ct.generation);
1627
1535 /* Lookups in the old hash might happen in parallel, which means we 1628 /* Lookups in the old hash might happen in parallel, which means we
1536 * might get false negatives during connection lookup. New connections 1629 * might get false negatives during connection lookup. New connections
1537 * created because of a false negative won't make it into the hash 1630 * created because of a false negative won't make it into the hash
1538 * though since that required taking the lock. 1631 * though since that required taking the locks.
1539 */ 1632 */
1540 spin_lock_bh(&nf_conntrack_lock); 1633
1541 for (i = 0; i < init_net.ct.htable_size; i++) { 1634 for (i = 0; i < init_net.ct.htable_size; i++) {
1542 while (!hlist_nulls_empty(&init_net.ct.hash[i])) { 1635 while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
1543 h = hlist_nulls_entry(init_net.ct.hash[i].first, 1636 h = hlist_nulls_entry(init_net.ct.hash[i].first,
@@ -1554,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1554 1647
1555 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; 1648 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
1556 init_net.ct.hash = hash; 1649 init_net.ct.hash = hash;
1557 spin_unlock_bh(&nf_conntrack_lock); 1650
1651 write_seqcount_end(&init_net.ct.generation);
1652 nf_conntrack_all_unlock();
1653 local_bh_enable();
1558 1654
1559 nf_ct_free_hashtable(old_hash, old_size); 1655 nf_ct_free_hashtable(old_hash, old_size);
1560 return 0; 1656 return 0;
@@ -1576,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
1576int nf_conntrack_init_start(void) 1672int nf_conntrack_init_start(void)
1577{ 1673{
1578 int max_factor = 8; 1674 int max_factor = 8;
1579 int ret, cpu; 1675 int i, ret, cpu;
1676
1677 for (i = 0; i < ARRAY_SIZE(nf_conntrack_locks); i++)
1678 spin_lock_init(&nf_conntrack_locks[i]);
1580 1679
1581 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB 1680 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1582 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ 1681 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 29bd704edb85..5b3eae7d4c9a 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -423,12 +423,16 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
423 unhelp(h, me); 423 unhelp(h, me);
424 spin_unlock_bh(&pcpu->lock); 424 spin_unlock_bh(&pcpu->lock);
425 } 425 }
426 spin_lock_bh(&nf_conntrack_lock); 426 local_bh_disable();
427 for (i = 0; i < net->ct.htable_size; i++) { 427 for (i = 0; i < net->ct.htable_size; i++) {
428 hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) 428 spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
429 unhelp(h, me); 429 if (i < net->ct.htable_size) {
430 hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
431 unhelp(h, me);
432 }
433 spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
430 } 434 }
431 spin_unlock_bh(&nf_conntrack_lock); 435 local_bh_enable();
432} 436}
433 437
434void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) 438void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index be4d1b0bbb6a..8d778a9fd063 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
764 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); 764 struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
765 u_int8_t l3proto = nfmsg->nfgen_family; 765 u_int8_t l3proto = nfmsg->nfgen_family;
766 int res; 766 int res;
767 spinlock_t *lockp;
768
767#ifdef CONFIG_NF_CONNTRACK_MARK 769#ifdef CONFIG_NF_CONNTRACK_MARK
768 const struct ctnetlink_dump_filter *filter = cb->data; 770 const struct ctnetlink_dump_filter *filter = cb->data;
769#endif 771#endif
770 772
771 spin_lock_bh(&nf_conntrack_lock);
772 last = (struct nf_conn *)cb->args[1]; 773 last = (struct nf_conn *)cb->args[1];
774
775 local_bh_disable();
773 for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { 776 for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
774restart: 777restart:
778 lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
779 spin_lock(lockp);
780 if (cb->args[0] >= net->ct.htable_size) {
781 spin_unlock(lockp);
782 goto out;
783 }
775 hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], 784 hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
776 hnnode) { 785 hnnode) {
777 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 786 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -803,16 +812,18 @@ restart:
803 if (res < 0) { 812 if (res < 0) {
804 nf_conntrack_get(&ct->ct_general); 813 nf_conntrack_get(&ct->ct_general);
805 cb->args[1] = (unsigned long)ct; 814 cb->args[1] = (unsigned long)ct;
815 spin_unlock(lockp);
806 goto out; 816 goto out;
807 } 817 }
808 } 818 }
819 spin_unlock(lockp);
809 if (cb->args[1]) { 820 if (cb->args[1]) {
810 cb->args[1] = 0; 821 cb->args[1] = 0;
811 goto restart; 822 goto restart;
812 } 823 }
813 } 824 }
814out: 825out:
815 spin_unlock_bh(&nf_conntrack_lock); 826 local_bh_enable();
816 if (last) 827 if (last)
817 nf_ct_put(last); 828 nf_ct_put(last);
818 829