diff options
| author | Jesper Dangaard Brouer <brouer@redhat.com> | 2014-03-03 08:46:13 -0500 |
|---|---|---|
| committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2014-03-07 05:41:13 -0500 |
| commit | 93bb0ceb75be2fdfa9fc0dd1fb522d9ada515d9c (patch) | |
| tree | e47e7b701d8aa47683816a2d913ad7d005c25939 /include/net | |
| parent | ca7433df3a672efc88e08222cfa4b3aa965ca324 (diff) | |
netfilter: conntrack: remove central spinlock nf_conntrack_lock
nf_conntrack_lock is a monolithic lock and suffers from huge contention
on current generation servers (8 or more core/threads).
Perf locking congestion is clear on base kernel:
- 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh
- _raw_spin_lock_bh
+ 25.33% init_conntrack
+ 24.86% nf_ct_delete_from_lists
+ 24.62% __nf_conntrack_confirm
+ 24.38% destroy_conntrack
+ 0.70% tcp_packet
+ 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup
+ 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free
+ 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer
+ 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete
+ 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table
This patch change conntrack locking and provides a huge performance
improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with
10Gbit/s ixgbe (with tool trafgen):
Base kernel: 810.405 new conntrack/sec
After patch: 2.233.876 new conntrack/sec
Notice other floods attack (SYN+ACK or ACK) can easily be deflected using:
# iptables -A INPUT -m state --state INVALID -j DROP
# sysctl -w net/netfilter/nf_conntrack_tcp_loose=0
Use an array of hashed spinlocks to protect insertions/deletions of
conntracks into the hash table. 1024 spinlocks seem to give good
results, at minimal cost (4KB memory). Due to lockdep max depth,
1024 becomes 8 if CONFIG_LOCKDEP=y
The hash resize is a bit tricky, because we need to take all locks in
the array. A seqcount_t is used to synchronize the hash table users
with the resizing process.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'include/net')
| -rw-r--r-- | include/net/netfilter/nf_conntrack_core.h | 7 | ||||
| -rw-r--r-- | include/net/netns/conntrack.h | 2 |
2 files changed, 8 insertions, 1 deletions
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index d12a631d0415..cc0c18827602 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h | |||
| @@ -77,7 +77,12 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, | |||
| 77 | const struct nf_conntrack_l3proto *l3proto, | 77 | const struct nf_conntrack_l3proto *l3proto, |
| 78 | const struct nf_conntrack_l4proto *proto); | 78 | const struct nf_conntrack_l4proto *proto); |
| 79 | 79 | ||
| 80 | extern spinlock_t nf_conntrack_lock ; | 80 | #ifdef CONFIG_LOCKDEP |
| 81 | # define CONNTRACK_LOCKS 8 | ||
| 82 | #else | ||
| 83 | # define CONNTRACK_LOCKS 1024 | ||
| 84 | #endif | ||
| 85 | extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; | ||
| 81 | 86 | ||
| 82 | extern spinlock_t nf_conntrack_expect_lock; | 87 | extern spinlock_t nf_conntrack_expect_lock; |
| 83 | 88 | ||
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index c6a8994e9922..773cce308bc6 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | #include <linux/list_nulls.h> | 5 | #include <linux/list_nulls.h> |
| 6 | #include <linux/atomic.h> | 6 | #include <linux/atomic.h> |
| 7 | #include <linux/netfilter/nf_conntrack_tcp.h> | 7 | #include <linux/netfilter/nf_conntrack_tcp.h> |
| 8 | #include <linux/seqlock.h> | ||
| 8 | 9 | ||
| 9 | struct ctl_table_header; | 10 | struct ctl_table_header; |
| 10 | struct nf_conntrack_ecache; | 11 | struct nf_conntrack_ecache; |
| @@ -90,6 +91,7 @@ struct netns_ct { | |||
| 90 | int sysctl_checksum; | 91 | int sysctl_checksum; |
| 91 | 92 | ||
| 92 | unsigned int htable_size; | 93 | unsigned int htable_size; |
| 94 | seqcount_t generation; | ||
| 93 | struct kmem_cache *nf_conntrack_cachep; | 95 | struct kmem_cache *nf_conntrack_cachep; |
| 94 | struct hlist_nulls_head *hash; | 96 | struct hlist_nulls_head *hash; |
| 95 | struct hlist_head *expect_hash; | 97 | struct hlist_head *expect_hash; |
