diff options
author | Jesper Dangaard Brouer <brouer@redhat.com> | 2013-04-03 19:38:16 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-04-04 17:37:05 -0400 |
commit | 19952cc4f8f572493293a8caed27c4be89c5fc9d (patch) | |
tree | a1b42a559ddc6d68ec7debcb7231cc00dbccc8ac | |
parent | d66248326410ed0d3e813ebe974b3e6638df0717 (diff) |
net: frag queue per hash bucket locking
This patch implements per hash bucket locking for the frag queue
hash. This removes two write locks, and the only remaining write
lock is for protecting hash rebuild. This essentially reduce the
readers-writer lock to a rebuild lock.
This patch is part of "net: frag performance followup"
http://thread.gmane.org/gmane.linux.network/263644
of which two patches have already been accepted:
Same test setup as previous:
(http://thread.gmane.org/gmane.linux.network/257155)
Two 10G interfaces, on seperate NUMA nodes, are under-test, and uses
Ethernet flow-control. A third interface is used for generating the
DoS attack (with trafgen).
Notice, I have changed the frag DoS generator script to be more
efficient/deadly. Before it would only hit one RX queue, now its
sending packets causing multi-queue RX, due to "better" RX hashing.
Test types summary (netperf UDP_STREAM):
Test-20G64K == 2x10G with 65K fragments
Test-20G3F == 2x10G with 3x fragments (3*1472 bytes)
Test-20G64K+DoS == Same as 20G64K with frag DoS
Test-20G3F+DoS == Same as 20G3F with frag DoS
Test-20G64K+MQ == Same as 20G64K with Multi-Queue frag DoS
Test-20G3F+MQ == Same as 20G3F with Multi-Queue frag DoS
When I rebased this-patch(03) (on top of net-next commit a210576c) and
removed the _bh spinlock, I saw a performance regression. BUT this
was caused by some unrelated change in-between. See tests below.
Test (A) is what I reported before for patch-02, accepted in commit 1b5ab0de.
Test (B) verifying-retest of commit 1b5ab0de corrospond to patch-02.
Test (C) is what I reported before for this-patch
Test (D) is net-next master HEAD (commit a210576c), which reveals some
(unknown) performance regression (compared against test (B)).
Test (D) function as a new base-test.
Performance table summary (in Mbit/s):
(#) Test-type: 20G64K 20G3F 20G64K+DoS 20G3F+DoS 20G64K+MQ 20G3F+MQ
---------- ------- ------- ---------- --------- -------- -------
(A) Patch-02 : 18848.7 13230.1 4103.04 5310.36 130.0 440.2
(B) 1b5ab0de : 18841.5 13156.8 4101.08 5314.57 129.0 424.2
(C) Patch-03v1: 18838.0 13490.5 4405.11 6814.72 196.6 461.6
(D) a210576c : 18321.5 11250.4 3635.34 5160.13 119.1 405.2
(E) with _bh : 17247.3 11492.6 3994.74 6405.29 166.7 413.6
(F) without bh: 17471.3 11298.7 3818.05 6102.11 165.7 406.3
Test (E) and (F) is this-patch(03), with(V1) and without(V2) the _bh spinlocks.
I cannot explain the slow down for 20G64K (but its an artificial
"lab-test" so I'm not worried). But the other results does show
improvements. And test (E) "with _bh" version is slightly better.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>
----
V2:
- By analysis from Hannes Frederic Sowa and Eric Dumazet, we don't
need the spinlock _bh versions, as Netfilter currently does a
local_bh_disable() before entering inet_fragment.
- Fold-in desc from cover-mail
V3:
- Drop the chain_len counter per hash bucket.
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/inet_frag.h | 8 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 57 |
2 files changed, 51 insertions, 14 deletions
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 7cac9c5789b5..6f41b45e819e 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h | |||
@@ -50,10 +50,16 @@ struct inet_frag_queue { | |||
50 | */ | 50 | */ |
51 | #define INETFRAGS_MAXDEPTH 128 | 51 | #define INETFRAGS_MAXDEPTH 128 |
52 | 52 | ||
53 | struct inet_frag_bucket { | ||
54 | struct hlist_head chain; | ||
55 | spinlock_t chain_lock; | ||
56 | }; | ||
57 | |||
53 | struct inet_frags { | 58 | struct inet_frags { |
54 | struct hlist_head hash[INETFRAGS_HASHSZ]; | 59 | struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; |
55 | /* This rwlock is a global lock (seperate per IPv4, IPv6 and | 60 | /* This rwlock is a global lock (seperate per IPv4, IPv6 and |
56 | * netfilter). Important to keep this on a seperate cacheline. | 61 | * netfilter). Important to keep this on a seperate cacheline. |
62 | * Its primarily a rebuild protection rwlock. | ||
57 | */ | 63 | */ |
58 | rwlock_t lock ____cacheline_aligned_in_smp; | 64 | rwlock_t lock ____cacheline_aligned_in_smp; |
59 | int secret_interval; | 65 | int secret_interval; |
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 1206ca64b0ea..e97d66a1fdde 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c | |||
@@ -52,20 +52,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy) | |||
52 | unsigned long now = jiffies; | 52 | unsigned long now = jiffies; |
53 | int i; | 53 | int i; |
54 | 54 | ||
55 | /* Per bucket lock NOT needed here, due to write lock protection */ | ||
55 | write_lock(&f->lock); | 56 | write_lock(&f->lock); |
57 | |||
56 | get_random_bytes(&f->rnd, sizeof(u32)); | 58 | get_random_bytes(&f->rnd, sizeof(u32)); |
57 | for (i = 0; i < INETFRAGS_HASHSZ; i++) { | 59 | for (i = 0; i < INETFRAGS_HASHSZ; i++) { |
60 | struct inet_frag_bucket *hb; | ||
58 | struct inet_frag_queue *q; | 61 | struct inet_frag_queue *q; |
59 | struct hlist_node *n; | 62 | struct hlist_node *n; |
60 | 63 | ||
61 | hlist_for_each_entry_safe(q, n, &f->hash[i], list) { | 64 | hb = &f->hash[i]; |
65 | hlist_for_each_entry_safe(q, n, &hb->chain, list) { | ||
62 | unsigned int hval = f->hashfn(q); | 66 | unsigned int hval = f->hashfn(q); |
63 | 67 | ||
64 | if (hval != i) { | 68 | if (hval != i) { |
69 | struct inet_frag_bucket *hb_dest; | ||
70 | |||
65 | hlist_del(&q->list); | 71 | hlist_del(&q->list); |
66 | 72 | ||
67 | /* Relink to new hash chain. */ | 73 | /* Relink to new hash chain. */ |
68 | hlist_add_head(&q->list, &f->hash[hval]); | 74 | hb_dest = &f->hash[hval]; |
75 | hlist_add_head(&q->list, &hb_dest->chain); | ||
69 | } | 76 | } |
70 | } | 77 | } |
71 | } | 78 | } |
@@ -78,9 +85,12 @@ void inet_frags_init(struct inet_frags *f) | |||
78 | { | 85 | { |
79 | int i; | 86 | int i; |
80 | 87 | ||
81 | for (i = 0; i < INETFRAGS_HASHSZ; i++) | 88 | for (i = 0; i < INETFRAGS_HASHSZ; i++) { |
82 | INIT_HLIST_HEAD(&f->hash[i]); | 89 | struct inet_frag_bucket *hb = &f->hash[i]; |
83 | 90 | ||
91 | spin_lock_init(&hb->chain_lock); | ||
92 | INIT_HLIST_HEAD(&hb->chain); | ||
93 | } | ||
84 | rwlock_init(&f->lock); | 94 | rwlock_init(&f->lock); |
85 | 95 | ||
86 | f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ | 96 | f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ |
@@ -122,9 +132,18 @@ EXPORT_SYMBOL(inet_frags_exit_net); | |||
122 | 132 | ||
123 | static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) | 133 | static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) |
124 | { | 134 | { |
125 | write_lock(&f->lock); | 135 | struct inet_frag_bucket *hb; |
136 | unsigned int hash; | ||
137 | |||
138 | read_lock(&f->lock); | ||
139 | hash = f->hashfn(fq); | ||
140 | hb = &f->hash[hash]; | ||
141 | |||
142 | spin_lock(&hb->chain_lock); | ||
126 | hlist_del(&fq->list); | 143 | hlist_del(&fq->list); |
127 | write_unlock(&f->lock); | 144 | spin_unlock(&hb->chain_lock); |
145 | |||
146 | read_unlock(&f->lock); | ||
128 | inet_frag_lru_del(fq); | 147 | inet_frag_lru_del(fq); |
129 | } | 148 | } |
130 | 149 | ||
@@ -226,27 +245,32 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, | |||
226 | struct inet_frag_queue *qp_in, struct inet_frags *f, | 245 | struct inet_frag_queue *qp_in, struct inet_frags *f, |
227 | void *arg) | 246 | void *arg) |
228 | { | 247 | { |
248 | struct inet_frag_bucket *hb; | ||
229 | struct inet_frag_queue *qp; | 249 | struct inet_frag_queue *qp; |
230 | #ifdef CONFIG_SMP | 250 | #ifdef CONFIG_SMP |
231 | #endif | 251 | #endif |
232 | unsigned int hash; | 252 | unsigned int hash; |
233 | 253 | ||
234 | write_lock(&f->lock); | 254 | read_lock(&f->lock); /* Protects against hash rebuild */ |
235 | /* | 255 | /* |
236 | * While we stayed w/o the lock other CPU could update | 256 | * While we stayed w/o the lock other CPU could update |
237 | * the rnd seed, so we need to re-calculate the hash | 257 | * the rnd seed, so we need to re-calculate the hash |
238 | * chain. Fortunatelly the qp_in can be used to get one. | 258 | * chain. Fortunatelly the qp_in can be used to get one. |
239 | */ | 259 | */ |
240 | hash = f->hashfn(qp_in); | 260 | hash = f->hashfn(qp_in); |
261 | hb = &f->hash[hash]; | ||
262 | spin_lock(&hb->chain_lock); | ||
263 | |||
241 | #ifdef CONFIG_SMP | 264 | #ifdef CONFIG_SMP |
242 | /* With SMP race we have to recheck hash table, because | 265 | /* With SMP race we have to recheck hash table, because |
243 | * such entry could be created on other cpu, while we | 266 | * such entry could be created on other cpu, while we |
244 | * promoted read lock to write lock. | 267 | * released the hash bucket lock. |
245 | */ | 268 | */ |
246 | hlist_for_each_entry(qp, &f->hash[hash], list) { | 269 | hlist_for_each_entry(qp, &hb->chain, list) { |
247 | if (qp->net == nf && f->match(qp, arg)) { | 270 | if (qp->net == nf && f->match(qp, arg)) { |
248 | atomic_inc(&qp->refcnt); | 271 | atomic_inc(&qp->refcnt); |
249 | write_unlock(&f->lock); | 272 | spin_unlock(&hb->chain_lock); |
273 | read_unlock(&f->lock); | ||
250 | qp_in->last_in |= INET_FRAG_COMPLETE; | 274 | qp_in->last_in |= INET_FRAG_COMPLETE; |
251 | inet_frag_put(qp_in, f); | 275 | inet_frag_put(qp_in, f); |
252 | return qp; | 276 | return qp; |
@@ -258,8 +282,9 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, | |||
258 | atomic_inc(&qp->refcnt); | 282 | atomic_inc(&qp->refcnt); |
259 | 283 | ||
260 | atomic_inc(&qp->refcnt); | 284 | atomic_inc(&qp->refcnt); |
261 | hlist_add_head(&qp->list, &f->hash[hash]); | 285 | hlist_add_head(&qp->list, &hb->chain); |
262 | write_unlock(&f->lock); | 286 | spin_unlock(&hb->chain_lock); |
287 | read_unlock(&f->lock); | ||
263 | inet_frag_lru_add(nf, qp); | 288 | inet_frag_lru_add(nf, qp); |
264 | return qp; | 289 | return qp; |
265 | } | 290 | } |
@@ -300,17 +325,23 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, | |||
300 | struct inet_frags *f, void *key, unsigned int hash) | 325 | struct inet_frags *f, void *key, unsigned int hash) |
301 | __releases(&f->lock) | 326 | __releases(&f->lock) |
302 | { | 327 | { |
328 | struct inet_frag_bucket *hb; | ||
303 | struct inet_frag_queue *q; | 329 | struct inet_frag_queue *q; |
304 | int depth = 0; | 330 | int depth = 0; |
305 | 331 | ||
306 | hlist_for_each_entry(q, &f->hash[hash], list) { | 332 | hb = &f->hash[hash]; |
333 | |||
334 | spin_lock(&hb->chain_lock); | ||
335 | hlist_for_each_entry(q, &hb->chain, list) { | ||
307 | if (q->net == nf && f->match(q, key)) { | 336 | if (q->net == nf && f->match(q, key)) { |
308 | atomic_inc(&q->refcnt); | 337 | atomic_inc(&q->refcnt); |
338 | spin_unlock(&hb->chain_lock); | ||
309 | read_unlock(&f->lock); | 339 | read_unlock(&f->lock); |
310 | return q; | 340 | return q; |
311 | } | 341 | } |
312 | depth++; | 342 | depth++; |
313 | } | 343 | } |
344 | spin_unlock(&hb->chain_lock); | ||
314 | read_unlock(&f->lock); | 345 | read_unlock(&f->lock); |
315 | 346 | ||
316 | if (depth <= INETFRAGS_MAXDEPTH) | 347 | if (depth <= INETFRAGS_MAXDEPTH) |