aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesper Dangaard Brouer <brouer@redhat.com>2013-04-03 19:38:16 -0400
committerDavid S. Miller <davem@davemloft.net>2013-04-04 17:37:05 -0400
commit19952cc4f8f572493293a8caed27c4be89c5fc9d (patch)
treea1b42a559ddc6d68ec7debcb7231cc00dbccc8ac
parentd66248326410ed0d3e813ebe974b3e6638df0717 (diff)
net: frag queue per hash bucket locking
This patch implements per hash bucket locking for the frag queue hash. This removes two write locks, and the only remaining write lock is for protecting hash rebuild. This essentially reduce the readers-writer lock to a rebuild lock. This patch is part of "net: frag performance followup" http://thread.gmane.org/gmane.linux.network/263644 of which two patches have already been accepted: Same test setup as previous: (http://thread.gmane.org/gmane.linux.network/257155) Two 10G interfaces, on seperate NUMA nodes, are under-test, and uses Ethernet flow-control. A third interface is used for generating the DoS attack (with trafgen). Notice, I have changed the frag DoS generator script to be more efficient/deadly. Before it would only hit one RX queue, now its sending packets causing multi-queue RX, due to "better" RX hashing. Test types summary (netperf UDP_STREAM): Test-20G64K == 2x10G with 65K fragments Test-20G3F == 2x10G with 3x fragments (3*1472 bytes) Test-20G64K+DoS == Same as 20G64K with frag DoS Test-20G3F+DoS == Same as 20G3F with frag DoS Test-20G64K+MQ == Same as 20G64K with Multi-Queue frag DoS Test-20G3F+MQ == Same as 20G3F with Multi-Queue frag DoS When I rebased this-patch(03) (on top of net-next commit a210576c) and removed the _bh spinlock, I saw a performance regression. BUT this was caused by some unrelated change in-between. See tests below. Test (A) is what I reported before for patch-02, accepted in commit 1b5ab0de. Test (B) verifying-retest of commit 1b5ab0de corrospond to patch-02. Test (C) is what I reported before for this-patch Test (D) is net-next master HEAD (commit a210576c), which reveals some (unknown) performance regression (compared against test (B)). Test (D) function as a new base-test. Performance table summary (in Mbit/s): (#) Test-type: 20G64K 20G3F 20G64K+DoS 20G3F+DoS 20G64K+MQ 20G3F+MQ ---------- ------- ------- ---------- --------- -------- ------- (A) Patch-02 : 18848.7 13230.1 4103.04 5310.36 130.0 440.2 (B) 1b5ab0de : 18841.5 13156.8 4101.08 5314.57 129.0 424.2 (C) Patch-03v1: 18838.0 13490.5 4405.11 6814.72 196.6 461.6 (D) a210576c : 18321.5 11250.4 3635.34 5160.13 119.1 405.2 (E) with _bh : 17247.3 11492.6 3994.74 6405.29 166.7 413.6 (F) without bh: 17471.3 11298.7 3818.05 6102.11 165.7 406.3 Test (E) and (F) is this-patch(03), with(V1) and without(V2) the _bh spinlocks. I cannot explain the slow down for 20G64K (but its an artificial "lab-test" so I'm not worried). But the other results does show improvements. And test (E) "with _bh" version is slightly better. Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Eric Dumazet <edumazet@google.com> ---- V2: - By analysis from Hannes Frederic Sowa and Eric Dumazet, we don't need the spinlock _bh versions, as Netfilter currently does a local_bh_disable() before entering inet_fragment. - Fold-in desc from cover-mail V3: - Drop the chain_len counter per hash bucket. Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/inet_frag.h8
-rw-r--r--net/ipv4/inet_fragment.c57
2 files changed, 51 insertions, 14 deletions
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 7cac9c5789b5..6f41b45e819e 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -50,10 +50,16 @@ struct inet_frag_queue {
50 */ 50 */
51#define INETFRAGS_MAXDEPTH 128 51#define INETFRAGS_MAXDEPTH 128
52 52
53struct inet_frag_bucket {
54 struct hlist_head chain;
55 spinlock_t chain_lock;
56};
57
53struct inet_frags { 58struct inet_frags {
54 struct hlist_head hash[INETFRAGS_HASHSZ]; 59 struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
55 /* This rwlock is a global lock (seperate per IPv4, IPv6 and 60 /* This rwlock is a global lock (seperate per IPv4, IPv6 and
56 * netfilter). Important to keep this on a seperate cacheline. 61 * netfilter). Important to keep this on a seperate cacheline.
62 * Its primarily a rebuild protection rwlock.
57 */ 63 */
58 rwlock_t lock ____cacheline_aligned_in_smp; 64 rwlock_t lock ____cacheline_aligned_in_smp;
59 int secret_interval; 65 int secret_interval;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 1206ca64b0ea..e97d66a1fdde 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -52,20 +52,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
52 unsigned long now = jiffies; 52 unsigned long now = jiffies;
53 int i; 53 int i;
54 54
55 /* Per bucket lock NOT needed here, due to write lock protection */
55 write_lock(&f->lock); 56 write_lock(&f->lock);
57
56 get_random_bytes(&f->rnd, sizeof(u32)); 58 get_random_bytes(&f->rnd, sizeof(u32));
57 for (i = 0; i < INETFRAGS_HASHSZ; i++) { 59 for (i = 0; i < INETFRAGS_HASHSZ; i++) {
60 struct inet_frag_bucket *hb;
58 struct inet_frag_queue *q; 61 struct inet_frag_queue *q;
59 struct hlist_node *n; 62 struct hlist_node *n;
60 63
61 hlist_for_each_entry_safe(q, n, &f->hash[i], list) { 64 hb = &f->hash[i];
65 hlist_for_each_entry_safe(q, n, &hb->chain, list) {
62 unsigned int hval = f->hashfn(q); 66 unsigned int hval = f->hashfn(q);
63 67
64 if (hval != i) { 68 if (hval != i) {
69 struct inet_frag_bucket *hb_dest;
70
65 hlist_del(&q->list); 71 hlist_del(&q->list);
66 72
67 /* Relink to new hash chain. */ 73 /* Relink to new hash chain. */
68 hlist_add_head(&q->list, &f->hash[hval]); 74 hb_dest = &f->hash[hval];
75 hlist_add_head(&q->list, &hb_dest->chain);
69 } 76 }
70 } 77 }
71 } 78 }
@@ -78,9 +85,12 @@ void inet_frags_init(struct inet_frags *f)
78{ 85{
79 int i; 86 int i;
80 87
81 for (i = 0; i < INETFRAGS_HASHSZ; i++) 88 for (i = 0; i < INETFRAGS_HASHSZ; i++) {
82 INIT_HLIST_HEAD(&f->hash[i]); 89 struct inet_frag_bucket *hb = &f->hash[i];
83 90
91 spin_lock_init(&hb->chain_lock);
92 INIT_HLIST_HEAD(&hb->chain);
93 }
84 rwlock_init(&f->lock); 94 rwlock_init(&f->lock);
85 95
86 f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ 96 f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
@@ -122,9 +132,18 @@ EXPORT_SYMBOL(inet_frags_exit_net);
122 132
123static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) 133static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
124{ 134{
125 write_lock(&f->lock); 135 struct inet_frag_bucket *hb;
136 unsigned int hash;
137
138 read_lock(&f->lock);
139 hash = f->hashfn(fq);
140 hb = &f->hash[hash];
141
142 spin_lock(&hb->chain_lock);
126 hlist_del(&fq->list); 143 hlist_del(&fq->list);
127 write_unlock(&f->lock); 144 spin_unlock(&hb->chain_lock);
145
146 read_unlock(&f->lock);
128 inet_frag_lru_del(fq); 147 inet_frag_lru_del(fq);
129} 148}
130 149
@@ -226,27 +245,32 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
226 struct inet_frag_queue *qp_in, struct inet_frags *f, 245 struct inet_frag_queue *qp_in, struct inet_frags *f,
227 void *arg) 246 void *arg)
228{ 247{
248 struct inet_frag_bucket *hb;
229 struct inet_frag_queue *qp; 249 struct inet_frag_queue *qp;
230#ifdef CONFIG_SMP 250#ifdef CONFIG_SMP
231#endif 251#endif
232 unsigned int hash; 252 unsigned int hash;
233 253
234 write_lock(&f->lock); 254 read_lock(&f->lock); /* Protects against hash rebuild */
235 /* 255 /*
236 * While we stayed w/o the lock other CPU could update 256 * While we stayed w/o the lock other CPU could update
237 * the rnd seed, so we need to re-calculate the hash 257 * the rnd seed, so we need to re-calculate the hash
238 * chain. Fortunatelly the qp_in can be used to get one. 258 * chain. Fortunatelly the qp_in can be used to get one.
239 */ 259 */
240 hash = f->hashfn(qp_in); 260 hash = f->hashfn(qp_in);
261 hb = &f->hash[hash];
262 spin_lock(&hb->chain_lock);
263
241#ifdef CONFIG_SMP 264#ifdef CONFIG_SMP
242 /* With SMP race we have to recheck hash table, because 265 /* With SMP race we have to recheck hash table, because
243 * such entry could be created on other cpu, while we 266 * such entry could be created on other cpu, while we
244 * promoted read lock to write lock. 267 * released the hash bucket lock.
245 */ 268 */
246 hlist_for_each_entry(qp, &f->hash[hash], list) { 269 hlist_for_each_entry(qp, &hb->chain, list) {
247 if (qp->net == nf && f->match(qp, arg)) { 270 if (qp->net == nf && f->match(qp, arg)) {
248 atomic_inc(&qp->refcnt); 271 atomic_inc(&qp->refcnt);
249 write_unlock(&f->lock); 272 spin_unlock(&hb->chain_lock);
273 read_unlock(&f->lock);
250 qp_in->last_in |= INET_FRAG_COMPLETE; 274 qp_in->last_in |= INET_FRAG_COMPLETE;
251 inet_frag_put(qp_in, f); 275 inet_frag_put(qp_in, f);
252 return qp; 276 return qp;
@@ -258,8 +282,9 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
258 atomic_inc(&qp->refcnt); 282 atomic_inc(&qp->refcnt);
259 283
260 atomic_inc(&qp->refcnt); 284 atomic_inc(&qp->refcnt);
261 hlist_add_head(&qp->list, &f->hash[hash]); 285 hlist_add_head(&qp->list, &hb->chain);
262 write_unlock(&f->lock); 286 spin_unlock(&hb->chain_lock);
287 read_unlock(&f->lock);
263 inet_frag_lru_add(nf, qp); 288 inet_frag_lru_add(nf, qp);
264 return qp; 289 return qp;
265} 290}
@@ -300,17 +325,23 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
300 struct inet_frags *f, void *key, unsigned int hash) 325 struct inet_frags *f, void *key, unsigned int hash)
301 __releases(&f->lock) 326 __releases(&f->lock)
302{ 327{
328 struct inet_frag_bucket *hb;
303 struct inet_frag_queue *q; 329 struct inet_frag_queue *q;
304 int depth = 0; 330 int depth = 0;
305 331
306 hlist_for_each_entry(q, &f->hash[hash], list) { 332 hb = &f->hash[hash];
333
334 spin_lock(&hb->chain_lock);
335 hlist_for_each_entry(q, &hb->chain, list) {
307 if (q->net == nf && f->match(q, key)) { 336 if (q->net == nf && f->match(q, key)) {
308 atomic_inc(&q->refcnt); 337 atomic_inc(&q->refcnt);
338 spin_unlock(&hb->chain_lock);
309 read_unlock(&f->lock); 339 read_unlock(&f->lock);
310 return q; 340 return q;
311 } 341 }
312 depth++; 342 depth++;
313 } 343 }
344 spin_unlock(&hb->chain_lock);
314 read_unlock(&f->lock); 345 read_unlock(&f->lock);
315 346
316 if (depth <= INETFRAGS_MAXDEPTH) 347 if (depth <= INETFRAGS_MAXDEPTH)