diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2011-03-04 17:33:59 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-03-04 17:33:59 -0500 |
commit | 65e8354ec13a45414045084166cb340c0d7ffe8a (patch) | |
tree | 3a7d8b63a3765343c07d6f878b4ef389041da6f0 | |
parent | d72751ede1b9bf993d7bd3377305c8e9e36a3cc4 (diff) |
inetpeer: seqlock optimization
David noticed :
------------------
Eric, I was profiling the non-routing-cache case and something that
stuck out is the case of calling inet_getpeer() with create==0.
If an entry is not found, we have to redo the lookup under a spinlock
to make certain that a concurrent writer rebalancing the tree does
not "hide" an existing entry from us.
This makes the case of a create==0 lookup for a not-present entry
really expensive. It is on the order of 600 cpu cycles on my
Niagara2.
I added a hack to not do the relookup under the lock when create==0
and it now costs less than 300 cycles.
This is now a pretty common operation with the way we handle COW'd
metrics, so I think it's definitely worth optimizing.
-----------------
One solution is to use a seqlock instead of a spinlock to protect struct
inet_peer_base.
After a failed avl tree lookup, we can easily detect if a writer did
some changes during our lookup. Taking the lock and redo the lookup is
only necessary in this case.
Note: Add one private rcu_deref_locked() macro to place in one spot the
access to spinlock included in seqlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/ipv4/inetpeer.c | 75 |
1 files changed, 35 insertions, 40 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 48f8d4592ccd..f604ffdbea27 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = { | |||
81 | 81 | ||
82 | struct inet_peer_base { | 82 | struct inet_peer_base { |
83 | struct inet_peer __rcu *root; | 83 | struct inet_peer __rcu *root; |
84 | spinlock_t lock; | 84 | seqlock_t lock; |
85 | int total; | 85 | int total; |
86 | }; | 86 | }; |
87 | 87 | ||
88 | static struct inet_peer_base v4_peers = { | 88 | static struct inet_peer_base v4_peers = { |
89 | .root = peer_avl_empty_rcu, | 89 | .root = peer_avl_empty_rcu, |
90 | .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), | 90 | .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), |
91 | .total = 0, | 91 | .total = 0, |
92 | }; | 92 | }; |
93 | 93 | ||
94 | static struct inet_peer_base v6_peers = { | 94 | static struct inet_peer_base v6_peers = { |
95 | .root = peer_avl_empty_rcu, | 95 | .root = peer_avl_empty_rcu, |
96 | .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), | 96 | .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), |
97 | .total = 0, | 97 | .total = 0, |
98 | }; | 98 | }; |
99 | 99 | ||
@@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a, | |||
177 | return 0; | 177 | return 0; |
178 | } | 178 | } |
179 | 179 | ||
180 | #define rcu_deref_locked(X, BASE) \ | ||
181 | rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) | ||
182 | |||
180 | /* | 183 | /* |
181 | * Called with local BH disabled and the pool lock held. | 184 | * Called with local BH disabled and the pool lock held. |
182 | */ | 185 | */ |
@@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a, | |||
187 | \ | 190 | \ |
188 | stackptr = _stack; \ | 191 | stackptr = _stack; \ |
189 | *stackptr++ = &_base->root; \ | 192 | *stackptr++ = &_base->root; \ |
190 | for (u = rcu_dereference_protected(_base->root, \ | 193 | for (u = rcu_deref_locked(_base->root, _base); \ |
191 | lockdep_is_held(&_base->lock)); \ | ||
192 | u != peer_avl_empty; ) { \ | 194 | u != peer_avl_empty; ) { \ |
193 | int cmp = addr_compare(_daddr, &u->daddr); \ | 195 | int cmp = addr_compare(_daddr, &u->daddr); \ |
194 | if (cmp == 0) \ | 196 | if (cmp == 0) \ |
@@ -198,8 +200,7 @@ static int addr_compare(const struct inetpeer_addr *a, | |||
198 | else \ | 200 | else \ |
199 | v = &u->avl_right; \ | 201 | v = &u->avl_right; \ |
200 | *stackptr++ = v; \ | 202 | *stackptr++ = v; \ |
201 | u = rcu_dereference_protected(*v, \ | 203 | u = rcu_deref_locked(*v, _base); \ |
202 | lockdep_is_held(&_base->lock)); \ | ||
203 | } \ | 204 | } \ |
204 | u; \ | 205 | u; \ |
205 | }) | 206 | }) |
@@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, | |||
246 | struct inet_peer __rcu **v; \ | 247 | struct inet_peer __rcu **v; \ |
247 | *stackptr++ = &start->avl_left; \ | 248 | *stackptr++ = &start->avl_left; \ |
248 | v = &start->avl_left; \ | 249 | v = &start->avl_left; \ |
249 | for (u = rcu_dereference_protected(*v, \ | 250 | for (u = rcu_deref_locked(*v, base); \ |
250 | lockdep_is_held(&base->lock)); \ | ||
251 | u->avl_right != peer_avl_empty_rcu; ) { \ | 251 | u->avl_right != peer_avl_empty_rcu; ) { \ |
252 | v = &u->avl_right; \ | 252 | v = &u->avl_right; \ |
253 | *stackptr++ = v; \ | 253 | *stackptr++ = v; \ |
254 | u = rcu_dereference_protected(*v, \ | 254 | u = rcu_deref_locked(*v, base); \ |
255 | lockdep_is_held(&base->lock)); \ | ||
256 | } \ | 255 | } \ |
257 | u; \ | 256 | u; \ |
258 | }) | 257 | }) |
@@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], | |||
271 | 270 | ||
272 | while (stackend > stack) { | 271 | while (stackend > stack) { |
273 | nodep = *--stackend; | 272 | nodep = *--stackend; |
274 | node = rcu_dereference_protected(*nodep, | 273 | node = rcu_deref_locked(*nodep, base); |
275 | lockdep_is_held(&base->lock)); | 274 | l = rcu_deref_locked(node->avl_left, base); |
276 | l = rcu_dereference_protected(node->avl_left, | 275 | r = rcu_deref_locked(node->avl_right, base); |
277 | lockdep_is_held(&base->lock)); | ||
278 | r = rcu_dereference_protected(node->avl_right, | ||
279 | lockdep_is_held(&base->lock)); | ||
280 | lh = node_height(l); | 276 | lh = node_height(l); |
281 | rh = node_height(r); | 277 | rh = node_height(r); |
282 | if (lh > rh + 1) { /* l: RH+2 */ | 278 | if (lh > rh + 1) { /* l: RH+2 */ |
283 | struct inet_peer *ll, *lr, *lrl, *lrr; | 279 | struct inet_peer *ll, *lr, *lrl, *lrr; |
284 | int lrh; | 280 | int lrh; |
285 | ll = rcu_dereference_protected(l->avl_left, | 281 | ll = rcu_deref_locked(l->avl_left, base); |
286 | lockdep_is_held(&base->lock)); | 282 | lr = rcu_deref_locked(l->avl_right, base); |
287 | lr = rcu_dereference_protected(l->avl_right, | ||
288 | lockdep_is_held(&base->lock)); | ||
289 | lrh = node_height(lr); | 283 | lrh = node_height(lr); |
290 | if (lrh <= node_height(ll)) { /* ll: RH+1 */ | 284 | if (lrh <= node_height(ll)) { /* ll: RH+1 */ |
291 | RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ | 285 | RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ |
@@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], | |||
296 | l->avl_height = node->avl_height + 1; | 290 | l->avl_height = node->avl_height + 1; |
297 | RCU_INIT_POINTER(*nodep, l); | 291 | RCU_INIT_POINTER(*nodep, l); |
298 | } else { /* ll: RH, lr: RH+1 */ | 292 | } else { /* ll: RH, lr: RH+1 */ |
299 | lrl = rcu_dereference_protected(lr->avl_left, | 293 | lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ |
300 | lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ | 294 | lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */ |
301 | lrr = rcu_dereference_protected(lr->avl_right, | ||
302 | lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */ | ||
303 | RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ | 295 | RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ |
304 | RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ | 296 | RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ |
305 | node->avl_height = rh + 1; /* node: RH+1 */ | 297 | node->avl_height = rh + 1; /* node: RH+1 */ |
@@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], | |||
314 | } else if (rh > lh + 1) { /* r: LH+2 */ | 306 | } else if (rh > lh + 1) { /* r: LH+2 */ |
315 | struct inet_peer *rr, *rl, *rlr, *rll; | 307 | struct inet_peer *rr, *rl, *rlr, *rll; |
316 | int rlh; | 308 | int rlh; |
317 | rr = rcu_dereference_protected(r->avl_right, | 309 | rr = rcu_deref_locked(r->avl_right, base); |
318 | lockdep_is_held(&base->lock)); | 310 | rl = rcu_deref_locked(r->avl_left, base); |
319 | rl = rcu_dereference_protected(r->avl_left, | ||
320 | lockdep_is_held(&base->lock)); | ||
321 | rlh = node_height(rl); | 311 | rlh = node_height(rl); |
322 | if (rlh <= node_height(rr)) { /* rr: LH+1 */ | 312 | if (rlh <= node_height(rr)) { /* rr: LH+1 */ |
323 | RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ | 313 | RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ |
@@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], | |||
328 | r->avl_height = node->avl_height + 1; | 318 | r->avl_height = node->avl_height + 1; |
329 | RCU_INIT_POINTER(*nodep, r); | 319 | RCU_INIT_POINTER(*nodep, r); |
330 | } else { /* rr: RH, rl: RH+1 */ | 320 | } else { /* rr: RH, rl: RH+1 */ |
331 | rlr = rcu_dereference_protected(rl->avl_right, | 321 | rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ |
332 | lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ | 322 | rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */ |
333 | rll = rcu_dereference_protected(rl->avl_left, | ||
334 | lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */ | ||
335 | RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ | 323 | RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ |
336 | RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ | 324 | RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ |
337 | node->avl_height = lh + 1; /* node: LH+1 */ | 325 | node->avl_height = lh + 1; /* node: LH+1 */ |
@@ -372,7 +360,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) | |||
372 | 360 | ||
373 | do_free = 0; | 361 | do_free = 0; |
374 | 362 | ||
375 | spin_lock_bh(&base->lock); | 363 | write_seqlock_bh(&base->lock); |
376 | /* Check the reference counter. It was artificially incremented by 1 | 364 | /* Check the reference counter. It was artificially incremented by 1 |
377 | * in cleanup() function to prevent sudden disappearing. If we can | 365 | * in cleanup() function to prevent sudden disappearing. If we can |
378 | * atomically (because of lockless readers) take this last reference, | 366 | * atomically (because of lockless readers) take this last reference, |
@@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) | |||
392 | /* look for a node to insert instead of p */ | 380 | /* look for a node to insert instead of p */ |
393 | struct inet_peer *t; | 381 | struct inet_peer *t; |
394 | t = lookup_rightempty(p, base); | 382 | t = lookup_rightempty(p, base); |
395 | BUG_ON(rcu_dereference_protected(*stackptr[-1], | 383 | BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); |
396 | lockdep_is_held(&base->lock)) != t); | ||
397 | **--stackptr = t->avl_left; | 384 | **--stackptr = t->avl_left; |
398 | /* t is removed, t->daddr > x->daddr for any | 385 | /* t is removed, t->daddr > x->daddr for any |
399 | * x in p->avl_left subtree. | 386 | * x in p->avl_left subtree. |
@@ -409,7 +396,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) | |||
409 | base->total--; | 396 | base->total--; |
410 | do_free = 1; | 397 | do_free = 1; |
411 | } | 398 | } |
412 | spin_unlock_bh(&base->lock); | 399 | write_sequnlock_bh(&base->lock); |
413 | 400 | ||
414 | if (do_free) | 401 | if (do_free) |
415 | call_rcu_bh(&p->rcu, inetpeer_free_rcu); | 402 | call_rcu_bh(&p->rcu, inetpeer_free_rcu); |
@@ -477,12 +464,16 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) | |||
477 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; | 464 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; |
478 | struct inet_peer_base *base = family_to_base(daddr->family); | 465 | struct inet_peer_base *base = family_to_base(daddr->family); |
479 | struct inet_peer *p; | 466 | struct inet_peer *p; |
467 | unsigned int sequence; | ||
468 | int invalidated; | ||
480 | 469 | ||
481 | /* Look up for the address quickly, lockless. | 470 | /* Look up for the address quickly, lockless. |
482 | * Because of a concurrent writer, we might not find an existing entry. | 471 | * Because of a concurrent writer, we might not find an existing entry. |
483 | */ | 472 | */ |
484 | rcu_read_lock_bh(); | 473 | rcu_read_lock_bh(); |
474 | sequence = read_seqbegin(&base->lock); | ||
485 | p = lookup_rcu_bh(daddr, base); | 475 | p = lookup_rcu_bh(daddr, base); |
476 | invalidated = read_seqretry(&base->lock, sequence); | ||
486 | rcu_read_unlock_bh(); | 477 | rcu_read_unlock_bh(); |
487 | 478 | ||
488 | if (p) { | 479 | if (p) { |
@@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) | |||
493 | return p; | 484 | return p; |
494 | } | 485 | } |
495 | 486 | ||
487 | /* If no writer did a change during our lookup, we can return early. */ | ||
488 | if (!create && !invalidated) | ||
489 | return NULL; | ||
490 | |||
496 | /* retry an exact lookup, taking the lock before. | 491 | /* retry an exact lookup, taking the lock before. |
497 | * At least, nodes should be hot in our cache. | 492 | * At least, nodes should be hot in our cache. |
498 | */ | 493 | */ |
499 | spin_lock_bh(&base->lock); | 494 | write_seqlock_bh(&base->lock); |
500 | p = lookup(daddr, stack, base); | 495 | p = lookup(daddr, stack, base); |
501 | if (p != peer_avl_empty) { | 496 | if (p != peer_avl_empty) { |
502 | atomic_inc(&p->refcnt); | 497 | atomic_inc(&p->refcnt); |
503 | spin_unlock_bh(&base->lock); | 498 | write_sequnlock_bh(&base->lock); |
504 | /* Remove the entry from unused list if it was there. */ | 499 | /* Remove the entry from unused list if it was there. */ |
505 | unlink_from_unused(p); | 500 | unlink_from_unused(p); |
506 | return p; | 501 | return p; |
@@ -524,7 +519,7 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) | |||
524 | link_to_pool(p, base); | 519 | link_to_pool(p, base); |
525 | base->total++; | 520 | base->total++; |
526 | } | 521 | } |
527 | spin_unlock_bh(&base->lock); | 522 | write_sequnlock_bh(&base->lock); |
528 | 523 | ||
529 | if (base->total >= inet_peer_threshold) | 524 | if (base->total >= inet_peer_threshold) |
530 | /* Remove one less-recently-used entry. */ | 525 | /* Remove one less-recently-used entry. */ |