aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2011-03-04 17:33:59 -0500
committerDavid S. Miller <davem@davemloft.net>2011-03-04 17:33:59 -0500
commit65e8354ec13a45414045084166cb340c0d7ffe8a (patch)
tree3a7d8b63a3765343c07d6f878b4ef389041da6f0 /net
parentd72751ede1b9bf993d7bd3377305c8e9e36a3cc4 (diff)
inetpeer: seqlock optimization
David noticed : ------------------ Eric, I was profiling the non-routing-cache case and something that stuck out is the case of calling inet_getpeer() with create==0. If an entry is not found, we have to redo the lookup under a spinlock to make certain that a concurrent writer rebalancing the tree does not "hide" an existing entry from us. This makes the case of a create==0 lookup for a not-present entry really expensive. It is on the order of 600 cpu cycles on my Niagara2. I added a hack to not do the relookup under the lock when create==0 and it now costs less than 300 cycles. This is now a pretty common operation with the way we handle COW'd metrics, so I think it's definitely worth optimizing. ----------------- One solution is to use a seqlock instead of a spinlock to protect struct inet_peer_base. After a failed avl tree lookup, we can easily detect if a writer did some changes during our lookup. Taking the lock and redo the lookup is only necessary in this case. Note: Add one private rcu_deref_locked() macro to place in one spot the access to spinlock included in seqlock. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/inetpeer.c75
1 files changed, 35 insertions, 40 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 48f8d4592ccd..f604ffdbea27 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = {
81 81
82struct inet_peer_base { 82struct inet_peer_base {
83 struct inet_peer __rcu *root; 83 struct inet_peer __rcu *root;
84 spinlock_t lock; 84 seqlock_t lock;
85 int total; 85 int total;
86}; 86};
87 87
88static struct inet_peer_base v4_peers = { 88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu, 89 .root = peer_avl_empty_rcu,
90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), 90 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
91 .total = 0, 91 .total = 0,
92}; 92};
93 93
94static struct inet_peer_base v6_peers = { 94static struct inet_peer_base v6_peers = {
95 .root = peer_avl_empty_rcu, 95 .root = peer_avl_empty_rcu,
96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), 96 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
97 .total = 0, 97 .total = 0,
98}; 98};
99 99
@@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a,
177 return 0; 177 return 0;
178} 178}
179 179
180#define rcu_deref_locked(X, BASE) \
181 rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
182
180/* 183/*
181 * Called with local BH disabled and the pool lock held. 184 * Called with local BH disabled and the pool lock held.
182 */ 185 */
@@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a,
187 \ 190 \
188 stackptr = _stack; \ 191 stackptr = _stack; \
189 *stackptr++ = &_base->root; \ 192 *stackptr++ = &_base->root; \
190 for (u = rcu_dereference_protected(_base->root, \ 193 for (u = rcu_deref_locked(_base->root, _base); \
191 lockdep_is_held(&_base->lock)); \
192 u != peer_avl_empty; ) { \ 194 u != peer_avl_empty; ) { \
193 int cmp = addr_compare(_daddr, &u->daddr); \ 195 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \ 196 if (cmp == 0) \
@@ -198,8 +200,7 @@ static int addr_compare(const struct inetpeer_addr *a,
198 else \ 200 else \
199 v = &u->avl_right; \ 201 v = &u->avl_right; \
200 *stackptr++ = v; \ 202 *stackptr++ = v; \
201 u = rcu_dereference_protected(*v, \ 203 u = rcu_deref_locked(*v, _base); \
202 lockdep_is_held(&_base->lock)); \
203 } \ 204 } \
204 u; \ 205 u; \
205}) 206})
@@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
246 struct inet_peer __rcu **v; \ 247 struct inet_peer __rcu **v; \
247 *stackptr++ = &start->avl_left; \ 248 *stackptr++ = &start->avl_left; \
248 v = &start->avl_left; \ 249 v = &start->avl_left; \
249 for (u = rcu_dereference_protected(*v, \ 250 for (u = rcu_deref_locked(*v, base); \
250 lockdep_is_held(&base->lock)); \
251 u->avl_right != peer_avl_empty_rcu; ) { \ 251 u->avl_right != peer_avl_empty_rcu; ) { \
252 v = &u->avl_right; \ 252 v = &u->avl_right; \
253 *stackptr++ = v; \ 253 *stackptr++ = v; \
254 u = rcu_dereference_protected(*v, \ 254 u = rcu_deref_locked(*v, base); \
255 lockdep_is_held(&base->lock)); \
256 } \ 255 } \
257 u; \ 256 u; \
258}) 257})
@@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
271 270
272 while (stackend > stack) { 271 while (stackend > stack) {
273 nodep = *--stackend; 272 nodep = *--stackend;
274 node = rcu_dereference_protected(*nodep, 273 node = rcu_deref_locked(*nodep, base);
275 lockdep_is_held(&base->lock)); 274 l = rcu_deref_locked(node->avl_left, base);
276 l = rcu_dereference_protected(node->avl_left, 275 r = rcu_deref_locked(node->avl_right, base);
277 lockdep_is_held(&base->lock));
278 r = rcu_dereference_protected(node->avl_right,
279 lockdep_is_held(&base->lock));
280 lh = node_height(l); 276 lh = node_height(l);
281 rh = node_height(r); 277 rh = node_height(r);
282 if (lh > rh + 1) { /* l: RH+2 */ 278 if (lh > rh + 1) { /* l: RH+2 */
283 struct inet_peer *ll, *lr, *lrl, *lrr; 279 struct inet_peer *ll, *lr, *lrl, *lrr;
284 int lrh; 280 int lrh;
285 ll = rcu_dereference_protected(l->avl_left, 281 ll = rcu_deref_locked(l->avl_left, base);
286 lockdep_is_held(&base->lock)); 282 lr = rcu_deref_locked(l->avl_right, base);
287 lr = rcu_dereference_protected(l->avl_right,
288 lockdep_is_held(&base->lock));
289 lrh = node_height(lr); 283 lrh = node_height(lr);
290 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 284 if (lrh <= node_height(ll)) { /* ll: RH+1 */
291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ 285 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
296 l->avl_height = node->avl_height + 1; 290 l->avl_height = node->avl_height + 1;
297 RCU_INIT_POINTER(*nodep, l); 291 RCU_INIT_POINTER(*nodep, l);
298 } else { /* ll: RH, lr: RH+1 */ 292 } else { /* ll: RH, lr: RH+1 */
299 lrl = rcu_dereference_protected(lr->avl_left, 293 lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ 294 lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
301 lrr = rcu_dereference_protected(lr->avl_right,
302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ 295 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 296 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
305 node->avl_height = rh + 1; /* node: RH+1 */ 297 node->avl_height = rh + 1; /* node: RH+1 */
@@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
314 } else if (rh > lh + 1) { /* r: LH+2 */ 306 } else if (rh > lh + 1) { /* r: LH+2 */
315 struct inet_peer *rr, *rl, *rlr, *rll; 307 struct inet_peer *rr, *rl, *rlr, *rll;
316 int rlh; 308 int rlh;
317 rr = rcu_dereference_protected(r->avl_right, 309 rr = rcu_deref_locked(r->avl_right, base);
318 lockdep_is_held(&base->lock)); 310 rl = rcu_deref_locked(r->avl_left, base);
319 rl = rcu_dereference_protected(r->avl_left,
320 lockdep_is_held(&base->lock));
321 rlh = node_height(rl); 311 rlh = node_height(rl);
322 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 312 if (rlh <= node_height(rr)) { /* rr: LH+1 */
323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ 313 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
328 r->avl_height = node->avl_height + 1; 318 r->avl_height = node->avl_height + 1;
329 RCU_INIT_POINTER(*nodep, r); 319 RCU_INIT_POINTER(*nodep, r);
330 } else { /* rr: RH, rl: RH+1 */ 320 } else { /* rr: RH, rl: RH+1 */
331 rlr = rcu_dereference_protected(rl->avl_right, 321 rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ 322 rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
333 rll = rcu_dereference_protected(rl->avl_left,
334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ 323 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 324 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
337 node->avl_height = lh + 1; /* node: LH+1 */ 325 node->avl_height = lh + 1; /* node: LH+1 */
@@ -372,7 +360,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
372 360
373 do_free = 0; 361 do_free = 0;
374 362
375 spin_lock_bh(&base->lock); 363 write_seqlock_bh(&base->lock);
376 /* Check the reference counter. It was artificially incremented by 1 364 /* Check the reference counter. It was artificially incremented by 1
377 * in cleanup() function to prevent sudden disappearing. If we can 365 * in cleanup() function to prevent sudden disappearing. If we can
378 * atomically (because of lockless readers) take this last reference, 366 * atomically (because of lockless readers) take this last reference,
@@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
392 /* look for a node to insert instead of p */ 380 /* look for a node to insert instead of p */
393 struct inet_peer *t; 381 struct inet_peer *t;
394 t = lookup_rightempty(p, base); 382 t = lookup_rightempty(p, base);
395 BUG_ON(rcu_dereference_protected(*stackptr[-1], 383 BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
396 lockdep_is_held(&base->lock)) != t);
397 **--stackptr = t->avl_left; 384 **--stackptr = t->avl_left;
398 /* t is removed, t->daddr > x->daddr for any 385 /* t is removed, t->daddr > x->daddr for any
399 * x in p->avl_left subtree. 386 * x in p->avl_left subtree.
@@ -409,7 +396,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
409 base->total--; 396 base->total--;
410 do_free = 1; 397 do_free = 1;
411 } 398 }
412 spin_unlock_bh(&base->lock); 399 write_sequnlock_bh(&base->lock);
413 400
414 if (do_free) 401 if (do_free)
415 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 402 call_rcu_bh(&p->rcu, inetpeer_free_rcu);
@@ -477,12 +464,16 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 464 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(daddr->family); 465 struct inet_peer_base *base = family_to_base(daddr->family);
479 struct inet_peer *p; 466 struct inet_peer *p;
467 unsigned int sequence;
468 int invalidated;
480 469
481 /* Look up for the address quickly, lockless. 470 /* Look up for the address quickly, lockless.
482 * Because of a concurrent writer, we might not find an existing entry. 471 * Because of a concurrent writer, we might not find an existing entry.
483 */ 472 */
484 rcu_read_lock_bh(); 473 rcu_read_lock_bh();
474 sequence = read_seqbegin(&base->lock);
485 p = lookup_rcu_bh(daddr, base); 475 p = lookup_rcu_bh(daddr, base);
476 invalidated = read_seqretry(&base->lock, sequence);
486 rcu_read_unlock_bh(); 477 rcu_read_unlock_bh();
487 478
488 if (p) { 479 if (p) {
@@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
493 return p; 484 return p;
494 } 485 }
495 486
487 /* If no writer did a change during our lookup, we can return early. */
488 if (!create && !invalidated)
489 return NULL;
490
496 /* retry an exact lookup, taking the lock before. 491 /* retry an exact lookup, taking the lock before.
497 * At least, nodes should be hot in our cache. 492 * At least, nodes should be hot in our cache.
498 */ 493 */
499 spin_lock_bh(&base->lock); 494 write_seqlock_bh(&base->lock);
500 p = lookup(daddr, stack, base); 495 p = lookup(daddr, stack, base);
501 if (p != peer_avl_empty) { 496 if (p != peer_avl_empty) {
502 atomic_inc(&p->refcnt); 497 atomic_inc(&p->refcnt);
503 spin_unlock_bh(&base->lock); 498 write_sequnlock_bh(&base->lock);
504 /* Remove the entry from unused list if it was there. */ 499 /* Remove the entry from unused list if it was there. */
505 unlink_from_unused(p); 500 unlink_from_unused(p);
506 return p; 501 return p;
@@ -524,7 +519,7 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
524 link_to_pool(p, base); 519 link_to_pool(p, base);
525 base->total++; 520 base->total++;
526 } 521 }
527 spin_unlock_bh(&base->lock); 522 write_sequnlock_bh(&base->lock);
528 523
529 if (base->total >= inet_peer_threshold) 524 if (base->total >= inet_peer_threshold)
530 /* Remove one less-recently-used entry. */ 525 /* Remove one less-recently-used entry. */