aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@sunset.davemloft.net>2007-03-24 23:36:25 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2007-03-25 21:48:05 -0400
commitf11e6659ce9058928d73ff440f9b40a818d628ab (patch)
tree00b7b33eec4c8e5ade0be1d7a6fc8a8f74b383da
parenta979101106f549f4ed80d6dcbc35077be34d4346 (diff)
[IPV6]: Fix routing round-robin locking.
As per RFC2461, section 6.3.6, item #2, when no routers on the matching list are known to be reachable or probably reachable we do round robin on those available routes so that we make sure to probe as many of them as possible to detect when one becomes reachable faster. Each routing table has a rwlock protecting the tree and the linked list of routes at each leaf. The round robin code executes during lookup and thus with the rwlock taken as a reader. A small local spinlock tries to provide protection but this does not work at all for two reasons: 1) The round-robin list manipulation, as coded, goes like this (with read lock held): walk routes finding head and tail spin_lock(); rotate list using head and tail spin_unlock(); While one thread is rotating the list, another thread can end up with stale values of head and tail and then proceed to corrupt the list when it gets the lock. This ends up causing the OOPS in fib6_add() later onthat many people have been hitting. 2) All the other code paths that run with the rwlock held as a reader do not expect the list to change on them, they expect it to remain completely fixed while they hold the lock in that way. So, simply stated, it is impossible to implement this correctly using a manipulation of the list without violating the rwlock locking semantics. Reimplement using a per-fib6_node round-robin pointer. This way we don't need to manipulate the list at all, and since the round-robin pointer can only ever point to real existing entries we don't need to perform any locking on the changing of the round-robin pointer itself. We only need to reset the round-robin pointer to NULL when the entry it is pointing to is removed. The idea is from Thomas Graf and it is very similar to how this was implemented before the advanced router selection code when in. Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/ip6_fib.h1
-rw-r--r--net/ipv6/ip6_fib.c8
-rw-r--r--net/ipv6/route.c97
3 files changed, 68 insertions, 38 deletions
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 9eda572a2a65..cf355a3c2ad5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -58,6 +58,7 @@ struct fib6_node
58 __u16 fn_bit; /* bit key */ 58 __u16 fn_bit; /* bit key */
59 __u16 fn_flags; 59 __u16 fn_flags;
60 __u32 fn_sernum; 60 __u32 fn_sernum;
61 struct rt6_info *rr_ptr;
61}; 62};
62 63
63#ifndef CONFIG_IPV6_SUBTREES 64#ifndef CONFIG_IPV6_SUBTREES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f4d7be77eb0f..268f476ef3db 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -658,6 +658,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
658 ins = &iter->u.dst.rt6_next; 658 ins = &iter->u.dst.rt6_next;
659 } 659 }
660 660
661 /* Reset round-robin state, if necessary */
662 if (ins == &fn->leaf)
663 fn->rr_ptr = NULL;
664
661 /* 665 /*
662 * insert node 666 * insert node
663 */ 667 */
@@ -1109,6 +1113,10 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
1109 rt6_stats.fib_rt_entries--; 1113 rt6_stats.fib_rt_entries--;
1110 rt6_stats.fib_discarded_routes++; 1114 rt6_stats.fib_discarded_routes++;
1111 1115
1116 /* Reset round-robin state, if necessary */
1117 if (fn->rr_ptr == rt)
1118 fn->rr_ptr = NULL;
1119
1112 /* Adjust walkers */ 1120 /* Adjust walkers */
1113 read_lock(&fib6_walker_lock); 1121 read_lock(&fib6_walker_lock);
1114 FOR_WALKERS(w) { 1122 FOR_WALKERS(w) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a6b3117df546..3931b33b25e8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -363,55 +363,76 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
363 return m; 363 return m;
364} 364}
365 365
366static struct rt6_info *rt6_select(struct rt6_info **head, int oif, 366static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
367 int strict) 367 int *mpri, struct rt6_info *match)
368{ 368{
369 struct rt6_info *match = NULL, *last = NULL; 369 int m;
370 struct rt6_info *rt, *rt0 = *head; 370
371 u32 metric; 371 if (rt6_check_expired(rt))
372 goto out;
373
374 m = rt6_score_route(rt, oif, strict);
375 if (m < 0)
376 goto out;
377
378 if (m > *mpri) {
379 if (strict & RT6_LOOKUP_F_REACHABLE)
380 rt6_probe(match);
381 *mpri = m;
382 match = rt;
383 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
384 rt6_probe(rt);
385 }
386
387out:
388 return match;
389}
390
391static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
392 struct rt6_info *rr_head,
393 u32 metric, int oif, int strict)
394{
395 struct rt6_info *rt, *match;
372 int mpri = -1; 396 int mpri = -1;
373 397
374 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n", 398 match = NULL;
375 __FUNCTION__, head, head ? *head : NULL, oif); 399 for (rt = rr_head; rt && rt->rt6i_metric == metric;
400 rt = rt->u.dst.rt6_next)
401 match = find_match(rt, oif, strict, &mpri, match);
402 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
403 rt = rt->u.dst.rt6_next)
404 match = find_match(rt, oif, strict, &mpri, match);
376 405
377 for (rt = rt0, metric = rt0->rt6i_metric; 406 return match;
378 rt && rt->rt6i_metric == metric && (!last || rt != rt0); 407}
379 rt = rt->u.dst.rt6_next) {
380 int m;
381 408
382 if (rt6_check_expired(rt)) 409static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
383 continue; 410{
411 struct rt6_info *match, *rt0;
384 412
385 last = rt; 413 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
414 __FUNCTION__, fn->leaf, oif);
386 415
387 m = rt6_score_route(rt, oif, strict); 416 rt0 = fn->rr_ptr;
388 if (m < 0) 417 if (!rt0)
389 continue; 418 fn->rr_ptr = rt0 = fn->leaf;
390 419
391 if (m > mpri) { 420 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
392 if (strict & RT6_LOOKUP_F_REACHABLE)
393 rt6_probe(match);
394 match = rt;
395 mpri = m;
396 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
397 rt6_probe(rt);
398 }
399 }
400 421
401 if (!match && 422 if (!match &&
402 (strict & RT6_LOOKUP_F_REACHABLE) && 423 (strict & RT6_LOOKUP_F_REACHABLE)) {
403 last && last != rt0) { 424 struct rt6_info *next = rt0->u.dst.rt6_next;
425
404 /* no entries matched; do round-robin */ 426 /* no entries matched; do round-robin */
405 static DEFINE_SPINLOCK(lock); 427 if (!next || next->rt6i_metric != rt0->rt6i_metric)
406 spin_lock(&lock); 428 next = fn->leaf;
407 *head = rt0->u.dst.rt6_next; 429
408 rt0->u.dst.rt6_next = last->u.dst.rt6_next; 430 if (next != rt0)
409 last->u.dst.rt6_next = rt0; 431 fn->rr_ptr = next;
410 spin_unlock(&lock);
411 } 432 }
412 433
413 RT6_TRACE("%s() => %p, score=%d\n", 434 RT6_TRACE("%s() => %p\n",
414 __FUNCTION__, match, mpri); 435 __FUNCTION__, match);
415 436
416 return (match ? match : &ip6_null_entry); 437 return (match ? match : &ip6_null_entry);
417} 438}
@@ -657,7 +678,7 @@ restart_2:
657 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); 678 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
658 679
659restart: 680restart:
660 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable); 681 rt = rt6_select(fn, fl->iif, strict | reachable);
661 BACKTRACK(&fl->fl6_src); 682 BACKTRACK(&fl->fl6_src);
662 if (rt == &ip6_null_entry || 683 if (rt == &ip6_null_entry ||
663 rt->rt6i_flags & RTF_CACHE) 684 rt->rt6i_flags & RTF_CACHE)
@@ -752,7 +773,7 @@ restart_2:
752 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); 773 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
753 774
754restart: 775restart:
755 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable); 776 rt = rt6_select(fn, fl->oif, strict | reachable);
756 BACKTRACK(&fl->fl6_src); 777 BACKTRACK(&fl->fl6_src);
757 if (rt == &ip6_null_entry || 778 if (rt == &ip6_null_entry ||
758 rt->rt6i_flags & RTF_CACHE) 779 rt->rt6i_flags & RTF_CACHE)