aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv6
diff options
context:
space:
mode:
authorNicolas Dichtel <nicolas.dichtel@6wind.com>2012-10-21 23:42:09 -0400
committerDavid S. Miller <davem@davemloft.net>2012-10-23 02:38:32 -0400
commit51ebd3181572af8d5076808dab2682d800f6da5d (patch)
tree41bce53f5ed82791ea5975f6ce708f54729659b4 /net/ipv6
parentd94ce9b283736a876b2e6dec665c68e5e8b5d55e (diff)
ipv6: add support of equal cost multipath (ECMP)
Each nexthop is added like a single route in the routing table. All routes that have the same metric/weight and destination but not the same gateway are considering as ECMP routes. They are linked together, through a list called rt6i_siblings. ECMP routes can be added in one shot, with RTA_MULTIPATH attribute or one after the other (in both case, the flag NLM_F_EXCL should not be set). The patch is based on a previous work from Luc Saillard <luc.saillard@6wind.com>. Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6')
-rw-r--r--net/ipv6/ip6_fib.c57
-rw-r--r--net/ipv6/route.c136
2 files changed, 190 insertions, 3 deletions
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 24995a93ef8c..710cafd2e1a9 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -672,6 +672,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
672 iter->rt6i_idev == rt->rt6i_idev && 672 iter->rt6i_idev == rt->rt6i_idev &&
673 ipv6_addr_equal(&iter->rt6i_gateway, 673 ipv6_addr_equal(&iter->rt6i_gateway,
674 &rt->rt6i_gateway)) { 674 &rt->rt6i_gateway)) {
675 if (rt->rt6i_nsiblings)
676 rt->rt6i_nsiblings = 0;
675 if (!(iter->rt6i_flags & RTF_EXPIRES)) 677 if (!(iter->rt6i_flags & RTF_EXPIRES))
676 return -EEXIST; 678 return -EEXIST;
677 if (!(rt->rt6i_flags & RTF_EXPIRES)) 679 if (!(rt->rt6i_flags & RTF_EXPIRES))
@@ -680,6 +682,21 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
680 rt6_set_expires(iter, rt->dst.expires); 682 rt6_set_expires(iter, rt->dst.expires);
681 return -EEXIST; 683 return -EEXIST;
682 } 684 }
685 /* If we have the same destination and the same metric,
686 * but not the same gateway, then the route we try to
687 * add is sibling to this route, increment our counter
688 * of siblings, and later we will add our route to the
689 * list.
690 * Only static routes (which don't have flag
691 * RTF_EXPIRES) are used for ECMPv6.
692 *
693 * To avoid long list, we only had siblings if the
694 * route have a gateway.
695 */
696 if (rt->rt6i_flags & RTF_GATEWAY &&
697 !(rt->rt6i_flags & RTF_EXPIRES) &&
698 !(iter->rt6i_flags & RTF_EXPIRES))
699 rt->rt6i_nsiblings++;
683 } 700 }
684 701
685 if (iter->rt6i_metric > rt->rt6i_metric) 702 if (iter->rt6i_metric > rt->rt6i_metric)
@@ -692,6 +709,35 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
692 if (ins == &fn->leaf) 709 if (ins == &fn->leaf)
693 fn->rr_ptr = NULL; 710 fn->rr_ptr = NULL;
694 711
712 /* Link this route to others same route. */
713 if (rt->rt6i_nsiblings) {
714 unsigned int rt6i_nsiblings;
715 struct rt6_info *sibling, *temp_sibling;
716
717 /* Find the first route that have the same metric */
718 sibling = fn->leaf;
719 while (sibling) {
720 if (sibling->rt6i_metric == rt->rt6i_metric) {
721 list_add_tail(&rt->rt6i_siblings,
722 &sibling->rt6i_siblings);
723 break;
724 }
725 sibling = sibling->dst.rt6_next;
726 }
727 /* For each sibling in the list, increment the counter of
728 * siblings. BUG() if counters does not match, list of siblings
729 * is broken!
730 */
731 rt6i_nsiblings = 0;
732 list_for_each_entry_safe(sibling, temp_sibling,
733 &rt->rt6i_siblings, rt6i_siblings) {
734 sibling->rt6i_nsiblings++;
735 BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
736 rt6i_nsiblings++;
737 }
738 BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
739 }
740
695 /* 741 /*
696 * insert node 742 * insert node
697 */ 743 */
@@ -1193,6 +1239,17 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
1193 if (fn->rr_ptr == rt) 1239 if (fn->rr_ptr == rt)
1194 fn->rr_ptr = NULL; 1240 fn->rr_ptr = NULL;
1195 1241
1242 /* Remove this entry from other siblings */
1243 if (rt->rt6i_nsiblings) {
1244 struct rt6_info *sibling, *next_sibling;
1245
1246 list_for_each_entry_safe(sibling, next_sibling,
1247 &rt->rt6i_siblings, rt6i_siblings)
1248 sibling->rt6i_nsiblings--;
1249 rt->rt6i_nsiblings = 0;
1250 list_del_init(&rt->rt6i_siblings);
1251 }
1252
1196 /* Adjust walkers */ 1253 /* Adjust walkers */
1197 read_lock(&fib6_walker_lock); 1254 read_lock(&fib6_walker_lock);
1198 FOR_WALKERS(w) { 1255 FOR_WALKERS(w) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7c7e963260e1..126da562d3eb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -57,6 +57,7 @@
57#include <net/xfrm.h> 57#include <net/xfrm.h>
58#include <net/netevent.h> 58#include <net/netevent.h>
59#include <net/netlink.h> 59#include <net/netlink.h>
60#include <net/nexthop.h>
60 61
61#include <asm/uaccess.h> 62#include <asm/uaccess.h>
62 63
@@ -289,6 +290,8 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
289 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 290 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
290 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); 291 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
291 rt->rt6i_genid = rt_genid(net); 292 rt->rt6i_genid = rt_genid(net);
293 INIT_LIST_HEAD(&rt->rt6i_siblings);
294 rt->rt6i_nsiblings = 0;
292 } 295 }
293 return rt; 296 return rt;
294} 297}
@@ -385,6 +388,69 @@ static bool rt6_need_strict(const struct in6_addr *daddr)
385 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); 388 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
386} 389}
387 390
391/* Multipath route selection:
392 * Hash based function using packet header and flowlabel.
393 * Adapted from fib_info_hashfn()
394 */
395static int rt6_info_hash_nhsfn(unsigned int candidate_count,
396 const struct flowi6 *fl6)
397{
398 unsigned int val = fl6->flowi6_proto;
399
400 val ^= fl6->daddr.s6_addr32[0];
401 val ^= fl6->daddr.s6_addr32[1];
402 val ^= fl6->daddr.s6_addr32[2];
403 val ^= fl6->daddr.s6_addr32[3];
404
405 val ^= fl6->saddr.s6_addr32[0];
406 val ^= fl6->saddr.s6_addr32[1];
407 val ^= fl6->saddr.s6_addr32[2];
408 val ^= fl6->saddr.s6_addr32[3];
409
410 /* Work only if this not encapsulated */
411 switch (fl6->flowi6_proto) {
412 case IPPROTO_UDP:
413 case IPPROTO_TCP:
414 case IPPROTO_SCTP:
415 val ^= fl6->fl6_sport;
416 val ^= fl6->fl6_dport;
417 break;
418
419 case IPPROTO_ICMPV6:
420 val ^= fl6->fl6_icmp_type;
421 val ^= fl6->fl6_icmp_code;
422 break;
423 }
424 /* RFC6438 recommands to use flowlabel */
425 val ^= fl6->flowlabel;
426
427 /* Perhaps, we need to tune, this function? */
428 val = val ^ (val >> 7) ^ (val >> 12);
429 return val % candidate_count;
430}
431
432static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
433 struct flowi6 *fl6)
434{
435 struct rt6_info *sibling, *next_sibling;
436 int route_choosen;
437
438 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
439 /* Don't change the route, if route_choosen == 0
440 * (siblings does not include ourself)
441 */
442 if (route_choosen)
443 list_for_each_entry_safe(sibling, next_sibling,
444 &match->rt6i_siblings, rt6i_siblings) {
445 route_choosen--;
446 if (route_choosen == 0) {
447 match = sibling;
448 break;
449 }
450 }
451 return match;
452}
453
388/* 454/*
389 * Route lookup. Any table->tb6_lock is implied. 455 * Route lookup. Any table->tb6_lock is implied.
390 */ 456 */
@@ -702,6 +768,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
702restart: 768restart:
703 rt = fn->leaf; 769 rt = fn->leaf;
704 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); 770 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
771 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
772 rt = rt6_multipath_select(rt, fl6);
705 BACKTRACK(net, &fl6->saddr); 773 BACKTRACK(net, &fl6->saddr);
706out: 774out:
707 dst_use(&rt->dst, jiffies); 775 dst_use(&rt->dst, jiffies);
@@ -863,7 +931,8 @@ restart_2:
863 931
864restart: 932restart:
865 rt = rt6_select(fn, oif, strict | reachable); 933 rt = rt6_select(fn, oif, strict | reachable);
866 934 if (rt->rt6i_nsiblings && oif == 0)
935 rt = rt6_multipath_select(rt, fl6);
867 BACKTRACK(net, &fl6->saddr); 936 BACKTRACK(net, &fl6->saddr);
868 if (rt == net->ipv6.ip6_null_entry || 937 if (rt == net->ipv6.ip6_null_entry ||
869 rt->rt6i_flags & RTF_CACHE) 938 rt->rt6i_flags & RTF_CACHE)
@@ -2249,6 +2318,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2249 [RTA_IIF] = { .type = NLA_U32 }, 2318 [RTA_IIF] = { .type = NLA_U32 },
2250 [RTA_PRIORITY] = { .type = NLA_U32 }, 2319 [RTA_PRIORITY] = { .type = NLA_U32 },
2251 [RTA_METRICS] = { .type = NLA_NESTED }, 2320 [RTA_METRICS] = { .type = NLA_NESTED },
2321 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2252}; 2322};
2253 2323
2254static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 2324static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2326,11 +2396,65 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2326 if (tb[RTA_TABLE]) 2396 if (tb[RTA_TABLE])
2327 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 2397 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2328 2398
2399 if (tb[RTA_MULTIPATH]) {
2400 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2401 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2402 }
2403
2329 err = 0; 2404 err = 0;
2330errout: 2405errout:
2331 return err; 2406 return err;
2332} 2407}
2333 2408
2409static int ip6_route_multipath(struct fib6_config *cfg, int add)
2410{
2411 struct fib6_config r_cfg;
2412 struct rtnexthop *rtnh;
2413 int remaining;
2414 int attrlen;
2415 int err = 0, last_err = 0;
2416
2417beginning:
2418 rtnh = (struct rtnexthop *)cfg->fc_mp;
2419 remaining = cfg->fc_mp_len;
2420
2421 /* Parse a Multipath Entry */
2422 while (rtnh_ok(rtnh, remaining)) {
2423 memcpy(&r_cfg, cfg, sizeof(*cfg));
2424 if (rtnh->rtnh_ifindex)
2425 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2426
2427 attrlen = rtnh_attrlen(rtnh);
2428 if (attrlen > 0) {
2429 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2430
2431 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2432 if (nla) {
2433 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
2434 r_cfg.fc_flags |= RTF_GATEWAY;
2435 }
2436 }
2437 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2438 if (err) {
2439 last_err = err;
2440 /* If we are trying to remove a route, do not stop the
2441 * loop when ip6_route_del() fails (because next hop is
2442 * already gone), we should try to remove all next hops.
2443 */
2444 if (add) {
2445 /* If add fails, we should try to delete all
2446 * next hops that have been already added.
2447 */
2448 add = 0;
2449 goto beginning;
2450 }
2451 }
2452 rtnh = rtnh_next(rtnh, &remaining);
2453 }
2454
2455 return last_err;
2456}
2457
2334static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 2458static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2335{ 2459{
2336 struct fib6_config cfg; 2460 struct fib6_config cfg;
@@ -2340,7 +2464,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
2340 if (err < 0) 2464 if (err < 0)
2341 return err; 2465 return err;
2342 2466
2343 return ip6_route_del(&cfg); 2467 if (cfg.fc_mp)
2468 return ip6_route_multipath(&cfg, 0);
2469 else
2470 return ip6_route_del(&cfg);
2344} 2471}
2345 2472
2346static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 2473static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -2352,7 +2479,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
2352 if (err < 0) 2479 if (err < 0)
2353 return err; 2480 return err;
2354 2481
2355 return ip6_route_add(&cfg); 2482 if (cfg.fc_mp)
2483 return ip6_route_multipath(&cfg, 1);
2484 else
2485 return ip6_route_add(&cfg);
2356} 2486}
2357 2487
2358static inline size_t rt6_nlmsg_size(void) 2488static inline size_t rt6_nlmsg_size(void)