diff options
author | Nicolas Dichtel <nicolas.dichtel@6wind.com> | 2012-10-21 23:42:09 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-10-23 02:38:32 -0400 |
commit | 51ebd3181572af8d5076808dab2682d800f6da5d (patch) | |
tree | 41bce53f5ed82791ea5975f6ce708f54729659b4 | |
parent | d94ce9b283736a876b2e6dec665c68e5e8b5d55e (diff) |
ipv6: add support of equal cost multipath (ECMP)
Each nexthop is added like a single route in the routing table. All routes
that have the same metric/weight and destination but not the same gateway
are considering as ECMP routes. They are linked together, through a list called
rt6i_siblings.
ECMP routes can be added in one shot, with RTA_MULTIPATH attribute or one after
the other (in both case, the flag NLM_F_EXCL should not be set).
The patch is based on a previous work from
Luc Saillard <luc.saillard@6wind.com>.
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/ip6_fib.h | 10 | ||||
-rw-r--r-- | net/ipv6/ip6_fib.c | 57 | ||||
-rw-r--r-- | net/ipv6/route.c | 136 |
3 files changed, 200 insertions, 3 deletions
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 8a2a203eb15d..20210d79e36a 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h | |||
@@ -47,6 +47,8 @@ struct fib6_config { | |||
47 | unsigned long fc_expires; | 47 | unsigned long fc_expires; |
48 | struct nlattr *fc_mx; | 48 | struct nlattr *fc_mx; |
49 | int fc_mx_len; | 49 | int fc_mx_len; |
50 | int fc_mp_len; | ||
51 | struct nlattr *fc_mp; | ||
50 | 52 | ||
51 | struct nl_info fc_nlinfo; | 53 | struct nl_info fc_nlinfo; |
52 | }; | 54 | }; |
@@ -99,6 +101,14 @@ struct rt6_info { | |||
99 | 101 | ||
100 | struct in6_addr rt6i_gateway; | 102 | struct in6_addr rt6i_gateway; |
101 | 103 | ||
104 | /* Multipath routes: | ||
105 | * siblings is a list of rt6_info that have the the same metric/weight, | ||
106 | * destination, but not the same gateway. nsiblings is just a cache | ||
107 | * to speed up lookup. | ||
108 | */ | ||
109 | struct list_head rt6i_siblings; | ||
110 | unsigned int rt6i_nsiblings; | ||
111 | |||
102 | atomic_t rt6i_ref; | 112 | atomic_t rt6i_ref; |
103 | 113 | ||
104 | /* These are in a separate cache line. */ | 114 | /* These are in a separate cache line. */ |
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 24995a93ef8c..710cafd2e1a9 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c | |||
@@ -672,6 +672,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, | |||
672 | iter->rt6i_idev == rt->rt6i_idev && | 672 | iter->rt6i_idev == rt->rt6i_idev && |
673 | ipv6_addr_equal(&iter->rt6i_gateway, | 673 | ipv6_addr_equal(&iter->rt6i_gateway, |
674 | &rt->rt6i_gateway)) { | 674 | &rt->rt6i_gateway)) { |
675 | if (rt->rt6i_nsiblings) | ||
676 | rt->rt6i_nsiblings = 0; | ||
675 | if (!(iter->rt6i_flags & RTF_EXPIRES)) | 677 | if (!(iter->rt6i_flags & RTF_EXPIRES)) |
676 | return -EEXIST; | 678 | return -EEXIST; |
677 | if (!(rt->rt6i_flags & RTF_EXPIRES)) | 679 | if (!(rt->rt6i_flags & RTF_EXPIRES)) |
@@ -680,6 +682,21 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, | |||
680 | rt6_set_expires(iter, rt->dst.expires); | 682 | rt6_set_expires(iter, rt->dst.expires); |
681 | return -EEXIST; | 683 | return -EEXIST; |
682 | } | 684 | } |
685 | /* If we have the same destination and the same metric, | ||
686 | * but not the same gateway, then the route we try to | ||
687 | * add is sibling to this route, increment our counter | ||
688 | * of siblings, and later we will add our route to the | ||
689 | * list. | ||
690 | * Only static routes (which don't have flag | ||
691 | * RTF_EXPIRES) are used for ECMPv6. | ||
692 | * | ||
693 | * To avoid long list, we only had siblings if the | ||
694 | * route have a gateway. | ||
695 | */ | ||
696 | if (rt->rt6i_flags & RTF_GATEWAY && | ||
697 | !(rt->rt6i_flags & RTF_EXPIRES) && | ||
698 | !(iter->rt6i_flags & RTF_EXPIRES)) | ||
699 | rt->rt6i_nsiblings++; | ||
683 | } | 700 | } |
684 | 701 | ||
685 | if (iter->rt6i_metric > rt->rt6i_metric) | 702 | if (iter->rt6i_metric > rt->rt6i_metric) |
@@ -692,6 +709,35 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, | |||
692 | if (ins == &fn->leaf) | 709 | if (ins == &fn->leaf) |
693 | fn->rr_ptr = NULL; | 710 | fn->rr_ptr = NULL; |
694 | 711 | ||
712 | /* Link this route to others same route. */ | ||
713 | if (rt->rt6i_nsiblings) { | ||
714 | unsigned int rt6i_nsiblings; | ||
715 | struct rt6_info *sibling, *temp_sibling; | ||
716 | |||
717 | /* Find the first route that have the same metric */ | ||
718 | sibling = fn->leaf; | ||
719 | while (sibling) { | ||
720 | if (sibling->rt6i_metric == rt->rt6i_metric) { | ||
721 | list_add_tail(&rt->rt6i_siblings, | ||
722 | &sibling->rt6i_siblings); | ||
723 | break; | ||
724 | } | ||
725 | sibling = sibling->dst.rt6_next; | ||
726 | } | ||
727 | /* For each sibling in the list, increment the counter of | ||
728 | * siblings. BUG() if counters does not match, list of siblings | ||
729 | * is broken! | ||
730 | */ | ||
731 | rt6i_nsiblings = 0; | ||
732 | list_for_each_entry_safe(sibling, temp_sibling, | ||
733 | &rt->rt6i_siblings, rt6i_siblings) { | ||
734 | sibling->rt6i_nsiblings++; | ||
735 | BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); | ||
736 | rt6i_nsiblings++; | ||
737 | } | ||
738 | BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); | ||
739 | } | ||
740 | |||
695 | /* | 741 | /* |
696 | * insert node | 742 | * insert node |
697 | */ | 743 | */ |
@@ -1193,6 +1239,17 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, | |||
1193 | if (fn->rr_ptr == rt) | 1239 | if (fn->rr_ptr == rt) |
1194 | fn->rr_ptr = NULL; | 1240 | fn->rr_ptr = NULL; |
1195 | 1241 | ||
1242 | /* Remove this entry from other siblings */ | ||
1243 | if (rt->rt6i_nsiblings) { | ||
1244 | struct rt6_info *sibling, *next_sibling; | ||
1245 | |||
1246 | list_for_each_entry_safe(sibling, next_sibling, | ||
1247 | &rt->rt6i_siblings, rt6i_siblings) | ||
1248 | sibling->rt6i_nsiblings--; | ||
1249 | rt->rt6i_nsiblings = 0; | ||
1250 | list_del_init(&rt->rt6i_siblings); | ||
1251 | } | ||
1252 | |||
1196 | /* Adjust walkers */ | 1253 | /* Adjust walkers */ |
1197 | read_lock(&fib6_walker_lock); | 1254 | read_lock(&fib6_walker_lock); |
1198 | FOR_WALKERS(w) { | 1255 | FOR_WALKERS(w) { |
diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7c7e963260e1..126da562d3eb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <net/xfrm.h> | 57 | #include <net/xfrm.h> |
58 | #include <net/netevent.h> | 58 | #include <net/netevent.h> |
59 | #include <net/netlink.h> | 59 | #include <net/netlink.h> |
60 | #include <net/nexthop.h> | ||
60 | 61 | ||
61 | #include <asm/uaccess.h> | 62 | #include <asm/uaccess.h> |
62 | 63 | ||
@@ -289,6 +290,8 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, | |||
289 | memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); | 290 | memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); |
290 | rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); | 291 | rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); |
291 | rt->rt6i_genid = rt_genid(net); | 292 | rt->rt6i_genid = rt_genid(net); |
293 | INIT_LIST_HEAD(&rt->rt6i_siblings); | ||
294 | rt->rt6i_nsiblings = 0; | ||
292 | } | 295 | } |
293 | return rt; | 296 | return rt; |
294 | } | 297 | } |
@@ -385,6 +388,69 @@ static bool rt6_need_strict(const struct in6_addr *daddr) | |||
385 | (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); | 388 | (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); |
386 | } | 389 | } |
387 | 390 | ||
391 | /* Multipath route selection: | ||
392 | * Hash based function using packet header and flowlabel. | ||
393 | * Adapted from fib_info_hashfn() | ||
394 | */ | ||
395 | static int rt6_info_hash_nhsfn(unsigned int candidate_count, | ||
396 | const struct flowi6 *fl6) | ||
397 | { | ||
398 | unsigned int val = fl6->flowi6_proto; | ||
399 | |||
400 | val ^= fl6->daddr.s6_addr32[0]; | ||
401 | val ^= fl6->daddr.s6_addr32[1]; | ||
402 | val ^= fl6->daddr.s6_addr32[2]; | ||
403 | val ^= fl6->daddr.s6_addr32[3]; | ||
404 | |||
405 | val ^= fl6->saddr.s6_addr32[0]; | ||
406 | val ^= fl6->saddr.s6_addr32[1]; | ||
407 | val ^= fl6->saddr.s6_addr32[2]; | ||
408 | val ^= fl6->saddr.s6_addr32[3]; | ||
409 | |||
410 | /* Work only if this not encapsulated */ | ||
411 | switch (fl6->flowi6_proto) { | ||
412 | case IPPROTO_UDP: | ||
413 | case IPPROTO_TCP: | ||
414 | case IPPROTO_SCTP: | ||
415 | val ^= fl6->fl6_sport; | ||
416 | val ^= fl6->fl6_dport; | ||
417 | break; | ||
418 | |||
419 | case IPPROTO_ICMPV6: | ||
420 | val ^= fl6->fl6_icmp_type; | ||
421 | val ^= fl6->fl6_icmp_code; | ||
422 | break; | ||
423 | } | ||
424 | /* RFC6438 recommands to use flowlabel */ | ||
425 | val ^= fl6->flowlabel; | ||
426 | |||
427 | /* Perhaps, we need to tune, this function? */ | ||
428 | val = val ^ (val >> 7) ^ (val >> 12); | ||
429 | return val % candidate_count; | ||
430 | } | ||
431 | |||
432 | static struct rt6_info *rt6_multipath_select(struct rt6_info *match, | ||
433 | struct flowi6 *fl6) | ||
434 | { | ||
435 | struct rt6_info *sibling, *next_sibling; | ||
436 | int route_choosen; | ||
437 | |||
438 | route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6); | ||
439 | /* Don't change the route, if route_choosen == 0 | ||
440 | * (siblings does not include ourself) | ||
441 | */ | ||
442 | if (route_choosen) | ||
443 | list_for_each_entry_safe(sibling, next_sibling, | ||
444 | &match->rt6i_siblings, rt6i_siblings) { | ||
445 | route_choosen--; | ||
446 | if (route_choosen == 0) { | ||
447 | match = sibling; | ||
448 | break; | ||
449 | } | ||
450 | } | ||
451 | return match; | ||
452 | } | ||
453 | |||
388 | /* | 454 | /* |
389 | * Route lookup. Any table->tb6_lock is implied. | 455 | * Route lookup. Any table->tb6_lock is implied. |
390 | */ | 456 | */ |
@@ -702,6 +768,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, | |||
702 | restart: | 768 | restart: |
703 | rt = fn->leaf; | 769 | rt = fn->leaf; |
704 | rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); | 770 | rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); |
771 | if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) | ||
772 | rt = rt6_multipath_select(rt, fl6); | ||
705 | BACKTRACK(net, &fl6->saddr); | 773 | BACKTRACK(net, &fl6->saddr); |
706 | out: | 774 | out: |
707 | dst_use(&rt->dst, jiffies); | 775 | dst_use(&rt->dst, jiffies); |
@@ -863,7 +931,8 @@ restart_2: | |||
863 | 931 | ||
864 | restart: | 932 | restart: |
865 | rt = rt6_select(fn, oif, strict | reachable); | 933 | rt = rt6_select(fn, oif, strict | reachable); |
866 | 934 | if (rt->rt6i_nsiblings && oif == 0) | |
935 | rt = rt6_multipath_select(rt, fl6); | ||
867 | BACKTRACK(net, &fl6->saddr); | 936 | BACKTRACK(net, &fl6->saddr); |
868 | if (rt == net->ipv6.ip6_null_entry || | 937 | if (rt == net->ipv6.ip6_null_entry || |
869 | rt->rt6i_flags & RTF_CACHE) | 938 | rt->rt6i_flags & RTF_CACHE) |
@@ -2249,6 +2318,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { | |||
2249 | [RTA_IIF] = { .type = NLA_U32 }, | 2318 | [RTA_IIF] = { .type = NLA_U32 }, |
2250 | [RTA_PRIORITY] = { .type = NLA_U32 }, | 2319 | [RTA_PRIORITY] = { .type = NLA_U32 }, |
2251 | [RTA_METRICS] = { .type = NLA_NESTED }, | 2320 | [RTA_METRICS] = { .type = NLA_NESTED }, |
2321 | [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, | ||
2252 | }; | 2322 | }; |
2253 | 2323 | ||
2254 | static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, | 2324 | static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, |
@@ -2326,11 +2396,65 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, | |||
2326 | if (tb[RTA_TABLE]) | 2396 | if (tb[RTA_TABLE]) |
2327 | cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); | 2397 | cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); |
2328 | 2398 | ||
2399 | if (tb[RTA_MULTIPATH]) { | ||
2400 | cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); | ||
2401 | cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); | ||
2402 | } | ||
2403 | |||
2329 | err = 0; | 2404 | err = 0; |
2330 | errout: | 2405 | errout: |
2331 | return err; | 2406 | return err; |
2332 | } | 2407 | } |
2333 | 2408 | ||
2409 | static int ip6_route_multipath(struct fib6_config *cfg, int add) | ||
2410 | { | ||
2411 | struct fib6_config r_cfg; | ||
2412 | struct rtnexthop *rtnh; | ||
2413 | int remaining; | ||
2414 | int attrlen; | ||
2415 | int err = 0, last_err = 0; | ||
2416 | |||
2417 | beginning: | ||
2418 | rtnh = (struct rtnexthop *)cfg->fc_mp; | ||
2419 | remaining = cfg->fc_mp_len; | ||
2420 | |||
2421 | /* Parse a Multipath Entry */ | ||
2422 | while (rtnh_ok(rtnh, remaining)) { | ||
2423 | memcpy(&r_cfg, cfg, sizeof(*cfg)); | ||
2424 | if (rtnh->rtnh_ifindex) | ||
2425 | r_cfg.fc_ifindex = rtnh->rtnh_ifindex; | ||
2426 | |||
2427 | attrlen = rtnh_attrlen(rtnh); | ||
2428 | if (attrlen > 0) { | ||
2429 | struct nlattr *nla, *attrs = rtnh_attrs(rtnh); | ||
2430 | |||
2431 | nla = nla_find(attrs, attrlen, RTA_GATEWAY); | ||
2432 | if (nla) { | ||
2433 | nla_memcpy(&r_cfg.fc_gateway, nla, 16); | ||
2434 | r_cfg.fc_flags |= RTF_GATEWAY; | ||
2435 | } | ||
2436 | } | ||
2437 | err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); | ||
2438 | if (err) { | ||
2439 | last_err = err; | ||
2440 | /* If we are trying to remove a route, do not stop the | ||
2441 | * loop when ip6_route_del() fails (because next hop is | ||
2442 | * already gone), we should try to remove all next hops. | ||
2443 | */ | ||
2444 | if (add) { | ||
2445 | /* If add fails, we should try to delete all | ||
2446 | * next hops that have been already added. | ||
2447 | */ | ||
2448 | add = 0; | ||
2449 | goto beginning; | ||
2450 | } | ||
2451 | } | ||
2452 | rtnh = rtnh_next(rtnh, &remaining); | ||
2453 | } | ||
2454 | |||
2455 | return last_err; | ||
2456 | } | ||
2457 | |||
2334 | static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) | 2458 | static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) |
2335 | { | 2459 | { |
2336 | struct fib6_config cfg; | 2460 | struct fib6_config cfg; |
@@ -2340,7 +2464,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a | |||
2340 | if (err < 0) | 2464 | if (err < 0) |
2341 | return err; | 2465 | return err; |
2342 | 2466 | ||
2343 | return ip6_route_del(&cfg); | 2467 | if (cfg.fc_mp) |
2468 | return ip6_route_multipath(&cfg, 0); | ||
2469 | else | ||
2470 | return ip6_route_del(&cfg); | ||
2344 | } | 2471 | } |
2345 | 2472 | ||
2346 | static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) | 2473 | static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) |
@@ -2352,7 +2479,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a | |||
2352 | if (err < 0) | 2479 | if (err < 0) |
2353 | return err; | 2480 | return err; |
2354 | 2481 | ||
2355 | return ip6_route_add(&cfg); | 2482 | if (cfg.fc_mp) |
2483 | return ip6_route_multipath(&cfg, 1); | ||
2484 | else | ||
2485 | return ip6_route_add(&cfg); | ||
2356 | } | 2486 | } |
2357 | 2487 | ||
2358 | static inline size_t rt6_nlmsg_size(void) | 2488 | static inline size_t rt6_nlmsg_size(void) |