diff options
author | Nicolas Dichtel <nicolas.dichtel@6wind.com> | 2012-10-21 23:42:09 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-10-23 02:38:32 -0400 |
commit | 51ebd3181572af8d5076808dab2682d800f6da5d (patch) | |
tree | 41bce53f5ed82791ea5975f6ce708f54729659b4 /net/ipv6/route.c | |
parent | d94ce9b283736a876b2e6dec665c68e5e8b5d55e (diff) |
ipv6: add support of equal cost multipath (ECMP)
Each nexthop is added like a single route in the routing table. All routes
that have the same metric/weight and destination but not the same gateway
are considering as ECMP routes. They are linked together, through a list called
rt6i_siblings.
ECMP routes can be added in one shot, with RTA_MULTIPATH attribute or one after
the other (in both case, the flag NLM_F_EXCL should not be set).
The patch is based on a previous work from
Luc Saillard <luc.saillard@6wind.com>.
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6/route.c')
-rw-r--r-- | net/ipv6/route.c | 136 |
1 files changed, 133 insertions, 3 deletions
diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7c7e963260e1..126da562d3eb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <net/xfrm.h> | 57 | #include <net/xfrm.h> |
58 | #include <net/netevent.h> | 58 | #include <net/netevent.h> |
59 | #include <net/netlink.h> | 59 | #include <net/netlink.h> |
60 | #include <net/nexthop.h> | ||
60 | 61 | ||
61 | #include <asm/uaccess.h> | 62 | #include <asm/uaccess.h> |
62 | 63 | ||
@@ -289,6 +290,8 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, | |||
289 | memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); | 290 | memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); |
290 | rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); | 291 | rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); |
291 | rt->rt6i_genid = rt_genid(net); | 292 | rt->rt6i_genid = rt_genid(net); |
293 | INIT_LIST_HEAD(&rt->rt6i_siblings); | ||
294 | rt->rt6i_nsiblings = 0; | ||
292 | } | 295 | } |
293 | return rt; | 296 | return rt; |
294 | } | 297 | } |
@@ -385,6 +388,69 @@ static bool rt6_need_strict(const struct in6_addr *daddr) | |||
385 | (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); | 388 | (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); |
386 | } | 389 | } |
387 | 390 | ||
391 | /* Multipath route selection: | ||
392 | * Hash based function using packet header and flowlabel. | ||
393 | * Adapted from fib_info_hashfn() | ||
394 | */ | ||
395 | static int rt6_info_hash_nhsfn(unsigned int candidate_count, | ||
396 | const struct flowi6 *fl6) | ||
397 | { | ||
398 | unsigned int val = fl6->flowi6_proto; | ||
399 | |||
400 | val ^= fl6->daddr.s6_addr32[0]; | ||
401 | val ^= fl6->daddr.s6_addr32[1]; | ||
402 | val ^= fl6->daddr.s6_addr32[2]; | ||
403 | val ^= fl6->daddr.s6_addr32[3]; | ||
404 | |||
405 | val ^= fl6->saddr.s6_addr32[0]; | ||
406 | val ^= fl6->saddr.s6_addr32[1]; | ||
407 | val ^= fl6->saddr.s6_addr32[2]; | ||
408 | val ^= fl6->saddr.s6_addr32[3]; | ||
409 | |||
410 | /* Work only if this not encapsulated */ | ||
411 | switch (fl6->flowi6_proto) { | ||
412 | case IPPROTO_UDP: | ||
413 | case IPPROTO_TCP: | ||
414 | case IPPROTO_SCTP: | ||
415 | val ^= fl6->fl6_sport; | ||
416 | val ^= fl6->fl6_dport; | ||
417 | break; | ||
418 | |||
419 | case IPPROTO_ICMPV6: | ||
420 | val ^= fl6->fl6_icmp_type; | ||
421 | val ^= fl6->fl6_icmp_code; | ||
422 | break; | ||
423 | } | ||
424 | /* RFC6438 recommands to use flowlabel */ | ||
425 | val ^= fl6->flowlabel; | ||
426 | |||
427 | /* Perhaps, we need to tune, this function? */ | ||
428 | val = val ^ (val >> 7) ^ (val >> 12); | ||
429 | return val % candidate_count; | ||
430 | } | ||
431 | |||
432 | static struct rt6_info *rt6_multipath_select(struct rt6_info *match, | ||
433 | struct flowi6 *fl6) | ||
434 | { | ||
435 | struct rt6_info *sibling, *next_sibling; | ||
436 | int route_choosen; | ||
437 | |||
438 | route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6); | ||
439 | /* Don't change the route, if route_choosen == 0 | ||
440 | * (siblings does not include ourself) | ||
441 | */ | ||
442 | if (route_choosen) | ||
443 | list_for_each_entry_safe(sibling, next_sibling, | ||
444 | &match->rt6i_siblings, rt6i_siblings) { | ||
445 | route_choosen--; | ||
446 | if (route_choosen == 0) { | ||
447 | match = sibling; | ||
448 | break; | ||
449 | } | ||
450 | } | ||
451 | return match; | ||
452 | } | ||
453 | |||
388 | /* | 454 | /* |
389 | * Route lookup. Any table->tb6_lock is implied. | 455 | * Route lookup. Any table->tb6_lock is implied. |
390 | */ | 456 | */ |
@@ -702,6 +768,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, | |||
702 | restart: | 768 | restart: |
703 | rt = fn->leaf; | 769 | rt = fn->leaf; |
704 | rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); | 770 | rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); |
771 | if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) | ||
772 | rt = rt6_multipath_select(rt, fl6); | ||
705 | BACKTRACK(net, &fl6->saddr); | 773 | BACKTRACK(net, &fl6->saddr); |
706 | out: | 774 | out: |
707 | dst_use(&rt->dst, jiffies); | 775 | dst_use(&rt->dst, jiffies); |
@@ -863,7 +931,8 @@ restart_2: | |||
863 | 931 | ||
864 | restart: | 932 | restart: |
865 | rt = rt6_select(fn, oif, strict | reachable); | 933 | rt = rt6_select(fn, oif, strict | reachable); |
866 | 934 | if (rt->rt6i_nsiblings && oif == 0) | |
935 | rt = rt6_multipath_select(rt, fl6); | ||
867 | BACKTRACK(net, &fl6->saddr); | 936 | BACKTRACK(net, &fl6->saddr); |
868 | if (rt == net->ipv6.ip6_null_entry || | 937 | if (rt == net->ipv6.ip6_null_entry || |
869 | rt->rt6i_flags & RTF_CACHE) | 938 | rt->rt6i_flags & RTF_CACHE) |
@@ -2249,6 +2318,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { | |||
2249 | [RTA_IIF] = { .type = NLA_U32 }, | 2318 | [RTA_IIF] = { .type = NLA_U32 }, |
2250 | [RTA_PRIORITY] = { .type = NLA_U32 }, | 2319 | [RTA_PRIORITY] = { .type = NLA_U32 }, |
2251 | [RTA_METRICS] = { .type = NLA_NESTED }, | 2320 | [RTA_METRICS] = { .type = NLA_NESTED }, |
2321 | [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, | ||
2252 | }; | 2322 | }; |
2253 | 2323 | ||
2254 | static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, | 2324 | static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, |
@@ -2326,11 +2396,65 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, | |||
2326 | if (tb[RTA_TABLE]) | 2396 | if (tb[RTA_TABLE]) |
2327 | cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); | 2397 | cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); |
2328 | 2398 | ||
2399 | if (tb[RTA_MULTIPATH]) { | ||
2400 | cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); | ||
2401 | cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); | ||
2402 | } | ||
2403 | |||
2329 | err = 0; | 2404 | err = 0; |
2330 | errout: | 2405 | errout: |
2331 | return err; | 2406 | return err; |
2332 | } | 2407 | } |
2333 | 2408 | ||
2409 | static int ip6_route_multipath(struct fib6_config *cfg, int add) | ||
2410 | { | ||
2411 | struct fib6_config r_cfg; | ||
2412 | struct rtnexthop *rtnh; | ||
2413 | int remaining; | ||
2414 | int attrlen; | ||
2415 | int err = 0, last_err = 0; | ||
2416 | |||
2417 | beginning: | ||
2418 | rtnh = (struct rtnexthop *)cfg->fc_mp; | ||
2419 | remaining = cfg->fc_mp_len; | ||
2420 | |||
2421 | /* Parse a Multipath Entry */ | ||
2422 | while (rtnh_ok(rtnh, remaining)) { | ||
2423 | memcpy(&r_cfg, cfg, sizeof(*cfg)); | ||
2424 | if (rtnh->rtnh_ifindex) | ||
2425 | r_cfg.fc_ifindex = rtnh->rtnh_ifindex; | ||
2426 | |||
2427 | attrlen = rtnh_attrlen(rtnh); | ||
2428 | if (attrlen > 0) { | ||
2429 | struct nlattr *nla, *attrs = rtnh_attrs(rtnh); | ||
2430 | |||
2431 | nla = nla_find(attrs, attrlen, RTA_GATEWAY); | ||
2432 | if (nla) { | ||
2433 | nla_memcpy(&r_cfg.fc_gateway, nla, 16); | ||
2434 | r_cfg.fc_flags |= RTF_GATEWAY; | ||
2435 | } | ||
2436 | } | ||
2437 | err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); | ||
2438 | if (err) { | ||
2439 | last_err = err; | ||
2440 | /* If we are trying to remove a route, do not stop the | ||
2441 | * loop when ip6_route_del() fails (because next hop is | ||
2442 | * already gone), we should try to remove all next hops. | ||
2443 | */ | ||
2444 | if (add) { | ||
2445 | /* If add fails, we should try to delete all | ||
2446 | * next hops that have been already added. | ||
2447 | */ | ||
2448 | add = 0; | ||
2449 | goto beginning; | ||
2450 | } | ||
2451 | } | ||
2452 | rtnh = rtnh_next(rtnh, &remaining); | ||
2453 | } | ||
2454 | |||
2455 | return last_err; | ||
2456 | } | ||
2457 | |||
2334 | static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) | 2458 | static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) |
2335 | { | 2459 | { |
2336 | struct fib6_config cfg; | 2460 | struct fib6_config cfg; |
@@ -2340,7 +2464,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a | |||
2340 | if (err < 0) | 2464 | if (err < 0) |
2341 | return err; | 2465 | return err; |
2342 | 2466 | ||
2343 | return ip6_route_del(&cfg); | 2467 | if (cfg.fc_mp) |
2468 | return ip6_route_multipath(&cfg, 0); | ||
2469 | else | ||
2470 | return ip6_route_del(&cfg); | ||
2344 | } | 2471 | } |
2345 | 2472 | ||
2346 | static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) | 2473 | static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) |
@@ -2352,7 +2479,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a | |||
2352 | if (err < 0) | 2479 | if (err < 0) |
2353 | return err; | 2480 | return err; |
2354 | 2481 | ||
2355 | return ip6_route_add(&cfg); | 2482 | if (cfg.fc_mp) |
2483 | return ip6_route_multipath(&cfg, 1); | ||
2484 | else | ||
2485 | return ip6_route_add(&cfg); | ||
2356 | } | 2486 | } |
2357 | 2487 | ||
2358 | static inline size_t rt6_nlmsg_size(void) | 2488 | static inline size_t rt6_nlmsg_size(void) |