summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDavid Ahern <dsa@cumulusnetworks.com>2016-04-07 10:21:00 -0400
committerDavid S. Miller <davem@davemloft.net>2016-04-11 15:16:13 -0400
commita6db4494d218c2e559173661ee972e048dc04fdd (patch)
treeb3035e7332e6e3335ddc8d1dbd6e35a8893ae947 /net
parent0b0e30c650e4345a50c417c397c7ecb63206a611 (diff)
net: ipv4: Consider failed nexthops in multipath routes
Multipath route lookups should consider knowledge about next hops and not select a hop that is known to be failed. Example: [h2] [h3] 15.0.0.5 | | 3| 3| [SP1] [SP2]--+ 1 2 1 2 | | /-------------+ | | \ / | | X | | / \ | | / \---------------\ | 1 2 1 2 12.0.0.2 [TOR1] 3-----------------3 [TOR2] 12.0.0.3 4 4 \ / \ / \ / -------| |-----/ 1 2 [TOR3] 3| | [h1] 12.0.0.1 host h1 with IP 12.0.0.1 has 2 paths to host h3 at 15.0.0.5: root@h1:~# ip ro ls ... 12.0.0.0/24 dev swp1 proto kernel scope link src 12.0.0.1 15.0.0.0/16 nexthop via 12.0.0.2 dev swp1 weight 1 nexthop via 12.0.0.3 dev swp1 weight 1 ... If the link between tor3 and tor1 is down and the link between tor1 and tor2 then tor1 is effectively cut-off from h1. Yet the route lookups in h1 are alternating between the 2 routes: ping 15.0.0.5 gets one and ssh 15.0.0.5 gets the other. Connections that attempt to use the 12.0.0.2 nexthop fail since that neighbor is not reachable: root@h1:~# ip neigh show ... 12.0.0.3 dev swp1 lladdr 00:02:00:00:00:1b REACHABLE 12.0.0.2 dev swp1 FAILED ... The failed path can be avoided by considering known neighbor information when selecting next hops. If the neighbor lookup fails we have no knowledge about the nexthop, so give it a shot. If there is an entry then only select the nexthop if the state is sane. This is similar to what fib_detect_death does. To maintain backward compatibility use of the neighbor information is based on a new sysctl, fib_multipath_use_neigh. Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Reviewed-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/fib_semantics.c34
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
2 files changed, 40 insertions, 5 deletions
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d97268e8ff10..ab64d9f2eef9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1559,21 +1559,45 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
1559} 1559}
1560 1560
1561#ifdef CONFIG_IP_ROUTE_MULTIPATH 1561#ifdef CONFIG_IP_ROUTE_MULTIPATH
1562static bool fib_good_nh(const struct fib_nh *nh)
1563{
1564 int state = NUD_REACHABLE;
1565
1566 if (nh->nh_scope == RT_SCOPE_LINK) {
1567 struct neighbour *n;
1568
1569 rcu_read_lock_bh();
1570
1571 n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw);
1572 if (n)
1573 state = n->nud_state;
1574
1575 rcu_read_unlock_bh();
1576 }
1577
1578 return !!(state & NUD_VALID);
1579}
1562 1580
1563void fib_select_multipath(struct fib_result *res, int hash) 1581void fib_select_multipath(struct fib_result *res, int hash)
1564{ 1582{
1565 struct fib_info *fi = res->fi; 1583 struct fib_info *fi = res->fi;
1584 struct net *net = fi->fib_net;
1585 bool first = false;
1566 1586
1567 for_nexthops(fi) { 1587 for_nexthops(fi) {
1568 if (hash > atomic_read(&nh->nh_upper_bound)) 1588 if (hash > atomic_read(&nh->nh_upper_bound))
1569 continue; 1589 continue;
1570 1590
1571 res->nh_sel = nhsel; 1591 if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
1572 return; 1592 fib_good_nh(nh)) {
1593 res->nh_sel = nhsel;
1594 return;
1595 }
1596 if (!first) {
1597 res->nh_sel = nhsel;
1598 first = true;
1599 }
1573 } endfor_nexthops(fi); 1600 } endfor_nexthops(fi);
1574
1575 /* Race condition: route has just become dead. */
1576 res->nh_sel = 0;
1577} 1601}
1578#endif 1602#endif
1579 1603
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1e1fe6086dd9..bb0419582b8d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -960,6 +960,17 @@ static struct ctl_table ipv4_net_table[] = {
960 .mode = 0644, 960 .mode = 0644,
961 .proc_handler = proc_dointvec, 961 .proc_handler = proc_dointvec,
962 }, 962 },
963#ifdef CONFIG_IP_ROUTE_MULTIPATH
964 {
965 .procname = "fib_multipath_use_neigh",
966 .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh,
967 .maxlen = sizeof(int),
968 .mode = 0644,
969 .proc_handler = proc_dointvec_minmax,
970 .extra1 = &zero,
971 .extra2 = &one,
972 },
973#endif
963 { } 974 { }
964}; 975};
965 976