diff options
author | David Ahern <dsahern@gmail.com> | 2018-03-02 11:32:18 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-03-04 13:04:23 -0500 |
commit | b4bac172e90ce4a93df8adf44eb70d91b9d611eb (patch) | |
tree | 875d572b15ee7f2ea7a06dcf5be4faa6c0865967 | |
parent | b75cc8f90f07342467b3bd51dbc0054f185032c9 (diff) |
net/ipv6: Add support for path selection using hash of 5-tuple
Some operators prefer IPv6 path selection to use a standard 5-tuple
hash rather than just an L3 hash with the flow the label. To that end
add support to IPv6 for multipath hash policy similar to bf4e0a3db97eb
("net: ipv4: add support for ECMP hash policy choice"). The default
is still L3 which covers source and destination addresses along with
flow label and IPv6 protocol.
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 7 | ||||
-rw-r--r-- | include/net/ip6_route.h | 4 | ||||
-rw-r--r-- | include/net/netevent.h | 1 | ||||
-rw-r--r-- | include/net/netns/ipv6.h | 1 | ||||
-rw-r--r-- | net/ipv6/icmp.c | 2 | ||||
-rw-r--r-- | net/ipv6/route.c | 68 | ||||
-rw-r--r-- | net/ipv6/sysctl_net_ipv6.c | 27 |
7 files changed, 91 insertions, 19 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a553d4e4a0fb..783675a730e5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -1363,6 +1363,13 @@ flowlabel_reflect - BOOLEAN | |||
1363 | FALSE: disabled | 1363 | FALSE: disabled |
1364 | Default: FALSE | 1364 | Default: FALSE |
1365 | 1365 | ||
1366 | fib_multipath_hash_policy - INTEGER | ||
1367 | Controls which hash policy to use for multipath routes. | ||
1368 | Default: 0 (Layer 3) | ||
1369 | Possible values: | ||
1370 | 0 - Layer 3 (source and destination addresses plus flow label) | ||
1371 | 1 - Layer 4 (standard 5-tuple) | ||
1372 | |||
1366 | anycast_src_echo_reply - BOOLEAN | 1373 | anycast_src_echo_reply - BOOLEAN |
1367 | Controls the use of anycast addresses as source addresses for ICMPv6 | 1374 | Controls the use of anycast addresses as source addresses for ICMPv6 |
1368 | echo reply | 1375 | echo reply |
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9594f9317952..ce2abc0ff102 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h | |||
@@ -130,8 +130,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, | |||
130 | struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, | 130 | struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, |
131 | const struct in6_addr *saddr, int oif, | 131 | const struct in6_addr *saddr, int oif, |
132 | const struct sk_buff *skb, int flags); | 132 | const struct sk_buff *skb, int flags); |
133 | u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, | 133 | u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, |
134 | struct flow_keys *hkeys); | 134 | const struct sk_buff *skb, struct flow_keys *hkeys); |
135 | 135 | ||
136 | struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); | 136 | struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); |
137 | 137 | ||
diff --git a/include/net/netevent.h b/include/net/netevent.h index baee605a94ab..d9918261701c 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h | |||
@@ -27,6 +27,7 @@ enum netevent_notif_type { | |||
27 | NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ | 27 | NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ |
28 | NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ | 28 | NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ |
29 | NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ | 29 | NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ |
30 | NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */ | ||
30 | }; | 31 | }; |
31 | 32 | ||
32 | int register_netevent_notifier(struct notifier_block *nb); | 33 | int register_netevent_notifier(struct notifier_block *nb); |
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index e286fda09fcf..5b51110435fc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h | |||
@@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 { | |||
28 | int ip6_rt_gc_elasticity; | 28 | int ip6_rt_gc_elasticity; |
29 | int ip6_rt_mtu_expires; | 29 | int ip6_rt_mtu_expires; |
30 | int ip6_rt_min_advmss; | 30 | int ip6_rt_min_advmss; |
31 | int multipath_hash_policy; | ||
31 | int flowlabel_consistency; | 32 | int flowlabel_consistency; |
32 | int auto_flowlabels; | 33 | int auto_flowlabels; |
33 | int icmpv6_time; | 34 | int icmpv6_time; |
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a5d929223820..6f84668be6ea 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c | |||
@@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, | |||
522 | fl6.fl6_icmp_type = type; | 522 | fl6.fl6_icmp_type = type; |
523 | fl6.fl6_icmp_code = code; | 523 | fl6.fl6_icmp_code = code; |
524 | fl6.flowi6_uid = sock_net_uid(net, NULL); | 524 | fl6.flowi6_uid = sock_net_uid(net, NULL); |
525 | fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL); | 525 | fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); |
526 | security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); | 526 | security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); |
527 | 527 | ||
528 | sk = icmpv6_xmit_lock(net); | 528 | sk = icmpv6_xmit_lock(net); |
diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d2b8368663cb..f0ae58424c45 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c | |||
@@ -450,7 +450,8 @@ static bool rt6_check_expired(const struct rt6_info *rt) | |||
450 | return false; | 450 | return false; |
451 | } | 451 | } |
452 | 452 | ||
453 | static struct rt6_info *rt6_multipath_select(struct rt6_info *match, | 453 | static struct rt6_info *rt6_multipath_select(const struct net *net, |
454 | struct rt6_info *match, | ||
454 | struct flowi6 *fl6, int oif, | 455 | struct flowi6 *fl6, int oif, |
455 | const struct sk_buff *skb, | 456 | const struct sk_buff *skb, |
456 | int strict) | 457 | int strict) |
@@ -461,7 +462,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, | |||
461 | * case it will always be non-zero. Otherwise now is the time to do it. | 462 | * case it will always be non-zero. Otherwise now is the time to do it. |
462 | */ | 463 | */ |
463 | if (!fl6->mp_hash) | 464 | if (!fl6->mp_hash) |
464 | fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL); | 465 | fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); |
465 | 466 | ||
466 | if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) | 467 | if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) |
467 | return match; | 468 | return match; |
@@ -932,7 +933,7 @@ restart: | |||
932 | rt = rt6_device_match(net, rt, &fl6->saddr, | 933 | rt = rt6_device_match(net, rt, &fl6->saddr, |
933 | fl6->flowi6_oif, flags); | 934 | fl6->flowi6_oif, flags); |
934 | if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) | 935 | if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) |
935 | rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, | 936 | rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, |
936 | skb, flags); | 937 | skb, flags); |
937 | } | 938 | } |
938 | if (rt == net->ipv6.ip6_null_entry) { | 939 | if (rt == net->ipv6.ip6_null_entry) { |
@@ -1674,7 +1675,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, | |||
1674 | redo_rt6_select: | 1675 | redo_rt6_select: |
1675 | rt = rt6_select(net, fn, oif, strict); | 1676 | rt = rt6_select(net, fn, oif, strict); |
1676 | if (rt->rt6i_nsiblings) | 1677 | if (rt->rt6i_nsiblings) |
1677 | rt = rt6_multipath_select(rt, fl6, oif, skb, strict); | 1678 | rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); |
1678 | if (rt == net->ipv6.ip6_null_entry) { | 1679 | if (rt == net->ipv6.ip6_null_entry) { |
1679 | fn = fib6_backtrack(fn, &fl6->saddr); | 1680 | fn = fib6_backtrack(fn, &fl6->saddr); |
1680 | if (fn) | 1681 | if (fn) |
@@ -1839,21 +1840,56 @@ out: | |||
1839 | } | 1840 | } |
1840 | 1841 | ||
1841 | /* if skb is set it will be used and fl6 can be NULL */ | 1842 | /* if skb is set it will be used and fl6 can be NULL */ |
1842 | u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, | 1843 | u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, |
1843 | struct flow_keys *flkeys) | 1844 | const struct sk_buff *skb, struct flow_keys *flkeys) |
1844 | { | 1845 | { |
1845 | struct flow_keys hash_keys; | 1846 | struct flow_keys hash_keys; |
1846 | u32 mhash; | 1847 | u32 mhash; |
1847 | 1848 | ||
1848 | memset(&hash_keys, 0, sizeof(hash_keys)); | 1849 | switch (net->ipv6.sysctl.multipath_hash_policy) { |
1849 | hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; | 1850 | case 0: |
1850 | if (skb) { | 1851 | memset(&hash_keys, 0, sizeof(hash_keys)); |
1851 | ip6_multipath_l3_keys(skb, &hash_keys, flkeys); | 1852 | hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; |
1852 | } else { | 1853 | if (skb) { |
1853 | hash_keys.addrs.v6addrs.src = fl6->saddr; | 1854 | ip6_multipath_l3_keys(skb, &hash_keys, flkeys); |
1854 | hash_keys.addrs.v6addrs.dst = fl6->daddr; | 1855 | } else { |
1855 | hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; | 1856 | hash_keys.addrs.v6addrs.src = fl6->saddr; |
1856 | hash_keys.basic.ip_proto = fl6->flowi6_proto; | 1857 | hash_keys.addrs.v6addrs.dst = fl6->daddr; |
1858 | hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; | ||
1859 | hash_keys.basic.ip_proto = fl6->flowi6_proto; | ||
1860 | } | ||
1861 | break; | ||
1862 | case 1: | ||
1863 | if (skb) { | ||
1864 | unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; | ||
1865 | struct flow_keys keys; | ||
1866 | |||
1867 | /* short-circuit if we already have L4 hash present */ | ||
1868 | if (skb->l4_hash) | ||
1869 | return skb_get_hash_raw(skb) >> 1; | ||
1870 | |||
1871 | memset(&hash_keys, 0, sizeof(hash_keys)); | ||
1872 | |||
1873 | if (!flkeys) { | ||
1874 | skb_flow_dissect_flow_keys(skb, &keys, flag); | ||
1875 | flkeys = &keys; | ||
1876 | } | ||
1877 | hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; | ||
1878 | hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; | ||
1879 | hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; | ||
1880 | hash_keys.ports.src = flkeys->ports.src; | ||
1881 | hash_keys.ports.dst = flkeys->ports.dst; | ||
1882 | hash_keys.basic.ip_proto = flkeys->basic.ip_proto; | ||
1883 | } else { | ||
1884 | memset(&hash_keys, 0, sizeof(hash_keys)); | ||
1885 | hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; | ||
1886 | hash_keys.addrs.v6addrs.src = fl6->saddr; | ||
1887 | hash_keys.addrs.v6addrs.dst = fl6->daddr; | ||
1888 | hash_keys.ports.src = fl6->fl6_sport; | ||
1889 | hash_keys.ports.dst = fl6->fl6_dport; | ||
1890 | hash_keys.basic.ip_proto = fl6->flowi6_proto; | ||
1891 | } | ||
1892 | break; | ||
1857 | } | 1893 | } |
1858 | mhash = flow_hash_from_keys(&hash_keys); | 1894 | mhash = flow_hash_from_keys(&hash_keys); |
1859 | 1895 | ||
@@ -1884,7 +1920,7 @@ void ip6_route_input(struct sk_buff *skb) | |||
1884 | flkeys = &_flkeys; | 1920 | flkeys = &_flkeys; |
1885 | 1921 | ||
1886 | if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) | 1922 | if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) |
1887 | fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); | 1923 | fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); |
1888 | skb_dst_drop(skb); | 1924 | skb_dst_drop(skb); |
1889 | skb_dst_set(skb, | 1925 | skb_dst_set(skb, |
1890 | ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); | 1926 | ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); |
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 262f791f1b9b..966c42af92f4 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c | |||
@@ -16,14 +16,31 @@ | |||
16 | #include <net/ipv6.h> | 16 | #include <net/ipv6.h> |
17 | #include <net/addrconf.h> | 17 | #include <net/addrconf.h> |
18 | #include <net/inet_frag.h> | 18 | #include <net/inet_frag.h> |
19 | #include <net/netevent.h> | ||
19 | #ifdef CONFIG_NETLABEL | 20 | #ifdef CONFIG_NETLABEL |
20 | #include <net/calipso.h> | 21 | #include <net/calipso.h> |
21 | #endif | 22 | #endif |
22 | 23 | ||
24 | static int zero; | ||
23 | static int one = 1; | 25 | static int one = 1; |
24 | static int auto_flowlabels_min; | 26 | static int auto_flowlabels_min; |
25 | static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; | 27 | static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; |
26 | 28 | ||
29 | static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, | ||
30 | void __user *buffer, size_t *lenp, | ||
31 | loff_t *ppos) | ||
32 | { | ||
33 | struct net *net; | ||
34 | int ret; | ||
35 | |||
36 | net = container_of(table->data, struct net, | ||
37 | ipv6.sysctl.multipath_hash_policy); | ||
38 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
39 | if (write && ret == 0) | ||
40 | call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); | ||
41 | |||
42 | return ret; | ||
43 | } | ||
27 | 44 | ||
28 | static struct ctl_table ipv6_table_template[] = { | 45 | static struct ctl_table ipv6_table_template[] = { |
29 | { | 46 | { |
@@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = { | |||
126 | .mode = 0644, | 143 | .mode = 0644, |
127 | .proc_handler = proc_dointvec | 144 | .proc_handler = proc_dointvec |
128 | }, | 145 | }, |
146 | { | ||
147 | .procname = "fib_multipath_hash_policy", | ||
148 | .data = &init_net.ipv6.sysctl.multipath_hash_policy, | ||
149 | .maxlen = sizeof(int), | ||
150 | .mode = 0644, | ||
151 | .proc_handler = proc_rt6_multipath_hash_policy, | ||
152 | .extra1 = &zero, | ||
153 | .extra2 = &one, | ||
154 | }, | ||
129 | { } | 155 | { } |
130 | }; | 156 | }; |
131 | 157 | ||
@@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) | |||
190 | ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; | 216 | ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; |
191 | ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; | 217 | ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; |
192 | ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; | 218 | ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; |
219 | ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy, | ||
193 | 220 | ||
194 | ipv6_route_table = ipv6_route_sysctl_init(net); | 221 | ipv6_route_table = ipv6_route_sysctl_init(net); |
195 | if (!ipv6_route_table) | 222 | if (!ipv6_route_table) |