summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNikolay Aleksandrov <nikolay@cumulusnetworks.com>2017-03-16 09:28:00 -0400
committerDavid S. Miller <davem@davemloft.net>2017-03-21 18:27:19 -0400
commitbf4e0a3db97eb882368fd82980b3b1fa0b5b9778 (patch)
treef1e5a3fd90d22fc603bc5a6780fc1c716684ad94
parent88997e4208aea117627898e5f6f9801cf3cd42d2 (diff)
net: ipv4: add support for ECMP hash policy choice
This patch adds support for ECMP hash policy choice via a new sysctl called fib_multipath_hash_policy and also adds support for L4 hashes. The current values for fib_multipath_hash_policy are: 0 - layer 3 (default) 1 - layer 4 If there's an skb hash already set and it matches the chosen policy then it will be used instead of being calculated (currently only for L4). In L3 mode we always calculate the hash due to the ICMP error special case, the flow dissector's field consistentification should handle the address order thus we can remove the address reversals. If the skb is provided we always use it for the hash calculation, otherwise we fallback to fl4, that is if skb is NULL fl4 has to be set. Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt8
-rw-r--r--include/net/ip_fib.h14
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--include/net/route.h6
-rw-r--r--net/ipv4/fib_semantics.c11
-rw-r--r--net/ipv4/icmp.c19
-rw-r--r--net/ipv4/route.c92
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
8 files changed, 100 insertions, 60 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ed3d0791eb27..b57308e76b1d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
73 0 - disabled 73 0 - disabled
74 1 - enabled 74 1 - enabled
75 75
76fib_multipath_hash_policy - INTEGER
77 Controls which hash policy to use for multipath routes. Only valid
78 for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
79 Default: 0 (Layer 3)
80 Possible values:
81 0 - Layer 3
82 1 - Layer 4
83
76route/max_size - INTEGER 84route/max_size - INTEGER
77 Maximum number of routes allowed in the kernel. Increase 85 Maximum number of routes allowed in the kernel. Increase
78 this when using large numbers of interfaces and/or routes. 86 this when using large numbers of interfaces and/or routes.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 272e62e139e0..6692c5758b33 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -395,17 +395,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
395int fib_sync_down_addr(struct net_device *dev, __be32 local); 395int fib_sync_down_addr(struct net_device *dev, __be32 local);
396int fib_sync_up(struct net_device *dev, unsigned int nh_flags); 396int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
397 397
398extern u32 fib_multipath_secret __read_mostly; 398#ifdef CONFIG_IP_ROUTE_MULTIPATH
399 399int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
400static inline int fib_multipath_hash(__be32 saddr, __be32 daddr) 400 const struct sk_buff *skb);
401{ 401#endif
402 return jhash_2words((__force u32)saddr, (__force u32)daddr,
403 fib_multipath_secret) >> 1;
404}
405
406void fib_select_multipath(struct fib_result *res, int hash); 402void fib_select_multipath(struct fib_result *res, int hash);
407void fib_select_path(struct net *net, struct fib_result *res, 403void fib_select_path(struct net *net, struct fib_result *res,
408 struct flowi4 *fl4, int mp_hash); 404 struct flowi4 *fl4, const struct sk_buff *skb);
409 405
410/* Exported by fib_trie.c */ 406/* Exported by fib_trie.c */
411void fib_trie_init(void); 407void fib_trie_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2e9d649ba169..a0e89190a3e9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -151,6 +151,7 @@ struct netns_ipv4 {
151#endif 151#endif
152#ifdef CONFIG_IP_ROUTE_MULTIPATH 152#ifdef CONFIG_IP_ROUTE_MULTIPATH
153 int sysctl_fib_multipath_use_neigh; 153 int sysctl_fib_multipath_use_neigh;
154 int sysctl_fib_multipath_hash_policy;
154#endif 155#endif
155 156
156 unsigned int fib_seq; /* protected by rtnl_mutex */ 157 unsigned int fib_seq; /* protected by rtnl_mutex */
diff --git a/include/net/route.h b/include/net/route.h
index c0874c87c173..2cc0e14c6359 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -113,13 +113,13 @@ struct in_device;
113int ip_rt_init(void); 113int ip_rt_init(void);
114void rt_cache_flush(struct net *net); 114void rt_cache_flush(struct net *net);
115void rt_flush_dev(struct net_device *dev); 115void rt_flush_dev(struct net_device *dev);
116struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp, 116struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp,
117 int mp_hash); 117 const struct sk_buff *skb);
118 118
119static inline struct rtable *__ip_route_output_key(struct net *net, 119static inline struct rtable *__ip_route_output_key(struct net *net,
120 struct flowi4 *flp) 120 struct flowi4 *flp)
121{ 121{
122 return __ip_route_output_key_hash(net, flp, -1); 122 return __ip_route_output_key_hash(net, flp, NULL);
123} 123}
124 124
125struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, 125struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 317026a39cfa..da449ddb8cc1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
57static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; 57static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
58 58
59#ifdef CONFIG_IP_ROUTE_MULTIPATH 59#ifdef CONFIG_IP_ROUTE_MULTIPATH
60u32 fib_multipath_secret __read_mostly;
61 60
62#define for_nexthops(fi) { \ 61#define for_nexthops(fi) { \
63 int nhsel; const struct fib_nh *nh; \ 62 int nhsel; const struct fib_nh *nh; \
@@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
576 575
577 atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); 576 atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
578 } endfor_nexthops(fi); 577 } endfor_nexthops(fi);
579
580 net_get_random_once(&fib_multipath_secret,
581 sizeof(fib_multipath_secret));
582} 578}
583 579
584static inline void fib_add_weight(struct fib_info *fi, 580static inline void fib_add_weight(struct fib_info *fi,
@@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
1641#endif 1637#endif
1642 1638
1643void fib_select_path(struct net *net, struct fib_result *res, 1639void fib_select_path(struct net *net, struct fib_result *res,
1644 struct flowi4 *fl4, int mp_hash) 1640 struct flowi4 *fl4, const struct sk_buff *skb)
1645{ 1641{
1646 bool oif_check; 1642 bool oif_check;
1647 1643
@@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
1650 1646
1651#ifdef CONFIG_IP_ROUTE_MULTIPATH 1647#ifdef CONFIG_IP_ROUTE_MULTIPATH
1652 if (res->fi->fib_nhs > 1 && oif_check) { 1648 if (res->fi->fib_nhs > 1 && oif_check) {
1653 if (mp_hash < 0) 1649 int h = fib_multipath_hash(res->fi, fl4, skb);
1654 mp_hash = get_hash_from_flowi4(fl4) >> 1;
1655 1650
1656 fib_select_multipath(res, mp_hash); 1651 fib_select_multipath(res, h);
1657 } 1652 }
1658 else 1653 else
1659#endif 1654#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fc310db2708b..43318b5f5647 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -464,22 +464,6 @@ out_bh_enable:
464 local_bh_enable(); 464 local_bh_enable();
465} 465}
466 466
467#ifdef CONFIG_IP_ROUTE_MULTIPATH
468
469/* Source and destination is swapped. See ip_multipath_icmp_hash */
470static int icmp_multipath_hash_skb(const struct sk_buff *skb)
471{
472 const struct iphdr *iph = ip_hdr(skb);
473
474 return fib_multipath_hash(iph->daddr, iph->saddr);
475}
476
477#else
478
479#define icmp_multipath_hash_skb(skb) (-1)
480
481#endif
482
483static struct rtable *icmp_route_lookup(struct net *net, 467static struct rtable *icmp_route_lookup(struct net *net,
484 struct flowi4 *fl4, 468 struct flowi4 *fl4,
485 struct sk_buff *skb_in, 469 struct sk_buff *skb_in,
@@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
505 fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); 489 fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
506 490
507 security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); 491 security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
508 rt = __ip_route_output_key_hash(net, fl4, 492 rt = __ip_route_output_key_hash(net, fl4, skb_in);
509 icmp_multipath_hash_skb(skb_in));
510 if (IS_ERR(rt)) 493 if (IS_ERR(rt))
511 return rt; 494 return rt;
512 495
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8471dd116771..5dda1ef81c7e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1734,45 +1734,97 @@ out:
1734} 1734}
1735 1735
1736#ifdef CONFIG_IP_ROUTE_MULTIPATH 1736#ifdef CONFIG_IP_ROUTE_MULTIPATH
1737
1738/* To make ICMP packets follow the right flow, the multipath hash is 1737/* To make ICMP packets follow the right flow, the multipath hash is
1739 * calculated from the inner IP addresses in reverse order. 1738 * calculated from the inner IP addresses.
1740 */ 1739 */
1741static int ip_multipath_icmp_hash(struct sk_buff *skb) 1740static void ip_multipath_l3_keys(const struct sk_buff *skb,
1741 struct flow_keys *hash_keys)
1742{ 1742{
1743 const struct iphdr *outer_iph = ip_hdr(skb); 1743 const struct iphdr *outer_iph = ip_hdr(skb);
1744 struct icmphdr _icmph; 1744 const struct iphdr *inner_iph;
1745 const struct icmphdr *icmph; 1745 const struct icmphdr *icmph;
1746 struct iphdr _inner_iph; 1746 struct iphdr _inner_iph;
1747 const struct iphdr *inner_iph; 1747 struct icmphdr _icmph;
1748
1749 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1750 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1751 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1752 return;
1748 1753
1749 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) 1754 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1750 goto standard_hash; 1755 return;
1751 1756
1752 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), 1757 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1753 &_icmph); 1758 &_icmph);
1754 if (!icmph) 1759 if (!icmph)
1755 goto standard_hash; 1760 return;
1756 1761
1757 if (icmph->type != ICMP_DEST_UNREACH && 1762 if (icmph->type != ICMP_DEST_UNREACH &&
1758 icmph->type != ICMP_REDIRECT && 1763 icmph->type != ICMP_REDIRECT &&
1759 icmph->type != ICMP_TIME_EXCEEDED && 1764 icmph->type != ICMP_TIME_EXCEEDED &&
1760 icmph->type != ICMP_PARAMETERPROB) { 1765 icmph->type != ICMP_PARAMETERPROB)
1761 goto standard_hash; 1766 return;
1762 }
1763 1767
1764 inner_iph = skb_header_pointer(skb, 1768 inner_iph = skb_header_pointer(skb,
1765 outer_iph->ihl * 4 + sizeof(_icmph), 1769 outer_iph->ihl * 4 + sizeof(_icmph),
1766 sizeof(_inner_iph), &_inner_iph); 1770 sizeof(_inner_iph), &_inner_iph);
1767 if (!inner_iph) 1771 if (!inner_iph)
1768 goto standard_hash; 1772 return;
1773 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1774 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1775}
1769 1776
1770 return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr); 1777/* if skb is set it will be used and fl4 can be NULL */
1778int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1779 const struct sk_buff *skb)
1780{
1781 struct net *net = fi->fib_net;
1782 struct flow_keys hash_keys;
1783 u32 mhash;
1771 1784
1772standard_hash: 1785 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1773 return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr); 1786 case 0:
1774} 1787 memset(&hash_keys, 0, sizeof(hash_keys));
1788 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1789 if (skb) {
1790 ip_multipath_l3_keys(skb, &hash_keys);
1791 } else {
1792 hash_keys.addrs.v4addrs.src = fl4->saddr;
1793 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1794 }
1795 break;
1796 case 1:
1797 /* skb is currently provided only when forwarding */
1798 if (skb) {
1799 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1800 struct flow_keys keys;
1801
1802 /* short-circuit if we already have L4 hash present */
1803 if (skb->l4_hash)
1804 return skb_get_hash_raw(skb) >> 1;
1805 memset(&hash_keys, 0, sizeof(hash_keys));
1806 skb_flow_dissect_flow_keys(skb, &keys, flag);
1807 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1808 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1809 hash_keys.ports.src = keys.ports.src;
1810 hash_keys.ports.dst = keys.ports.dst;
1811 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1812 } else {
1813 memset(&hash_keys, 0, sizeof(hash_keys));
1814 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1815 hash_keys.addrs.v4addrs.src = fl4->saddr;
1816 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1817 hash_keys.ports.src = fl4->fl4_sport;
1818 hash_keys.ports.dst = fl4->fl4_dport;
1819 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1820 }
1821 break;
1822 }
1823 mhash = flow_hash_from_keys(&hash_keys);
1775 1824
1825 return mhash >> 1;
1826}
1827EXPORT_SYMBOL_GPL(fib_multipath_hash);
1776#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1828#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1777 1829
1778static int ip_mkroute_input(struct sk_buff *skb, 1830static int ip_mkroute_input(struct sk_buff *skb,
@@ -1782,12 +1834,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
1782{ 1834{
1783#ifdef CONFIG_IP_ROUTE_MULTIPATH 1835#ifdef CONFIG_IP_ROUTE_MULTIPATH
1784 if (res->fi && res->fi->fib_nhs > 1) { 1836 if (res->fi && res->fi->fib_nhs > 1) {
1785 int h; 1837 int h = fib_multipath_hash(res->fi, NULL, skb);
1786 1838
1787 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1788 h = ip_multipath_icmp_hash(skb);
1789 else
1790 h = fib_multipath_hash(saddr, daddr);
1791 fib_select_multipath(res, h); 1839 fib_select_multipath(res, h);
1792 } 1840 }
1793#endif 1841#endif
@@ -2203,7 +2251,7 @@ add:
2203 */ 2251 */
2204 2252
2205struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2253struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2206 int mp_hash) 2254 const struct sk_buff *skb)
2207{ 2255{
2208 struct net_device *dev_out = NULL; 2256 struct net_device *dev_out = NULL;
2209 __u8 tos = RT_FL_TOS(fl4); 2257 __u8 tos = RT_FL_TOS(fl4);
@@ -2365,7 +2413,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2365 goto make_route; 2413 goto make_route;
2366 } 2414 }
2367 2415
2368 fib_select_path(net, &res, fl4, mp_hash); 2416 fib_select_path(net, &res, fl4, skb);
2369 2417
2370 dev_out = FIB_RES_DEV(res); 2418 dev_out = FIB_RES_DEV(res);
2371 fl4->flowi4_oif = dev_out->ifindex; 2419 fl4->flowi4_oif = dev_out->ifindex;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 11aaef0939b2..711c3e2e17b1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -997,6 +997,15 @@ static struct ctl_table ipv4_net_table[] = {
997 .extra1 = &zero, 997 .extra1 = &zero,
998 .extra2 = &one, 998 .extra2 = &one,
999 }, 999 },
1000 {
1001 .procname = "fib_multipath_hash_policy",
1002 .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy,
1003 .maxlen = sizeof(int),
1004 .mode = 0644,
1005 .proc_handler = proc_dointvec_minmax,
1006 .extra1 = &zero,
1007 .extra2 = &one,
1008 },
1000#endif 1009#endif
1001 { 1010 {
1002 .procname = "ip_unprivileged_port_start", 1011 .procname = "ip_unprivileged_port_start",