aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom Herbert <tom@herbertland.com>2015-04-29 18:33:21 -0400
committerDavid S. Miller <davem@davemloft.net>2015-05-03 21:58:01 -0400
commit82a584b7cd366511a22e37675b029cf2fb58e291 (patch)
treeef5907021414a1ab539ca398ab7c43810dacfc75
parent7035870d1219f5cd86128edcb4c3517def632ad3 (diff)
ipv6: Flow label state ranges
This patch divides the IPv6 flow label space into two ranges: 0-7ffff is reserved for flow label manager, 80000-fffff will be used for creating auto flow labels (per RFC6438). This only affects how labels are set on transmit, it does not affect receive. This range split can be disbaled by systcl. Background: IPv6 flow labels have been an unmitigated disappointment thus far in the lifetime of IPv6. Support in HW devices to use them for ECMP is lacking, and OSes don't turn them on by default. If we had these we could get much better hashing in IPv6 networks without resorting to DPI, possibly eliminating some of the motivations to to define new encaps in UDP just for getting ECMP. Unfortunately, the initial specfications of IPv6 did not clarify how they are to be used. There has always been a vague concept that these can be used for ECMP, flow hashing, etc. and we do now have a good standard how to this in RFC6438. The problem is that flow labels can be either stateful or stateless (as in RFC6438), and we are presented with the possibility that a stateless label may collide with a stateful one. Attempts to split the flow label space were rejected in IETF. When we added support in Linux for RFC6438, we could not turn on flow labels by default due to this conflict. This patch splits the flow label space and should give us a path to enabling auto flow labels by default for all IPv6 packets. This is an API change so we need to consider compatibility with existing deployment. The stateful range is chosen to be the lower values in hopes that most uses would have chosen small numbers. Once we resolve the stateless/stateful issue, we can proceed to look at enabling RFC6438 flow labels by default (starting with scaled testing). Signed-off-by: Tom Herbert <tom@herbertland.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt8
-rw-r--r--include/net/ipv6.h9
-rw-r--r--include/net/netns/ipv6.h1
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/ipv6/ip6_flowlabel.c4
-rw-r--r--net/ipv6/sysctl_net_ipv6.c8
6 files changed, 29 insertions, 2 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 071fb18dc57c..5095c63e50ed 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1213,6 +1213,14 @@ auto_flowlabels - BOOLEAN
1213 FALSE: disabled 1213 FALSE: disabled
1214 Default: false 1214 Default: false
1215 1215
1216flowlabel_state_ranges - BOOLEAN
1217 Split the flow label number space into two ranges. 0-0x7FFFF is
1218 reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF
1219 is reserved for stateless flow labels as described in RFC6437.
1220 TRUE: enabled
1221 FALSE: disabled
1222 Default: true
1223
1216anycast_src_echo_reply - BOOLEAN 1224anycast_src_echo_reply - BOOLEAN
1217 Controls the use of anycast addresses as source addresses for ICMPv6 1225 Controls the use of anycast addresses as source addresses for ICMPv6
1218 echo reply 1226 echo reply
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index eec8ad3c9843..53d25ef1699a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -239,8 +239,10 @@ struct ip6_flowlabel {
239 struct net *fl_net; 239 struct net *fl_net;
240}; 240};
241 241
242#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) 242#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF)
243#define IPV6_FLOWLABEL_MASK cpu_to_be32(0x000FFFFF) 243#define IPV6_FLOWLABEL_MASK cpu_to_be32(0x000FFFFF)
244#define IPV6_FLOWLABEL_STATELESS_FLAG cpu_to_be32(0x00080000)
245
244#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) 246#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
245#define IPV6_TCLASS_SHIFT 20 247#define IPV6_TCLASS_SHIFT 20
246 248
@@ -719,6 +721,9 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
719 hash ^= hash >> 12; 721 hash ^= hash >> 12;
720 722
721 flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; 723 flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
724
725 if (net->ipv6.sysctl.flowlabel_state_ranges)
726 flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;
722 } 727 }
723 728
724 return flowlabel; 729 return flowlabel;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index d2527bf81142..8d93544a2d2b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -34,6 +34,7 @@ struct netns_sysctl_ipv6 {
34 int fwmark_reflect; 34 int fwmark_reflect;
35 int idgen_retries; 35 int idgen_retries;
36 int idgen_delay; 36 int idgen_delay;
37 int flowlabel_state_ranges;
37}; 38};
38 39
39struct netns_ipv6 { 40struct netns_ipv6 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index eef63b394c5a..4632afa57e05 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -768,6 +768,7 @@ static int __net_init inet6_net_init(struct net *net)
768 net->ipv6.sysctl.auto_flowlabels = 0; 768 net->ipv6.sysctl.auto_flowlabels = 0;
769 net->ipv6.sysctl.idgen_retries = 3; 769 net->ipv6.sysctl.idgen_retries = 3;
770 net->ipv6.sysctl.idgen_delay = 1 * HZ; 770 net->ipv6.sysctl.idgen_delay = 1 * HZ;
771 net->ipv6.sysctl.flowlabel_state_ranges = 1;
771 atomic_set(&net->ipv6.fib6_sernum, 1); 772 atomic_set(&net->ipv6.fib6_sernum, 1);
772 773
773 err = ipv6_init_mibs(net); 774 err = ipv6_init_mibs(net);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index d491125011c4..1f9ebe3cbb4a 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -595,6 +595,10 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
595 if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) 595 if (freq.flr_label & ~IPV6_FLOWLABEL_MASK)
596 return -EINVAL; 596 return -EINVAL;
597 597
598 if (net->ipv6.sysctl.flowlabel_state_ranges &&
599 (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
600 return -ERANGE;
601
598 fl = fl_create(net, sk, &freq, optval, optlen, &err); 602 fl = fl_create(net, sk, &freq, optval, optlen, &err);
599 if (!fl) 603 if (!fl)
600 return err; 604 return err;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index abcc79f649b3..4e705add4f18 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -68,6 +68,13 @@ static struct ctl_table ipv6_table_template[] = {
68 .mode = 0644, 68 .mode = 0644,
69 .proc_handler = proc_dointvec_jiffies, 69 .proc_handler = proc_dointvec_jiffies,
70 }, 70 },
71 {
72 .procname = "flowlabel_state_ranges",
73 .data = &init_net.ipv6.sysctl.flowlabel_state_ranges,
74 .maxlen = sizeof(int),
75 .mode = 0644,
76 .proc_handler = proc_dointvec
77 },
71 { } 78 { }
72}; 79};
73 80
@@ -109,6 +116,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
109 ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect; 116 ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect;
110 ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries; 117 ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
111 ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay; 118 ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
119 ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
112 120
113 ipv6_route_table = ipv6_route_sysctl_init(net); 121 ipv6_route_table = ipv6_route_sysctl_init(net);
114 if (!ipv6_route_table) 122 if (!ipv6_route_table)