aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFan Du <fan.du@intel.com>2015-02-09 20:53:16 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-09 21:45:00 -0500
commitb0f9ca53cbb103e9240a29a974e0b6085e58f9f7 (patch)
tree219e761b507aa959e6fd38c79adf3f9bc4be4b4d
parentf217d6ca4a8cde473358637aa29daaaa3d0b57a9 (diff)
ipv4: Namespecify TCP PMTU mechanism
Packetization Layer Path MTU Discovery works separately beside Path MTU Discovery at IP level, different net namespace has various requirements on which one to chose, e.g., a virutalized container instance would require TCP PMTU to probe an usable effective mtu for underlying tunnel, while the host would employ classical ICMP based PMTU to function. Hence making TCP PMTU mechanism per net namespace to decouple two functionality. Furthermore the probe base MSS should also be configured separately for each namespace. Signed-off-by: Fan Du <fan.du@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/netns/ipv4.h2
-rw-r--r--include/net/tcp.h2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c28
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_output.c8
-rw-r--r--net/ipv4/tcp_timer.c7
6 files changed, 25 insertions, 23 deletions
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e0bdcb147326..dbe225478adb 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -82,6 +82,8 @@ struct netns_ipv4 {
82 82
83 int sysctl_fwmark_reflect; 83 int sysctl_fwmark_reflect;
84 int sysctl_tcp_fwmark_accept; 84 int sysctl_tcp_fwmark_accept;
85 int sysctl_tcp_mtu_probing;
86 int sysctl_tcp_base_mss;
85 87
86 struct ping_group_range ping_group_range; 88 struct ping_group_range ping_group_range;
87 89
diff --git a/include/net/tcp.h b/include/net/tcp.h
index da4196fb78db..8d6b983d5099 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,8 +262,6 @@ extern int sysctl_tcp_low_latency;
262extern int sysctl_tcp_nometrics_save; 262extern int sysctl_tcp_nometrics_save;
263extern int sysctl_tcp_moderate_rcvbuf; 263extern int sysctl_tcp_moderate_rcvbuf;
264extern int sysctl_tcp_tso_win_divisor; 264extern int sysctl_tcp_tso_win_divisor;
265extern int sysctl_tcp_mtu_probing;
266extern int sysctl_tcp_base_mss;
267extern int sysctl_tcp_workaround_signed_windows; 265extern int sysctl_tcp_workaround_signed_windows;
268extern int sysctl_tcp_slow_start_after_idle; 266extern int sysctl_tcp_slow_start_after_idle;
269extern int sysctl_tcp_thin_linear_timeouts; 267extern int sysctl_tcp_thin_linear_timeouts;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 82601a68cf90..d151539da8e6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -604,20 +604,6 @@ static struct ctl_table ipv4_table[] = {
604 .proc_handler = proc_tcp_congestion_control, 604 .proc_handler = proc_tcp_congestion_control,
605 }, 605 },
606 { 606 {
607 .procname = "tcp_mtu_probing",
608 .data = &sysctl_tcp_mtu_probing,
609 .maxlen = sizeof(int),
610 .mode = 0644,
611 .proc_handler = proc_dointvec,
612 },
613 {
614 .procname = "tcp_base_mss",
615 .data = &sysctl_tcp_base_mss,
616 .maxlen = sizeof(int),
617 .mode = 0644,
618 .proc_handler = proc_dointvec,
619 },
620 {
621 .procname = "tcp_workaround_signed_windows", 607 .procname = "tcp_workaround_signed_windows",
622 .data = &sysctl_tcp_workaround_signed_windows, 608 .data = &sysctl_tcp_workaround_signed_windows,
623 .maxlen = sizeof(int), 609 .maxlen = sizeof(int),
@@ -883,6 +869,20 @@ static struct ctl_table ipv4_net_table[] = {
883 .mode = 0644, 869 .mode = 0644,
884 .proc_handler = proc_dointvec, 870 .proc_handler = proc_dointvec,
885 }, 871 },
872 {
873 .procname = "tcp_mtu_probing",
874 .data = &init_net.ipv4.sysctl_tcp_mtu_probing,
875 .maxlen = sizeof(int),
876 .mode = 0644,
877 .proc_handler = proc_dointvec,
878 },
879 {
880 .procname = "tcp_base_mss",
881 .data = &init_net.ipv4.sysctl_tcp_base_mss,
882 .maxlen = sizeof(int),
883 .mode = 0644,
884 .proc_handler = proc_dointvec,
885 },
886 { } 886 { }
887}; 887};
888 888
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 67bc95fb5d9e..5a2dfed4783b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2459,6 +2459,7 @@ static int __net_init tcp_sk_init(struct net *net)
2459 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2459 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2460 } 2460 }
2461 net->ipv4.sysctl_tcp_ecn = 2; 2461 net->ipv4.sysctl_tcp_ecn = 2;
2462 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2462 return 0; 2463 return 0;
2463 2464
2464fail: 2465fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4fcc9a768849..a2a796c5536b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,9 +59,6 @@ int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
59 */ 59 */
60int sysctl_tcp_tso_win_divisor __read_mostly = 3; 60int sysctl_tcp_tso_win_divisor __read_mostly = 3;
61 61
62int sysctl_tcp_mtu_probing __read_mostly = 0;
63int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
64
65/* By default, RFC2861 behavior. */ 62/* By default, RFC2861 behavior. */
66int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 63int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67 64
@@ -1350,11 +1347,12 @@ void tcp_mtup_init(struct sock *sk)
1350{ 1347{
1351 struct tcp_sock *tp = tcp_sk(sk); 1348 struct tcp_sock *tp = tcp_sk(sk);
1352 struct inet_connection_sock *icsk = inet_csk(sk); 1349 struct inet_connection_sock *icsk = inet_csk(sk);
1350 struct net *net = sock_net(sk);
1353 1351
1354 icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; 1352 icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
1355 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + 1353 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1356 icsk->icsk_af_ops->net_header_len; 1354 icsk->icsk_af_ops->net_header_len;
1357 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); 1355 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1358 icsk->icsk_mtup.probe_size = 0; 1356 icsk->icsk_mtup.probe_size = 0;
1359} 1357}
1360EXPORT_SYMBOL(tcp_mtup_init); 1358EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 1829c7fbc77e..0732b787904e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -101,17 +101,20 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
101 101
102static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) 102static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
103{ 103{
104 struct net *net = sock_net(sk);
105
104 /* Black hole detection */ 106 /* Black hole detection */
105 if (sysctl_tcp_mtu_probing) { 107 if (net->ipv4.sysctl_tcp_mtu_probing) {
106 if (!icsk->icsk_mtup.enabled) { 108 if (!icsk->icsk_mtup.enabled) {
107 icsk->icsk_mtup.enabled = 1; 109 icsk->icsk_mtup.enabled = 1;
108 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 110 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
109 } else { 111 } else {
112 struct net *net = sock_net(sk);
110 struct tcp_sock *tp = tcp_sk(sk); 113 struct tcp_sock *tp = tcp_sk(sk);
111 int mss; 114 int mss;
112 115
113 mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; 116 mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
114 mss = min(sysctl_tcp_base_mss, mss); 117 mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
115 mss = max(mss, 68 - tp->tcp_header_len); 118 mss = max(mss, 68 - tp->tcp_header_len);
116 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); 119 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
117 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 120 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);