diff options
author | John Heffner <jheffner@psc.edu> | 2006-03-20 20:53:41 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2006-03-20 20:53:41 -0500 |
commit | 5d424d5a674f782d0659a3b66d951f412901faee (patch) | |
tree | 579871172044e02e626a90388d19ec55cf2d1fc4 | |
parent | 1d60290f27e7dc4bce2c43922d0bfa9abd246fc9 (diff) |
[TCP]: MTU probing
Implementation of packetization layer path mtu discovery for TCP, based on
the internet-draft currently found at
<http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>.
Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/sysctl.h | 2 | ||||
-rw-r--r-- | include/net/inet_connection_sock.h | 13 | ||||
-rw-r--r-- | include/net/tcp.h | 9 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 16 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 49 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 236 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 36 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 |
9 files changed, 326 insertions, 37 deletions
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 8ad4beab2888..6e8880ea49e7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h | |||
@@ -397,6 +397,8 @@ enum | |||
397 | NET_TCP_CONG_CONTROL=110, | 397 | NET_TCP_CONG_CONTROL=110, |
398 | NET_TCP_ABC=111, | 398 | NET_TCP_ABC=111, |
399 | NET_IPV4_IPFRAG_MAX_DIST=112, | 399 | NET_IPV4_IPFRAG_MAX_DIST=112, |
400 | NET_TCP_MTU_PROBING=113, | ||
401 | NET_TCP_BASE_MSS=114, | ||
400 | }; | 402 | }; |
401 | 403 | ||
402 | enum { | 404 | enum { |
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index fa587c94e9d0..b3abe33f4e5f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h | |||
@@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops { | |||
72 | * @icsk_probes_out: unanswered 0 window probes | 72 | * @icsk_probes_out: unanswered 0 window probes |
73 | * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) | 73 | * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) |
74 | * @icsk_ack: Delayed ACK control data | 74 | * @icsk_ack: Delayed ACK control data |
75 | * @icsk_mtup; MTU probing control data | ||
75 | */ | 76 | */ |
76 | struct inet_connection_sock { | 77 | struct inet_connection_sock { |
77 | /* inet_sock has to be the first member! */ | 78 | /* inet_sock has to be the first member! */ |
@@ -104,6 +105,18 @@ struct inet_connection_sock { | |||
104 | __u16 last_seg_size; /* Size of last incoming segment */ | 105 | __u16 last_seg_size; /* Size of last incoming segment */ |
105 | __u16 rcv_mss; /* MSS used for delayed ACK decisions */ | 106 | __u16 rcv_mss; /* MSS used for delayed ACK decisions */ |
106 | } icsk_ack; | 107 | } icsk_ack; |
108 | struct { | ||
109 | int enabled; | ||
110 | |||
111 | /* Range of MTUs to search */ | ||
112 | int search_high; | ||
113 | int search_low; | ||
114 | |||
115 | /* Information on the current probe. */ | ||
116 | int probe_size; | ||
117 | __u32 probe_seq_start; | ||
118 | __u32 probe_seq_end; | ||
119 | } icsk_mtup; | ||
107 | u32 icsk_ca_priv[16]; | 120 | u32 icsk_ca_priv[16]; |
108 | #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) | 121 | #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) |
109 | }; | 122 | }; |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 77f21c65bbca..16879fa560de 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); | |||
60 | /* Minimal RCV_MSS. */ | 60 | /* Minimal RCV_MSS. */ |
61 | #define TCP_MIN_RCVMSS 536U | 61 | #define TCP_MIN_RCVMSS 536U |
62 | 62 | ||
63 | /* The least MTU to use for probing */ | ||
64 | #define TCP_BASE_MSS 512 | ||
65 | |||
63 | /* After receiving this amount of duplicate ACKs fast retransmit starts. */ | 66 | /* After receiving this amount of duplicate ACKs fast retransmit starts. */ |
64 | #define TCP_FASTRETRANS_THRESH 3 | 67 | #define TCP_FASTRETRANS_THRESH 3 |
65 | 68 | ||
@@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save; | |||
219 | extern int sysctl_tcp_moderate_rcvbuf; | 222 | extern int sysctl_tcp_moderate_rcvbuf; |
220 | extern int sysctl_tcp_tso_win_divisor; | 223 | extern int sysctl_tcp_tso_win_divisor; |
221 | extern int sysctl_tcp_abc; | 224 | extern int sysctl_tcp_abc; |
225 | extern int sysctl_tcp_mtu_probing; | ||
226 | extern int sysctl_tcp_base_mss; | ||
222 | 227 | ||
223 | extern atomic_t tcp_memory_allocated; | 228 | extern atomic_t tcp_memory_allocated; |
224 | extern atomic_t tcp_sockets_allocated; | 229 | extern atomic_t tcp_sockets_allocated; |
@@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
447 | 452 | ||
448 | extern void tcp_initialize_rcv_mss(struct sock *sk); | 453 | extern void tcp_initialize_rcv_mss(struct sock *sk); |
449 | 454 | ||
455 | extern int tcp_mtu_to_mss(struct sock *sk, int pmtu); | ||
456 | extern int tcp_mss_to_mtu(struct sock *sk, int mss); | ||
457 | extern void tcp_mtup_init(struct sock *sk); | ||
458 | |||
450 | static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) | 459 | static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) |
451 | { | 460 | { |
452 | tp->pred_flags = htonl((tp->tcp_header_len << 26) | | 461 | tp->pred_flags = htonl((tp->tcp_header_len << 26) | |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 16984d4a8a06..ebf2e0b363c4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -664,6 +664,22 @@ ctl_table ipv4_table[] = { | |||
664 | .mode = 0644, | 664 | .mode = 0644, |
665 | .proc_handler = &proc_dointvec, | 665 | .proc_handler = &proc_dointvec, |
666 | }, | 666 | }, |
667 | { | ||
668 | .ctl_name = NET_TCP_MTU_PROBING, | ||
669 | .procname = "tcp_mtu_probing", | ||
670 | .data = &sysctl_tcp_mtu_probing, | ||
671 | .maxlen = sizeof(int), | ||
672 | .mode = 0644, | ||
673 | .proc_handler = &proc_dointvec, | ||
674 | }, | ||
675 | { | ||
676 | .ctl_name = NET_TCP_BASE_MSS, | ||
677 | .procname = "tcp_base_mss", | ||
678 | .data = &sysctl_tcp_base_mss, | ||
679 | .maxlen = sizeof(int), | ||
680 | .mode = 0644, | ||
681 | .proc_handler = &proc_dointvec, | ||
682 | }, | ||
667 | 683 | ||
668 | { .ctl_name = 0 } | 684 | { .ctl_name = 0 } |
669 | }; | 685 | }; |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e9a54ae7d690..0ac388e3d01d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) | |||
1891 | } | 1891 | } |
1892 | } | 1892 | } |
1893 | 1893 | ||
1894 | static void tcp_mtup_probe_failed(struct sock *sk) | ||
1895 | { | ||
1896 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1897 | |||
1898 | icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; | ||
1899 | icsk->icsk_mtup.probe_size = 0; | ||
1900 | } | ||
1901 | |||
1902 | static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) | ||
1903 | { | ||
1904 | struct tcp_sock *tp = tcp_sk(sk); | ||
1905 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1906 | |||
1907 | /* FIXME: breaks with very large cwnd */ | ||
1908 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | ||
1909 | tp->snd_cwnd = tp->snd_cwnd * | ||
1910 | tcp_mss_to_mtu(sk, tp->mss_cache) / | ||
1911 | icsk->icsk_mtup.probe_size; | ||
1912 | tp->snd_cwnd_cnt = 0; | ||
1913 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
1914 | tp->rcv_ssthresh = tcp_current_ssthresh(sk); | ||
1915 | |||
1916 | icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; | ||
1917 | icsk->icsk_mtup.probe_size = 0; | ||
1918 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | ||
1919 | } | ||
1920 | |||
1921 | |||
1894 | /* Process an event, which can update packets-in-flight not trivially. | 1922 | /* Process an event, which can update packets-in-flight not trivially. |
1895 | * Main goal of this function is to calculate new estimate for left_out, | 1923 | * Main goal of this function is to calculate new estimate for left_out, |
1896 | * taking into account both packets sitting in receiver's buffer and | 1924 | * taking into account both packets sitting in receiver's buffer and |
@@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
2023 | return; | 2051 | return; |
2024 | } | 2052 | } |
2025 | 2053 | ||
2054 | /* MTU probe failure: don't reduce cwnd */ | ||
2055 | if (icsk->icsk_ca_state < TCP_CA_CWR && | ||
2056 | icsk->icsk_mtup.probe_size && | ||
2057 | tp->snd_una == icsk->icsk_mtup.probe_seq_start) { | ||
2058 | tcp_mtup_probe_failed(sk); | ||
2059 | /* Restores the reduction we did in tcp_mtup_probe() */ | ||
2060 | tp->snd_cwnd++; | ||
2061 | tcp_simple_retransmit(sk); | ||
2062 | return; | ||
2063 | } | ||
2064 | |||
2026 | /* Otherwise enter Recovery state */ | 2065 | /* Otherwise enter Recovery state */ |
2027 | 2066 | ||
2028 | if (IsReno(tp)) | 2067 | if (IsReno(tp)) |
@@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
2243 | tp->retrans_stamp = 0; | 2282 | tp->retrans_stamp = 0; |
2244 | } | 2283 | } |
2245 | 2284 | ||
2285 | /* MTU probing checks */ | ||
2286 | if (icsk->icsk_mtup.probe_size) { | ||
2287 | if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { | ||
2288 | tcp_mtup_probe_success(sk, skb); | ||
2289 | } | ||
2290 | } | ||
2291 | |||
2246 | if (sacked) { | 2292 | if (sacked) { |
2247 | if (sacked & TCPCB_RETRANS) { | 2293 | if (sacked & TCPCB_RETRANS) { |
2248 | if(sacked & TCPCB_SACKED_RETRANS) | 2294 | if(sacked & TCPCB_SACKED_RETRANS) |
@@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
4101 | if (tp->rx_opt.sack_ok && sysctl_tcp_fack) | 4147 | if (tp->rx_opt.sack_ok && sysctl_tcp_fack) |
4102 | tp->rx_opt.sack_ok |= 2; | 4148 | tp->rx_opt.sack_ok |= 2; |
4103 | 4149 | ||
4150 | tcp_mtup_init(sk); | ||
4104 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 4151 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
4105 | tcp_initialize_rcv_mss(sk); | 4152 | tcp_initialize_rcv_mss(sk); |
4106 | 4153 | ||
@@ -4211,6 +4258,7 @@ discard: | |||
4211 | if (tp->ecn_flags&TCP_ECN_OK) | 4258 | if (tp->ecn_flags&TCP_ECN_OK) |
4212 | sock_set_flag(sk, SOCK_NO_LARGESEND); | 4259 | sock_set_flag(sk, SOCK_NO_LARGESEND); |
4213 | 4260 | ||
4261 | tcp_mtup_init(sk); | ||
4214 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 4262 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
4215 | tcp_initialize_rcv_mss(sk); | 4263 | tcp_initialize_rcv_mss(sk); |
4216 | 4264 | ||
@@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4399 | */ | 4447 | */ |
4400 | tp->lsndtime = tcp_time_stamp; | 4448 | tp->lsndtime = tcp_time_stamp; |
4401 | 4449 | ||
4450 | tcp_mtup_init(sk); | ||
4402 | tcp_initialize_rcv_mss(sk); | 4451 | tcp_initialize_rcv_mss(sk); |
4403 | tcp_init_buffer_space(sk); | 4452 | tcp_init_buffer_space(sk); |
4404 | tcp_fast_path_on(tp); | 4453 | tcp_fast_path_on(tp); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 233bdf259965..57e7a26e8213 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
900 | inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; | 900 | inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; |
901 | newinet->id = newtp->write_seq ^ jiffies; | 901 | newinet->id = newtp->write_seq ^ jiffies; |
902 | 902 | ||
903 | tcp_mtup_init(newsk); | ||
903 | tcp_sync_mss(newsk, dst_mtu(dst)); | 904 | tcp_sync_mss(newsk, dst_mtu(dst)); |
904 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); | 905 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); |
905 | tcp_initialize_rcv_mss(newsk); | 906 | tcp_initialize_rcv_mss(newsk); |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9f498a6c8895..8197b5e12f1f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1; | |||
51 | */ | 51 | */ |
52 | int sysctl_tcp_tso_win_divisor = 3; | 52 | int sysctl_tcp_tso_win_divisor = 3; |
53 | 53 | ||
54 | int sysctl_tcp_mtu_probing = 0; | ||
55 | int sysctl_tcp_base_mss = 512; | ||
56 | |||
57 | EXPORT_SYMBOL(sysctl_tcp_mtu_probing); | ||
58 | EXPORT_SYMBOL(sysctl_tcp_base_mss); | ||
59 | |||
54 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, | 60 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, |
55 | struct sk_buff *skb) | 61 | struct sk_buff *skb) |
56 | { | 62 | { |
@@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
681 | return 0; | 687 | return 0; |
682 | } | 688 | } |
683 | 689 | ||
690 | /* Not accounting for SACKs here. */ | ||
691 | int tcp_mtu_to_mss(struct sock *sk, int pmtu) | ||
692 | { | ||
693 | struct tcp_sock *tp = tcp_sk(sk); | ||
694 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
695 | int mss_now; | ||
696 | |||
697 | /* Calculate base mss without TCP options: | ||
698 | It is MMS_S - sizeof(tcphdr) of rfc1122 | ||
699 | */ | ||
700 | mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); | ||
701 | |||
702 | /* Clamp it (mss_clamp does not include tcp options) */ | ||
703 | if (mss_now > tp->rx_opt.mss_clamp) | ||
704 | mss_now = tp->rx_opt.mss_clamp; | ||
705 | |||
706 | /* Now subtract optional transport overhead */ | ||
707 | mss_now -= icsk->icsk_ext_hdr_len; | ||
708 | |||
709 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | ||
710 | if (mss_now < 48) | ||
711 | mss_now = 48; | ||
712 | |||
713 | /* Now subtract TCP options size, not including SACKs */ | ||
714 | mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | ||
715 | |||
716 | return mss_now; | ||
717 | } | ||
718 | |||
719 | /* Inverse of above */ | ||
720 | int tcp_mss_to_mtu(struct sock *sk, int mss) | ||
721 | { | ||
722 | struct tcp_sock *tp = tcp_sk(sk); | ||
723 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
724 | int mtu; | ||
725 | |||
726 | mtu = mss + | ||
727 | tp->tcp_header_len + | ||
728 | icsk->icsk_ext_hdr_len + | ||
729 | icsk->icsk_af_ops->net_header_len; | ||
730 | |||
731 | return mtu; | ||
732 | } | ||
733 | |||
734 | void tcp_mtup_init(struct sock *sk) | ||
735 | { | ||
736 | struct tcp_sock *tp = tcp_sk(sk); | ||
737 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
738 | |||
739 | icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; | ||
740 | icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + | ||
741 | icsk->icsk_af_ops->net_header_len; | ||
742 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); | ||
743 | icsk->icsk_mtup.probe_size = 0; | ||
744 | } | ||
745 | |||
684 | /* This function synchronize snd mss to current pmtu/exthdr set. | 746 | /* This function synchronize snd mss to current pmtu/exthdr set. |
685 | 747 | ||
686 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts | 748 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts |
@@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
708 | { | 770 | { |
709 | struct tcp_sock *tp = tcp_sk(sk); | 771 | struct tcp_sock *tp = tcp_sk(sk); |
710 | struct inet_connection_sock *icsk = inet_csk(sk); | 772 | struct inet_connection_sock *icsk = inet_csk(sk); |
711 | /* Calculate base mss without TCP options: | 773 | int mss_now; |
712 | It is MMS_S - sizeof(tcphdr) of rfc1122 | ||
713 | */ | ||
714 | int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - | ||
715 | sizeof(struct tcphdr)); | ||
716 | 774 | ||
717 | /* Clamp it (mss_clamp does not include tcp options) */ | 775 | if (icsk->icsk_mtup.search_high > pmtu) |
718 | if (mss_now > tp->rx_opt.mss_clamp) | 776 | icsk->icsk_mtup.search_high = pmtu; |
719 | mss_now = tp->rx_opt.mss_clamp; | ||
720 | 777 | ||
721 | /* Now subtract optional transport overhead */ | 778 | mss_now = tcp_mtu_to_mss(sk, pmtu); |
722 | mss_now -= icsk->icsk_ext_hdr_len; | ||
723 | |||
724 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | ||
725 | if (mss_now < 48) | ||
726 | mss_now = 48; | ||
727 | |||
728 | /* Now subtract TCP options size, not including SACKs */ | ||
729 | mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | ||
730 | 779 | ||
731 | /* Bound mss with half of window */ | 780 | /* Bound mss with half of window */ |
732 | if (tp->max_window && mss_now > (tp->max_window>>1)) | 781 | if (tp->max_window && mss_now > (tp->max_window>>1)) |
@@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
734 | 783 | ||
735 | /* And store cached results */ | 784 | /* And store cached results */ |
736 | icsk->icsk_pmtu_cookie = pmtu; | 785 | icsk->icsk_pmtu_cookie = pmtu; |
786 | if (icsk->icsk_mtup.enabled) | ||
787 | mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); | ||
737 | tp->mss_cache = mss_now; | 788 | tp->mss_cache = mss_now; |
738 | 789 | ||
739 | return mss_now; | 790 | return mss_now; |
@@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
1063 | return 1; | 1114 | return 1; |
1064 | } | 1115 | } |
1065 | 1116 | ||
1117 | /* Create a new MTU probe if we are ready. | ||
1118 | * Returns 0 if we should wait to probe (no cwnd available), | ||
1119 | * 1 if a probe was sent, | ||
1120 | * -1 otherwise */ | ||
1121 | static int tcp_mtu_probe(struct sock *sk) | ||
1122 | { | ||
1123 | struct tcp_sock *tp = tcp_sk(sk); | ||
1124 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1125 | struct sk_buff *skb, *nskb, *next; | ||
1126 | int len; | ||
1127 | int probe_size; | ||
1128 | unsigned int pif; | ||
1129 | int copy; | ||
1130 | int mss_now; | ||
1131 | |||
1132 | /* Not currently probing/verifying, | ||
1133 | * not in recovery, | ||
1134 | * have enough cwnd, and | ||
1135 | * not SACKing (the variable headers throw things off) */ | ||
1136 | if (!icsk->icsk_mtup.enabled || | ||
1137 | icsk->icsk_mtup.probe_size || | ||
1138 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open || | ||
1139 | tp->snd_cwnd < 11 || | ||
1140 | tp->rx_opt.eff_sacks) | ||
1141 | return -1; | ||
1142 | |||
1143 | /* Very simple search strategy: just double the MSS. */ | ||
1144 | mss_now = tcp_current_mss(sk, 0); | ||
1145 | probe_size = 2*tp->mss_cache; | ||
1146 | if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { | ||
1147 | /* TODO: set timer for probe_converge_event */ | ||
1148 | return -1; | ||
1149 | } | ||
1150 | |||
1151 | /* Have enough data in the send queue to probe? */ | ||
1152 | len = 0; | ||
1153 | if ((skb = sk->sk_send_head) == NULL) | ||
1154 | return -1; | ||
1155 | while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) | ||
1156 | skb = skb->next; | ||
1157 | if (len < probe_size) | ||
1158 | return -1; | ||
1159 | |||
1160 | /* Receive window check. */ | ||
1161 | if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) { | ||
1162 | if (tp->snd_wnd < probe_size) | ||
1163 | return -1; | ||
1164 | else | ||
1165 | return 0; | ||
1166 | } | ||
1167 | |||
1168 | /* Do we need to wait to drain cwnd? */ | ||
1169 | pif = tcp_packets_in_flight(tp); | ||
1170 | if (pif + 2 > tp->snd_cwnd) { | ||
1171 | /* With no packets in flight, don't stall. */ | ||
1172 | if (pif == 0) | ||
1173 | return -1; | ||
1174 | else | ||
1175 | return 0; | ||
1176 | } | ||
1177 | |||
1178 | /* We're allowed to probe. Build it now. */ | ||
1179 | if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) | ||
1180 | return -1; | ||
1181 | sk_charge_skb(sk, nskb); | ||
1182 | |||
1183 | skb = sk->sk_send_head; | ||
1184 | __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); | ||
1185 | sk->sk_send_head = nskb; | ||
1186 | |||
1187 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; | ||
1188 | TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; | ||
1189 | TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; | ||
1190 | TCP_SKB_CB(nskb)->sacked = 0; | ||
1191 | nskb->csum = 0; | ||
1192 | if (skb->ip_summed == CHECKSUM_HW) | ||
1193 | nskb->ip_summed = CHECKSUM_HW; | ||
1194 | |||
1195 | len = 0; | ||
1196 | while (len < probe_size) { | ||
1197 | next = skb->next; | ||
1198 | |||
1199 | copy = min_t(int, skb->len, probe_size - len); | ||
1200 | if (nskb->ip_summed) | ||
1201 | skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); | ||
1202 | else | ||
1203 | nskb->csum = skb_copy_and_csum_bits(skb, 0, | ||
1204 | skb_put(nskb, copy), copy, nskb->csum); | ||
1205 | |||
1206 | if (skb->len <= copy) { | ||
1207 | /* We've eaten all the data from this skb. | ||
1208 | * Throw it away. */ | ||
1209 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; | ||
1210 | __skb_unlink(skb, &sk->sk_write_queue); | ||
1211 | sk_stream_free_skb(sk, skb); | ||
1212 | } else { | ||
1213 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & | ||
1214 | ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); | ||
1215 | if (!skb_shinfo(skb)->nr_frags) { | ||
1216 | skb_pull(skb, copy); | ||
1217 | if (skb->ip_summed != CHECKSUM_HW) | ||
1218 | skb->csum = csum_partial(skb->data, skb->len, 0); | ||
1219 | } else { | ||
1220 | __pskb_trim_head(skb, copy); | ||
1221 | tcp_set_skb_tso_segs(sk, skb, mss_now); | ||
1222 | } | ||
1223 | TCP_SKB_CB(skb)->seq += copy; | ||
1224 | } | ||
1225 | |||
1226 | len += copy; | ||
1227 | skb = next; | ||
1228 | } | ||
1229 | tcp_init_tso_segs(sk, nskb, nskb->len); | ||
1230 | |||
1231 | /* We're ready to send. If this fails, the probe will | ||
1232 | * be resegmented into mss-sized pieces by tcp_write_xmit(). */ | ||
1233 | TCP_SKB_CB(nskb)->when = tcp_time_stamp; | ||
1234 | if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { | ||
1235 | /* Decrement cwnd here because we are sending | ||
1236 | * effectively two packets. */ | ||
1237 | tp->snd_cwnd--; | ||
1238 | update_send_head(sk, tp, nskb); | ||
1239 | |||
1240 | icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); | ||
1241 | icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq; | ||
1242 | icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; | ||
1243 | |||
1244 | return 1; | ||
1245 | } | ||
1246 | |||
1247 | return -1; | ||
1248 | } | ||
1249 | |||
1250 | |||
1066 | /* This routine writes packets to the network. It advances the | 1251 | /* This routine writes packets to the network. It advances the |
1067 | * send_head. This happens as incoming acks open up the remote | 1252 | * send_head. This happens as incoming acks open up the remote |
1068 | * window for us. | 1253 | * window for us. |
@@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1076 | struct sk_buff *skb; | 1261 | struct sk_buff *skb; |
1077 | unsigned int tso_segs, sent_pkts; | 1262 | unsigned int tso_segs, sent_pkts; |
1078 | int cwnd_quota; | 1263 | int cwnd_quota; |
1264 | int result; | ||
1079 | 1265 | ||
1080 | /* If we are closed, the bytes will have to remain here. | 1266 | /* If we are closed, the bytes will have to remain here. |
1081 | * In time closedown will finish, we empty the write queue and all | 1267 | * In time closedown will finish, we empty the write queue and all |
@@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1085 | return 0; | 1271 | return 0; |
1086 | 1272 | ||
1087 | sent_pkts = 0; | 1273 | sent_pkts = 0; |
1274 | |||
1275 | /* Do MTU probing. */ | ||
1276 | if ((result = tcp_mtu_probe(sk)) == 0) { | ||
1277 | return 0; | ||
1278 | } else if (result > 0) { | ||
1279 | sent_pkts = 1; | ||
1280 | } | ||
1281 | |||
1088 | while ((skb = sk->sk_send_head)) { | 1282 | while ((skb = sk->sk_send_head)) { |
1089 | unsigned int limit; | 1283 | unsigned int limit; |
1090 | 1284 | ||
@@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk) | |||
1455 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | 1649 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) |
1456 | { | 1650 | { |
1457 | struct tcp_sock *tp = tcp_sk(sk); | 1651 | struct tcp_sock *tp = tcp_sk(sk); |
1652 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1458 | unsigned int cur_mss = tcp_current_mss(sk, 0); | 1653 | unsigned int cur_mss = tcp_current_mss(sk, 0); |
1459 | int err; | 1654 | int err; |
1460 | 1655 | ||
1656 | /* Inconslusive MTU probe */ | ||
1657 | if (icsk->icsk_mtup.probe_size) { | ||
1658 | icsk->icsk_mtup.probe_size = 0; | ||
1659 | } | ||
1660 | |||
1461 | /* Do not sent more than we queued. 1/4 is reserved for possible | 1661 | /* Do not sent more than we queued. 1/4 is reserved for possible |
1462 | * copying overhead: fragmentation, tunneling, mangling etc. | 1662 | * copying overhead: fragmentation, tunneling, mangling etc. |
1463 | */ | 1663 | */ |
@@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk) | |||
1883 | if (tp->rx_opt.user_mss) | 2083 | if (tp->rx_opt.user_mss) |
1884 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | 2084 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; |
1885 | tp->max_window = 0; | 2085 | tp->max_window = 0; |
2086 | tcp_mtup_init(sk); | ||
1886 | tcp_sync_mss(sk, dst_mtu(dst)); | 2087 | tcp_sync_mss(sk, dst_mtu(dst)); |
1887 | 2088 | ||
1888 | if (!tp->window_clamp) | 2089 | if (!tp->window_clamp) |
@@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack); | |||
2180 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2381 | EXPORT_SYMBOL(tcp_simple_retransmit); |
2181 | EXPORT_SYMBOL(tcp_sync_mss); | 2382 | EXPORT_SYMBOL(tcp_sync_mss); |
2182 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); | 2383 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); |
2384 | EXPORT_SYMBOL(tcp_mtup_init); | ||
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e1880959614a..7c1bde3cd6cb 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -119,8 +119,10 @@ static int tcp_orphan_retries(struct sock *sk, int alive) | |||
119 | /* A write timeout has occurred. Process the after effects. */ | 119 | /* A write timeout has occurred. Process the after effects. */ |
120 | static int tcp_write_timeout(struct sock *sk) | 120 | static int tcp_write_timeout(struct sock *sk) |
121 | { | 121 | { |
122 | const struct inet_connection_sock *icsk = inet_csk(sk); | 122 | struct inet_connection_sock *icsk = inet_csk(sk); |
123 | struct tcp_sock *tp = tcp_sk(sk); | ||
123 | int retry_until; | 124 | int retry_until; |
125 | int mss; | ||
124 | 126 | ||
125 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 127 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
126 | if (icsk->icsk_retransmits) | 128 | if (icsk->icsk_retransmits) |
@@ -128,25 +130,19 @@ static int tcp_write_timeout(struct sock *sk) | |||
128 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; | 130 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
129 | } else { | 131 | } else { |
130 | if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { | 132 | if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { |
131 | /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black | 133 | /* Black hole detection */ |
132 | hole detection. :-( | 134 | if (sysctl_tcp_mtu_probing) { |
133 | 135 | if (!icsk->icsk_mtup.enabled) { | |
134 | It is place to make it. It is not made. I do not want | 136 | icsk->icsk_mtup.enabled = 1; |
135 | to make it. It is disgusting. It does not work in any | 137 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
136 | case. Let me to cite the same draft, which requires for | 138 | } else { |
137 | us to implement this: | 139 | mss = min(sysctl_tcp_base_mss, |
138 | 140 | tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2); | |
139 | "The one security concern raised by this memo is that ICMP black holes | 141 | mss = max(mss, 68 - tp->tcp_header_len); |
140 | are often caused by over-zealous security administrators who block | 142 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); |
141 | all ICMP messages. It is vitally important that those who design and | 143 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
142 | deploy security systems understand the impact of strict filtering on | 144 | } |
143 | upper-layer protocols. The safest web site in the world is worthless | 145 | } |
144 | if most TCP implementations cannot transfer data from it. It would | ||
145 | be far nicer to have all of the black holes fixed rather than fixing | ||
146 | all of the TCP implementations." | ||
147 | |||
148 | Golden words :-). | ||
149 | */ | ||
150 | 146 | ||
151 | dst_negative_advice(&sk->sk_dst_cache); | 147 | dst_negative_advice(&sk->sk_dst_cache); |
152 | } | 148 | } |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ca9cf6853755..14de50380f4e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -987,6 +987,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
987 | inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + | 987 | inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + |
988 | newnp->opt->opt_flen); | 988 | newnp->opt->opt_flen); |
989 | 989 | ||
990 | tcp_mtup_init(newsk); | ||
990 | tcp_sync_mss(newsk, dst_mtu(dst)); | 991 | tcp_sync_mss(newsk, dst_mtu(dst)); |
991 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); | 992 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); |
992 | tcp_initialize_rcv_mss(newsk); | 993 | tcp_initialize_rcv_mss(newsk); |