aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohn Heffner <jheffner@psc.edu>2006-03-20 20:53:41 -0500
committerDavid S. Miller <davem@davemloft.net>2006-03-20 20:53:41 -0500
commit5d424d5a674f782d0659a3b66d951f412901faee (patch)
tree579871172044e02e626a90388d19ec55cf2d1fc4
parent1d60290f27e7dc4bce2c43922d0bfa9abd246fc9 (diff)
[TCP]: MTU probing
Implementation of packetization layer path mtu discovery for TCP, based on the internet-draft currently found at <http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>. Signed-off-by: John Heffner <jheffner@psc.edu> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/sysctl.h2
-rw-r--r--include/net/inet_connection_sock.h13
-rw-r--r--include/net/tcp.h9
-rw-r--r--net/ipv4/sysctl_net_ipv4.c16
-rw-r--r--net/ipv4/tcp_input.c49
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_output.c236
-rw-r--r--net/ipv4/tcp_timer.c36
-rw-r--r--net/ipv6/tcp_ipv6.c1
9 files changed, 326 insertions, 37 deletions
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 8ad4beab2888..6e8880ea49e7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -397,6 +397,8 @@ enum
397 NET_TCP_CONG_CONTROL=110, 397 NET_TCP_CONG_CONTROL=110,
398 NET_TCP_ABC=111, 398 NET_TCP_ABC=111,
399 NET_IPV4_IPFRAG_MAX_DIST=112, 399 NET_IPV4_IPFRAG_MAX_DIST=112,
400 NET_TCP_MTU_PROBING=113,
401 NET_TCP_BASE_MSS=114,
400}; 402};
401 403
402enum { 404enum {
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index fa587c94e9d0..b3abe33f4e5f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops {
72 * @icsk_probes_out: unanswered 0 window probes 72 * @icsk_probes_out: unanswered 0 window probes
73 * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) 73 * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options)
74 * @icsk_ack: Delayed ACK control data 74 * @icsk_ack: Delayed ACK control data
75 * @icsk_mtup; MTU probing control data
75 */ 76 */
76struct inet_connection_sock { 77struct inet_connection_sock {
77 /* inet_sock has to be the first member! */ 78 /* inet_sock has to be the first member! */
@@ -104,6 +105,18 @@ struct inet_connection_sock {
104 __u16 last_seg_size; /* Size of last incoming segment */ 105 __u16 last_seg_size; /* Size of last incoming segment */
105 __u16 rcv_mss; /* MSS used for delayed ACK decisions */ 106 __u16 rcv_mss; /* MSS used for delayed ACK decisions */
106 } icsk_ack; 107 } icsk_ack;
108 struct {
109 int enabled;
110
111 /* Range of MTUs to search */
112 int search_high;
113 int search_low;
114
115 /* Information on the current probe. */
116 int probe_size;
117 __u32 probe_seq_start;
118 __u32 probe_seq_end;
119 } icsk_mtup;
107 u32 icsk_ca_priv[16]; 120 u32 icsk_ca_priv[16];
108#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) 121#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
109}; 122};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 77f21c65bbca..16879fa560de 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
60/* Minimal RCV_MSS. */ 60/* Minimal RCV_MSS. */
61#define TCP_MIN_RCVMSS 536U 61#define TCP_MIN_RCVMSS 536U
62 62
63/* The least MTU to use for probing */
64#define TCP_BASE_MSS 512
65
63/* After receiving this amount of duplicate ACKs fast retransmit starts. */ 66/* After receiving this amount of duplicate ACKs fast retransmit starts. */
64#define TCP_FASTRETRANS_THRESH 3 67#define TCP_FASTRETRANS_THRESH 3
65 68
@@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save;
219extern int sysctl_tcp_moderate_rcvbuf; 222extern int sysctl_tcp_moderate_rcvbuf;
220extern int sysctl_tcp_tso_win_divisor; 223extern int sysctl_tcp_tso_win_divisor;
221extern int sysctl_tcp_abc; 224extern int sysctl_tcp_abc;
225extern int sysctl_tcp_mtu_probing;
226extern int sysctl_tcp_base_mss;
222 227
223extern atomic_t tcp_memory_allocated; 228extern atomic_t tcp_memory_allocated;
224extern atomic_t tcp_sockets_allocated; 229extern atomic_t tcp_sockets_allocated;
@@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
447 452
448extern void tcp_initialize_rcv_mss(struct sock *sk); 453extern void tcp_initialize_rcv_mss(struct sock *sk);
449 454
455extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
456extern int tcp_mss_to_mtu(struct sock *sk, int mss);
457extern void tcp_mtup_init(struct sock *sk);
458
450static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) 459static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
451{ 460{
452 tp->pred_flags = htonl((tp->tcp_header_len << 26) | 461 tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 16984d4a8a06..ebf2e0b363c4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -664,6 +664,22 @@ ctl_table ipv4_table[] = {
664 .mode = 0644, 664 .mode = 0644,
665 .proc_handler = &proc_dointvec, 665 .proc_handler = &proc_dointvec,
666 }, 666 },
667 {
668 .ctl_name = NET_TCP_MTU_PROBING,
669 .procname = "tcp_mtu_probing",
670 .data = &sysctl_tcp_mtu_probing,
671 .maxlen = sizeof(int),
672 .mode = 0644,
673 .proc_handler = &proc_dointvec,
674 },
675 {
676 .ctl_name = NET_TCP_BASE_MSS,
677 .procname = "tcp_base_mss",
678 .data = &sysctl_tcp_base_mss,
679 .maxlen = sizeof(int),
680 .mode = 0644,
681 .proc_handler = &proc_dointvec,
682 },
667 683
668 { .ctl_name = 0 } 684 { .ctl_name = 0 }
669}; 685};
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e9a54ae7d690..0ac388e3d01d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
1891 } 1891 }
1892} 1892}
1893 1893
1894static void tcp_mtup_probe_failed(struct sock *sk)
1895{
1896 struct inet_connection_sock *icsk = inet_csk(sk);
1897
1898 icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
1899 icsk->icsk_mtup.probe_size = 0;
1900}
1901
1902static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
1903{
1904 struct tcp_sock *tp = tcp_sk(sk);
1905 struct inet_connection_sock *icsk = inet_csk(sk);
1906
1907 /* FIXME: breaks with very large cwnd */
1908 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1909 tp->snd_cwnd = tp->snd_cwnd *
1910 tcp_mss_to_mtu(sk, tp->mss_cache) /
1911 icsk->icsk_mtup.probe_size;
1912 tp->snd_cwnd_cnt = 0;
1913 tp->snd_cwnd_stamp = tcp_time_stamp;
1914 tp->rcv_ssthresh = tcp_current_ssthresh(sk);
1915
1916 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
1917 icsk->icsk_mtup.probe_size = 0;
1918 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
1919}
1920
1921
1894/* Process an event, which can update packets-in-flight not trivially. 1922/* Process an event, which can update packets-in-flight not trivially.
1895 * Main goal of this function is to calculate new estimate for left_out, 1923 * Main goal of this function is to calculate new estimate for left_out,
1896 * taking into account both packets sitting in receiver's buffer and 1924 * taking into account both packets sitting in receiver's buffer and
@@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
2023 return; 2051 return;
2024 } 2052 }
2025 2053
2054 /* MTU probe failure: don't reduce cwnd */
2055 if (icsk->icsk_ca_state < TCP_CA_CWR &&
2056 icsk->icsk_mtup.probe_size &&
2057 tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
2058 tcp_mtup_probe_failed(sk);
2059 /* Restores the reduction we did in tcp_mtup_probe() */
2060 tp->snd_cwnd++;
2061 tcp_simple_retransmit(sk);
2062 return;
2063 }
2064
2026 /* Otherwise enter Recovery state */ 2065 /* Otherwise enter Recovery state */
2027 2066
2028 if (IsReno(tp)) 2067 if (IsReno(tp))
@@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2243 tp->retrans_stamp = 0; 2282 tp->retrans_stamp = 0;
2244 } 2283 }
2245 2284
2285 /* MTU probing checks */
2286 if (icsk->icsk_mtup.probe_size) {
2287 if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
2288 tcp_mtup_probe_success(sk, skb);
2289 }
2290 }
2291
2246 if (sacked) { 2292 if (sacked) {
2247 if (sacked & TCPCB_RETRANS) { 2293 if (sacked & TCPCB_RETRANS) {
2248 if(sacked & TCPCB_SACKED_RETRANS) 2294 if(sacked & TCPCB_SACKED_RETRANS)
@@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4101 if (tp->rx_opt.sack_ok && sysctl_tcp_fack) 4147 if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
4102 tp->rx_opt.sack_ok |= 2; 4148 tp->rx_opt.sack_ok |= 2;
4103 4149
4150 tcp_mtup_init(sk);
4104 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 4151 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4105 tcp_initialize_rcv_mss(sk); 4152 tcp_initialize_rcv_mss(sk);
4106 4153
@@ -4211,6 +4258,7 @@ discard:
4211 if (tp->ecn_flags&TCP_ECN_OK) 4258 if (tp->ecn_flags&TCP_ECN_OK)
4212 sock_set_flag(sk, SOCK_NO_LARGESEND); 4259 sock_set_flag(sk, SOCK_NO_LARGESEND);
4213 4260
4261 tcp_mtup_init(sk);
4214 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 4262 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4215 tcp_initialize_rcv_mss(sk); 4263 tcp_initialize_rcv_mss(sk);
4216 4264
@@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4399 */ 4447 */
4400 tp->lsndtime = tcp_time_stamp; 4448 tp->lsndtime = tcp_time_stamp;
4401 4449
4450 tcp_mtup_init(sk);
4402 tcp_initialize_rcv_mss(sk); 4451 tcp_initialize_rcv_mss(sk);
4403 tcp_init_buffer_space(sk); 4452 tcp_init_buffer_space(sk);
4404 tcp_fast_path_on(tp); 4453 tcp_fast_path_on(tp);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 233bdf259965..57e7a26e8213 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
900 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; 900 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
901 newinet->id = newtp->write_seq ^ jiffies; 901 newinet->id = newtp->write_seq ^ jiffies;
902 902
903 tcp_mtup_init(newsk);
903 tcp_sync_mss(newsk, dst_mtu(dst)); 904 tcp_sync_mss(newsk, dst_mtu(dst));
904 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 905 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
905 tcp_initialize_rcv_mss(newsk); 906 tcp_initialize_rcv_mss(newsk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f498a6c8895..8197b5e12f1f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1;
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 3; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54int sysctl_tcp_mtu_probing = 0;
55int sysctl_tcp_base_mss = 512;
56
57EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
58EXPORT_SYMBOL(sysctl_tcp_base_mss);
59
54static void update_send_head(struct sock *sk, struct tcp_sock *tp, 60static void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 61 struct sk_buff *skb)
56{ 62{
@@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
681 return 0; 687 return 0;
682} 688}
683 689
690/* Not accounting for SACKs here. */
691int tcp_mtu_to_mss(struct sock *sk, int pmtu)
692{
693 struct tcp_sock *tp = tcp_sk(sk);
694 struct inet_connection_sock *icsk = inet_csk(sk);
695 int mss_now;
696
697 /* Calculate base mss without TCP options:
698 It is MMS_S - sizeof(tcphdr) of rfc1122
699 */
700 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
701
702 /* Clamp it (mss_clamp does not include tcp options) */
703 if (mss_now > tp->rx_opt.mss_clamp)
704 mss_now = tp->rx_opt.mss_clamp;
705
706 /* Now subtract optional transport overhead */
707 mss_now -= icsk->icsk_ext_hdr_len;
708
709 /* Then reserve room for full set of TCP options and 8 bytes of data */
710 if (mss_now < 48)
711 mss_now = 48;
712
713 /* Now subtract TCP options size, not including SACKs */
714 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
715
716 return mss_now;
717}
718
719/* Inverse of above */
720int tcp_mss_to_mtu(struct sock *sk, int mss)
721{
722 struct tcp_sock *tp = tcp_sk(sk);
723 struct inet_connection_sock *icsk = inet_csk(sk);
724 int mtu;
725
726 mtu = mss +
727 tp->tcp_header_len +
728 icsk->icsk_ext_hdr_len +
729 icsk->icsk_af_ops->net_header_len;
730
731 return mtu;
732}
733
734void tcp_mtup_init(struct sock *sk)
735{
736 struct tcp_sock *tp = tcp_sk(sk);
737 struct inet_connection_sock *icsk = inet_csk(sk);
738
739 icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
740 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
741 icsk->icsk_af_ops->net_header_len;
742 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
743 icsk->icsk_mtup.probe_size = 0;
744}
745
684/* This function synchronize snd mss to current pmtu/exthdr set. 746/* This function synchronize snd mss to current pmtu/exthdr set.
685 747
686 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts 748 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
708{ 770{
709 struct tcp_sock *tp = tcp_sk(sk); 771 struct tcp_sock *tp = tcp_sk(sk);
710 struct inet_connection_sock *icsk = inet_csk(sk); 772 struct inet_connection_sock *icsk = inet_csk(sk);
711 /* Calculate base mss without TCP options: 773 int mss_now;
712 It is MMS_S - sizeof(tcphdr) of rfc1122
713 */
714 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
715 sizeof(struct tcphdr));
716 774
717 /* Clamp it (mss_clamp does not include tcp options) */ 775 if (icsk->icsk_mtup.search_high > pmtu)
718 if (mss_now > tp->rx_opt.mss_clamp) 776 icsk->icsk_mtup.search_high = pmtu;
719 mss_now = tp->rx_opt.mss_clamp;
720 777
721 /* Now subtract optional transport overhead */ 778 mss_now = tcp_mtu_to_mss(sk, pmtu);
722 mss_now -= icsk->icsk_ext_hdr_len;
723
724 /* Then reserve room for full set of TCP options and 8 bytes of data */
725 if (mss_now < 48)
726 mss_now = 48;
727
728 /* Now subtract TCP options size, not including SACKs */
729 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
730 779
731 /* Bound mss with half of window */ 780 /* Bound mss with half of window */
732 if (tp->max_window && mss_now > (tp->max_window>>1)) 781 if (tp->max_window && mss_now > (tp->max_window>>1))
@@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
734 783
735 /* And store cached results */ 784 /* And store cached results */
736 icsk->icsk_pmtu_cookie = pmtu; 785 icsk->icsk_pmtu_cookie = pmtu;
786 if (icsk->icsk_mtup.enabled)
787 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
737 tp->mss_cache = mss_now; 788 tp->mss_cache = mss_now;
738 789
739 return mss_now; 790 return mss_now;
@@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
1063 return 1; 1114 return 1;
1064} 1115}
1065 1116
1117/* Create a new MTU probe if we are ready.
1118 * Returns 0 if we should wait to probe (no cwnd available),
1119 * 1 if a probe was sent,
1120 * -1 otherwise */
1121static int tcp_mtu_probe(struct sock *sk)
1122{
1123 struct tcp_sock *tp = tcp_sk(sk);
1124 struct inet_connection_sock *icsk = inet_csk(sk);
1125 struct sk_buff *skb, *nskb, *next;
1126 int len;
1127 int probe_size;
1128 unsigned int pif;
1129 int copy;
1130 int mss_now;
1131
1132 /* Not currently probing/verifying,
1133 * not in recovery,
1134 * have enough cwnd, and
1135 * not SACKing (the variable headers throw things off) */
1136 if (!icsk->icsk_mtup.enabled ||
1137 icsk->icsk_mtup.probe_size ||
1138 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1139 tp->snd_cwnd < 11 ||
1140 tp->rx_opt.eff_sacks)
1141 return -1;
1142
1143 /* Very simple search strategy: just double the MSS. */
1144 mss_now = tcp_current_mss(sk, 0);
1145 probe_size = 2*tp->mss_cache;
1146 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1147 /* TODO: set timer for probe_converge_event */
1148 return -1;
1149 }
1150
1151 /* Have enough data in the send queue to probe? */
1152 len = 0;
1153 if ((skb = sk->sk_send_head) == NULL)
1154 return -1;
1155 while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
1156 skb = skb->next;
1157 if (len < probe_size)
1158 return -1;
1159
1160 /* Receive window check. */
1161 if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
1162 if (tp->snd_wnd < probe_size)
1163 return -1;
1164 else
1165 return 0;
1166 }
1167
1168 /* Do we need to wait to drain cwnd? */
1169 pif = tcp_packets_in_flight(tp);
1170 if (pif + 2 > tp->snd_cwnd) {
1171 /* With no packets in flight, don't stall. */
1172 if (pif == 0)
1173 return -1;
1174 else
1175 return 0;
1176 }
1177
1178 /* We're allowed to probe. Build it now. */
1179 if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1180 return -1;
1181 sk_charge_skb(sk, nskb);
1182
1183 skb = sk->sk_send_head;
1184 __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
1185 sk->sk_send_head = nskb;
1186
1187 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1188 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1189 TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
1190 TCP_SKB_CB(nskb)->sacked = 0;
1191 nskb->csum = 0;
1192 if (skb->ip_summed == CHECKSUM_HW)
1193 nskb->ip_summed = CHECKSUM_HW;
1194
1195 len = 0;
1196 while (len < probe_size) {
1197 next = skb->next;
1198
1199 copy = min_t(int, skb->len, probe_size - len);
1200 if (nskb->ip_summed)
1201 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1202 else
1203 nskb->csum = skb_copy_and_csum_bits(skb, 0,
1204 skb_put(nskb, copy), copy, nskb->csum);
1205
1206 if (skb->len <= copy) {
1207 /* We've eaten all the data from this skb.
1208 * Throw it away. */
1209 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
1210 __skb_unlink(skb, &sk->sk_write_queue);
1211 sk_stream_free_skb(sk, skb);
1212 } else {
1213 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1214 ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
1215 if (!skb_shinfo(skb)->nr_frags) {
1216 skb_pull(skb, copy);
1217 if (skb->ip_summed != CHECKSUM_HW)
1218 skb->csum = csum_partial(skb->data, skb->len, 0);
1219 } else {
1220 __pskb_trim_head(skb, copy);
1221 tcp_set_skb_tso_segs(sk, skb, mss_now);
1222 }
1223 TCP_SKB_CB(skb)->seq += copy;
1224 }
1225
1226 len += copy;
1227 skb = next;
1228 }
1229 tcp_init_tso_segs(sk, nskb, nskb->len);
1230
1231 /* We're ready to send. If this fails, the probe will
1232 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
1233 TCP_SKB_CB(nskb)->when = tcp_time_stamp;
1234 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
1235 /* Decrement cwnd here because we are sending
1236 * effectively two packets. */
1237 tp->snd_cwnd--;
1238 update_send_head(sk, tp, nskb);
1239
1240 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
1241 icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1242 icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1243
1244 return 1;
1245 }
1246
1247 return -1;
1248}
1249
1250
1066/* This routine writes packets to the network. It advances the 1251/* This routine writes packets to the network. It advances the
1067 * send_head. This happens as incoming acks open up the remote 1252 * send_head. This happens as incoming acks open up the remote
1068 * window for us. 1253 * window for us.
@@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1076 struct sk_buff *skb; 1261 struct sk_buff *skb;
1077 unsigned int tso_segs, sent_pkts; 1262 unsigned int tso_segs, sent_pkts;
1078 int cwnd_quota; 1263 int cwnd_quota;
1264 int result;
1079 1265
1080 /* If we are closed, the bytes will have to remain here. 1266 /* If we are closed, the bytes will have to remain here.
1081 * In time closedown will finish, we empty the write queue and all 1267 * In time closedown will finish, we empty the write queue and all
@@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1085 return 0; 1271 return 0;
1086 1272
1087 sent_pkts = 0; 1273 sent_pkts = 0;
1274
1275 /* Do MTU probing. */
1276 if ((result = tcp_mtu_probe(sk)) == 0) {
1277 return 0;
1278 } else if (result > 0) {
1279 sent_pkts = 1;
1280 }
1281
1088 while ((skb = sk->sk_send_head)) { 1282 while ((skb = sk->sk_send_head)) {
1089 unsigned int limit; 1283 unsigned int limit;
1090 1284
@@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk)
1455int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 1649int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1456{ 1650{
1457 struct tcp_sock *tp = tcp_sk(sk); 1651 struct tcp_sock *tp = tcp_sk(sk);
1652 struct inet_connection_sock *icsk = inet_csk(sk);
1458 unsigned int cur_mss = tcp_current_mss(sk, 0); 1653 unsigned int cur_mss = tcp_current_mss(sk, 0);
1459 int err; 1654 int err;
1460 1655
1656 /* Inconslusive MTU probe */
1657 if (icsk->icsk_mtup.probe_size) {
1658 icsk->icsk_mtup.probe_size = 0;
1659 }
1660
1461 /* Do not sent more than we queued. 1/4 is reserved for possible 1661 /* Do not sent more than we queued. 1/4 is reserved for possible
1462 * copying overhead: fragmentation, tunneling, mangling etc. 1662 * copying overhead: fragmentation, tunneling, mangling etc.
1463 */ 1663 */
@@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk)
1883 if (tp->rx_opt.user_mss) 2083 if (tp->rx_opt.user_mss)
1884 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 2084 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
1885 tp->max_window = 0; 2085 tp->max_window = 0;
2086 tcp_mtup_init(sk);
1886 tcp_sync_mss(sk, dst_mtu(dst)); 2087 tcp_sync_mss(sk, dst_mtu(dst));
1887 2088
1888 if (!tp->window_clamp) 2089 if (!tp->window_clamp)
@@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack);
2180EXPORT_SYMBOL(tcp_simple_retransmit); 2381EXPORT_SYMBOL(tcp_simple_retransmit);
2181EXPORT_SYMBOL(tcp_sync_mss); 2382EXPORT_SYMBOL(tcp_sync_mss);
2182EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); 2383EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
2384EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e1880959614a..7c1bde3cd6cb 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -119,8 +119,10 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
119/* A write timeout has occurred. Process the after effects. */ 119/* A write timeout has occurred. Process the after effects. */
120static int tcp_write_timeout(struct sock *sk) 120static int tcp_write_timeout(struct sock *sk)
121{ 121{
122 const struct inet_connection_sock *icsk = inet_csk(sk); 122 struct inet_connection_sock *icsk = inet_csk(sk);
123 struct tcp_sock *tp = tcp_sk(sk);
123 int retry_until; 124 int retry_until;
125 int mss;
124 126
125 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 127 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
126 if (icsk->icsk_retransmits) 128 if (icsk->icsk_retransmits)
@@ -128,25 +130,19 @@ static int tcp_write_timeout(struct sock *sk)
128 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 130 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
129 } else { 131 } else {
130 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { 132 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
131 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black 133 /* Black hole detection */
132 hole detection. :-( 134 if (sysctl_tcp_mtu_probing) {
133 135 if (!icsk->icsk_mtup.enabled) {
134 It is place to make it. It is not made. I do not want 136 icsk->icsk_mtup.enabled = 1;
135 to make it. It is disgusting. It does not work in any 137 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
136 case. Let me to cite the same draft, which requires for 138 } else {
137 us to implement this: 139 mss = min(sysctl_tcp_base_mss,
138 140 tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
139 "The one security concern raised by this memo is that ICMP black holes 141 mss = max(mss, 68 - tp->tcp_header_len);
140 are often caused by over-zealous security administrators who block 142 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
141 all ICMP messages. It is vitally important that those who design and 143 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
142 deploy security systems understand the impact of strict filtering on 144 }
143 upper-layer protocols. The safest web site in the world is worthless 145 }
144 if most TCP implementations cannot transfer data from it. It would
145 be far nicer to have all of the black holes fixed rather than fixing
146 all of the TCP implementations."
147
148 Golden words :-).
149 */
150 146
151 dst_negative_advice(&sk->sk_dst_cache); 147 dst_negative_advice(&sk->sk_dst_cache);
152 } 148 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ca9cf6853755..14de50380f4e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -987,6 +987,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
987 inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + 987 inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
988 newnp->opt->opt_flen); 988 newnp->opt->opt_flen);
989 989
990 tcp_mtup_init(newsk);
990 tcp_sync_mss(newsk, dst_mtu(dst)); 991 tcp_sync_mss(newsk, dst_mtu(dst));
991 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 992 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
992 tcp_initialize_rcv_mss(newsk); 993 tcp_initialize_rcv_mss(newsk);