aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c259
1 files changed, 236 insertions, 23 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f498a6c8895..9d79546d384e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,12 +45,23 @@
45/* People can turn this off for buggy TCP's found in printers etc. */ 45/* People can turn this off for buggy TCP's found in printers etc. */
46int sysctl_tcp_retrans_collapse = 1; 46int sysctl_tcp_retrans_collapse = 1;
47 47
48/* People can turn this on to work with those rare, broken TCPs that
49 * interpret the window field as a signed quantity.
50 */
51int sysctl_tcp_workaround_signed_windows = 0;
52
48/* This limits the percentage of the congestion window which we 53/* This limits the percentage of the congestion window which we
49 * will allow a single TSO frame to consume. Building TSO frames 54 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 55 * which are too large can cause TCP streams to be bursty.
51 */ 56 */
52int sysctl_tcp_tso_win_divisor = 3; 57int sysctl_tcp_tso_win_divisor = 3;
53 58
59int sysctl_tcp_mtu_probing = 0;
60int sysctl_tcp_base_mss = 512;
61
62EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
63EXPORT_SYMBOL(sysctl_tcp_base_mss);
64
54static void update_send_head(struct sock *sk, struct tcp_sock *tp, 65static void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 66 struct sk_buff *skb)
56{ 67{
@@ -171,12 +182,18 @@ void tcp_select_initial_window(int __space, __u32 mss,
171 space = (space / mss) * mss; 182 space = (space / mss) * mss;
172 183
173 /* NOTE: offering an initial window larger than 32767 184 /* NOTE: offering an initial window larger than 32767
174 * will break some buggy TCP stacks. We try to be nice. 185 * will break some buggy TCP stacks. If the admin tells us
175 * If we are not window scaling, then this truncates 186 * it is likely we could be speaking with such a buggy stack
176 * our initial window offering to 32k. There should also 187 * we will truncate our initial window offering to 32K-1
177 * be a sysctl option to stop being nice. 188 * unless the remote has sent us a window scaling option,
189 * which we interpret as a sign the remote TCP is not
190 * misinterpreting the window field as a signed quantity.
178 */ 191 */
179 (*rcv_wnd) = min(space, MAX_TCP_WINDOW); 192 if (sysctl_tcp_workaround_signed_windows)
193 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
194 else
195 (*rcv_wnd) = space;
196
180 (*rcv_wscale) = 0; 197 (*rcv_wscale) = 0;
181 if (wscale_ok) { 198 if (wscale_ok) {
182 /* Set window scaling on max possible window 199 /* Set window scaling on max possible window
@@ -235,7 +252,7 @@ static u16 tcp_select_window(struct sock *sk)
235 /* Make sure we do not exceed the maximum possible 252 /* Make sure we do not exceed the maximum possible
236 * scaled window. 253 * scaled window.
237 */ 254 */
238 if (!tp->rx_opt.rcv_wscale) 255 if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
239 new_win = min(new_win, MAX_TCP_WINDOW); 256 new_win = min(new_win, MAX_TCP_WINDOW);
240 else 257 else
241 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); 258 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -681,6 +698,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
681 return 0; 698 return 0;
682} 699}
683 700
701/* Not accounting for SACKs here. */
702int tcp_mtu_to_mss(struct sock *sk, int pmtu)
703{
704 struct tcp_sock *tp = tcp_sk(sk);
705 struct inet_connection_sock *icsk = inet_csk(sk);
706 int mss_now;
707
708 /* Calculate base mss without TCP options:
709 It is MMS_S - sizeof(tcphdr) of rfc1122
710 */
711 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
712
713 /* Clamp it (mss_clamp does not include tcp options) */
714 if (mss_now > tp->rx_opt.mss_clamp)
715 mss_now = tp->rx_opt.mss_clamp;
716
717 /* Now subtract optional transport overhead */
718 mss_now -= icsk->icsk_ext_hdr_len;
719
720 /* Then reserve room for full set of TCP options and 8 bytes of data */
721 if (mss_now < 48)
722 mss_now = 48;
723
724 /* Now subtract TCP options size, not including SACKs */
725 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
726
727 return mss_now;
728}
729
730/* Inverse of above */
731int tcp_mss_to_mtu(struct sock *sk, int mss)
732{
733 struct tcp_sock *tp = tcp_sk(sk);
734 struct inet_connection_sock *icsk = inet_csk(sk);
735 int mtu;
736
737 mtu = mss +
738 tp->tcp_header_len +
739 icsk->icsk_ext_hdr_len +
740 icsk->icsk_af_ops->net_header_len;
741
742 return mtu;
743}
744
745void tcp_mtup_init(struct sock *sk)
746{
747 struct tcp_sock *tp = tcp_sk(sk);
748 struct inet_connection_sock *icsk = inet_csk(sk);
749
750 icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
751 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
752 icsk->icsk_af_ops->net_header_len;
753 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
754 icsk->icsk_mtup.probe_size = 0;
755}
756
684/* This function synchronize snd mss to current pmtu/exthdr set. 757/* This function synchronize snd mss to current pmtu/exthdr set.
685 758
686 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts 759 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -708,25 +781,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
708{ 781{
709 struct tcp_sock *tp = tcp_sk(sk); 782 struct tcp_sock *tp = tcp_sk(sk);
710 struct inet_connection_sock *icsk = inet_csk(sk); 783 struct inet_connection_sock *icsk = inet_csk(sk);
711 /* Calculate base mss without TCP options: 784 int mss_now;
712 It is MMS_S - sizeof(tcphdr) of rfc1122
713 */
714 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
715 sizeof(struct tcphdr));
716
717 /* Clamp it (mss_clamp does not include tcp options) */
718 if (mss_now > tp->rx_opt.mss_clamp)
719 mss_now = tp->rx_opt.mss_clamp;
720
721 /* Now subtract optional transport overhead */
722 mss_now -= icsk->icsk_ext_hdr_len;
723 785
724 /* Then reserve room for full set of TCP options and 8 bytes of data */ 786 if (icsk->icsk_mtup.search_high > pmtu)
725 if (mss_now < 48) 787 icsk->icsk_mtup.search_high = pmtu;
726 mss_now = 48;
727 788
728 /* Now subtract TCP options size, not including SACKs */ 789 mss_now = tcp_mtu_to_mss(sk, pmtu);
729 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
730 790
731 /* Bound mss with half of window */ 791 /* Bound mss with half of window */
732 if (tp->max_window && mss_now > (tp->max_window>>1)) 792 if (tp->max_window && mss_now > (tp->max_window>>1))
@@ -734,6 +794,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
734 794
735 /* And store cached results */ 795 /* And store cached results */
736 icsk->icsk_pmtu_cookie = pmtu; 796 icsk->icsk_pmtu_cookie = pmtu;
797 if (icsk->icsk_mtup.enabled)
798 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
737 tp->mss_cache = mss_now; 799 tp->mss_cache = mss_now;
738 800
739 return mss_now; 801 return mss_now;
@@ -1063,6 +1125,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
1063 return 1; 1125 return 1;
1064} 1126}
1065 1127
1128/* Create a new MTU probe if we are ready.
1129 * Returns 0 if we should wait to probe (no cwnd available),
1130 * 1 if a probe was sent,
1131 * -1 otherwise */
1132static int tcp_mtu_probe(struct sock *sk)
1133{
1134 struct tcp_sock *tp = tcp_sk(sk);
1135 struct inet_connection_sock *icsk = inet_csk(sk);
1136 struct sk_buff *skb, *nskb, *next;
1137 int len;
1138 int probe_size;
1139 unsigned int pif;
1140 int copy;
1141 int mss_now;
1142
1143 /* Not currently probing/verifying,
1144 * not in recovery,
1145 * have enough cwnd, and
1146 * not SACKing (the variable headers throw things off) */
1147 if (!icsk->icsk_mtup.enabled ||
1148 icsk->icsk_mtup.probe_size ||
1149 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1150 tp->snd_cwnd < 11 ||
1151 tp->rx_opt.eff_sacks)
1152 return -1;
1153
1154 /* Very simple search strategy: just double the MSS. */
1155 mss_now = tcp_current_mss(sk, 0);
1156 probe_size = 2*tp->mss_cache;
1157 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1158 /* TODO: set timer for probe_converge_event */
1159 return -1;
1160 }
1161
1162 /* Have enough data in the send queue to probe? */
1163 len = 0;
1164 if ((skb = sk->sk_send_head) == NULL)
1165 return -1;
1166 while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
1167 skb = skb->next;
1168 if (len < probe_size)
1169 return -1;
1170
1171 /* Receive window check. */
1172 if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
1173 if (tp->snd_wnd < probe_size)
1174 return -1;
1175 else
1176 return 0;
1177 }
1178
1179 /* Do we need to wait to drain cwnd? */
1180 pif = tcp_packets_in_flight(tp);
1181 if (pif + 2 > tp->snd_cwnd) {
1182 /* With no packets in flight, don't stall. */
1183 if (pif == 0)
1184 return -1;
1185 else
1186 return 0;
1187 }
1188
1189 /* We're allowed to probe. Build it now. */
1190 if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1191 return -1;
1192 sk_charge_skb(sk, nskb);
1193
1194 skb = sk->sk_send_head;
1195 __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
1196 sk->sk_send_head = nskb;
1197
1198 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1199 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1200 TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
1201 TCP_SKB_CB(nskb)->sacked = 0;
1202 nskb->csum = 0;
1203 if (skb->ip_summed == CHECKSUM_HW)
1204 nskb->ip_summed = CHECKSUM_HW;
1205
1206 len = 0;
1207 while (len < probe_size) {
1208 next = skb->next;
1209
1210 copy = min_t(int, skb->len, probe_size - len);
1211 if (nskb->ip_summed)
1212 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1213 else
1214 nskb->csum = skb_copy_and_csum_bits(skb, 0,
1215 skb_put(nskb, copy), copy, nskb->csum);
1216
1217 if (skb->len <= copy) {
1218 /* We've eaten all the data from this skb.
1219 * Throw it away. */
1220 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
1221 __skb_unlink(skb, &sk->sk_write_queue);
1222 sk_stream_free_skb(sk, skb);
1223 } else {
1224 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1225 ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
1226 if (!skb_shinfo(skb)->nr_frags) {
1227 skb_pull(skb, copy);
1228 if (skb->ip_summed != CHECKSUM_HW)
1229 skb->csum = csum_partial(skb->data, skb->len, 0);
1230 } else {
1231 __pskb_trim_head(skb, copy);
1232 tcp_set_skb_tso_segs(sk, skb, mss_now);
1233 }
1234 TCP_SKB_CB(skb)->seq += copy;
1235 }
1236
1237 len += copy;
1238 skb = next;
1239 }
1240 tcp_init_tso_segs(sk, nskb, nskb->len);
1241
1242 /* We're ready to send. If this fails, the probe will
1243 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
1244 TCP_SKB_CB(nskb)->when = tcp_time_stamp;
1245 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
1246 /* Decrement cwnd here because we are sending
1247 * effectively two packets. */
1248 tp->snd_cwnd--;
1249 update_send_head(sk, tp, nskb);
1250
1251 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
1252 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1253 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1254
1255 return 1;
1256 }
1257
1258 return -1;
1259}
1260
1261
1066/* This routine writes packets to the network. It advances the 1262/* This routine writes packets to the network. It advances the
1067 * send_head. This happens as incoming acks open up the remote 1263 * send_head. This happens as incoming acks open up the remote
1068 * window for us. 1264 * window for us.
@@ -1076,6 +1272,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1076 struct sk_buff *skb; 1272 struct sk_buff *skb;
1077 unsigned int tso_segs, sent_pkts; 1273 unsigned int tso_segs, sent_pkts;
1078 int cwnd_quota; 1274 int cwnd_quota;
1275 int result;
1079 1276
1080 /* If we are closed, the bytes will have to remain here. 1277 /* If we are closed, the bytes will have to remain here.
1081 * In time closedown will finish, we empty the write queue and all 1278 * In time closedown will finish, we empty the write queue and all
@@ -1085,6 +1282,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1085 return 0; 1282 return 0;
1086 1283
1087 sent_pkts = 0; 1284 sent_pkts = 0;
1285
1286 /* Do MTU probing. */
1287 if ((result = tcp_mtu_probe(sk)) == 0) {
1288 return 0;
1289 } else if (result > 0) {
1290 sent_pkts = 1;
1291 }
1292
1088 while ((skb = sk->sk_send_head)) { 1293 while ((skb = sk->sk_send_head)) {
1089 unsigned int limit; 1294 unsigned int limit;
1090 1295
@@ -1455,9 +1660,15 @@ void tcp_simple_retransmit(struct sock *sk)
1455int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 1660int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1456{ 1661{
1457 struct tcp_sock *tp = tcp_sk(sk); 1662 struct tcp_sock *tp = tcp_sk(sk);
1663 struct inet_connection_sock *icsk = inet_csk(sk);
1458 unsigned int cur_mss = tcp_current_mss(sk, 0); 1664 unsigned int cur_mss = tcp_current_mss(sk, 0);
1459 int err; 1665 int err;
1460 1666
1667 /* Inconslusive MTU probe */
1668 if (icsk->icsk_mtup.probe_size) {
1669 icsk->icsk_mtup.probe_size = 0;
1670 }
1671
1461 /* Do not sent more than we queued. 1/4 is reserved for possible 1672 /* Do not sent more than we queued. 1/4 is reserved for possible
1462 * copying overhead: fragmentation, tunneling, mangling etc. 1673 * copying overhead: fragmentation, tunneling, mangling etc.
1463 */ 1674 */
@@ -1883,6 +2094,7 @@ static void tcp_connect_init(struct sock *sk)
1883 if (tp->rx_opt.user_mss) 2094 if (tp->rx_opt.user_mss)
1884 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 2095 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
1885 tp->max_window = 0; 2096 tp->max_window = 0;
2097 tcp_mtup_init(sk);
1886 tcp_sync_mss(sk, dst_mtu(dst)); 2098 tcp_sync_mss(sk, dst_mtu(dst));
1887 2099
1888 if (!tp->window_clamp) 2100 if (!tp->window_clamp)
@@ -2180,3 +2392,4 @@ EXPORT_SYMBOL(tcp_make_synack);
2180EXPORT_SYMBOL(tcp_simple_retransmit); 2392EXPORT_SYMBOL(tcp_simple_retransmit);
2181EXPORT_SYMBOL(tcp_sync_mss); 2393EXPORT_SYMBOL(tcp_sync_mss);
2182EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); 2394EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
2395EXPORT_SYMBOL(tcp_mtup_init);