diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 259 |
1 files changed, 236 insertions, 23 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9f498a6c8895..9d79546d384e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -45,12 +45,23 @@ | |||
45 | /* People can turn this off for buggy TCP's found in printers etc. */ | 45 | /* People can turn this off for buggy TCP's found in printers etc. */ |
46 | int sysctl_tcp_retrans_collapse = 1; | 46 | int sysctl_tcp_retrans_collapse = 1; |
47 | 47 | ||
48 | /* People can turn this on to work with those rare, broken TCPs that | ||
49 | * interpret the window field as a signed quantity. | ||
50 | */ | ||
51 | int sysctl_tcp_workaround_signed_windows = 0; | ||
52 | |||
48 | /* This limits the percentage of the congestion window which we | 53 | /* This limits the percentage of the congestion window which we |
49 | * will allow a single TSO frame to consume. Building TSO frames | 54 | * will allow a single TSO frame to consume. Building TSO frames |
50 | * which are too large can cause TCP streams to be bursty. | 55 | * which are too large can cause TCP streams to be bursty. |
51 | */ | 56 | */ |
52 | int sysctl_tcp_tso_win_divisor = 3; | 57 | int sysctl_tcp_tso_win_divisor = 3; |
53 | 58 | ||
59 | int sysctl_tcp_mtu_probing = 0; | ||
60 | int sysctl_tcp_base_mss = 512; | ||
61 | |||
62 | EXPORT_SYMBOL(sysctl_tcp_mtu_probing); | ||
63 | EXPORT_SYMBOL(sysctl_tcp_base_mss); | ||
64 | |||
54 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, | 65 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, |
55 | struct sk_buff *skb) | 66 | struct sk_buff *skb) |
56 | { | 67 | { |
@@ -171,12 +182,18 @@ void tcp_select_initial_window(int __space, __u32 mss, | |||
171 | space = (space / mss) * mss; | 182 | space = (space / mss) * mss; |
172 | 183 | ||
173 | /* NOTE: offering an initial window larger than 32767 | 184 | /* NOTE: offering an initial window larger than 32767 |
174 | * will break some buggy TCP stacks. We try to be nice. | 185 | * will break some buggy TCP stacks. If the admin tells us |
175 | * If we are not window scaling, then this truncates | 186 | * it is likely we could be speaking with such a buggy stack |
176 | * our initial window offering to 32k. There should also | 187 | * we will truncate our initial window offering to 32K-1 |
177 | * be a sysctl option to stop being nice. | 188 | * unless the remote has sent us a window scaling option, |
189 | * which we interpret as a sign the remote TCP is not | ||
190 | * misinterpreting the window field as a signed quantity. | ||
178 | */ | 191 | */ |
179 | (*rcv_wnd) = min(space, MAX_TCP_WINDOW); | 192 | if (sysctl_tcp_workaround_signed_windows) |
193 | (*rcv_wnd) = min(space, MAX_TCP_WINDOW); | ||
194 | else | ||
195 | (*rcv_wnd) = space; | ||
196 | |||
180 | (*rcv_wscale) = 0; | 197 | (*rcv_wscale) = 0; |
181 | if (wscale_ok) { | 198 | if (wscale_ok) { |
182 | /* Set window scaling on max possible window | 199 | /* Set window scaling on max possible window |
@@ -235,7 +252,7 @@ static u16 tcp_select_window(struct sock *sk) | |||
235 | /* Make sure we do not exceed the maximum possible | 252 | /* Make sure we do not exceed the maximum possible |
236 | * scaled window. | 253 | * scaled window. |
237 | */ | 254 | */ |
238 | if (!tp->rx_opt.rcv_wscale) | 255 | if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) |
239 | new_win = min(new_win, MAX_TCP_WINDOW); | 256 | new_win = min(new_win, MAX_TCP_WINDOW); |
240 | else | 257 | else |
241 | new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); | 258 | new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); |
@@ -681,6 +698,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
681 | return 0; | 698 | return 0; |
682 | } | 699 | } |
683 | 700 | ||
701 | /* Not accounting for SACKs here. */ | ||
702 | int tcp_mtu_to_mss(struct sock *sk, int pmtu) | ||
703 | { | ||
704 | struct tcp_sock *tp = tcp_sk(sk); | ||
705 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
706 | int mss_now; | ||
707 | |||
708 | /* Calculate base mss without TCP options: | ||
709 | It is MMS_S - sizeof(tcphdr) of rfc1122 | ||
710 | */ | ||
711 | mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); | ||
712 | |||
713 | /* Clamp it (mss_clamp does not include tcp options) */ | ||
714 | if (mss_now > tp->rx_opt.mss_clamp) | ||
715 | mss_now = tp->rx_opt.mss_clamp; | ||
716 | |||
717 | /* Now subtract optional transport overhead */ | ||
718 | mss_now -= icsk->icsk_ext_hdr_len; | ||
719 | |||
720 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | ||
721 | if (mss_now < 48) | ||
722 | mss_now = 48; | ||
723 | |||
724 | /* Now subtract TCP options size, not including SACKs */ | ||
725 | mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | ||
726 | |||
727 | return mss_now; | ||
728 | } | ||
729 | |||
730 | /* Inverse of above */ | ||
731 | int tcp_mss_to_mtu(struct sock *sk, int mss) | ||
732 | { | ||
733 | struct tcp_sock *tp = tcp_sk(sk); | ||
734 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
735 | int mtu; | ||
736 | |||
737 | mtu = mss + | ||
738 | tp->tcp_header_len + | ||
739 | icsk->icsk_ext_hdr_len + | ||
740 | icsk->icsk_af_ops->net_header_len; | ||
741 | |||
742 | return mtu; | ||
743 | } | ||
744 | |||
745 | void tcp_mtup_init(struct sock *sk) | ||
746 | { | ||
747 | struct tcp_sock *tp = tcp_sk(sk); | ||
748 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
749 | |||
750 | icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; | ||
751 | icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + | ||
752 | icsk->icsk_af_ops->net_header_len; | ||
753 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); | ||
754 | icsk->icsk_mtup.probe_size = 0; | ||
755 | } | ||
756 | |||
684 | /* This function synchronize snd mss to current pmtu/exthdr set. | 757 | /* This function synchronize snd mss to current pmtu/exthdr set. |
685 | 758 | ||
686 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts | 759 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts |
@@ -708,25 +781,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
708 | { | 781 | { |
709 | struct tcp_sock *tp = tcp_sk(sk); | 782 | struct tcp_sock *tp = tcp_sk(sk); |
710 | struct inet_connection_sock *icsk = inet_csk(sk); | 783 | struct inet_connection_sock *icsk = inet_csk(sk); |
711 | /* Calculate base mss without TCP options: | 784 | int mss_now; |
712 | It is MMS_S - sizeof(tcphdr) of rfc1122 | ||
713 | */ | ||
714 | int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - | ||
715 | sizeof(struct tcphdr)); | ||
716 | |||
717 | /* Clamp it (mss_clamp does not include tcp options) */ | ||
718 | if (mss_now > tp->rx_opt.mss_clamp) | ||
719 | mss_now = tp->rx_opt.mss_clamp; | ||
720 | |||
721 | /* Now subtract optional transport overhead */ | ||
722 | mss_now -= icsk->icsk_ext_hdr_len; | ||
723 | 785 | ||
724 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | 786 | if (icsk->icsk_mtup.search_high > pmtu) |
725 | if (mss_now < 48) | 787 | icsk->icsk_mtup.search_high = pmtu; |
726 | mss_now = 48; | ||
727 | 788 | ||
728 | /* Now subtract TCP options size, not including SACKs */ | 789 | mss_now = tcp_mtu_to_mss(sk, pmtu); |
729 | mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | ||
730 | 790 | ||
731 | /* Bound mss with half of window */ | 791 | /* Bound mss with half of window */ |
732 | if (tp->max_window && mss_now > (tp->max_window>>1)) | 792 | if (tp->max_window && mss_now > (tp->max_window>>1)) |
@@ -734,6 +794,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
734 | 794 | ||
735 | /* And store cached results */ | 795 | /* And store cached results */ |
736 | icsk->icsk_pmtu_cookie = pmtu; | 796 | icsk->icsk_pmtu_cookie = pmtu; |
797 | if (icsk->icsk_mtup.enabled) | ||
798 | mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); | ||
737 | tp->mss_cache = mss_now; | 799 | tp->mss_cache = mss_now; |
738 | 800 | ||
739 | return mss_now; | 801 | return mss_now; |
@@ -1063,6 +1125,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
1063 | return 1; | 1125 | return 1; |
1064 | } | 1126 | } |
1065 | 1127 | ||
1128 | /* Create a new MTU probe if we are ready. | ||
1129 | * Returns 0 if we should wait to probe (no cwnd available), | ||
1130 | * 1 if a probe was sent, | ||
1131 | * -1 otherwise */ | ||
1132 | static int tcp_mtu_probe(struct sock *sk) | ||
1133 | { | ||
1134 | struct tcp_sock *tp = tcp_sk(sk); | ||
1135 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1136 | struct sk_buff *skb, *nskb, *next; | ||
1137 | int len; | ||
1138 | int probe_size; | ||
1139 | unsigned int pif; | ||
1140 | int copy; | ||
1141 | int mss_now; | ||
1142 | |||
1143 | /* Not currently probing/verifying, | ||
1144 | * not in recovery, | ||
1145 | * have enough cwnd, and | ||
1146 | * not SACKing (the variable headers throw things off) */ | ||
1147 | if (!icsk->icsk_mtup.enabled || | ||
1148 | icsk->icsk_mtup.probe_size || | ||
1149 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open || | ||
1150 | tp->snd_cwnd < 11 || | ||
1151 | tp->rx_opt.eff_sacks) | ||
1152 | return -1; | ||
1153 | |||
1154 | /* Very simple search strategy: just double the MSS. */ | ||
1155 | mss_now = tcp_current_mss(sk, 0); | ||
1156 | probe_size = 2*tp->mss_cache; | ||
1157 | if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { | ||
1158 | /* TODO: set timer for probe_converge_event */ | ||
1159 | return -1; | ||
1160 | } | ||
1161 | |||
1162 | /* Have enough data in the send queue to probe? */ | ||
1163 | len = 0; | ||
1164 | if ((skb = sk->sk_send_head) == NULL) | ||
1165 | return -1; | ||
1166 | while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) | ||
1167 | skb = skb->next; | ||
1168 | if (len < probe_size) | ||
1169 | return -1; | ||
1170 | |||
1171 | /* Receive window check. */ | ||
1172 | if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) { | ||
1173 | if (tp->snd_wnd < probe_size) | ||
1174 | return -1; | ||
1175 | else | ||
1176 | return 0; | ||
1177 | } | ||
1178 | |||
1179 | /* Do we need to wait to drain cwnd? */ | ||
1180 | pif = tcp_packets_in_flight(tp); | ||
1181 | if (pif + 2 > tp->snd_cwnd) { | ||
1182 | /* With no packets in flight, don't stall. */ | ||
1183 | if (pif == 0) | ||
1184 | return -1; | ||
1185 | else | ||
1186 | return 0; | ||
1187 | } | ||
1188 | |||
1189 | /* We're allowed to probe. Build it now. */ | ||
1190 | if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) | ||
1191 | return -1; | ||
1192 | sk_charge_skb(sk, nskb); | ||
1193 | |||
1194 | skb = sk->sk_send_head; | ||
1195 | __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); | ||
1196 | sk->sk_send_head = nskb; | ||
1197 | |||
1198 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; | ||
1199 | TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; | ||
1200 | TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; | ||
1201 | TCP_SKB_CB(nskb)->sacked = 0; | ||
1202 | nskb->csum = 0; | ||
1203 | if (skb->ip_summed == CHECKSUM_HW) | ||
1204 | nskb->ip_summed = CHECKSUM_HW; | ||
1205 | |||
1206 | len = 0; | ||
1207 | while (len < probe_size) { | ||
1208 | next = skb->next; | ||
1209 | |||
1210 | copy = min_t(int, skb->len, probe_size - len); | ||
1211 | if (nskb->ip_summed) | ||
1212 | skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); | ||
1213 | else | ||
1214 | nskb->csum = skb_copy_and_csum_bits(skb, 0, | ||
1215 | skb_put(nskb, copy), copy, nskb->csum); | ||
1216 | |||
1217 | if (skb->len <= copy) { | ||
1218 | /* We've eaten all the data from this skb. | ||
1219 | * Throw it away. */ | ||
1220 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; | ||
1221 | __skb_unlink(skb, &sk->sk_write_queue); | ||
1222 | sk_stream_free_skb(sk, skb); | ||
1223 | } else { | ||
1224 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & | ||
1225 | ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); | ||
1226 | if (!skb_shinfo(skb)->nr_frags) { | ||
1227 | skb_pull(skb, copy); | ||
1228 | if (skb->ip_summed != CHECKSUM_HW) | ||
1229 | skb->csum = csum_partial(skb->data, skb->len, 0); | ||
1230 | } else { | ||
1231 | __pskb_trim_head(skb, copy); | ||
1232 | tcp_set_skb_tso_segs(sk, skb, mss_now); | ||
1233 | } | ||
1234 | TCP_SKB_CB(skb)->seq += copy; | ||
1235 | } | ||
1236 | |||
1237 | len += copy; | ||
1238 | skb = next; | ||
1239 | } | ||
1240 | tcp_init_tso_segs(sk, nskb, nskb->len); | ||
1241 | |||
1242 | /* We're ready to send. If this fails, the probe will | ||
1243 | * be resegmented into mss-sized pieces by tcp_write_xmit(). */ | ||
1244 | TCP_SKB_CB(nskb)->when = tcp_time_stamp; | ||
1245 | if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { | ||
1246 | /* Decrement cwnd here because we are sending | ||
1247 | * effectively two packets. */ | ||
1248 | tp->snd_cwnd--; | ||
1249 | update_send_head(sk, tp, nskb); | ||
1250 | |||
1251 | icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); | ||
1252 | tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; | ||
1253 | tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; | ||
1254 | |||
1255 | return 1; | ||
1256 | } | ||
1257 | |||
1258 | return -1; | ||
1259 | } | ||
1260 | |||
1261 | |||
1066 | /* This routine writes packets to the network. It advances the | 1262 | /* This routine writes packets to the network. It advances the |
1067 | * send_head. This happens as incoming acks open up the remote | 1263 | * send_head. This happens as incoming acks open up the remote |
1068 | * window for us. | 1264 | * window for us. |
@@ -1076,6 +1272,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1076 | struct sk_buff *skb; | 1272 | struct sk_buff *skb; |
1077 | unsigned int tso_segs, sent_pkts; | 1273 | unsigned int tso_segs, sent_pkts; |
1078 | int cwnd_quota; | 1274 | int cwnd_quota; |
1275 | int result; | ||
1079 | 1276 | ||
1080 | /* If we are closed, the bytes will have to remain here. | 1277 | /* If we are closed, the bytes will have to remain here. |
1081 | * In time closedown will finish, we empty the write queue and all | 1278 | * In time closedown will finish, we empty the write queue and all |
@@ -1085,6 +1282,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1085 | return 0; | 1282 | return 0; |
1086 | 1283 | ||
1087 | sent_pkts = 0; | 1284 | sent_pkts = 0; |
1285 | |||
1286 | /* Do MTU probing. */ | ||
1287 | if ((result = tcp_mtu_probe(sk)) == 0) { | ||
1288 | return 0; | ||
1289 | } else if (result > 0) { | ||
1290 | sent_pkts = 1; | ||
1291 | } | ||
1292 | |||
1088 | while ((skb = sk->sk_send_head)) { | 1293 | while ((skb = sk->sk_send_head)) { |
1089 | unsigned int limit; | 1294 | unsigned int limit; |
1090 | 1295 | ||
@@ -1455,9 +1660,15 @@ void tcp_simple_retransmit(struct sock *sk) | |||
1455 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | 1660 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) |
1456 | { | 1661 | { |
1457 | struct tcp_sock *tp = tcp_sk(sk); | 1662 | struct tcp_sock *tp = tcp_sk(sk); |
1663 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1458 | unsigned int cur_mss = tcp_current_mss(sk, 0); | 1664 | unsigned int cur_mss = tcp_current_mss(sk, 0); |
1459 | int err; | 1665 | int err; |
1460 | 1666 | ||
1667 | /* Inconslusive MTU probe */ | ||
1668 | if (icsk->icsk_mtup.probe_size) { | ||
1669 | icsk->icsk_mtup.probe_size = 0; | ||
1670 | } | ||
1671 | |||
1461 | /* Do not sent more than we queued. 1/4 is reserved for possible | 1672 | /* Do not sent more than we queued. 1/4 is reserved for possible |
1462 | * copying overhead: fragmentation, tunneling, mangling etc. | 1673 | * copying overhead: fragmentation, tunneling, mangling etc. |
1463 | */ | 1674 | */ |
@@ -1883,6 +2094,7 @@ static void tcp_connect_init(struct sock *sk) | |||
1883 | if (tp->rx_opt.user_mss) | 2094 | if (tp->rx_opt.user_mss) |
1884 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | 2095 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; |
1885 | tp->max_window = 0; | 2096 | tp->max_window = 0; |
2097 | tcp_mtup_init(sk); | ||
1886 | tcp_sync_mss(sk, dst_mtu(dst)); | 2098 | tcp_sync_mss(sk, dst_mtu(dst)); |
1887 | 2099 | ||
1888 | if (!tp->window_clamp) | 2100 | if (!tp->window_clamp) |
@@ -2180,3 +2392,4 @@ EXPORT_SYMBOL(tcp_make_synack); | |||
2180 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2392 | EXPORT_SYMBOL(tcp_simple_retransmit); |
2181 | EXPORT_SYMBOL(tcp_sync_mss); | 2393 | EXPORT_SYMBOL(tcp_sync_mss); |
2182 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); | 2394 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); |
2395 | EXPORT_SYMBOL(tcp_mtup_init); | ||