diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 259 |
1 files changed, 236 insertions, 23 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9f498a6c8895..9d79546d384e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -45,12 +45,23 @@ | |||
| 45 | /* People can turn this off for buggy TCP's found in printers etc. */ | 45 | /* People can turn this off for buggy TCP's found in printers etc. */ |
| 46 | int sysctl_tcp_retrans_collapse = 1; | 46 | int sysctl_tcp_retrans_collapse = 1; |
| 47 | 47 | ||
| 48 | /* People can turn this on to work with those rare, broken TCPs that | ||
| 49 | * interpret the window field as a signed quantity. | ||
| 50 | */ | ||
| 51 | int sysctl_tcp_workaround_signed_windows = 0; | ||
| 52 | |||
| 48 | /* This limits the percentage of the congestion window which we | 53 | /* This limits the percentage of the congestion window which we |
| 49 | * will allow a single TSO frame to consume. Building TSO frames | 54 | * will allow a single TSO frame to consume. Building TSO frames |
| 50 | * which are too large can cause TCP streams to be bursty. | 55 | * which are too large can cause TCP streams to be bursty. |
| 51 | */ | 56 | */ |
| 52 | int sysctl_tcp_tso_win_divisor = 3; | 57 | int sysctl_tcp_tso_win_divisor = 3; |
| 53 | 58 | ||
| 59 | int sysctl_tcp_mtu_probing = 0; | ||
| 60 | int sysctl_tcp_base_mss = 512; | ||
| 61 | |||
| 62 | EXPORT_SYMBOL(sysctl_tcp_mtu_probing); | ||
| 63 | EXPORT_SYMBOL(sysctl_tcp_base_mss); | ||
| 64 | |||
| 54 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, | 65 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, |
| 55 | struct sk_buff *skb) | 66 | struct sk_buff *skb) |
| 56 | { | 67 | { |
| @@ -171,12 +182,18 @@ void tcp_select_initial_window(int __space, __u32 mss, | |||
| 171 | space = (space / mss) * mss; | 182 | space = (space / mss) * mss; |
| 172 | 183 | ||
| 173 | /* NOTE: offering an initial window larger than 32767 | 184 | /* NOTE: offering an initial window larger than 32767 |
| 174 | * will break some buggy TCP stacks. We try to be nice. | 185 | * will break some buggy TCP stacks. If the admin tells us |
| 175 | * If we are not window scaling, then this truncates | 186 | * it is likely we could be speaking with such a buggy stack |
| 176 | * our initial window offering to 32k. There should also | 187 | * we will truncate our initial window offering to 32K-1 |
| 177 | * be a sysctl option to stop being nice. | 188 | * unless the remote has sent us a window scaling option, |
| 189 | * which we interpret as a sign the remote TCP is not | ||
| 190 | * misinterpreting the window field as a signed quantity. | ||
| 178 | */ | 191 | */ |
| 179 | (*rcv_wnd) = min(space, MAX_TCP_WINDOW); | 192 | if (sysctl_tcp_workaround_signed_windows) |
| 193 | (*rcv_wnd) = min(space, MAX_TCP_WINDOW); | ||
| 194 | else | ||
| 195 | (*rcv_wnd) = space; | ||
| 196 | |||
| 180 | (*rcv_wscale) = 0; | 197 | (*rcv_wscale) = 0; |
| 181 | if (wscale_ok) { | 198 | if (wscale_ok) { |
| 182 | /* Set window scaling on max possible window | 199 | /* Set window scaling on max possible window |
| @@ -235,7 +252,7 @@ static u16 tcp_select_window(struct sock *sk) | |||
| 235 | /* Make sure we do not exceed the maximum possible | 252 | /* Make sure we do not exceed the maximum possible |
| 236 | * scaled window. | 253 | * scaled window. |
| 237 | */ | 254 | */ |
| 238 | if (!tp->rx_opt.rcv_wscale) | 255 | if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) |
| 239 | new_win = min(new_win, MAX_TCP_WINDOW); | 256 | new_win = min(new_win, MAX_TCP_WINDOW); |
| 240 | else | 257 | else |
| 241 | new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); | 258 | new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); |
| @@ -681,6 +698,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
| 681 | return 0; | 698 | return 0; |
| 682 | } | 699 | } |
| 683 | 700 | ||
| 701 | /* Not accounting for SACKs here. */ | ||
| 702 | int tcp_mtu_to_mss(struct sock *sk, int pmtu) | ||
| 703 | { | ||
| 704 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 705 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 706 | int mss_now; | ||
| 707 | |||
| 708 | /* Calculate base mss without TCP options: | ||
| 709 | It is MMS_S - sizeof(tcphdr) of rfc1122 | ||
| 710 | */ | ||
| 711 | mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); | ||
| 712 | |||
| 713 | /* Clamp it (mss_clamp does not include tcp options) */ | ||
| 714 | if (mss_now > tp->rx_opt.mss_clamp) | ||
| 715 | mss_now = tp->rx_opt.mss_clamp; | ||
| 716 | |||
| 717 | /* Now subtract optional transport overhead */ | ||
| 718 | mss_now -= icsk->icsk_ext_hdr_len; | ||
| 719 | |||
| 720 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | ||
| 721 | if (mss_now < 48) | ||
| 722 | mss_now = 48; | ||
| 723 | |||
| 724 | /* Now subtract TCP options size, not including SACKs */ | ||
| 725 | mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | ||
| 726 | |||
| 727 | return mss_now; | ||
| 728 | } | ||
| 729 | |||
| 730 | /* Inverse of above */ | ||
| 731 | int tcp_mss_to_mtu(struct sock *sk, int mss) | ||
| 732 | { | ||
| 733 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 734 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 735 | int mtu; | ||
| 736 | |||
| 737 | mtu = mss + | ||
| 738 | tp->tcp_header_len + | ||
| 739 | icsk->icsk_ext_hdr_len + | ||
| 740 | icsk->icsk_af_ops->net_header_len; | ||
| 741 | |||
| 742 | return mtu; | ||
| 743 | } | ||
| 744 | |||
| 745 | void tcp_mtup_init(struct sock *sk) | ||
| 746 | { | ||
| 747 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 748 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 749 | |||
| 750 | icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; | ||
| 751 | icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + | ||
| 752 | icsk->icsk_af_ops->net_header_len; | ||
| 753 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); | ||
| 754 | icsk->icsk_mtup.probe_size = 0; | ||
| 755 | } | ||
| 756 | |||
| 684 | /* This function synchronize snd mss to current pmtu/exthdr set. | 757 | /* This function synchronize snd mss to current pmtu/exthdr set. |
| 685 | 758 | ||
| 686 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts | 759 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts |
| @@ -708,25 +781,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
| 708 | { | 781 | { |
| 709 | struct tcp_sock *tp = tcp_sk(sk); | 782 | struct tcp_sock *tp = tcp_sk(sk); |
| 710 | struct inet_connection_sock *icsk = inet_csk(sk); | 783 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 711 | /* Calculate base mss without TCP options: | 784 | int mss_now; |
| 712 | It is MMS_S - sizeof(tcphdr) of rfc1122 | ||
| 713 | */ | ||
| 714 | int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - | ||
| 715 | sizeof(struct tcphdr)); | ||
| 716 | |||
| 717 | /* Clamp it (mss_clamp does not include tcp options) */ | ||
| 718 | if (mss_now > tp->rx_opt.mss_clamp) | ||
| 719 | mss_now = tp->rx_opt.mss_clamp; | ||
| 720 | |||
| 721 | /* Now subtract optional transport overhead */ | ||
| 722 | mss_now -= icsk->icsk_ext_hdr_len; | ||
| 723 | 785 | ||
| 724 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | 786 | if (icsk->icsk_mtup.search_high > pmtu) |
| 725 | if (mss_now < 48) | 787 | icsk->icsk_mtup.search_high = pmtu; |
| 726 | mss_now = 48; | ||
| 727 | 788 | ||
| 728 | /* Now subtract TCP options size, not including SACKs */ | 789 | mss_now = tcp_mtu_to_mss(sk, pmtu); |
| 729 | mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | ||
| 730 | 790 | ||
| 731 | /* Bound mss with half of window */ | 791 | /* Bound mss with half of window */ |
| 732 | if (tp->max_window && mss_now > (tp->max_window>>1)) | 792 | if (tp->max_window && mss_now > (tp->max_window>>1)) |
| @@ -734,6 +794,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
| 734 | 794 | ||
| 735 | /* And store cached results */ | 795 | /* And store cached results */ |
| 736 | icsk->icsk_pmtu_cookie = pmtu; | 796 | icsk->icsk_pmtu_cookie = pmtu; |
| 797 | if (icsk->icsk_mtup.enabled) | ||
| 798 | mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); | ||
| 737 | tp->mss_cache = mss_now; | 799 | tp->mss_cache = mss_now; |
| 738 | 800 | ||
| 739 | return mss_now; | 801 | return mss_now; |
| @@ -1063,6 +1125,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
| 1063 | return 1; | 1125 | return 1; |
| 1064 | } | 1126 | } |
| 1065 | 1127 | ||
| 1128 | /* Create a new MTU probe if we are ready. | ||
| 1129 | * Returns 0 if we should wait to probe (no cwnd available), | ||
| 1130 | * 1 if a probe was sent, | ||
| 1131 | * -1 otherwise */ | ||
| 1132 | static int tcp_mtu_probe(struct sock *sk) | ||
| 1133 | { | ||
| 1134 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1135 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1136 | struct sk_buff *skb, *nskb, *next; | ||
| 1137 | int len; | ||
| 1138 | int probe_size; | ||
| 1139 | unsigned int pif; | ||
| 1140 | int copy; | ||
| 1141 | int mss_now; | ||
| 1142 | |||
| 1143 | /* Not currently probing/verifying, | ||
| 1144 | * not in recovery, | ||
| 1145 | * have enough cwnd, and | ||
| 1146 | * not SACKing (the variable headers throw things off) */ | ||
| 1147 | if (!icsk->icsk_mtup.enabled || | ||
| 1148 | icsk->icsk_mtup.probe_size || | ||
| 1149 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open || | ||
| 1150 | tp->snd_cwnd < 11 || | ||
| 1151 | tp->rx_opt.eff_sacks) | ||
| 1152 | return -1; | ||
| 1153 | |||
| 1154 | /* Very simple search strategy: just double the MSS. */ | ||
| 1155 | mss_now = tcp_current_mss(sk, 0); | ||
| 1156 | probe_size = 2*tp->mss_cache; | ||
| 1157 | if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { | ||
| 1158 | /* TODO: set timer for probe_converge_event */ | ||
| 1159 | return -1; | ||
| 1160 | } | ||
| 1161 | |||
| 1162 | /* Have enough data in the send queue to probe? */ | ||
| 1163 | len = 0; | ||
| 1164 | if ((skb = sk->sk_send_head) == NULL) | ||
| 1165 | return -1; | ||
| 1166 | while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) | ||
| 1167 | skb = skb->next; | ||
| 1168 | if (len < probe_size) | ||
| 1169 | return -1; | ||
| 1170 | |||
| 1171 | /* Receive window check. */ | ||
| 1172 | if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) { | ||
| 1173 | if (tp->snd_wnd < probe_size) | ||
| 1174 | return -1; | ||
| 1175 | else | ||
| 1176 | return 0; | ||
| 1177 | } | ||
| 1178 | |||
| 1179 | /* Do we need to wait to drain cwnd? */ | ||
| 1180 | pif = tcp_packets_in_flight(tp); | ||
| 1181 | if (pif + 2 > tp->snd_cwnd) { | ||
| 1182 | /* With no packets in flight, don't stall. */ | ||
| 1183 | if (pif == 0) | ||
| 1184 | return -1; | ||
| 1185 | else | ||
| 1186 | return 0; | ||
| 1187 | } | ||
| 1188 | |||
| 1189 | /* We're allowed to probe. Build it now. */ | ||
| 1190 | if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) | ||
| 1191 | return -1; | ||
| 1192 | sk_charge_skb(sk, nskb); | ||
| 1193 | |||
| 1194 | skb = sk->sk_send_head; | ||
| 1195 | __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); | ||
| 1196 | sk->sk_send_head = nskb; | ||
| 1197 | |||
| 1198 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; | ||
| 1199 | TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; | ||
| 1200 | TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; | ||
| 1201 | TCP_SKB_CB(nskb)->sacked = 0; | ||
| 1202 | nskb->csum = 0; | ||
| 1203 | if (skb->ip_summed == CHECKSUM_HW) | ||
| 1204 | nskb->ip_summed = CHECKSUM_HW; | ||
| 1205 | |||
| 1206 | len = 0; | ||
| 1207 | while (len < probe_size) { | ||
| 1208 | next = skb->next; | ||
| 1209 | |||
| 1210 | copy = min_t(int, skb->len, probe_size - len); | ||
| 1211 | if (nskb->ip_summed) | ||
| 1212 | skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); | ||
| 1213 | else | ||
| 1214 | nskb->csum = skb_copy_and_csum_bits(skb, 0, | ||
| 1215 | skb_put(nskb, copy), copy, nskb->csum); | ||
| 1216 | |||
| 1217 | if (skb->len <= copy) { | ||
| 1218 | /* We've eaten all the data from this skb. | ||
| 1219 | * Throw it away. */ | ||
| 1220 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; | ||
| 1221 | __skb_unlink(skb, &sk->sk_write_queue); | ||
| 1222 | sk_stream_free_skb(sk, skb); | ||
| 1223 | } else { | ||
| 1224 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & | ||
| 1225 | ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); | ||
| 1226 | if (!skb_shinfo(skb)->nr_frags) { | ||
| 1227 | skb_pull(skb, copy); | ||
| 1228 | if (skb->ip_summed != CHECKSUM_HW) | ||
| 1229 | skb->csum = csum_partial(skb->data, skb->len, 0); | ||
| 1230 | } else { | ||
| 1231 | __pskb_trim_head(skb, copy); | ||
| 1232 | tcp_set_skb_tso_segs(sk, skb, mss_now); | ||
| 1233 | } | ||
| 1234 | TCP_SKB_CB(skb)->seq += copy; | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | len += copy; | ||
| 1238 | skb = next; | ||
| 1239 | } | ||
| 1240 | tcp_init_tso_segs(sk, nskb, nskb->len); | ||
| 1241 | |||
| 1242 | /* We're ready to send. If this fails, the probe will | ||
| 1243 | * be resegmented into mss-sized pieces by tcp_write_xmit(). */ | ||
| 1244 | TCP_SKB_CB(nskb)->when = tcp_time_stamp; | ||
| 1245 | if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { | ||
| 1246 | /* Decrement cwnd here because we are sending | ||
| 1247 | * effectively two packets. */ | ||
| 1248 | tp->snd_cwnd--; | ||
| 1249 | update_send_head(sk, tp, nskb); | ||
| 1250 | |||
| 1251 | icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); | ||
| 1252 | tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; | ||
| 1253 | tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; | ||
| 1254 | |||
| 1255 | return 1; | ||
| 1256 | } | ||
| 1257 | |||
| 1258 | return -1; | ||
| 1259 | } | ||
| 1260 | |||
| 1261 | |||
| 1066 | /* This routine writes packets to the network. It advances the | 1262 | /* This routine writes packets to the network. It advances the |
| 1067 | * send_head. This happens as incoming acks open up the remote | 1263 | * send_head. This happens as incoming acks open up the remote |
| 1068 | * window for us. | 1264 | * window for us. |
| @@ -1076,6 +1272,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
| 1076 | struct sk_buff *skb; | 1272 | struct sk_buff *skb; |
| 1077 | unsigned int tso_segs, sent_pkts; | 1273 | unsigned int tso_segs, sent_pkts; |
| 1078 | int cwnd_quota; | 1274 | int cwnd_quota; |
| 1275 | int result; | ||
| 1079 | 1276 | ||
| 1080 | /* If we are closed, the bytes will have to remain here. | 1277 | /* If we are closed, the bytes will have to remain here. |
| 1081 | * In time closedown will finish, we empty the write queue and all | 1278 | * In time closedown will finish, we empty the write queue and all |
| @@ -1085,6 +1282,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
| 1085 | return 0; | 1282 | return 0; |
| 1086 | 1283 | ||
| 1087 | sent_pkts = 0; | 1284 | sent_pkts = 0; |
| 1285 | |||
| 1286 | /* Do MTU probing. */ | ||
| 1287 | if ((result = tcp_mtu_probe(sk)) == 0) { | ||
| 1288 | return 0; | ||
| 1289 | } else if (result > 0) { | ||
| 1290 | sent_pkts = 1; | ||
| 1291 | } | ||
| 1292 | |||
| 1088 | while ((skb = sk->sk_send_head)) { | 1293 | while ((skb = sk->sk_send_head)) { |
| 1089 | unsigned int limit; | 1294 | unsigned int limit; |
| 1090 | 1295 | ||
| @@ -1455,9 +1660,15 @@ void tcp_simple_retransmit(struct sock *sk) | |||
| 1455 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | 1660 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) |
| 1456 | { | 1661 | { |
| 1457 | struct tcp_sock *tp = tcp_sk(sk); | 1662 | struct tcp_sock *tp = tcp_sk(sk); |
| 1663 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1458 | unsigned int cur_mss = tcp_current_mss(sk, 0); | 1664 | unsigned int cur_mss = tcp_current_mss(sk, 0); |
| 1459 | int err; | 1665 | int err; |
| 1460 | 1666 | ||
| 1667 | /* Inconslusive MTU probe */ | ||
| 1668 | if (icsk->icsk_mtup.probe_size) { | ||
| 1669 | icsk->icsk_mtup.probe_size = 0; | ||
| 1670 | } | ||
| 1671 | |||
| 1461 | /* Do not sent more than we queued. 1/4 is reserved for possible | 1672 | /* Do not sent more than we queued. 1/4 is reserved for possible |
| 1462 | * copying overhead: fragmentation, tunneling, mangling etc. | 1673 | * copying overhead: fragmentation, tunneling, mangling etc. |
| 1463 | */ | 1674 | */ |
| @@ -1883,6 +2094,7 @@ static void tcp_connect_init(struct sock *sk) | |||
| 1883 | if (tp->rx_opt.user_mss) | 2094 | if (tp->rx_opt.user_mss) |
| 1884 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | 2095 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; |
| 1885 | tp->max_window = 0; | 2096 | tp->max_window = 0; |
| 2097 | tcp_mtup_init(sk); | ||
| 1886 | tcp_sync_mss(sk, dst_mtu(dst)); | 2098 | tcp_sync_mss(sk, dst_mtu(dst)); |
| 1887 | 2099 | ||
| 1888 | if (!tp->window_clamp) | 2100 | if (!tp->window_clamp) |
| @@ -2180,3 +2392,4 @@ EXPORT_SYMBOL(tcp_make_synack); | |||
| 2180 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2392 | EXPORT_SYMBOL(tcp_simple_retransmit); |
| 2181 | EXPORT_SYMBOL(tcp_sync_mss); | 2393 | EXPORT_SYMBOL(tcp_sync_mss); |
| 2182 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); | 2394 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); |
| 2395 | EXPORT_SYMBOL(tcp_mtup_init); | ||
