diff options
author | John Heffner <jheffner@psc.edu> | 2006-03-20 20:53:41 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2006-03-20 20:53:41 -0500 |
commit | 5d424d5a674f782d0659a3b66d951f412901faee (patch) | |
tree | 579871172044e02e626a90388d19ec55cf2d1fc4 /net/ipv4/tcp_output.c | |
parent | 1d60290f27e7dc4bce2c43922d0bfa9abd246fc9 (diff) |
[TCP]: MTU probing
Implementation of packetization layer path mtu discovery for TCP, based on
the internet-draft currently found at
<http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>.
Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 236 |
1 files changed, 219 insertions, 17 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9f498a6c8895..8197b5e12f1f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1; | |||
51 | */ | 51 | */ |
52 | int sysctl_tcp_tso_win_divisor = 3; | 52 | int sysctl_tcp_tso_win_divisor = 3; |
53 | 53 | ||
54 | int sysctl_tcp_mtu_probing = 0; | ||
55 | int sysctl_tcp_base_mss = 512; | ||
56 | |||
57 | EXPORT_SYMBOL(sysctl_tcp_mtu_probing); | ||
58 | EXPORT_SYMBOL(sysctl_tcp_base_mss); | ||
59 | |||
54 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, | 60 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, |
55 | struct sk_buff *skb) | 61 | struct sk_buff *skb) |
56 | { | 62 | { |
@@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
681 | return 0; | 687 | return 0; |
682 | } | 688 | } |
683 | 689 | ||
690 | /* Not accounting for SACKs here. */ | ||
691 | int tcp_mtu_to_mss(struct sock *sk, int pmtu) | ||
692 | { | ||
693 | struct tcp_sock *tp = tcp_sk(sk); | ||
694 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
695 | int mss_now; | ||
696 | |||
697 | /* Calculate base mss without TCP options: | ||
698 | It is MMS_S - sizeof(tcphdr) of rfc1122 | ||
699 | */ | ||
700 | mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); | ||
701 | |||
702 | /* Clamp it (mss_clamp does not include tcp options) */ | ||
703 | if (mss_now > tp->rx_opt.mss_clamp) | ||
704 | mss_now = tp->rx_opt.mss_clamp; | ||
705 | |||
706 | /* Now subtract optional transport overhead */ | ||
707 | mss_now -= icsk->icsk_ext_hdr_len; | ||
708 | |||
709 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | ||
710 | if (mss_now < 48) | ||
711 | mss_now = 48; | ||
712 | |||
713 | /* Now subtract TCP options size, not including SACKs */ | ||
714 | mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | ||
715 | |||
716 | return mss_now; | ||
717 | } | ||
718 | |||
719 | /* Inverse of above */ | ||
720 | int tcp_mss_to_mtu(struct sock *sk, int mss) | ||
721 | { | ||
722 | struct tcp_sock *tp = tcp_sk(sk); | ||
723 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
724 | int mtu; | ||
725 | |||
726 | mtu = mss + | ||
727 | tp->tcp_header_len + | ||
728 | icsk->icsk_ext_hdr_len + | ||
729 | icsk->icsk_af_ops->net_header_len; | ||
730 | |||
731 | return mtu; | ||
732 | } | ||
733 | |||
734 | void tcp_mtup_init(struct sock *sk) | ||
735 | { | ||
736 | struct tcp_sock *tp = tcp_sk(sk); | ||
737 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
738 | |||
739 | icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; | ||
740 | icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + | ||
741 | icsk->icsk_af_ops->net_header_len; | ||
742 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); | ||
743 | icsk->icsk_mtup.probe_size = 0; | ||
744 | } | ||
745 | |||
684 | /* This function synchronize snd mss to current pmtu/exthdr set. | 746 | /* This function synchronize snd mss to current pmtu/exthdr set. |
685 | 747 | ||
686 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts | 748 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts |
@@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
708 | { | 770 | { |
709 | struct tcp_sock *tp = tcp_sk(sk); | 771 | struct tcp_sock *tp = tcp_sk(sk); |
710 | struct inet_connection_sock *icsk = inet_csk(sk); | 772 | struct inet_connection_sock *icsk = inet_csk(sk); |
711 | /* Calculate base mss without TCP options: | 773 | int mss_now; |
712 | It is MMS_S - sizeof(tcphdr) of rfc1122 | ||
713 | */ | ||
714 | int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - | ||
715 | sizeof(struct tcphdr)); | ||
716 | 774 | ||
717 | /* Clamp it (mss_clamp does not include tcp options) */ | 775 | if (icsk->icsk_mtup.search_high > pmtu) |
718 | if (mss_now > tp->rx_opt.mss_clamp) | 776 | icsk->icsk_mtup.search_high = pmtu; |
719 | mss_now = tp->rx_opt.mss_clamp; | ||
720 | 777 | ||
721 | /* Now subtract optional transport overhead */ | 778 | mss_now = tcp_mtu_to_mss(sk, pmtu); |
722 | mss_now -= icsk->icsk_ext_hdr_len; | ||
723 | |||
724 | /* Then reserve room for full set of TCP options and 8 bytes of data */ | ||
725 | if (mss_now < 48) | ||
726 | mss_now = 48; | ||
727 | |||
728 | /* Now subtract TCP options size, not including SACKs */ | ||
729 | mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | ||
730 | 779 | ||
731 | /* Bound mss with half of window */ | 780 | /* Bound mss with half of window */ |
732 | if (tp->max_window && mss_now > (tp->max_window>>1)) | 781 | if (tp->max_window && mss_now > (tp->max_window>>1)) |
@@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
734 | 783 | ||
735 | /* And store cached results */ | 784 | /* And store cached results */ |
736 | icsk->icsk_pmtu_cookie = pmtu; | 785 | icsk->icsk_pmtu_cookie = pmtu; |
786 | if (icsk->icsk_mtup.enabled) | ||
787 | mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); | ||
737 | tp->mss_cache = mss_now; | 788 | tp->mss_cache = mss_now; |
738 | 789 | ||
739 | return mss_now; | 790 | return mss_now; |
@@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
1063 | return 1; | 1114 | return 1; |
1064 | } | 1115 | } |
1065 | 1116 | ||
1117 | /* Create a new MTU probe if we are ready. | ||
1118 | * Returns 0 if we should wait to probe (no cwnd available), | ||
1119 | * 1 if a probe was sent, | ||
1120 | * -1 otherwise */ | ||
1121 | static int tcp_mtu_probe(struct sock *sk) | ||
1122 | { | ||
1123 | struct tcp_sock *tp = tcp_sk(sk); | ||
1124 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1125 | struct sk_buff *skb, *nskb, *next; | ||
1126 | int len; | ||
1127 | int probe_size; | ||
1128 | unsigned int pif; | ||
1129 | int copy; | ||
1130 | int mss_now; | ||
1131 | |||
1132 | /* Not currently probing/verifying, | ||
1133 | * not in recovery, | ||
1134 | * have enough cwnd, and | ||
1135 | * not SACKing (the variable headers throw things off) */ | ||
1136 | if (!icsk->icsk_mtup.enabled || | ||
1137 | icsk->icsk_mtup.probe_size || | ||
1138 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open || | ||
1139 | tp->snd_cwnd < 11 || | ||
1140 | tp->rx_opt.eff_sacks) | ||
1141 | return -1; | ||
1142 | |||
1143 | /* Very simple search strategy: just double the MSS. */ | ||
1144 | mss_now = tcp_current_mss(sk, 0); | ||
1145 | probe_size = 2*tp->mss_cache; | ||
1146 | if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { | ||
1147 | /* TODO: set timer for probe_converge_event */ | ||
1148 | return -1; | ||
1149 | } | ||
1150 | |||
1151 | /* Have enough data in the send queue to probe? */ | ||
1152 | len = 0; | ||
1153 | if ((skb = sk->sk_send_head) == NULL) | ||
1154 | return -1; | ||
1155 | while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) | ||
1156 | skb = skb->next; | ||
1157 | if (len < probe_size) | ||
1158 | return -1; | ||
1159 | |||
1160 | /* Receive window check. */ | ||
1161 | if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) { | ||
1162 | if (tp->snd_wnd < probe_size) | ||
1163 | return -1; | ||
1164 | else | ||
1165 | return 0; | ||
1166 | } | ||
1167 | |||
1168 | /* Do we need to wait to drain cwnd? */ | ||
1169 | pif = tcp_packets_in_flight(tp); | ||
1170 | if (pif + 2 > tp->snd_cwnd) { | ||
1171 | /* With no packets in flight, don't stall. */ | ||
1172 | if (pif == 0) | ||
1173 | return -1; | ||
1174 | else | ||
1175 | return 0; | ||
1176 | } | ||
1177 | |||
1178 | /* We're allowed to probe. Build it now. */ | ||
1179 | if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) | ||
1180 | return -1; | ||
1181 | sk_charge_skb(sk, nskb); | ||
1182 | |||
1183 | skb = sk->sk_send_head; | ||
1184 | __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); | ||
1185 | sk->sk_send_head = nskb; | ||
1186 | |||
1187 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; | ||
1188 | TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; | ||
1189 | TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; | ||
1190 | TCP_SKB_CB(nskb)->sacked = 0; | ||
1191 | nskb->csum = 0; | ||
1192 | if (skb->ip_summed == CHECKSUM_HW) | ||
1193 | nskb->ip_summed = CHECKSUM_HW; | ||
1194 | |||
1195 | len = 0; | ||
1196 | while (len < probe_size) { | ||
1197 | next = skb->next; | ||
1198 | |||
1199 | copy = min_t(int, skb->len, probe_size - len); | ||
1200 | if (nskb->ip_summed) | ||
1201 | skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); | ||
1202 | else | ||
1203 | nskb->csum = skb_copy_and_csum_bits(skb, 0, | ||
1204 | skb_put(nskb, copy), copy, nskb->csum); | ||
1205 | |||
1206 | if (skb->len <= copy) { | ||
1207 | /* We've eaten all the data from this skb. | ||
1208 | * Throw it away. */ | ||
1209 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; | ||
1210 | __skb_unlink(skb, &sk->sk_write_queue); | ||
1211 | sk_stream_free_skb(sk, skb); | ||
1212 | } else { | ||
1213 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & | ||
1214 | ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); | ||
1215 | if (!skb_shinfo(skb)->nr_frags) { | ||
1216 | skb_pull(skb, copy); | ||
1217 | if (skb->ip_summed != CHECKSUM_HW) | ||
1218 | skb->csum = csum_partial(skb->data, skb->len, 0); | ||
1219 | } else { | ||
1220 | __pskb_trim_head(skb, copy); | ||
1221 | tcp_set_skb_tso_segs(sk, skb, mss_now); | ||
1222 | } | ||
1223 | TCP_SKB_CB(skb)->seq += copy; | ||
1224 | } | ||
1225 | |||
1226 | len += copy; | ||
1227 | skb = next; | ||
1228 | } | ||
1229 | tcp_init_tso_segs(sk, nskb, nskb->len); | ||
1230 | |||
1231 | /* We're ready to send. If this fails, the probe will | ||
1232 | * be resegmented into mss-sized pieces by tcp_write_xmit(). */ | ||
1233 | TCP_SKB_CB(nskb)->when = tcp_time_stamp; | ||
1234 | if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { | ||
1235 | /* Decrement cwnd here because we are sending | ||
1236 | * effectively two packets. */ | ||
1237 | tp->snd_cwnd--; | ||
1238 | update_send_head(sk, tp, nskb); | ||
1239 | |||
1240 | icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); | ||
1241 | icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq; | ||
1242 | icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; | ||
1243 | |||
1244 | return 1; | ||
1245 | } | ||
1246 | |||
1247 | return -1; | ||
1248 | } | ||
1249 | |||
1250 | |||
1066 | /* This routine writes packets to the network. It advances the | 1251 | /* This routine writes packets to the network. It advances the |
1067 | * send_head. This happens as incoming acks open up the remote | 1252 | * send_head. This happens as incoming acks open up the remote |
1068 | * window for us. | 1253 | * window for us. |
@@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1076 | struct sk_buff *skb; | 1261 | struct sk_buff *skb; |
1077 | unsigned int tso_segs, sent_pkts; | 1262 | unsigned int tso_segs, sent_pkts; |
1078 | int cwnd_quota; | 1263 | int cwnd_quota; |
1264 | int result; | ||
1079 | 1265 | ||
1080 | /* If we are closed, the bytes will have to remain here. | 1266 | /* If we are closed, the bytes will have to remain here. |
1081 | * In time closedown will finish, we empty the write queue and all | 1267 | * In time closedown will finish, we empty the write queue and all |
@@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1085 | return 0; | 1271 | return 0; |
1086 | 1272 | ||
1087 | sent_pkts = 0; | 1273 | sent_pkts = 0; |
1274 | |||
1275 | /* Do MTU probing. */ | ||
1276 | if ((result = tcp_mtu_probe(sk)) == 0) { | ||
1277 | return 0; | ||
1278 | } else if (result > 0) { | ||
1279 | sent_pkts = 1; | ||
1280 | } | ||
1281 | |||
1088 | while ((skb = sk->sk_send_head)) { | 1282 | while ((skb = sk->sk_send_head)) { |
1089 | unsigned int limit; | 1283 | unsigned int limit; |
1090 | 1284 | ||
@@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk) | |||
1455 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | 1649 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) |
1456 | { | 1650 | { |
1457 | struct tcp_sock *tp = tcp_sk(sk); | 1651 | struct tcp_sock *tp = tcp_sk(sk); |
1652 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1458 | unsigned int cur_mss = tcp_current_mss(sk, 0); | 1653 | unsigned int cur_mss = tcp_current_mss(sk, 0); |
1459 | int err; | 1654 | int err; |
1460 | 1655 | ||
1656 | /* Inconslusive MTU probe */ | ||
1657 | if (icsk->icsk_mtup.probe_size) { | ||
1658 | icsk->icsk_mtup.probe_size = 0; | ||
1659 | } | ||
1660 | |||
1461 | /* Do not sent more than we queued. 1/4 is reserved for possible | 1661 | /* Do not sent more than we queued. 1/4 is reserved for possible |
1462 | * copying overhead: fragmentation, tunneling, mangling etc. | 1662 | * copying overhead: fragmentation, tunneling, mangling etc. |
1463 | */ | 1663 | */ |
@@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk) | |||
1883 | if (tp->rx_opt.user_mss) | 2083 | if (tp->rx_opt.user_mss) |
1884 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | 2084 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; |
1885 | tp->max_window = 0; | 2085 | tp->max_window = 0; |
2086 | tcp_mtup_init(sk); | ||
1886 | tcp_sync_mss(sk, dst_mtu(dst)); | 2087 | tcp_sync_mss(sk, dst_mtu(dst)); |
1887 | 2088 | ||
1888 | if (!tp->window_clamp) | 2089 | if (!tp->window_clamp) |
@@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack); | |||
2180 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2381 | EXPORT_SYMBOL(tcp_simple_retransmit); |
2181 | EXPORT_SYMBOL(tcp_sync_mss); | 2382 | EXPORT_SYMBOL(tcp_sync_mss); |
2182 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); | 2383 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); |
2384 | EXPORT_SYMBOL(tcp_mtup_init); | ||