aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorJohn Heffner <jheffner@psc.edu>2006-03-20 20:53:41 -0500
committerDavid S. Miller <davem@davemloft.net>2006-03-20 20:53:41 -0500
commit5d424d5a674f782d0659a3b66d951f412901faee (patch)
tree579871172044e02e626a90388d19ec55cf2d1fc4 /net/ipv4/tcp_output.c
parent1d60290f27e7dc4bce2c43922d0bfa9abd246fc9 (diff)
[TCP]: MTU probing
Implementation of packetization layer path mtu discovery for TCP, based on the internet-draft currently found at <http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>. Signed-off-by: John Heffner <jheffner@psc.edu> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c236
1 files changed, 219 insertions, 17 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f498a6c8895..8197b5e12f1f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1;
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 3; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54int sysctl_tcp_mtu_probing = 0;
55int sysctl_tcp_base_mss = 512;
56
57EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
58EXPORT_SYMBOL(sysctl_tcp_base_mss);
59
54static void update_send_head(struct sock *sk, struct tcp_sock *tp, 60static void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 61 struct sk_buff *skb)
56{ 62{
@@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
681 return 0; 687 return 0;
682} 688}
683 689
690/* Not accounting for SACKs here. */
691int tcp_mtu_to_mss(struct sock *sk, int pmtu)
692{
693 struct tcp_sock *tp = tcp_sk(sk);
694 struct inet_connection_sock *icsk = inet_csk(sk);
695 int mss_now;
696
697 /* Calculate base mss without TCP options:
698 It is MMS_S - sizeof(tcphdr) of rfc1122
699 */
700 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
701
702 /* Clamp it (mss_clamp does not include tcp options) */
703 if (mss_now > tp->rx_opt.mss_clamp)
704 mss_now = tp->rx_opt.mss_clamp;
705
706 /* Now subtract optional transport overhead */
707 mss_now -= icsk->icsk_ext_hdr_len;
708
709 /* Then reserve room for full set of TCP options and 8 bytes of data */
710 if (mss_now < 48)
711 mss_now = 48;
712
713 /* Now subtract TCP options size, not including SACKs */
714 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
715
716 return mss_now;
717}
718
719/* Inverse of above */
720int tcp_mss_to_mtu(struct sock *sk, int mss)
721{
722 struct tcp_sock *tp = tcp_sk(sk);
723 struct inet_connection_sock *icsk = inet_csk(sk);
724 int mtu;
725
726 mtu = mss +
727 tp->tcp_header_len +
728 icsk->icsk_ext_hdr_len +
729 icsk->icsk_af_ops->net_header_len;
730
731 return mtu;
732}
733
734void tcp_mtup_init(struct sock *sk)
735{
736 struct tcp_sock *tp = tcp_sk(sk);
737 struct inet_connection_sock *icsk = inet_csk(sk);
738
739 icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
740 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
741 icsk->icsk_af_ops->net_header_len;
742 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
743 icsk->icsk_mtup.probe_size = 0;
744}
745
684/* This function synchronize snd mss to current pmtu/exthdr set. 746/* This function synchronize snd mss to current pmtu/exthdr set.
685 747
686 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts 748 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
708{ 770{
709 struct tcp_sock *tp = tcp_sk(sk); 771 struct tcp_sock *tp = tcp_sk(sk);
710 struct inet_connection_sock *icsk = inet_csk(sk); 772 struct inet_connection_sock *icsk = inet_csk(sk);
711 /* Calculate base mss without TCP options: 773 int mss_now;
712 It is MMS_S - sizeof(tcphdr) of rfc1122
713 */
714 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
715 sizeof(struct tcphdr));
716 774
717 /* Clamp it (mss_clamp does not include tcp options) */ 775 if (icsk->icsk_mtup.search_high > pmtu)
718 if (mss_now > tp->rx_opt.mss_clamp) 776 icsk->icsk_mtup.search_high = pmtu;
719 mss_now = tp->rx_opt.mss_clamp;
720 777
721 /* Now subtract optional transport overhead */ 778 mss_now = tcp_mtu_to_mss(sk, pmtu);
722 mss_now -= icsk->icsk_ext_hdr_len;
723
724 /* Then reserve room for full set of TCP options and 8 bytes of data */
725 if (mss_now < 48)
726 mss_now = 48;
727
728 /* Now subtract TCP options size, not including SACKs */
729 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
730 779
731 /* Bound mss with half of window */ 780 /* Bound mss with half of window */
732 if (tp->max_window && mss_now > (tp->max_window>>1)) 781 if (tp->max_window && mss_now > (tp->max_window>>1))
@@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
734 783
735 /* And store cached results */ 784 /* And store cached results */
736 icsk->icsk_pmtu_cookie = pmtu; 785 icsk->icsk_pmtu_cookie = pmtu;
786 if (icsk->icsk_mtup.enabled)
787 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
737 tp->mss_cache = mss_now; 788 tp->mss_cache = mss_now;
738 789
739 return mss_now; 790 return mss_now;
@@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
1063 return 1; 1114 return 1;
1064} 1115}
1065 1116
1117/* Create a new MTU probe if we are ready.
1118 * Returns 0 if we should wait to probe (no cwnd available),
1119 * 1 if a probe was sent,
1120 * -1 otherwise */
1121static int tcp_mtu_probe(struct sock *sk)
1122{
1123 struct tcp_sock *tp = tcp_sk(sk);
1124 struct inet_connection_sock *icsk = inet_csk(sk);
1125 struct sk_buff *skb, *nskb, *next;
1126 int len;
1127 int probe_size;
1128 unsigned int pif;
1129 int copy;
1130 int mss_now;
1131
1132 /* Not currently probing/verifying,
1133 * not in recovery,
1134 * have enough cwnd, and
1135 * not SACKing (the variable headers throw things off) */
1136 if (!icsk->icsk_mtup.enabled ||
1137 icsk->icsk_mtup.probe_size ||
1138 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1139 tp->snd_cwnd < 11 ||
1140 tp->rx_opt.eff_sacks)
1141 return -1;
1142
1143 /* Very simple search strategy: just double the MSS. */
1144 mss_now = tcp_current_mss(sk, 0);
1145 probe_size = 2*tp->mss_cache;
1146 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1147 /* TODO: set timer for probe_converge_event */
1148 return -1;
1149 }
1150
1151 /* Have enough data in the send queue to probe? */
1152 len = 0;
1153 if ((skb = sk->sk_send_head) == NULL)
1154 return -1;
1155 while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
1156 skb = skb->next;
1157 if (len < probe_size)
1158 return -1;
1159
1160 /* Receive window check. */
1161 if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
1162 if (tp->snd_wnd < probe_size)
1163 return -1;
1164 else
1165 return 0;
1166 }
1167
1168 /* Do we need to wait to drain cwnd? */
1169 pif = tcp_packets_in_flight(tp);
1170 if (pif + 2 > tp->snd_cwnd) {
1171 /* With no packets in flight, don't stall. */
1172 if (pif == 0)
1173 return -1;
1174 else
1175 return 0;
1176 }
1177
1178 /* We're allowed to probe. Build it now. */
1179 if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1180 return -1;
1181 sk_charge_skb(sk, nskb);
1182
1183 skb = sk->sk_send_head;
1184 __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
1185 sk->sk_send_head = nskb;
1186
1187 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1188 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1189 TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
1190 TCP_SKB_CB(nskb)->sacked = 0;
1191 nskb->csum = 0;
1192 if (skb->ip_summed == CHECKSUM_HW)
1193 nskb->ip_summed = CHECKSUM_HW;
1194
1195 len = 0;
1196 while (len < probe_size) {
1197 next = skb->next;
1198
1199 copy = min_t(int, skb->len, probe_size - len);
1200 if (nskb->ip_summed)
1201 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1202 else
1203 nskb->csum = skb_copy_and_csum_bits(skb, 0,
1204 skb_put(nskb, copy), copy, nskb->csum);
1205
1206 if (skb->len <= copy) {
1207 /* We've eaten all the data from this skb.
1208 * Throw it away. */
1209 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
1210 __skb_unlink(skb, &sk->sk_write_queue);
1211 sk_stream_free_skb(sk, skb);
1212 } else {
1213 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1214 ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
1215 if (!skb_shinfo(skb)->nr_frags) {
1216 skb_pull(skb, copy);
1217 if (skb->ip_summed != CHECKSUM_HW)
1218 skb->csum = csum_partial(skb->data, skb->len, 0);
1219 } else {
1220 __pskb_trim_head(skb, copy);
1221 tcp_set_skb_tso_segs(sk, skb, mss_now);
1222 }
1223 TCP_SKB_CB(skb)->seq += copy;
1224 }
1225
1226 len += copy;
1227 skb = next;
1228 }
1229 tcp_init_tso_segs(sk, nskb, nskb->len);
1230
1231 /* We're ready to send. If this fails, the probe will
1232 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
1233 TCP_SKB_CB(nskb)->when = tcp_time_stamp;
1234 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
1235 /* Decrement cwnd here because we are sending
1236 * effectively two packets. */
1237 tp->snd_cwnd--;
1238 update_send_head(sk, tp, nskb);
1239
1240 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
1241 icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1242 icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1243
1244 return 1;
1245 }
1246
1247 return -1;
1248}
1249
1250
1066/* This routine writes packets to the network. It advances the 1251/* This routine writes packets to the network. It advances the
1067 * send_head. This happens as incoming acks open up the remote 1252 * send_head. This happens as incoming acks open up the remote
1068 * window for us. 1253 * window for us.
@@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1076 struct sk_buff *skb; 1261 struct sk_buff *skb;
1077 unsigned int tso_segs, sent_pkts; 1262 unsigned int tso_segs, sent_pkts;
1078 int cwnd_quota; 1263 int cwnd_quota;
1264 int result;
1079 1265
1080 /* If we are closed, the bytes will have to remain here. 1266 /* If we are closed, the bytes will have to remain here.
1081 * In time closedown will finish, we empty the write queue and all 1267 * In time closedown will finish, we empty the write queue and all
@@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1085 return 0; 1271 return 0;
1086 1272
1087 sent_pkts = 0; 1273 sent_pkts = 0;
1274
1275 /* Do MTU probing. */
1276 if ((result = tcp_mtu_probe(sk)) == 0) {
1277 return 0;
1278 } else if (result > 0) {
1279 sent_pkts = 1;
1280 }
1281
1088 while ((skb = sk->sk_send_head)) { 1282 while ((skb = sk->sk_send_head)) {
1089 unsigned int limit; 1283 unsigned int limit;
1090 1284
@@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk)
1455int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 1649int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1456{ 1650{
1457 struct tcp_sock *tp = tcp_sk(sk); 1651 struct tcp_sock *tp = tcp_sk(sk);
1652 struct inet_connection_sock *icsk = inet_csk(sk);
1458 unsigned int cur_mss = tcp_current_mss(sk, 0); 1653 unsigned int cur_mss = tcp_current_mss(sk, 0);
1459 int err; 1654 int err;
1460 1655
1656 /* Inconslusive MTU probe */
1657 if (icsk->icsk_mtup.probe_size) {
1658 icsk->icsk_mtup.probe_size = 0;
1659 }
1660
1461 /* Do not sent more than we queued. 1/4 is reserved for possible 1661 /* Do not sent more than we queued. 1/4 is reserved for possible
1462 * copying overhead: fragmentation, tunneling, mangling etc. 1662 * copying overhead: fragmentation, tunneling, mangling etc.
1463 */ 1663 */
@@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk)
1883 if (tp->rx_opt.user_mss) 2083 if (tp->rx_opt.user_mss)
1884 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 2084 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
1885 tp->max_window = 0; 2085 tp->max_window = 0;
2086 tcp_mtup_init(sk);
1886 tcp_sync_mss(sk, dst_mtu(dst)); 2087 tcp_sync_mss(sk, dst_mtu(dst));
1887 2088
1888 if (!tp->window_clamp) 2089 if (!tp->window_clamp)
@@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack);
2180EXPORT_SYMBOL(tcp_simple_retransmit); 2381EXPORT_SYMBOL(tcp_simple_retransmit);
2181EXPORT_SYMBOL(tcp_sync_mss); 2382EXPORT_SYMBOL(tcp_sync_mss);
2182EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); 2383EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
2384EXPORT_SYMBOL(tcp_mtup_init);