diff options
author | David S. Miller <davem@davemloft.net> | 2005-12-06 19:24:52 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2005-12-06 19:24:52 -0500 |
commit | dfb4b9dceb35c567a595ae5e9d035cfda044a103 (patch) | |
tree | 1a76209896509b38458df03593b78ff1abeb0cd9 | |
parent | 0d7bef600acab393898bd5553e167496587da3e1 (diff) |
[TCP] Vegas: timestamp before clone
We have to store the congestion control timestamp on the SKB before we
clone it, not after. Else we get no timestamping information at all.
tcp_transmit_skb() has been reworked so that we can do the timestamp
still in one spot, instead of at all the call sites.
Problem discovered, and initial fix, from Tom Young
<tyo@ee.unimelb.edu.au>.
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/ipv4/tcp_output.c | 233 |
1 files changed, 124 insertions, 109 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 029c70dfb585..b7325e0b406a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk) | |||
262 | * We are working here with either a clone of the original | 262 | * We are working here with either a clone of the original |
263 | * SKB, or a fresh unique copy made by the retransmit engine. | 263 | * SKB, or a fresh unique copy made by the retransmit engine. |
264 | */ | 264 | */ |
265 | static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | 265 | static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) |
266 | { | 266 | { |
267 | if (skb != NULL) { | 267 | const struct inet_connection_sock *icsk = inet_csk(sk); |
268 | const struct inet_connection_sock *icsk = inet_csk(sk); | 268 | struct inet_sock *inet; |
269 | struct inet_sock *inet = inet_sk(sk); | 269 | struct tcp_sock *tp; |
270 | struct tcp_sock *tp = tcp_sk(sk); | 270 | struct tcp_skb_cb *tcb; |
271 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | 271 | int tcp_header_size; |
272 | int tcp_header_size = tp->tcp_header_len; | 272 | struct tcphdr *th; |
273 | struct tcphdr *th; | 273 | int sysctl_flags; |
274 | int sysctl_flags; | 274 | int err; |
275 | int err; | 275 | |
276 | BUG_ON(!skb || !tcp_skb_pcount(skb)); | ||
277 | |||
278 | /* If congestion control is doing timestamping, we must | ||
279 | * take such a timestamp before we potentially clone/copy. | ||
280 | */ | ||
281 | if (icsk->icsk_ca_ops->rtt_sample) | ||
282 | __net_timestamp(skb); | ||
283 | |||
284 | if (likely(clone_it)) { | ||
285 | if (unlikely(skb_cloned(skb))) | ||
286 | skb = pskb_copy(skb, gfp_mask); | ||
287 | else | ||
288 | skb = skb_clone(skb, gfp_mask); | ||
289 | if (unlikely(!skb)) | ||
290 | return -ENOBUFS; | ||
291 | } | ||
276 | 292 | ||
277 | BUG_ON(!tcp_skb_pcount(skb)); | 293 | inet = inet_sk(sk); |
294 | tp = tcp_sk(sk); | ||
295 | tcb = TCP_SKB_CB(skb); | ||
296 | tcp_header_size = tp->tcp_header_len; | ||
278 | 297 | ||
279 | #define SYSCTL_FLAG_TSTAMPS 0x1 | 298 | #define SYSCTL_FLAG_TSTAMPS 0x1 |
280 | #define SYSCTL_FLAG_WSCALE 0x2 | 299 | #define SYSCTL_FLAG_WSCALE 0x2 |
281 | #define SYSCTL_FLAG_SACK 0x4 | 300 | #define SYSCTL_FLAG_SACK 0x4 |
282 | 301 | ||
283 | /* If congestion control is doing timestamping */ | 302 | sysctl_flags = 0; |
284 | if (icsk->icsk_ca_ops->rtt_sample) | 303 | if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { |
285 | __net_timestamp(skb); | 304 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; |
286 | 305 | if(sysctl_tcp_timestamps) { | |
287 | sysctl_flags = 0; | 306 | tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; |
288 | if (tcb->flags & TCPCB_FLAG_SYN) { | 307 | sysctl_flags |= SYSCTL_FLAG_TSTAMPS; |
289 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; | ||
290 | if(sysctl_tcp_timestamps) { | ||
291 | tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; | ||
292 | sysctl_flags |= SYSCTL_FLAG_TSTAMPS; | ||
293 | } | ||
294 | if(sysctl_tcp_window_scaling) { | ||
295 | tcp_header_size += TCPOLEN_WSCALE_ALIGNED; | ||
296 | sysctl_flags |= SYSCTL_FLAG_WSCALE; | ||
297 | } | ||
298 | if(sysctl_tcp_sack) { | ||
299 | sysctl_flags |= SYSCTL_FLAG_SACK; | ||
300 | if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) | ||
301 | tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; | ||
302 | } | ||
303 | } else if (tp->rx_opt.eff_sacks) { | ||
304 | /* A SACK is 2 pad bytes, a 2 byte header, plus | ||
305 | * 2 32-bit sequence numbers for each SACK block. | ||
306 | */ | ||
307 | tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + | ||
308 | (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); | ||
309 | } | 308 | } |
310 | 309 | if (sysctl_tcp_window_scaling) { | |
311 | if (tcp_packets_in_flight(tp) == 0) | 310 | tcp_header_size += TCPOLEN_WSCALE_ALIGNED; |
312 | tcp_ca_event(sk, CA_EVENT_TX_START); | 311 | sysctl_flags |= SYSCTL_FLAG_WSCALE; |
313 | |||
314 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); | ||
315 | skb->h.th = th; | ||
316 | skb_set_owner_w(skb, sk); | ||
317 | |||
318 | /* Build TCP header and checksum it. */ | ||
319 | th->source = inet->sport; | ||
320 | th->dest = inet->dport; | ||
321 | th->seq = htonl(tcb->seq); | ||
322 | th->ack_seq = htonl(tp->rcv_nxt); | ||
323 | *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); | ||
324 | if (tcb->flags & TCPCB_FLAG_SYN) { | ||
325 | /* RFC1323: The window in SYN & SYN/ACK segments | ||
326 | * is never scaled. | ||
327 | */ | ||
328 | th->window = htons(tp->rcv_wnd); | ||
329 | } else { | ||
330 | th->window = htons(tcp_select_window(sk)); | ||
331 | } | 312 | } |
332 | th->check = 0; | 313 | if (sysctl_tcp_sack) { |
333 | th->urg_ptr = 0; | 314 | sysctl_flags |= SYSCTL_FLAG_SACK; |
334 | 315 | if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) | |
335 | if (tp->urg_mode && | 316 | tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; |
336 | between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { | ||
337 | th->urg_ptr = htons(tp->snd_up-tcb->seq); | ||
338 | th->urg = 1; | ||
339 | } | 317 | } |
318 | } else if (unlikely(tp->rx_opt.eff_sacks)) { | ||
319 | /* A SACK is 2 pad bytes, a 2 byte header, plus | ||
320 | * 2 32-bit sequence numbers for each SACK block. | ||
321 | */ | ||
322 | tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + | ||
323 | (tp->rx_opt.eff_sacks * | ||
324 | TCPOLEN_SACK_PERBLOCK)); | ||
325 | } | ||
326 | |||
327 | if (tcp_packets_in_flight(tp) == 0) | ||
328 | tcp_ca_event(sk, CA_EVENT_TX_START); | ||
329 | |||
330 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); | ||
331 | skb->h.th = th; | ||
332 | skb_set_owner_w(skb, sk); | ||
333 | |||
334 | /* Build TCP header and checksum it. */ | ||
335 | th->source = inet->sport; | ||
336 | th->dest = inet->dport; | ||
337 | th->seq = htonl(tcb->seq); | ||
338 | th->ack_seq = htonl(tp->rcv_nxt); | ||
339 | *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | | ||
340 | tcb->flags); | ||
341 | |||
342 | if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { | ||
343 | /* RFC1323: The window in SYN & SYN/ACK segments | ||
344 | * is never scaled. | ||
345 | */ | ||
346 | th->window = htons(tp->rcv_wnd); | ||
347 | } else { | ||
348 | th->window = htons(tcp_select_window(sk)); | ||
349 | } | ||
350 | th->check = 0; | ||
351 | th->urg_ptr = 0; | ||
340 | 352 | ||
341 | if (tcb->flags & TCPCB_FLAG_SYN) { | 353 | if (unlikely(tp->urg_mode && |
342 | tcp_syn_build_options((__u32 *)(th + 1), | 354 | between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { |
343 | tcp_advertise_mss(sk), | 355 | th->urg_ptr = htons(tp->snd_up-tcb->seq); |
344 | (sysctl_flags & SYSCTL_FLAG_TSTAMPS), | 356 | th->urg = 1; |
345 | (sysctl_flags & SYSCTL_FLAG_SACK), | 357 | } |
346 | (sysctl_flags & SYSCTL_FLAG_WSCALE), | ||
347 | tp->rx_opt.rcv_wscale, | ||
348 | tcb->when, | ||
349 | tp->rx_opt.ts_recent); | ||
350 | } else { | ||
351 | tcp_build_and_update_options((__u32 *)(th + 1), | ||
352 | tp, tcb->when); | ||
353 | 358 | ||
354 | TCP_ECN_send(sk, tp, skb, tcp_header_size); | 359 | if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { |
355 | } | 360 | tcp_syn_build_options((__u32 *)(th + 1), |
356 | tp->af_specific->send_check(sk, th, skb->len, skb); | 361 | tcp_advertise_mss(sk), |
362 | (sysctl_flags & SYSCTL_FLAG_TSTAMPS), | ||
363 | (sysctl_flags & SYSCTL_FLAG_SACK), | ||
364 | (sysctl_flags & SYSCTL_FLAG_WSCALE), | ||
365 | tp->rx_opt.rcv_wscale, | ||
366 | tcb->when, | ||
367 | tp->rx_opt.ts_recent); | ||
368 | } else { | ||
369 | tcp_build_and_update_options((__u32 *)(th + 1), | ||
370 | tp, tcb->when); | ||
371 | TCP_ECN_send(sk, tp, skb, tcp_header_size); | ||
372 | } | ||
357 | 373 | ||
358 | if (tcb->flags & TCPCB_FLAG_ACK) | 374 | tp->af_specific->send_check(sk, th, skb->len, skb); |
359 | tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); | ||
360 | 375 | ||
361 | if (skb->len != tcp_header_size) | 376 | if (likely(tcb->flags & TCPCB_FLAG_ACK)) |
362 | tcp_event_data_sent(tp, skb, sk); | 377 | tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); |
363 | 378 | ||
364 | TCP_INC_STATS(TCP_MIB_OUTSEGS); | 379 | if (skb->len != tcp_header_size) |
380 | tcp_event_data_sent(tp, skb, sk); | ||
365 | 381 | ||
366 | err = tp->af_specific->queue_xmit(skb, 0); | 382 | TCP_INC_STATS(TCP_MIB_OUTSEGS); |
367 | if (err <= 0) | ||
368 | return err; | ||
369 | 383 | ||
370 | tcp_enter_cwr(sk); | 384 | err = tp->af_specific->queue_xmit(skb, 0); |
385 | if (unlikely(err <= 0)) | ||
386 | return err; | ||
387 | |||
388 | tcp_enter_cwr(sk); | ||
389 | |||
390 | /* NET_XMIT_CN is special. It does not guarantee, | ||
391 | * that this packet is lost. It tells that device | ||
392 | * is about to start to drop packets or already | ||
393 | * drops some packets of the same priority and | ||
394 | * invokes us to send less aggressively. | ||
395 | */ | ||
396 | return err == NET_XMIT_CN ? 0 : err; | ||
371 | 397 | ||
372 | /* NET_XMIT_CN is special. It does not guarantee, | ||
373 | * that this packet is lost. It tells that device | ||
374 | * is about to start to drop packets or already | ||
375 | * drops some packets of the same priority and | ||
376 | * invokes us to send less aggressively. | ||
377 | */ | ||
378 | return err == NET_XMIT_CN ? 0 : err; | ||
379 | } | ||
380 | return -ENOBUFS; | ||
381 | #undef SYSCTL_FLAG_TSTAMPS | 398 | #undef SYSCTL_FLAG_TSTAMPS |
382 | #undef SYSCTL_FLAG_WSCALE | 399 | #undef SYSCTL_FLAG_WSCALE |
383 | #undef SYSCTL_FLAG_SACK | 400 | #undef SYSCTL_FLAG_SACK |
@@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1036 | 1053 | ||
1037 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 1054 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1038 | 1055 | ||
1039 | if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) | 1056 | if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) |
1040 | break; | 1057 | break; |
1041 | 1058 | ||
1042 | /* Advance the send_head. This one is sent out. | 1059 | /* Advance the send_head. This one is sent out. |
@@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) | |||
1109 | /* Send it out now. */ | 1126 | /* Send it out now. */ |
1110 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 1127 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1111 | 1128 | ||
1112 | if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { | 1129 | if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { |
1113 | update_send_head(sk, tp, skb); | 1130 | update_send_head(sk, tp, skb); |
1114 | tcp_cwnd_validate(sk, tp); | 1131 | tcp_cwnd_validate(sk, tp); |
1115 | return; | 1132 | return; |
@@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
1429 | */ | 1446 | */ |
1430 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 1447 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1431 | 1448 | ||
1432 | err = tcp_transmit_skb(sk, (skb_cloned(skb) ? | 1449 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
1433 | pskb_copy(skb, GFP_ATOMIC): | ||
1434 | skb_clone(skb, GFP_ATOMIC))); | ||
1435 | 1450 | ||
1436 | if (err == 0) { | 1451 | if (err == 0) { |
1437 | /* Update global TCP statistics. */ | 1452 | /* Update global TCP statistics. */ |
@@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) | |||
1665 | TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); | 1680 | TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); |
1666 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; | 1681 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; |
1667 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 1682 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1668 | if (tcp_transmit_skb(sk, skb)) | 1683 | if (tcp_transmit_skb(sk, skb, 0, priority)) |
1669 | NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); | 1684 | NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); |
1670 | } | 1685 | } |
1671 | 1686 | ||
@@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk) | |||
1700 | TCP_ECN_send_synack(tcp_sk(sk), skb); | 1715 | TCP_ECN_send_synack(tcp_sk(sk), skb); |
1701 | } | 1716 | } |
1702 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 1717 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1703 | return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); | 1718 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
1704 | } | 1719 | } |
1705 | 1720 | ||
1706 | /* | 1721 | /* |
@@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk) | |||
1861 | __skb_queue_tail(&sk->sk_write_queue, buff); | 1876 | __skb_queue_tail(&sk->sk_write_queue, buff); |
1862 | sk_charge_skb(sk, buff); | 1877 | sk_charge_skb(sk, buff); |
1863 | tp->packets_out += tcp_skb_pcount(buff); | 1878 | tp->packets_out += tcp_skb_pcount(buff); |
1864 | tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); | 1879 | tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); |
1865 | TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); | 1880 | TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); |
1866 | 1881 | ||
1867 | /* Timer for repeating the SYN until an answer. */ | 1882 | /* Timer for repeating the SYN until an answer. */ |
@@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk) | |||
1957 | /* Send it off, this clears delayed acks for us. */ | 1972 | /* Send it off, this clears delayed acks for us. */ |
1958 | TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); | 1973 | TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); |
1959 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 1974 | TCP_SKB_CB(buff)->when = tcp_time_stamp; |
1960 | tcp_transmit_skb(sk, buff); | 1975 | tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); |
1961 | } | 1976 | } |
1962 | } | 1977 | } |
1963 | 1978 | ||
@@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) | |||
1997 | TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; | 2012 | TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; |
1998 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; | 2013 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; |
1999 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2014 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
2000 | return tcp_transmit_skb(sk, skb); | 2015 | return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); |
2001 | } | 2016 | } |
2002 | 2017 | ||
2003 | int tcp_write_wakeup(struct sock *sk) | 2018 | int tcp_write_wakeup(struct sock *sk) |
@@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk) | |||
2030 | 2045 | ||
2031 | TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; | 2046 | TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; |
2032 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2047 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
2033 | err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); | 2048 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
2034 | if (!err) { | 2049 | if (!err) { |
2035 | update_send_head(sk, tp, skb); | 2050 | update_send_head(sk, tp, skb); |
2036 | } | 2051 | } |