aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c288
1 files changed, 194 insertions, 94 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e98b57578dc..40a26b7157b4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -42,7 +42,7 @@
42 * Andi Kleen : Moved open_request checking here 42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests. 43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes. 44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of 45 * Andrey Savochkin: Fix RTT measurements in the presence of
46 * timestamps. 46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when 47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming 48 * removing SACKs due to in sequence incoming
@@ -89,6 +89,7 @@ int sysctl_tcp_frto;
89int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
90 90
91int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
92int sysctl_tcp_abc = 1;
92 93
93#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 94#define FLAG_DATA 0x01 /* Incoming frame contained data. */
94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 95#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk)
223 * of receiver window. Check #2. 224 * of receiver window. Check #2.
224 * 225 *
225 * The scheme does not work when sender sends good segments opening 226 * The scheme does not work when sender sends good segments opening
226 * window and then starts to feed us spagetti. But it should work 227 * window and then starts to feed us spaghetti. But it should work
227 * in common situations. Otherwise, we have to rely on queue collapsing. 228 * in common situations. Otherwise, we have to rely on queue collapsing.
228 */ 229 */
229 230
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
233{ 234{
234 /* Optimize this! */ 235 /* Optimize this! */
235 int truesize = tcp_win_from_space(skb->truesize)/2; 236 int truesize = tcp_win_from_space(skb->truesize)/2;
236 int window = tcp_full_space(sk)/2; 237 int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
237 238
238 while (tp->rcv_ssthresh <= window) { 239 while (tp->rcv_ssthresh <= window) {
239 if (truesize <= skb->len) 240 if (truesize <= skb->len)
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
277 int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); 278 int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
278 279
279 /* Try to select rcvbuf so that 4 mss-sized segments 280 /* Try to select rcvbuf so that 4 mss-sized segments
280 * will fit to window and correspoding skbs will fit to our rcvbuf. 281 * will fit to window and corresponding skbs will fit to our rcvbuf.
281 * (was 3; 4 is minimum to allow fast retransmit to work.) 282 * (was 3; 4 is minimum to allow fast retransmit to work.)
282 */ 283 */
283 while (tcp_win_from_space(rcvmem) < tp->advmss) 284 while (tcp_win_from_space(rcvmem) < tp->advmss)
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
286 sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); 287 sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
287} 288}
288 289
289/* 4. Try to fixup all. It is made iimediately after connection enters 290/* 4. Try to fixup all. It is made immediately after connection enters
290 * established state. 291 * established state.
291 */ 292 */
292static void tcp_init_buffer_space(struct sock *sk) 293static void tcp_init_buffer_space(struct sock *sk)
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk)
326static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 327static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
327{ 328{
328 struct inet_connection_sock *icsk = inet_csk(sk); 329 struct inet_connection_sock *icsk = inet_csk(sk);
329 struct sk_buff *skb;
330 unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
331 int ofo_win = 0;
332 330
333 icsk->icsk_ack.quick = 0; 331 icsk->icsk_ack.quick = 0;
334 332
335 skb_queue_walk(&tp->out_of_order_queue, skb) { 333 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
336 ofo_win += skb->len; 334 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
337 } 335 !tcp_memory_pressure &&
338 336 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
339 /* If overcommit is due to out of order segments, 337 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
340 * do not clamp window. Try to expand rcvbuf instead. 338 sysctl_tcp_rmem[2]);
341 */
342 if (ofo_win) {
343 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
344 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
345 !tcp_memory_pressure &&
346 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
347 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
348 sysctl_tcp_rmem[2]);
349 } 339 }
350 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { 340 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
351 app_win += ofo_win;
352 if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
353 app_win >>= 1;
354 if (app_win > icsk->icsk_ack.rcv_mss)
355 app_win -= icsk->icsk_ack.rcv_mss;
356 app_win = max(app_win, 2U*tp->advmss);
357
358 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); 341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
359 }
360} 342}
361 343
362/* Receiver "autotuning" code. 344/* Receiver "autotuning" code.
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
385 * are stalled on filesystem I/O. 367 * are stalled on filesystem I/O.
386 * 368 *
387 * Also, since we are only going for a minimum in the 369 * Also, since we are only going for a minimum in the
388 * non-timestamp case, we do not smoothe things out 370 * non-timestamp case, we do not smoother things out
389 * else with timestamps disabled convergance takes too 371 * else with timestamps disabled convergence takes too
390 * long. 372 * long.
391 */ 373 */
392 if (!win_dep) { 374 if (!win_dep) {
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
395 } else if (m < new_sample) 377 } else if (m < new_sample)
396 new_sample = m << 3; 378 new_sample = m << 3;
397 } else { 379 } else {
398 /* No previous mesaure. */ 380 /* No previous measure. */
399 new_sample = m << 3; 381 new_sample = m << 3;
400 } 382 }
401 383
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
524 if (icsk->icsk_ack.ato > icsk->icsk_rto) 506 if (icsk->icsk_ack.ato > icsk->icsk_rto)
525 icsk->icsk_ack.ato = icsk->icsk_rto; 507 icsk->icsk_ack.ato = icsk->icsk_rto;
526 } else if (m > icsk->icsk_rto) { 508 } else if (m > icsk->icsk_rto) {
527 /* Too long gap. Apparently sender falled to 509 /* Too long gap. Apparently sender failed to
528 * restart window, so that we send ACKs quickly. 510 * restart window, so that we send ACKs quickly.
529 */ 511 */
530 tcp_incr_quickack(sk); 512 tcp_incr_quickack(sk);
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
548 * To save cycles in the RFC 1323 implementation it was better to break 530 * To save cycles in the RFC 1323 implementation it was better to break
549 * it up into three procedures. -- erics 531 * it up into three procedures. -- erics
550 */ 532 */
551static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) 533static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
552{ 534{
553 struct tcp_sock *tp = tcp_sk(sk); 535 struct tcp_sock *tp = tcp_sk(sk);
554 const struct inet_connection_sock *icsk = inet_csk(sk);
555 long m = mrtt; /* RTT */ 536 long m = mrtt; /* RTT */
556 537
557 /* The following amusing code comes from Jacobson's 538 /* The following amusing code comes from Jacobson's
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
565 * 546 *
566 * Funny. This algorithm seems to be very broken. 547 * Funny. This algorithm seems to be very broken.
567 * These formulae increase RTO, when it should be decreased, increase 548 * These formulae increase RTO, when it should be decreased, increase
568 * too slowly, when it should be incresed fastly, decrease too fastly 549 * too slowly, when it should be increased fastly, decrease too fastly
569 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely 550 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
570 * does not matter how to _calculate_ it. Seems, it was trap 551 * does not matter how to _calculate_ it. Seems, it was trap
571 * that VJ failed to avoid. 8) 552 * that VJ failed to avoid. 8)
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
610 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); 591 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
611 tp->rtt_seq = tp->snd_nxt; 592 tp->rtt_seq = tp->snd_nxt;
612 } 593 }
613
614 if (icsk->icsk_ca_ops->rtt_sample)
615 icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
616} 594}
617 595
618/* Calculate rto without backoff. This is the second half of Van Jacobson's 596/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk)
629 * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ 607 * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
630 * to do with delayed acks, because at cwnd>2 true delack timeout 608 * to do with delayed acks, because at cwnd>2 true delack timeout
631 * is invisible. Actually, Linux-2.4 also generates erratic 609 * is invisible. Actually, Linux-2.4 also generates erratic
632 * ACKs in some curcumstances. 610 * ACKs in some circumstances.
633 */ 611 */
634 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; 612 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
635 613
636 /* 2. Fixups made earlier cannot be right. 614 /* 2. Fixups made earlier cannot be right.
637 * If we do not estimate RTO correctly without them, 615 * If we do not estimate RTO correctly without them,
638 * all the algo is pure shit and should be replaced 616 * all the algo is pure shit and should be replaced
639 * with correct one. It is exaclty, which we pretend to do. 617 * with correct one. It is exactly, which we pretend to do.
640 */ 618 */
641} 619}
642 620
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk)
794 * to make it more realistic. 772 * to make it more realistic.
795 * 773 *
796 * A bit of theory. RTT is time passed after "normal" sized packet 774 * A bit of theory. RTT is time passed after "normal" sized packet
797 * is sent until it is ACKed. In normal curcumstances sending small 775 * is sent until it is ACKed. In normal circumstances sending small
798 * packets force peer to delay ACKs and calculation is correct too. 776 * packets force peer to delay ACKs and calculation is correct too.
799 * The algorithm is adaptive and, provided we follow specs, it 777 * The algorithm is adaptive and, provided we follow specs, it
800 * NEVER underestimate RTT. BUT! If peer tries to make some clever 778 * NEVER underestimate RTT. BUT! If peer tries to make some clever
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
919 int prior_fackets; 897 int prior_fackets;
920 u32 lost_retrans = 0; 898 u32 lost_retrans = 0;
921 int flag = 0; 899 int flag = 0;
900 int dup_sack = 0;
922 int i; 901 int i;
923 902
924 if (!tp->sacked_out) 903 if (!tp->sacked_out)
925 tp->fackets_out = 0; 904 tp->fackets_out = 0;
926 prior_fackets = tp->fackets_out; 905 prior_fackets = tp->fackets_out;
927 906
928 for (i=0; i<num_sacks; i++, sp++) { 907 /* SACK fastpath:
929 struct sk_buff *skb; 908 * if the only SACK change is the increase of the end_seq of
930 __u32 start_seq = ntohl(sp->start_seq); 909 * the first block then only apply that SACK block
931 __u32 end_seq = ntohl(sp->end_seq); 910 * and use retrans queue hinting otherwise slowpath */
932 int fack_count = 0; 911 flag = 1;
933 int dup_sack = 0; 912 for (i = 0; i< num_sacks; i++) {
913 __u32 start_seq = ntohl(sp[i].start_seq);
914 __u32 end_seq = ntohl(sp[i].end_seq);
915
916 if (i == 0){
917 if (tp->recv_sack_cache[i].start_seq != start_seq)
918 flag = 0;
919 } else {
920 if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
921 (tp->recv_sack_cache[i].end_seq != end_seq))
922 flag = 0;
923 }
924 tp->recv_sack_cache[i].start_seq = start_seq;
925 tp->recv_sack_cache[i].end_seq = end_seq;
934 926
935 /* Check for D-SACK. */ 927 /* Check for D-SACK. */
936 if (i == 0) { 928 if (i == 0) {
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
962 if (before(ack, prior_snd_una - tp->max_window)) 954 if (before(ack, prior_snd_una - tp->max_window))
963 return 0; 955 return 0;
964 } 956 }
957 }
958
959 if (flag)
960 num_sacks = 1;
961 else {
962 int j;
963 tp->fastpath_skb_hint = NULL;
964
965 /* order SACK blocks to allow in order walk of the retrans queue */
966 for (i = num_sacks-1; i > 0; i--) {
967 for (j = 0; j < i; j++){
968 if (after(ntohl(sp[j].start_seq),
969 ntohl(sp[j+1].start_seq))){
970 sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq);
971 sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq);
972 sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq);
973 sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq);
974 }
975
976 }
977 }
978 }
979
980 /* clear flag as used for different purpose in following code */
981 flag = 0;
982
983 for (i=0; i<num_sacks; i++, sp++) {
984 struct sk_buff *skb;
985 __u32 start_seq = ntohl(sp->start_seq);
986 __u32 end_seq = ntohl(sp->end_seq);
987 int fack_count;
988
989 /* Use SACK fastpath hint if valid */
990 if (tp->fastpath_skb_hint) {
991 skb = tp->fastpath_skb_hint;
992 fack_count = tp->fastpath_cnt_hint;
993 } else {
994 skb = sk->sk_write_queue.next;
995 fack_count = 0;
996 }
965 997
966 /* Event "B" in the comment above. */ 998 /* Event "B" in the comment above. */
967 if (after(end_seq, tp->high_seq)) 999 if (after(end_seq, tp->high_seq))
968 flag |= FLAG_DATA_LOST; 1000 flag |= FLAG_DATA_LOST;
969 1001
970 sk_stream_for_retrans_queue(skb, sk) { 1002 sk_stream_for_retrans_queue_from(skb, sk) {
971 int in_sack, pcount; 1003 int in_sack, pcount;
972 u8 sacked; 1004 u8 sacked;
973 1005
1006 tp->fastpath_skb_hint = skb;
1007 tp->fastpath_cnt_hint = fack_count;
1008
974 /* The retransmission queue is always in order, so 1009 /* The retransmission queue is always in order, so
975 * we can short-circuit the walk early. 1010 * we can short-circuit the walk early.
976 */ 1011 */
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1045 TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1080 TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1046 tp->lost_out -= tcp_skb_pcount(skb); 1081 tp->lost_out -= tcp_skb_pcount(skb);
1047 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1083
1084 /* clear lost hint */
1085 tp->retransmit_skb_hint = NULL;
1048 } 1086 }
1049 } else { 1087 } else {
1050 /* New sack for not retransmitted frame, 1088 /* New sack for not retransmitted frame,
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1057 if (sacked & TCPCB_LOST) { 1095 if (sacked & TCPCB_LOST) {
1058 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1096 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1059 tp->lost_out -= tcp_skb_pcount(skb); 1097 tp->lost_out -= tcp_skb_pcount(skb);
1098
1099 /* clear lost hint */
1100 tp->retransmit_skb_hint = NULL;
1060 } 1101 }
1061 } 1102 }
1062 1103
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1080 (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { 1121 (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1122 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1082 tp->retrans_out -= tcp_skb_pcount(skb); 1123 tp->retrans_out -= tcp_skb_pcount(skb);
1124 tp->retransmit_skb_hint = NULL;
1083 } 1125 }
1084 } 1126 }
1085 } 1127 }
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1107 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1149 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1108 tp->retrans_out -= tcp_skb_pcount(skb); 1150 tp->retrans_out -= tcp_skb_pcount(skb);
1109 1151
1152 /* clear lost hint */
1153 tp->retransmit_skb_hint = NULL;
1154
1110 if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { 1155 if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1111 tp->lost_out += tcp_skb_pcount(skb); 1156 tp->lost_out += tcp_skb_pcount(skb);
1112 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1157 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk)
1214 tcp_set_ca_state(sk, TCP_CA_Loss); 1259 tcp_set_ca_state(sk, TCP_CA_Loss);
1215 tp->high_seq = tp->frto_highmark; 1260 tp->high_seq = tp->frto_highmark;
1216 TCP_ECN_queue_cwr(tp); 1261 TCP_ECN_queue_cwr(tp);
1262
1263 clear_all_retrans_hints(tp);
1217} 1264}
1218 1265
1219void tcp_clear_retrans(struct tcp_sock *tp) 1266void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how)
1251 tp->snd_cwnd_cnt = 0; 1298 tp->snd_cwnd_cnt = 0;
1252 tp->snd_cwnd_stamp = tcp_time_stamp; 1299 tp->snd_cwnd_stamp = tcp_time_stamp;
1253 1300
1301 tp->bytes_acked = 0;
1254 tcp_clear_retrans(tp); 1302 tcp_clear_retrans(tp);
1255 1303
1256 /* Push undo marker, if it was plain RTO and nothing 1304 /* Push undo marker, if it was plain RTO and nothing
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1279 tcp_set_ca_state(sk, TCP_CA_Loss); 1327 tcp_set_ca_state(sk, TCP_CA_Loss);
1280 tp->high_seq = tp->snd_nxt; 1328 tp->high_seq = tp->snd_nxt;
1281 TCP_ECN_queue_cwr(tp); 1329 TCP_ECN_queue_cwr(tp);
1330
1331 clear_all_retrans_hints(tp);
1282} 1332}
1283 1333
1284static int tcp_check_sack_reneging(struct sock *sk) 1334static int tcp_check_sack_reneging(struct sock *sk)
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
1503 int packets, u32 high_seq) 1553 int packets, u32 high_seq)
1504{ 1554{
1505 struct sk_buff *skb; 1555 struct sk_buff *skb;
1506 int cnt = packets; 1556 int cnt;
1507 1557
1508 BUG_TRAP(cnt <= tp->packets_out); 1558 BUG_TRAP(packets <= tp->packets_out);
1559 if (tp->lost_skb_hint) {
1560 skb = tp->lost_skb_hint;
1561 cnt = tp->lost_cnt_hint;
1562 } else {
1563 skb = sk->sk_write_queue.next;
1564 cnt = 0;
1565 }
1509 1566
1510 sk_stream_for_retrans_queue(skb, sk) { 1567 sk_stream_for_retrans_queue_from(skb, sk) {
1511 cnt -= tcp_skb_pcount(skb); 1568 /* TODO: do this better */
1512 if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) 1569 /* this is not the most efficient way to do this... */
1570 tp->lost_skb_hint = skb;
1571 tp->lost_cnt_hint = cnt;
1572 cnt += tcp_skb_pcount(skb);
1573 if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
1513 break; 1574 break;
1514 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { 1575 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1515 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1576 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1516 tp->lost_out += tcp_skb_pcount(skb); 1577 tp->lost_out += tcp_skb_pcount(skb);
1578
1579 /* clear xmit_retransmit_queue hints
1580 * if this is beyond hint */
1581 if(tp->retransmit_skb_hint != NULL &&
1582 before(TCP_SKB_CB(skb)->seq,
1583 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
1584
1585 tp->retransmit_skb_hint = NULL;
1586 }
1517 } 1587 }
1518 } 1588 }
1519 tcp_sync_left_out(tp); 1589 tcp_sync_left_out(tp);
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
1540 if (tcp_head_timedout(sk, tp)) { 1610 if (tcp_head_timedout(sk, tp)) {
1541 struct sk_buff *skb; 1611 struct sk_buff *skb;
1542 1612
1543 sk_stream_for_retrans_queue(skb, sk) { 1613 skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
1544 if (tcp_skb_timedout(sk, skb) && 1614 : sk->sk_write_queue.next;
1545 !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { 1615
1616 sk_stream_for_retrans_queue_from(skb, sk) {
1617 if (!tcp_skb_timedout(sk, skb))
1618 break;
1619
1620 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1546 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1621 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1547 tp->lost_out += tcp_skb_pcount(skb); 1622 tp->lost_out += tcp_skb_pcount(skb);
1623
1624 /* clear xmit_retrans hint */
1625 if (tp->retransmit_skb_hint &&
1626 before(TCP_SKB_CB(skb)->seq,
1627 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
1628
1629 tp->retransmit_skb_hint = NULL;
1548 } 1630 }
1549 } 1631 }
1632
1633 tp->scoreboard_skb_hint = skb;
1634
1550 tcp_sync_left_out(tp); 1635 tcp_sync_left_out(tp);
1551 } 1636 }
1552} 1637}
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
1626 } 1711 }
1627 tcp_moderate_cwnd(tp); 1712 tcp_moderate_cwnd(tp);
1628 tp->snd_cwnd_stamp = tcp_time_stamp; 1713 tp->snd_cwnd_stamp = tcp_time_stamp;
1714
1715 /* There is something screwy going on with the retrans hints after
1716 an undo */
1717 clear_all_retrans_hints(tp);
1629} 1718}
1630 1719
1631static inline int tcp_may_undo(struct tcp_sock *tp) 1720static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1709 sk_stream_for_retrans_queue(skb, sk) { 1798 sk_stream_for_retrans_queue(skb, sk) {
1710 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1799 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1711 } 1800 }
1801
1802 clear_all_retrans_hints(tp);
1803
1712 DBGUNDO(sk, tp, "partial loss"); 1804 DBGUNDO(sk, tp, "partial loss");
1713 tp->lost_out = 0; 1805 tp->lost_out = 0;
1714 tp->left_out = tp->sacked_out; 1806 tp->left_out = tp->sacked_out;
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1908 TCP_ECN_queue_cwr(tp); 2000 TCP_ECN_queue_cwr(tp);
1909 } 2001 }
1910 2002
2003 tp->bytes_acked = 0;
1911 tp->snd_cwnd_cnt = 0; 2004 tp->snd_cwnd_cnt = 0;
1912 tcp_set_ca_state(sk, TCP_CA_Recovery); 2005 tcp_set_ca_state(sk, TCP_CA_Recovery);
1913 } 2006 }
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1919} 2012}
1920 2013
1921/* Read draft-ietf-tcplw-high-performance before mucking 2014/* Read draft-ietf-tcplw-high-performance before mucking
1922 * with this code. (Superceeds RFC1323) 2015 * with this code. (Supersedes RFC1323)
1923 */ 2016 */
1924static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) 2017static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
1925{ 2018{
1926 /* RTTM Rule: A TSecr value received in a segment is used to 2019 /* RTTM Rule: A TSecr value received in a segment is used to
1927 * update the averaged RTT measurement only if the segment 2020 * update the averaged RTT measurement only if the segment
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
1932 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> 2025 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
1933 * 2026 *
1934 * Changed: reset backoff as soon as we see the first valid sample. 2027 * Changed: reset backoff as soon as we see the first valid sample.
1935 * If we do not, we get strongly overstimated rto. With timestamps 2028 * If we do not, we get strongly overestimated rto. With timestamps
1936 * samples are accepted even from very old segments: f.e., when rtt=1 2029 * samples are accepted even from very old segments: f.e., when rtt=1
1937 * increases to 8, we retransmit 5 times and after 8 seconds delayed 2030 * increases to 8, we retransmit 5 times and after 8 seconds delayed
1938 * answer arrives rto becomes 120 seconds! If at least one of segments 2031 * answer arrives rto becomes 120 seconds! If at least one of segments
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
1940 */ 2033 */
1941 struct tcp_sock *tp = tcp_sk(sk); 2034 struct tcp_sock *tp = tcp_sk(sk);
1942 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 2035 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1943 tcp_rtt_estimator(sk, seq_rtt, usrtt); 2036 tcp_rtt_estimator(sk, seq_rtt);
1944 tcp_set_rto(sk); 2037 tcp_set_rto(sk);
1945 inet_csk(sk)->icsk_backoff = 0; 2038 inet_csk(sk)->icsk_backoff = 0;
1946 tcp_bound_rto(sk); 2039 tcp_bound_rto(sk);
1947} 2040}
1948 2041
1949static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) 2042static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
1950{ 2043{
1951 /* We don't have a timestamp. Can only use 2044 /* We don't have a timestamp. Can only use
1952 * packets that are not retransmitted to determine 2045 * packets that are not retransmitted to determine
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag
1960 if (flag & FLAG_RETRANS_DATA_ACKED) 2053 if (flag & FLAG_RETRANS_DATA_ACKED)
1961 return; 2054 return;
1962 2055
1963 tcp_rtt_estimator(sk, seq_rtt, usrtt); 2056 tcp_rtt_estimator(sk, seq_rtt);
1964 tcp_set_rto(sk); 2057 tcp_set_rto(sk);
1965 inet_csk(sk)->icsk_backoff = 0; 2058 inet_csk(sk)->icsk_backoff = 0;
1966 tcp_bound_rto(sk); 2059 tcp_bound_rto(sk);
1967} 2060}
1968 2061
1969static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, 2062static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
1970 const s32 seq_rtt, u32 *usrtt) 2063 const s32 seq_rtt)
1971{ 2064{
1972 const struct tcp_sock *tp = tcp_sk(sk); 2065 const struct tcp_sock *tp = tcp_sk(sk);
1973 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 2066 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
1974 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 2067 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
1975 tcp_ack_saw_tstamp(sk, usrtt, flag); 2068 tcp_ack_saw_tstamp(sk, flag);
1976 else if (seq_rtt >= 0) 2069 else if (seq_rtt >= 0)
1977 tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); 2070 tcp_ack_no_tstamp(sk, seq_rtt, flag);
1978} 2071}
1979 2072
1980static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, 2073static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2054 return acked; 2147 return acked;
2055} 2148}
2056 2149
2150static inline u32 tcp_usrtt(const struct sk_buff *skb)
2151{
2152 struct timeval tv, now;
2153
2154 do_gettimeofday(&now);
2155 skb_get_timestamp(skb, &tv);
2156 return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec);
2157}
2057 2158
2058/* Remove acknowledged frames from the retransmission queue. */ 2159/* Remove acknowledged frames from the retransmission queue. */
2059static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) 2160static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2060{ 2161{
2061 struct tcp_sock *tp = tcp_sk(sk); 2162 struct tcp_sock *tp = tcp_sk(sk);
2163 const struct inet_connection_sock *icsk = inet_csk(sk);
2062 struct sk_buff *skb; 2164 struct sk_buff *skb;
2063 __u32 now = tcp_time_stamp; 2165 __u32 now = tcp_time_stamp;
2064 int acked = 0; 2166 int acked = 0;
2065 __s32 seq_rtt = -1; 2167 __s32 seq_rtt = -1;
2066 struct timeval usnow;
2067 u32 pkts_acked = 0; 2168 u32 pkts_acked = 0;
2068 2169 void (*rtt_sample)(struct sock *sk, u32 usrtt)
2069 if (seq_usrtt) 2170 = icsk->icsk_ca_ops->rtt_sample;
2070 do_gettimeofday(&usnow);
2071 2171
2072 while ((skb = skb_peek(&sk->sk_write_queue)) && 2172 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2073 skb != sk->sk_send_head) { 2173 skb != sk->sk_send_head) {
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2107 tp->retrans_out -= tcp_skb_pcount(skb); 2207 tp->retrans_out -= tcp_skb_pcount(skb);
2108 acked |= FLAG_RETRANS_DATA_ACKED; 2208 acked |= FLAG_RETRANS_DATA_ACKED;
2109 seq_rtt = -1; 2209 seq_rtt = -1;
2110 } else if (seq_rtt < 0) 2210 } else if (seq_rtt < 0) {
2111 seq_rtt = now - scb->when; 2211 seq_rtt = now - scb->when;
2112 if (seq_usrtt) { 2212 if (rtt_sample)
2113 struct timeval tv; 2213 (*rtt_sample)(sk, tcp_usrtt(skb));
2114
2115 skb_get_timestamp(skb, &tv);
2116 *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
2117 + (usnow.tv_usec - tv.tv_usec);
2118 } 2214 }
2119
2120 if (sacked & TCPCB_SACKED_ACKED) 2215 if (sacked & TCPCB_SACKED_ACKED)
2121 tp->sacked_out -= tcp_skb_pcount(skb); 2216 tp->sacked_out -= tcp_skb_pcount(skb);
2122 if (sacked & TCPCB_LOST) 2217 if (sacked & TCPCB_LOST)
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2126 !before(scb->end_seq, tp->snd_up)) 2221 !before(scb->end_seq, tp->snd_up))
2127 tp->urg_mode = 0; 2222 tp->urg_mode = 0;
2128 } 2223 }
2129 } else if (seq_rtt < 0) 2224 } else if (seq_rtt < 0) {
2130 seq_rtt = now - scb->when; 2225 seq_rtt = now - scb->when;
2226 if (rtt_sample)
2227 (*rtt_sample)(sk, tcp_usrtt(skb));
2228 }
2131 tcp_dec_pcount_approx(&tp->fackets_out, skb); 2229 tcp_dec_pcount_approx(&tp->fackets_out, skb);
2132 tcp_packets_out_dec(tp, skb); 2230 tcp_packets_out_dec(tp, skb);
2133 __skb_unlink(skb, &sk->sk_write_queue); 2231 __skb_unlink(skb, &sk->sk_write_queue);
2134 sk_stream_free_skb(sk, skb); 2232 sk_stream_free_skb(sk, skb);
2233 clear_all_retrans_hints(tp);
2135 } 2234 }
2136 2235
2137 if (acked&FLAG_ACKED) { 2236 if (acked&FLAG_ACKED) {
2138 const struct inet_connection_sock *icsk = inet_csk(sk); 2237 tcp_ack_update_rtt(sk, acked, seq_rtt);
2139 tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
2140 tcp_ack_packets_out(sk, tp); 2238 tcp_ack_packets_out(sk, tp);
2141 2239
2142 if (icsk->icsk_ca_ops->pkts_acked) 2240 if (icsk->icsk_ca_ops->pkts_acked)
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2284 } 2382 }
2285 2383
2286 /* F-RTO affects on two new ACKs following RTO. 2384 /* F-RTO affects on two new ACKs following RTO.
2287 * At latest on third ACK the TCP behavor is back to normal. 2385 * At latest on third ACK the TCP behavior is back to normal.
2288 */ 2386 */
2289 tp->frto_counter = (tp->frto_counter + 1) % 3; 2387 tp->frto_counter = (tp->frto_counter + 1) % 3;
2290} 2388}
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2299 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2397 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2300 u32 prior_in_flight; 2398 u32 prior_in_flight;
2301 s32 seq_rtt; 2399 s32 seq_rtt;
2302 s32 seq_usrtt = 0;
2303 int prior_packets; 2400 int prior_packets;
2304 2401
2305 /* If the ack is newer than sent or older than previous acks 2402 /* If the ack is newer than sent or older than previous acks
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2311 if (before(ack, prior_snd_una)) 2408 if (before(ack, prior_snd_una))
2312 goto old_ack; 2409 goto old_ack;
2313 2410
2411 if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
2412 tp->bytes_acked += ack - prior_snd_una;
2413
2314 if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { 2414 if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
2315 /* Window is constant, pure forward advance. 2415 /* Window is constant, pure forward advance.
2316 * No more checks are required. 2416 * No more checks are required.
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2352 prior_in_flight = tcp_packets_in_flight(tp); 2452 prior_in_flight = tcp_packets_in_flight(tp);
2353 2453
2354 /* See if we can take anything off of the retransmit queue. */ 2454 /* See if we can take anything off of the retransmit queue. */
2355 flag |= tcp_clean_rtx_queue(sk, &seq_rtt, 2455 flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
2356 icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
2357 2456
2358 if (tp->frto_counter) 2457 if (tp->frto_counter)
2359 tcp_process_frto(sk, prior_snd_una); 2458 tcp_process_frto(sk, prior_snd_una);
2360 2459
2361 if (tcp_ack_is_dubious(sk, flag)) { 2460 if (tcp_ack_is_dubious(sk, flag)) {
2362 /* Advanve CWND, if state allows this. */ 2461 /* Advance CWND, if state allows this. */
2363 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) 2462 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
2364 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); 2463 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
2365 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2464 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3148{ 3247{
3149 struct sk_buff *skb; 3248 struct sk_buff *skb;
3150 3249
3151 /* First, check that queue is collapsable and find 3250 /* First, check that queue is collapsible and find
3152 * the point where collapsing can be useful. */ 3251 * the point where collapsing can be useful. */
3153 for (skb = head; skb != tail; ) { 3252 for (skb = head; skb != tail; ) {
3154 /* No new bits? It is possible on ofo queue. */ 3253 /* No new bits? It is possible on ofo queue. */
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
3456 3555
3457/* 3556/*
3458 * This routine is only called when we have urgent data 3557 * This routine is only called when we have urgent data
3459 * signalled. Its the 'slow' part of tcp_urg. It could be 3558 * signaled. Its the 'slow' part of tcp_urg. It could be
3460 * moved inline now as tcp_urg is only called from one 3559 * moved inline now as tcp_urg is only called from one
3461 * place. We handle URGent data wrong. We have to - as 3560 * place. We handle URGent data wrong. We have to - as
3462 * BSD still doesn't use the correction from RFC961. 3561 * BSD still doesn't use the correction from RFC961.
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
3501 * urgent. To do this requires some care. We cannot just ignore 3600 * urgent. To do this requires some care. We cannot just ignore
3502 * tp->copied_seq since we would read the last urgent byte again 3601 * tp->copied_seq since we would read the last urgent byte again
3503 * as data, nor can we alter copied_seq until this data arrives 3602 * as data, nor can we alter copied_seq until this data arrives
3504 * or we break the sematics of SIOCATMARK (and thus sockatmark()) 3603 * or we break the semantics of SIOCATMARK (and thus sockatmark())
3505 * 3604 *
3506 * NOTE. Double Dutch. Rendering to plain English: author of comment 3605 * NOTE. Double Dutch. Rendering to plain English: author of comment
3507 * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); 3606 * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3646 tp->rx_opt.saw_tstamp = 0; 3745 tp->rx_opt.saw_tstamp = 0;
3647 3746
3648 /* pred_flags is 0xS?10 << 16 + snd_wnd 3747 /* pred_flags is 0xS?10 << 16 + snd_wnd
3649 * if header_predition is to be made 3748 * if header_prediction is to be made
3650 * 'S' will always be tp->tcp_header_len >> 2 3749 * 'S' will always be tp->tcp_header_len >> 2
3651 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to 3750 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
3652 * turn it off (when there are holes in the receive 3751 * turn it off (when there are holes in the receive
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4242 */ 4341 */
4243 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4342 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4244 !tp->srtt) 4343 !tp->srtt)
4245 tcp_ack_saw_tstamp(sk, NULL, 0); 4344 tcp_ack_saw_tstamp(sk, 0);
4246 4345
4247 if (tp->rx_opt.tstamp_ok) 4346 if (tp->rx_opt.tstamp_ok)
4248 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4347 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4372,6 +4471,7 @@ discard:
4372 4471
4373EXPORT_SYMBOL(sysctl_tcp_ecn); 4472EXPORT_SYMBOL(sysctl_tcp_ecn);
4374EXPORT_SYMBOL(sysctl_tcp_reordering); 4473EXPORT_SYMBOL(sysctl_tcp_reordering);
4474EXPORT_SYMBOL(sysctl_tcp_abc);
4375EXPORT_SYMBOL(tcp_parse_options); 4475EXPORT_SYMBOL(tcp_parse_options);
4376EXPORT_SYMBOL(tcp_rcv_established); 4476EXPORT_SYMBOL(tcp_rcv_established);
4377EXPORT_SYMBOL(tcp_rcv_state_process); 4477EXPORT_SYMBOL(tcp_rcv_state_process);