aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pobox.com>2005-08-29 16:40:27 -0400
committerJeff Garzik <jgarzik@pobox.com>2005-08-29 16:40:27 -0400
commitc1b054d03f5b31c33eaa0b267c629b118eaf3790 (patch)
tree9333907ca767be24fcb3667877242976c3e3c8dd /net/ipv4/tcp_output.c
parent559fb51ba7e66fe298b8355fabde1275b7def35f (diff)
parentbf4e70e54cf31dcca48d279c7f7e71328eebe749 (diff)
Merge /spare/repo/linux-2.6/
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c584
1 files changed, 445 insertions, 139 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e337..dd30dd137b74 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 112 u32 cwnd = tp->snd_cwnd;
113 113
114 if (tcp_is_vegas(tp)) 114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
115 tcp_vegas_enable(tp);
116 115
117 tp->snd_ssthresh = tcp_current_ssthresh(tp); 116 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd); 117 restart_cwnd = min(restart_cwnd, cwnd);
@@ -141,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
141 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
142} 141}
143 142
144static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
145{ 144{
146 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
147 146
148 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
150} 149}
151 150
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_WSCALE 0x2 279#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4 280#define SYSCTL_FLAG_SACK 0x4
282 281
282 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp);
285
283 sysctl_flags = 0; 286 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) { 287 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 288 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 307 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 } 308 }
306 309
307 /* 310 if (tcp_packets_in_flight(tp) == 0)
308 * If the connection is idle and we are restarting, 311 tcp_ca_event(tp, CA_EVENT_TX_START);
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318 312
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 313 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th; 314 skb->h.th = th;
@@ -361,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
361 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
362 356
363 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
365 359
366 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -409,42 +403,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
409 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
410} 404}
411 405
412static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
413{
414 /* Force push to be on for any TSO frames to workaround
415 * problems with busted implementations like Mac OS-X that
416 * hold off socket receive wakeups until push is seen.
417 */
418 if (tcp_skb_pcount(skb) > 1)
419 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
420}
421
422/* Send _single_ skb sitting at the send head. This function requires
423 * true push pending frames to setup probe timer etc.
424 */
425void tcp_push_one(struct sock *sk, unsigned cur_mss)
426{
427 struct tcp_sock *tp = tcp_sk(sk);
428 struct sk_buff *skb = sk->sk_send_head;
429
430 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
431 /* Send it out now. */
432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
433 tcp_tso_set_push(skb);
434 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
435 sk->sk_send_head = NULL;
436 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
437 tcp_packets_out_inc(sk, tp, skb);
438 return;
439 }
440 }
441}
442
443void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
444{ 407{
445 struct tcp_sock *tp = tcp_sk(sk); 408 if (skb->len <= mss_now ||
446
447 if (skb->len <= tp->mss_cache_std ||
448 !(sk->sk_route_caps & NETIF_F_TSO)) { 409 !(sk->sk_route_caps & NETIF_F_TSO)) {
449 /* Avoid the costly divide in the normal 410 /* Avoid the costly divide in the normal
450 * non-TSO case. 411 * non-TSO case.
@@ -454,10 +415,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
454 } else { 415 } else {
455 unsigned int factor; 416 unsigned int factor;
456 417
457 factor = skb->len + (tp->mss_cache_std - 1); 418 factor = skb->len + (mss_now - 1);
458 factor /= tp->mss_cache_std; 419 factor /= mss_now;
459 skb_shinfo(skb)->tso_segs = factor; 420 skb_shinfo(skb)->tso_segs = factor;
460 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 421 skb_shinfo(skb)->tso_size = mss_now;
461 } 422 }
462} 423}
463 424
@@ -466,7 +427,7 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
466 * packet to the list. This won't be called frequently, I hope. 427 * packet to the list. This won't be called frequently, I hope.
467 * Remember, these are still headerless SKBs at this point. 428 * Remember, these are still headerless SKBs at this point.
468 */ 429 */
469static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) 430static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
470{ 431{
471 struct tcp_sock *tp = tcp_sk(sk); 432 struct tcp_sock *tp = tcp_sk(sk);
472 struct sk_buff *buff; 433 struct sk_buff *buff;
@@ -521,6 +482,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
521 * skbs, which it never sent before. --ANK 482 * skbs, which it never sent before. --ANK
522 */ 483 */
523 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 484 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
485 buff->stamp = skb->stamp;
524 486
525 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 487 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
526 tp->lost_out -= tcp_skb_pcount(skb); 488 tp->lost_out -= tcp_skb_pcount(skb);
@@ -528,8 +490,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
528 } 490 }
529 491
530 /* Fix up tso_factor for both original and new SKB. */ 492 /* Fix up tso_factor for both original and new SKB. */
531 tcp_set_skb_tso_segs(sk, skb); 493 tcp_set_skb_tso_segs(sk, skb, mss_now);
532 tcp_set_skb_tso_segs(sk, buff); 494 tcp_set_skb_tso_segs(sk, buff, mss_now);
533 495
534 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 496 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
535 tp->lost_out += tcp_skb_pcount(skb); 497 tp->lost_out += tcp_skb_pcount(skb);
@@ -542,6 +504,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
542 } 504 }
543 505
544 /* Link BUFF into the send queue. */ 506 /* Link BUFF into the send queue. */
507 skb_header_release(buff);
545 __skb_append(skb, buff); 508 __skb_append(skb, buff);
546 509
547 return 0; 510 return 0;
@@ -604,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
604 * factor and mss. 567 * factor and mss.
605 */ 568 */
606 if (tcp_skb_pcount(skb) > 1) 569 if (tcp_skb_pcount(skb) > 1)
607 tcp_set_skb_tso_segs(sk, skb); 570 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
608 571
609 return 0; 572 return 0;
610} 573}
@@ -662,7 +625,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
662 625
663 /* And store cached results */ 626 /* And store cached results */
664 tp->pmtu_cookie = pmtu; 627 tp->pmtu_cookie = pmtu;
665 tp->mss_cache = tp->mss_cache_std = mss_now; 628 tp->mss_cache = mss_now;
666 629
667 return mss_now; 630 return mss_now;
668} 631}
@@ -674,57 +637,315 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
674 * cannot be large. However, taking into account rare use of URG, this 637 * cannot be large. However, taking into account rare use of URG, this
675 * is not a big flaw. 638 * is not a big flaw.
676 */ 639 */
677 640unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
678unsigned int tcp_current_mss(struct sock *sk, int large)
679{ 641{
680 struct tcp_sock *tp = tcp_sk(sk); 642 struct tcp_sock *tp = tcp_sk(sk);
681 struct dst_entry *dst = __sk_dst_get(sk); 643 struct dst_entry *dst = __sk_dst_get(sk);
682 unsigned int do_large, mss_now; 644 u32 mss_now;
645 u16 xmit_size_goal;
646 int doing_tso = 0;
647
648 mss_now = tp->mss_cache;
649
650 if (large_allowed &&
651 (sk->sk_route_caps & NETIF_F_TSO) &&
652 !tp->urg_mode)
653 doing_tso = 1;
683 654
684 mss_now = tp->mss_cache_std;
685 if (dst) { 655 if (dst) {
686 u32 mtu = dst_mtu(dst); 656 u32 mtu = dst_mtu(dst);
687 if (mtu != tp->pmtu_cookie) 657 if (mtu != tp->pmtu_cookie)
688 mss_now = tcp_sync_mss(sk, mtu); 658 mss_now = tcp_sync_mss(sk, mtu);
689 } 659 }
690 660
691 do_large = (large && 661 if (tp->rx_opt.eff_sacks)
692 (sk->sk_route_caps & NETIF_F_TSO) && 662 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
693 !tp->urg_mode); 663 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
694 664
695 if (do_large) { 665 xmit_size_goal = mss_now;
696 unsigned int large_mss, factor, limit;
697 666
698 large_mss = 65535 - tp->af_specific->net_header_len - 667 if (doing_tso) {
668 xmit_size_goal = 65535 -
669 tp->af_specific->net_header_len -
699 tp->ext_header_len - tp->tcp_header_len; 670 tp->ext_header_len - tp->tcp_header_len;
700 671
701 if (tp->max_window && large_mss > (tp->max_window>>1)) 672 if (tp->max_window &&
702 large_mss = max((tp->max_window>>1), 673 (xmit_size_goal > (tp->max_window >> 1)))
703 68U - tp->tcp_header_len); 674 xmit_size_goal = max((tp->max_window >> 1),
675 68U - tp->tcp_header_len);
704 676
705 factor = large_mss / mss_now; 677 xmit_size_goal -= (xmit_size_goal % mss_now);
678 }
679 tp->xmit_size_goal = xmit_size_goal;
706 680
707 /* Always keep large mss multiple of real mss, but 681 return mss_now;
708 * do not exceed 1/tso_win_divisor of the congestion window 682}
709 * so we can keep the ACK clock ticking and minimize 683
710 * bursting. 684/* Congestion window validation. (RFC2861) */
711 */
712 limit = tp->snd_cwnd;
713 if (sysctl_tcp_tso_win_divisor)
714 limit /= sysctl_tcp_tso_win_divisor;
715 limit = max(1U, limit);
716 if (factor > limit)
717 factor = limit;
718 685
719 tp->mss_cache = mss_now * factor; 686static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
687{
688 __u32 packets_out = tp->packets_out;
689
690 if (packets_out >= tp->snd_cwnd) {
691 /* Network is feed fully. */
692 tp->snd_cwnd_used = 0;
693 tp->snd_cwnd_stamp = tcp_time_stamp;
694 } else {
695 /* Network starves. */
696 if (tp->packets_out > tp->snd_cwnd_used)
697 tp->snd_cwnd_used = tp->packets_out;
720 698
721 mss_now = tp->mss_cache; 699 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
700 tcp_cwnd_application_limited(sk);
722 } 701 }
702}
723 703
724 if (tp->rx_opt.eff_sacks) 704static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
725 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 705{
726 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 706 u32 window, cwnd_len;
727 return mss_now; 707
708 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
709 cwnd_len = mss_now * cwnd;
710 return min(window, cwnd_len);
711}
712
713/* Can at least one segment of SKB be sent right now, according to the
714 * congestion window rules? If so, return how many segments are allowed.
715 */
716static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
717{
718 u32 in_flight, cwnd;
719
720 /* Don't be strict about the congestion window for the final FIN. */
721 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
722 return 1;
723
724 in_flight = tcp_packets_in_flight(tp);
725 cwnd = tp->snd_cwnd;
726 if (in_flight < cwnd)
727 return (cwnd - in_flight);
728
729 return 0;
730}
731
732/* This must be invoked the first time we consider transmitting
733 * SKB onto the wire.
734 */
735static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
736{
737 int tso_segs = tcp_skb_pcount(skb);
738
739 if (!tso_segs ||
740 (tso_segs > 1 &&
741 skb_shinfo(skb)->tso_size != mss_now)) {
742 tcp_set_skb_tso_segs(sk, skb, mss_now);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb, cur_mss);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 if (skb->len != skb->data_len)
865 return tcp_fragment(sk, skb, len, mss_now);
866
867 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
868 if (unlikely(buff == NULL))
869 return -ENOMEM;
870
871 buff->truesize = nlen;
872 skb->truesize -= nlen;
873
874 /* Correct the sequence numbers. */
875 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
876 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
877 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
878
879 /* PSH and FIN should only be set in the second packet. */
880 flags = TCP_SKB_CB(skb)->flags;
881 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
882 TCP_SKB_CB(buff)->flags = flags;
883
884 /* This packet was never sent out yet, so no SACK bits. */
885 TCP_SKB_CB(buff)->sacked = 0;
886
887 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
888 skb_split(skb, buff, len);
889
890 /* Fix up tso_factor for both original and new SKB. */
891 tcp_set_skb_tso_segs(sk, skb, mss_now);
892 tcp_set_skb_tso_segs(sk, buff, mss_now);
893
894 /* Link BUFF into the send queue. */
895 skb_header_release(buff);
896 __skb_append(skb, buff);
897
898 return 0;
899}
900
901/* Try to defer sending, if possible, in order to minimize the amount
902 * of TSO splitting we do. View it as a kind of TSO Nagle test.
903 *
904 * This algorithm is from John Heffner.
905 */
906static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
907{
908 u32 send_win, cong_win, limit, in_flight;
909
910 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
911 return 0;
912
913 if (tp->ca_state != TCP_CA_Open)
914 return 0;
915
916 in_flight = tcp_packets_in_flight(tp);
917
918 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
919 (tp->snd_cwnd <= in_flight));
920
921 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
922
923 /* From in_flight test above, we know that cwnd > in_flight. */
924 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
925
926 limit = min(send_win, cong_win);
927
928 if (sysctl_tcp_tso_win_divisor) {
929 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
930
931 /* If at least some fraction of a window is available,
932 * just use it.
933 */
934 chunk /= sysctl_tcp_tso_win_divisor;
935 if (limit >= chunk)
936 return 0;
937 } else {
938 /* Different approach, try not to defer past a single
939 * ACK. Receiver should ACK every other full sized
940 * frame, so if we have space for more than 3 frames
941 * then send now.
942 */
943 if (limit > tcp_max_burst(tp) * tp->mss_cache)
944 return 0;
945 }
946
947 /* Ok, it looks like it is advisable to defer. */
948 return 1;
728} 949}
729 950
730/* This routine writes packets to the network. It advances the 951/* This routine writes packets to the network. It advances the
@@ -734,57 +955,142 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
734 * Returns 1, if no segments are in flight and we have queued segments, but 955 * Returns 1, if no segments are in flight and we have queued segments, but
735 * cannot send anything now because of SWS or another problem. 956 * cannot send anything now because of SWS or another problem.
736 */ 957 */
737int tcp_write_xmit(struct sock *sk, int nonagle) 958static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
738{ 959{
739 struct tcp_sock *tp = tcp_sk(sk); 960 struct tcp_sock *tp = tcp_sk(sk);
740 unsigned int mss_now; 961 struct sk_buff *skb;
962 unsigned int tso_segs, sent_pkts;
963 int cwnd_quota;
741 964
742 /* If we are closed, the bytes will have to remain here. 965 /* If we are closed, the bytes will have to remain here.
743 * In time closedown will finish, we empty the write queue and all 966 * In time closedown will finish, we empty the write queue and all
744 * will be happy. 967 * will be happy.
745 */ 968 */
746 if (sk->sk_state != TCP_CLOSE) { 969 if (unlikely(sk->sk_state == TCP_CLOSE))
747 struct sk_buff *skb; 970 return 0;
748 int sent_pkts = 0;
749 971
750 /* Account for SACKS, we may need to fragment due to this. 972 sent_pkts = 0;
751 * It is just like the real MSS changing on us midstream. 973 while ((skb = sk->sk_send_head)) {
752 * We also handle things correctly when the user adds some 974 unsigned int limit;
753 * IP options mid-stream. Silly to do, but cover it.
754 */
755 mss_now = tcp_current_mss(sk, 1);
756
757 while ((skb = sk->sk_send_head) &&
758 tcp_snd_test(sk, skb, mss_now,
759 tcp_skb_is_last(sk, skb) ? nonagle :
760 TCP_NAGLE_PUSH)) {
761 if (skb->len > mss_now) {
762 if (tcp_fragment(sk, skb, mss_now))
763 break;
764 }
765 975
766 TCP_SKB_CB(skb)->when = tcp_time_stamp; 976 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
767 tcp_tso_set_push(skb); 977 BUG_ON(!tso_segs);
768 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) 978
979 cwnd_quota = tcp_cwnd_test(tp, skb);
980 if (!cwnd_quota)
981 break;
982
983 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
984 break;
985
986 if (tso_segs == 1) {
987 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
988 (tcp_skb_is_last(sk, skb) ?
989 nonagle : TCP_NAGLE_PUSH))))
769 break; 990 break;
991 } else {
992 if (tcp_tso_should_defer(sk, tp, skb))
993 break;
994 }
770 995
771 /* Advance the send_head. This one is sent out. 996 limit = mss_now;
772 * This call will increment packets_out. 997 if (tso_segs > 1) {
773 */ 998 limit = tcp_window_allows(tp, skb,
774 update_send_head(sk, tp, skb); 999 mss_now, cwnd_quota);
1000
1001 if (skb->len < limit) {
1002 unsigned int trim = skb->len % mss_now;
775 1003
776 tcp_minshall_update(tp, mss_now, skb); 1004 if (trim)
777 sent_pkts = 1; 1005 limit = skb->len - trim;
1006 }
778 } 1007 }
779 1008
780 if (sent_pkts) { 1009 if (skb->len > limit &&
781 tcp_cwnd_validate(sk, tp); 1010 unlikely(tso_fragment(sk, skb, limit, mss_now)))
782 return 0; 1011 break;
1012
1013 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1014
1015 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
1016 break;
1017
1018 /* Advance the send_head. This one is sent out.
1019 * This call will increment packets_out.
1020 */
1021 update_send_head(sk, tp, skb);
1022
1023 tcp_minshall_update(tp, mss_now, skb);
1024 sent_pkts++;
1025 }
1026
1027 if (likely(sent_pkts)) {
1028 tcp_cwnd_validate(sk, tp);
1029 return 0;
1030 }
1031 return !tp->packets_out && sk->sk_send_head;
1032}
1033
1034/* Push out any pending frames which were held back due to
1035 * TCP_CORK or attempt at coalescing tiny packets.
1036 * The socket must be locked by the caller.
1037 */
1038void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1039 unsigned int cur_mss, int nonagle)
1040{
1041 struct sk_buff *skb = sk->sk_send_head;
1042
1043 if (skb) {
1044 if (tcp_write_xmit(sk, cur_mss, nonagle))
1045 tcp_check_probe_timer(sk, tp);
1046 }
1047}
1048
1049/* Send _single_ skb sitting at the send head. This function requires
1050 * true push pending frames to setup probe timer etc.
1051 */
1052void tcp_push_one(struct sock *sk, unsigned int mss_now)
1053{
1054 struct tcp_sock *tp = tcp_sk(sk);
1055 struct sk_buff *skb = sk->sk_send_head;
1056 unsigned int tso_segs, cwnd_quota;
1057
1058 BUG_ON(!skb || skb->len < mss_now);
1059
1060 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1061 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1062
1063 if (likely(cwnd_quota)) {
1064 unsigned int limit;
1065
1066 BUG_ON(!tso_segs);
1067
1068 limit = mss_now;
1069 if (tso_segs > 1) {
1070 limit = tcp_window_allows(tp, skb,
1071 mss_now, cwnd_quota);
1072
1073 if (skb->len < limit) {
1074 unsigned int trim = skb->len % mss_now;
1075
1076 if (trim)
1077 limit = skb->len - trim;
1078 }
783 } 1079 }
784 1080
785 return !tp->packets_out && sk->sk_send_head; 1081 if (skb->len > limit &&
1082 unlikely(tso_fragment(sk, skb, limit, mss_now)))
1083 return;
1084
1085 /* Send it out now. */
1086 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1087
1088 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1089 update_send_head(sk, tp, skb);
1090 tcp_cwnd_validate(sk, tp);
1091 return;
1092 }
786 } 1093 }
787 return 0;
788} 1094}
789 1095
790/* This function returns the amount that we can raise the 1096/* This function returns the amount that we can raise the
@@ -1044,7 +1350,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1044 if (sk->sk_route_caps & NETIF_F_TSO) { 1350 if (sk->sk_route_caps & NETIF_F_TSO) {
1045 sk->sk_route_caps &= ~NETIF_F_TSO; 1351 sk->sk_route_caps &= ~NETIF_F_TSO;
1046 sock_set_flag(sk, SOCK_NO_LARGESEND); 1352 sock_set_flag(sk, SOCK_NO_LARGESEND);
1047 tp->mss_cache = tp->mss_cache_std;
1048 } 1353 }
1049 1354
1050 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1355 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1062,15 +1367,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1062 1367
1063 if (skb->len > cur_mss) { 1368 if (skb->len > cur_mss) {
1064 int old_factor = tcp_skb_pcount(skb); 1369 int old_factor = tcp_skb_pcount(skb);
1065 int new_factor; 1370 int diff;
1066 1371
1067 if (tcp_fragment(sk, skb, cur_mss)) 1372 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1068 return -ENOMEM; /* We'll try again later. */ 1373 return -ENOMEM; /* We'll try again later. */
1069 1374
1070 /* New SKB created, account for it. */ 1375 /* New SKB created, account for it. */
1071 new_factor = tcp_skb_pcount(skb); 1376 diff = old_factor - tcp_skb_pcount(skb) -
1072 tp->packets_out -= old_factor - new_factor; 1377 tcp_skb_pcount(skb->next);
1073 tp->packets_out += tcp_skb_pcount(skb->next); 1378 tp->packets_out -= diff;
1379
1380 if (diff > 0) {
1381 tp->fackets_out -= diff;
1382 if ((int)tp->fackets_out < 0)
1383 tp->fackets_out = 0;
1384 }
1074 } 1385 }
1075 1386
1076 /* Collapse two adjacent packets if worthwhile and we can. */ 1387 /* Collapse two adjacent packets if worthwhile and we can. */
@@ -1106,7 +1417,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1106 * is still in somebody's hands, else make a clone. 1417 * is still in somebody's hands, else make a clone.
1107 */ 1418 */
1108 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1419 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1109 tcp_tso_set_push(skb);
1110 1420
1111 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1421 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1112 pskb_copy(skb, GFP_ATOMIC): 1422 pskb_copy(skb, GFP_ATOMIC):
@@ -1290,7 +1600,7 @@ void tcp_send_fin(struct sock *sk)
1290 * was unread data in the receive queue. This behavior is recommended 1600 * was unread data in the receive queue. This behavior is recommended
1291 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM 1601 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1292 */ 1602 */
1293void tcp_send_active_reset(struct sock *sk, int priority) 1603void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
1294{ 1604{
1295 struct tcp_sock *tp = tcp_sk(sk); 1605 struct tcp_sock *tp = tcp_sk(sk);
1296 struct sk_buff *skb; 1606 struct sk_buff *skb;
@@ -1449,7 +1759,6 @@ static inline void tcp_connect_init(struct sock *sk)
1449 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 1759 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1450 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 1760 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451 tcp_initialize_rcv_mss(sk); 1761 tcp_initialize_rcv_mss(sk);
1452 tcp_ca_init(tp);
1453 1762
1454 tcp_select_initial_window(tcp_full_space(sk), 1763 tcp_select_initial_window(tcp_full_space(sk),
1455 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 1764 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1812,6 @@ int tcp_connect(struct sock *sk)
1503 TCP_SKB_CB(buff)->end_seq = tp->write_seq; 1812 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1504 tp->snd_nxt = tp->write_seq; 1813 tp->snd_nxt = tp->write_seq;
1505 tp->pushed_seq = tp->write_seq; 1814 tp->pushed_seq = tp->write_seq;
1506 tcp_ca_init(tp);
1507 1815
1508 /* Send it off. */ 1816 /* Send it off. */
1509 TCP_SKB_CB(buff)->when = tcp_time_stamp; 1817 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1670,21 +1978,19 @@ int tcp_write_wakeup(struct sock *sk)
1670 skb->len > mss) { 1978 skb->len > mss) {
1671 seg_size = min(seg_size, mss); 1979 seg_size = min(seg_size, mss);
1672 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1980 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1673 if (tcp_fragment(sk, skb, seg_size)) 1981 if (tcp_fragment(sk, skb, seg_size, mss))
1674 return -1; 1982 return -1;
1675 /* SWS override triggered forced fragmentation. 1983 /* SWS override triggered forced fragmentation.
1676 * Disable TSO, the connection is too sick. */ 1984 * Disable TSO, the connection is too sick. */
1677 if (sk->sk_route_caps & NETIF_F_TSO) { 1985 if (sk->sk_route_caps & NETIF_F_TSO) {
1678 sock_set_flag(sk, SOCK_NO_LARGESEND); 1986 sock_set_flag(sk, SOCK_NO_LARGESEND);
1679 sk->sk_route_caps &= ~NETIF_F_TSO; 1987 sk->sk_route_caps &= ~NETIF_F_TSO;
1680 tp->mss_cache = tp->mss_cache_std;
1681 } 1988 }
1682 } else if (!tcp_skb_pcount(skb)) 1989 } else if (!tcp_skb_pcount(skb))
1683 tcp_set_skb_tso_segs(sk, skb); 1990 tcp_set_skb_tso_segs(sk, skb, mss);
1684 1991
1685 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1992 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1686 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1993 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1687 tcp_tso_set_push(skb);
1688 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 1994 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1689 if (!err) { 1995 if (!err) {
1690 update_send_head(sk, tp, skb); 1996 update_send_head(sk, tp, skb);