aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c571
1 files changed, 437 insertions, 134 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e337..7d076f0db100 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 112 u32 cwnd = tp->snd_cwnd;
113 113
114 if (tcp_is_vegas(tp)) 114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
115 tcp_vegas_enable(tp);
116 115
117 tp->snd_ssthresh = tcp_current_ssthresh(tp); 116 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd); 117 restart_cwnd = min(restart_cwnd, cwnd);
@@ -141,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
141 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
142} 141}
143 142
144static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
145{ 144{
146 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
147 146
148 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
150} 149}
151 150
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_WSCALE 0x2 279#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4 280#define SYSCTL_FLAG_SACK 0x4
282 281
282 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp);
285
283 sysctl_flags = 0; 286 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) { 287 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 288 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 307 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 } 308 }
306 309
307 /* 310 if (tcp_packets_in_flight(tp) == 0)
308 * If the connection is idle and we are restarting, 311 tcp_ca_event(tp, CA_EVENT_TX_START);
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318 312
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 313 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th; 314 skb->h.th = th;
@@ -361,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
361 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
362 356
363 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
365 359
366 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -409,42 +403,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
409 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
410} 404}
411 405
412static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
413{
414 /* Force push to be on for any TSO frames to workaround
415 * problems with busted implementations like Mac OS-X that
416 * hold off socket receive wakeups until push is seen.
417 */
418 if (tcp_skb_pcount(skb) > 1)
419 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
420}
421
422/* Send _single_ skb sitting at the send head. This function requires
423 * true push pending frames to setup probe timer etc.
424 */
425void tcp_push_one(struct sock *sk, unsigned cur_mss)
426{ 407{
427 struct tcp_sock *tp = tcp_sk(sk); 408 if (skb->len <= mss_now ||
428 struct sk_buff *skb = sk->sk_send_head;
429
430 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
431 /* Send it out now. */
432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
433 tcp_tso_set_push(skb);
434 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
435 sk->sk_send_head = NULL;
436 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
437 tcp_packets_out_inc(sk, tp, skb);
438 return;
439 }
440 }
441}
442
443void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
444{
445 struct tcp_sock *tp = tcp_sk(sk);
446
447 if (skb->len <= tp->mss_cache_std ||
448 !(sk->sk_route_caps & NETIF_F_TSO)) { 409 !(sk->sk_route_caps & NETIF_F_TSO)) {
449 /* Avoid the costly divide in the normal 410 /* Avoid the costly divide in the normal
450 * non-TSO case. 411 * non-TSO case.
@@ -454,10 +415,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
454 } else { 415 } else {
455 unsigned int factor; 416 unsigned int factor;
456 417
457 factor = skb->len + (tp->mss_cache_std - 1); 418 factor = skb->len + (mss_now - 1);
458 factor /= tp->mss_cache_std; 419 factor /= mss_now;
459 skb_shinfo(skb)->tso_segs = factor; 420 skb_shinfo(skb)->tso_segs = factor;
460 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 421 skb_shinfo(skb)->tso_size = mss_now;
461 } 422 }
462} 423}
463 424
@@ -466,7 +427,7 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
466 * packet to the list. This won't be called frequently, I hope. 427 * packet to the list. This won't be called frequently, I hope.
467 * Remember, these are still headerless SKBs at this point. 428 * Remember, these are still headerless SKBs at this point.
468 */ 429 */
469static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) 430static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
470{ 431{
471 struct tcp_sock *tp = tcp_sk(sk); 432 struct tcp_sock *tp = tcp_sk(sk);
472 struct sk_buff *buff; 433 struct sk_buff *buff;
@@ -521,6 +482,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
521 * skbs, which it never sent before. --ANK 482 * skbs, which it never sent before. --ANK
522 */ 483 */
523 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 484 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
485 buff->stamp = skb->stamp;
524 486
525 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 487 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
526 tp->lost_out -= tcp_skb_pcount(skb); 488 tp->lost_out -= tcp_skb_pcount(skb);
@@ -528,8 +490,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
528 } 490 }
529 491
530 /* Fix up tso_factor for both original and new SKB. */ 492 /* Fix up tso_factor for both original and new SKB. */
531 tcp_set_skb_tso_segs(sk, skb); 493 tcp_set_skb_tso_segs(sk, skb, mss_now);
532 tcp_set_skb_tso_segs(sk, buff); 494 tcp_set_skb_tso_segs(sk, buff, mss_now);
533 495
534 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 496 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
535 tp->lost_out += tcp_skb_pcount(skb); 497 tp->lost_out += tcp_skb_pcount(skb);
@@ -542,6 +504,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
542 } 504 }
543 505
544 /* Link BUFF into the send queue. */ 506 /* Link BUFF into the send queue. */
507 skb_header_release(buff);
545 __skb_append(skb, buff); 508 __skb_append(skb, buff);
546 509
547 return 0; 510 return 0;
@@ -604,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
604 * factor and mss. 567 * factor and mss.
605 */ 568 */
606 if (tcp_skb_pcount(skb) > 1) 569 if (tcp_skb_pcount(skb) > 1)
607 tcp_set_skb_tso_segs(sk, skb); 570 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
608 571
609 return 0; 572 return 0;
610} 573}
@@ -662,7 +625,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
662 625
663 /* And store cached results */ 626 /* And store cached results */
664 tp->pmtu_cookie = pmtu; 627 tp->pmtu_cookie = pmtu;
665 tp->mss_cache = tp->mss_cache_std = mss_now; 628 tp->mss_cache = mss_now;
666 629
667 return mss_now; 630 return mss_now;
668} 631}
@@ -674,57 +637,318 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
674 * cannot be large. However, taking into account rare use of URG, this 637 * cannot be large. However, taking into account rare use of URG, this
675 * is not a big flaw. 638 * is not a big flaw.
676 */ 639 */
677 640unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
678unsigned int tcp_current_mss(struct sock *sk, int large)
679{ 641{
680 struct tcp_sock *tp = tcp_sk(sk); 642 struct tcp_sock *tp = tcp_sk(sk);
681 struct dst_entry *dst = __sk_dst_get(sk); 643 struct dst_entry *dst = __sk_dst_get(sk);
682 unsigned int do_large, mss_now; 644 u32 mss_now;
645 u16 xmit_size_goal;
646 int doing_tso = 0;
647
648 mss_now = tp->mss_cache;
649
650 if (large_allowed &&
651 (sk->sk_route_caps & NETIF_F_TSO) &&
652 !tp->urg_mode)
653 doing_tso = 1;
683 654
684 mss_now = tp->mss_cache_std;
685 if (dst) { 655 if (dst) {
686 u32 mtu = dst_mtu(dst); 656 u32 mtu = dst_mtu(dst);
687 if (mtu != tp->pmtu_cookie) 657 if (mtu != tp->pmtu_cookie)
688 mss_now = tcp_sync_mss(sk, mtu); 658 mss_now = tcp_sync_mss(sk, mtu);
689 } 659 }
690 660
691 do_large = (large && 661 if (tp->rx_opt.eff_sacks)
692 (sk->sk_route_caps & NETIF_F_TSO) && 662 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
693 !tp->urg_mode); 663 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
694 664
695 if (do_large) { 665 xmit_size_goal = mss_now;
696 unsigned int large_mss, factor, limit;
697 666
698 large_mss = 65535 - tp->af_specific->net_header_len - 667 if (doing_tso) {
668 xmit_size_goal = 65535 -
669 tp->af_specific->net_header_len -
699 tp->ext_header_len - tp->tcp_header_len; 670 tp->ext_header_len - tp->tcp_header_len;
700 671
701 if (tp->max_window && large_mss > (tp->max_window>>1)) 672 if (tp->max_window &&
702 large_mss = max((tp->max_window>>1), 673 (xmit_size_goal > (tp->max_window >> 1)))
703 68U - tp->tcp_header_len); 674 xmit_size_goal = max((tp->max_window >> 1),
675 68U - tp->tcp_header_len);
704 676
705 factor = large_mss / mss_now; 677 xmit_size_goal -= (xmit_size_goal % mss_now);
678 }
679 tp->xmit_size_goal = xmit_size_goal;
706 680
707 /* Always keep large mss multiple of real mss, but 681 return mss_now;
708 * do not exceed 1/tso_win_divisor of the congestion window 682}
709 * so we can keep the ACK clock ticking and minimize
710 * bursting.
711 */
712 limit = tp->snd_cwnd;
713 if (sysctl_tcp_tso_win_divisor)
714 limit /= sysctl_tcp_tso_win_divisor;
715 limit = max(1U, limit);
716 if (factor > limit)
717 factor = limit;
718 683
719 tp->mss_cache = mss_now * factor; 684/* Congestion window validation. (RFC2861) */
720 685
721 mss_now = tp->mss_cache; 686static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
687{
688 __u32 packets_out = tp->packets_out;
689
690 if (packets_out >= tp->snd_cwnd) {
691 /* Network is feed fully. */
692 tp->snd_cwnd_used = 0;
693 tp->snd_cwnd_stamp = tcp_time_stamp;
694 } else {
695 /* Network starves. */
696 if (tp->packets_out > tp->snd_cwnd_used)
697 tp->snd_cwnd_used = tp->packets_out;
698
699 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
700 tcp_cwnd_application_limited(sk);
722 } 701 }
702}
723 703
724 if (tp->rx_opt.eff_sacks) 704static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
725 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 705{
726 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 706 u32 window, cwnd_len;
727 return mss_now; 707
708 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
709 cwnd_len = mss_now * cwnd;
710 return min(window, cwnd_len);
711}
712
713/* Can at least one segment of SKB be sent right now, according to the
714 * congestion window rules? If so, return how many segments are allowed.
715 */
716static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
717{
718 u32 in_flight, cwnd;
719
720 /* Don't be strict about the congestion window for the final FIN. */
721 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
722 return 1;
723
724 in_flight = tcp_packets_in_flight(tp);
725 cwnd = tp->snd_cwnd;
726 if (in_flight < cwnd)
727 return (cwnd - in_flight);
728
729 return 0;
730}
731
732/* This must be invoked the first time we consider transmitting
733 * SKB onto the wire.
734 */
735static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
736{
737 int tso_segs = tcp_skb_pcount(skb);
738
739 if (!tso_segs ||
740 (tso_segs > 1 &&
741 skb_shinfo(skb)->tso_size != mss_now)) {
742 tcp_set_skb_tso_segs(sk, skb, mss_now);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb, cur_mss);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 BUG_ON(skb->len != skb->data_len);
865
866 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
867 if (unlikely(buff == NULL))
868 return -ENOMEM;
869
870 buff->truesize = nlen;
871 skb->truesize -= nlen;
872
873 /* Correct the sequence numbers. */
874 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
875 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
876 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
877
878 /* PSH and FIN should only be set in the second packet. */
879 flags = TCP_SKB_CB(skb)->flags;
880 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
881 TCP_SKB_CB(buff)->flags = flags;
882
883 /* This packet was never sent out yet, so no SACK bits. */
884 TCP_SKB_CB(buff)->sacked = 0;
885
886 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
887 skb_split(skb, buff, len);
888
889 /* Fix up tso_factor for both original and new SKB. */
890 tcp_set_skb_tso_segs(sk, skb, mss_now);
891 tcp_set_skb_tso_segs(sk, buff, mss_now);
892
893 /* Link BUFF into the send queue. */
894 skb_header_release(buff);
895 __skb_append(skb, buff);
896
897 return 0;
898}
899
900/* Try to defer sending, if possible, in order to minimize the amount
901 * of TSO splitting we do. View it as a kind of TSO Nagle test.
902 *
903 * This algorithm is from John Heffner.
904 */
905static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
906{
907 u32 send_win, cong_win, limit, in_flight;
908
909 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
910 return 0;
911
912 if (tp->ca_state != TCP_CA_Open)
913 return 0;
914
915 in_flight = tcp_packets_in_flight(tp);
916
917 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
918 (tp->snd_cwnd <= in_flight));
919
920 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
921
922 /* From in_flight test above, we know that cwnd > in_flight. */
923 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
924
925 limit = min(send_win, cong_win);
926
927 /* If sk_send_head can be sent fully now, just do it. */
928 if (skb->len <= limit)
929 return 0;
930
931 if (sysctl_tcp_tso_win_divisor) {
932 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
933
934 /* If at least some fraction of a window is available,
935 * just use it.
936 */
937 chunk /= sysctl_tcp_tso_win_divisor;
938 if (limit >= chunk)
939 return 0;
940 } else {
941 /* Different approach, try not to defer past a single
942 * ACK. Receiver should ACK every other full sized
943 * frame, so if we have space for more than 3 frames
944 * then send now.
945 */
946 if (limit > tcp_max_burst(tp) * tp->mss_cache)
947 return 0;
948 }
949
950 /* Ok, it looks like it is advisable to defer. */
951 return 1;
728} 952}
729 953
730/* This routine writes packets to the network. It advances the 954/* This routine writes packets to the network. It advances the
@@ -734,57 +958,142 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
734 * Returns 1, if no segments are in flight and we have queued segments, but 958 * Returns 1, if no segments are in flight and we have queued segments, but
735 * cannot send anything now because of SWS or another problem. 959 * cannot send anything now because of SWS or another problem.
736 */ 960 */
737int tcp_write_xmit(struct sock *sk, int nonagle) 961static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
738{ 962{
739 struct tcp_sock *tp = tcp_sk(sk); 963 struct tcp_sock *tp = tcp_sk(sk);
740 unsigned int mss_now; 964 struct sk_buff *skb;
965 unsigned int tso_segs, sent_pkts;
966 int cwnd_quota;
741 967
742 /* If we are closed, the bytes will have to remain here. 968 /* If we are closed, the bytes will have to remain here.
743 * In time closedown will finish, we empty the write queue and all 969 * In time closedown will finish, we empty the write queue and all
744 * will be happy. 970 * will be happy.
745 */ 971 */
746 if (sk->sk_state != TCP_CLOSE) { 972 if (unlikely(sk->sk_state == TCP_CLOSE))
747 struct sk_buff *skb; 973 return 0;
748 int sent_pkts = 0;
749 974
750 /* Account for SACKS, we may need to fragment due to this. 975 sent_pkts = 0;
751 * It is just like the real MSS changing on us midstream. 976 while ((skb = sk->sk_send_head)) {
752 * We also handle things correctly when the user adds some 977 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
753 * IP options mid-stream. Silly to do, but cover it. 978 BUG_ON(!tso_segs);
754 */ 979
755 mss_now = tcp_current_mss(sk, 1); 980 cwnd_quota = tcp_cwnd_test(tp, skb);
756 981 if (!cwnd_quota)
757 while ((skb = sk->sk_send_head) && 982 break;
758 tcp_snd_test(sk, skb, mss_now, 983
759 tcp_skb_is_last(sk, skb) ? nonagle : 984 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
760 TCP_NAGLE_PUSH)) { 985 break;
761 if (skb->len > mss_now) { 986
762 if (tcp_fragment(sk, skb, mss_now)) 987 if (tso_segs == 1) {
988 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
989 (tcp_skb_is_last(sk, skb) ?
990 nonagle : TCP_NAGLE_PUSH))))
991 break;
992 } else {
993 if (tcp_tso_should_defer(sk, tp, skb))
994 break;
995 }
996
997 if (tso_segs > 1) {
998 u32 limit = tcp_window_allows(tp, skb,
999 mss_now, cwnd_quota);
1000
1001 if (skb->len < limit) {
1002 unsigned int trim = skb->len % mss_now;
1003
1004 if (trim)
1005 limit = skb->len - trim;
1006 }
1007 if (skb->len > limit) {
1008 if (tso_fragment(sk, skb, limit, mss_now))
763 break; 1009 break;
764 } 1010 }
765 1011 } else if (unlikely(skb->len > mss_now)) {
766 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1012 if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now)))
767 tcp_tso_set_push(skb);
768 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
769 break; 1013 break;
1014 }
770 1015
771 /* Advance the send_head. This one is sent out. 1016 TCP_SKB_CB(skb)->when = tcp_time_stamp;
772 * This call will increment packets_out. 1017
773 */ 1018 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
774 update_send_head(sk, tp, skb); 1019 break;
1020
1021 /* Advance the send_head. This one is sent out.
1022 * This call will increment packets_out.
1023 */
1024 update_send_head(sk, tp, skb);
1025
1026 tcp_minshall_update(tp, mss_now, skb);
1027 sent_pkts++;
1028 }
1029
1030 if (likely(sent_pkts)) {
1031 tcp_cwnd_validate(sk, tp);
1032 return 0;
1033 }
1034 return !tp->packets_out && sk->sk_send_head;
1035}
1036
1037/* Push out any pending frames which were held back due to
1038 * TCP_CORK or attempt at coalescing tiny packets.
1039 * The socket must be locked by the caller.
1040 */
1041void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1042 unsigned int cur_mss, int nonagle)
1043{
1044 struct sk_buff *skb = sk->sk_send_head;
775 1045
776 tcp_minshall_update(tp, mss_now, skb); 1046 if (skb) {
777 sent_pkts = 1; 1047 if (tcp_write_xmit(sk, cur_mss, nonagle))
1048 tcp_check_probe_timer(sk, tp);
1049 }
1050}
1051
1052/* Send _single_ skb sitting at the send head. This function requires
1053 * true push pending frames to setup probe timer etc.
1054 */
1055void tcp_push_one(struct sock *sk, unsigned int mss_now)
1056{
1057 struct tcp_sock *tp = tcp_sk(sk);
1058 struct sk_buff *skb = sk->sk_send_head;
1059 unsigned int tso_segs, cwnd_quota;
1060
1061 BUG_ON(!skb || skb->len < mss_now);
1062
1063 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1064 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1065
1066 if (likely(cwnd_quota)) {
1067 BUG_ON(!tso_segs);
1068
1069 if (tso_segs > 1) {
1070 u32 limit = tcp_window_allows(tp, skb,
1071 mss_now, cwnd_quota);
1072
1073 if (skb->len < limit) {
1074 unsigned int trim = skb->len % mss_now;
1075
1076 if (trim)
1077 limit = skb->len - trim;
1078 }
1079 if (skb->len > limit) {
1080 if (unlikely(tso_fragment(sk, skb, limit, mss_now)))
1081 return;
1082 }
1083 } else if (unlikely(skb->len > mss_now)) {
1084 if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now)))
1085 return;
778 } 1086 }
779 1087
780 if (sent_pkts) { 1088 /* Send it out now. */
1089 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1090
1091 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1092 update_send_head(sk, tp, skb);
781 tcp_cwnd_validate(sk, tp); 1093 tcp_cwnd_validate(sk, tp);
782 return 0; 1094 return;
783 } 1095 }
784
785 return !tp->packets_out && sk->sk_send_head;
786 } 1096 }
787 return 0;
788} 1097}
789 1098
790/* This function returns the amount that we can raise the 1099/* This function returns the amount that we can raise the
@@ -1044,7 +1353,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1044 if (sk->sk_route_caps & NETIF_F_TSO) { 1353 if (sk->sk_route_caps & NETIF_F_TSO) {
1045 sk->sk_route_caps &= ~NETIF_F_TSO; 1354 sk->sk_route_caps &= ~NETIF_F_TSO;
1046 sock_set_flag(sk, SOCK_NO_LARGESEND); 1355 sock_set_flag(sk, SOCK_NO_LARGESEND);
1047 tp->mss_cache = tp->mss_cache_std;
1048 } 1356 }
1049 1357
1050 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1358 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1064,7 +1372,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1064 int old_factor = tcp_skb_pcount(skb); 1372 int old_factor = tcp_skb_pcount(skb);
1065 int new_factor; 1373 int new_factor;
1066 1374
1067 if (tcp_fragment(sk, skb, cur_mss)) 1375 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1068 return -ENOMEM; /* We'll try again later. */ 1376 return -ENOMEM; /* We'll try again later. */
1069 1377
1070 /* New SKB created, account for it. */ 1378 /* New SKB created, account for it. */
@@ -1106,7 +1414,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1106 * is still in somebody's hands, else make a clone. 1414 * is still in somebody's hands, else make a clone.
1107 */ 1415 */
1108 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1416 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1109 tcp_tso_set_push(skb);
1110 1417
1111 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1418 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1112 pskb_copy(skb, GFP_ATOMIC): 1419 pskb_copy(skb, GFP_ATOMIC):
@@ -1290,7 +1597,7 @@ void tcp_send_fin(struct sock *sk)
1290 * was unread data in the receive queue. This behavior is recommended 1597 * was unread data in the receive queue. This behavior is recommended
1291 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM 1598 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1292 */ 1599 */
1293void tcp_send_active_reset(struct sock *sk, int priority) 1600void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
1294{ 1601{
1295 struct tcp_sock *tp = tcp_sk(sk); 1602 struct tcp_sock *tp = tcp_sk(sk);
1296 struct sk_buff *skb; 1603 struct sk_buff *skb;
@@ -1449,7 +1756,6 @@ static inline void tcp_connect_init(struct sock *sk)
1449 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 1756 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1450 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 1757 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451 tcp_initialize_rcv_mss(sk); 1758 tcp_initialize_rcv_mss(sk);
1452 tcp_ca_init(tp);
1453 1759
1454 tcp_select_initial_window(tcp_full_space(sk), 1760 tcp_select_initial_window(tcp_full_space(sk),
1455 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 1761 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1809,6 @@ int tcp_connect(struct sock *sk)
1503 TCP_SKB_CB(buff)->end_seq = tp->write_seq; 1809 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1504 tp->snd_nxt = tp->write_seq; 1810 tp->snd_nxt = tp->write_seq;
1505 tp->pushed_seq = tp->write_seq; 1811 tp->pushed_seq = tp->write_seq;
1506 tcp_ca_init(tp);
1507 1812
1508 /* Send it off. */ 1813 /* Send it off. */
1509 TCP_SKB_CB(buff)->when = tcp_time_stamp; 1814 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1670,21 +1975,19 @@ int tcp_write_wakeup(struct sock *sk)
1670 skb->len > mss) { 1975 skb->len > mss) {
1671 seg_size = min(seg_size, mss); 1976 seg_size = min(seg_size, mss);
1672 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1977 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1673 if (tcp_fragment(sk, skb, seg_size)) 1978 if (tcp_fragment(sk, skb, seg_size, mss))
1674 return -1; 1979 return -1;
1675 /* SWS override triggered forced fragmentation. 1980 /* SWS override triggered forced fragmentation.
1676 * Disable TSO, the connection is too sick. */ 1981 * Disable TSO, the connection is too sick. */
1677 if (sk->sk_route_caps & NETIF_F_TSO) { 1982 if (sk->sk_route_caps & NETIF_F_TSO) {
1678 sock_set_flag(sk, SOCK_NO_LARGESEND); 1983 sock_set_flag(sk, SOCK_NO_LARGESEND);
1679 sk->sk_route_caps &= ~NETIF_F_TSO; 1984 sk->sk_route_caps &= ~NETIF_F_TSO;
1680 tp->mss_cache = tp->mss_cache_std;
1681 } 1985 }
1682 } else if (!tcp_skb_pcount(skb)) 1986 } else if (!tcp_skb_pcount(skb))
1683 tcp_set_skb_tso_segs(sk, skb); 1987 tcp_set_skb_tso_segs(sk, skb, mss);
1684 1988
1685 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1989 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1686 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1990 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1687 tcp_tso_set_push(skb);
1688 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 1991 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1689 if (!err) { 1992 if (!err) {
1690 update_send_head(sk, tp, skb); 1993 update_send_head(sk, tp, skb);