aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2005-07-05 18:24:38 -0400
committerDavid S. Miller <davem@davemloft.net>2005-07-05 18:24:38 -0400
commitc1b4a7e69576d65efc31a8cea0714173c2841244 (patch)
tree92082532651cddc6f0649a9d7ca9ca63e381d310 /net/ipv4
parent0d9901df62fe4820aee86b49f1a074cdb5c6928e (diff)
[TCP]: Move to new TSO segmenting scheme.
Make TSO segment transmit size decisions at send time not earlier. The basic scheme is that we try to build as large a TSO frame as possible when pulling in the user data, but the size of the TSO frame output to the card is determined at transmit time. This is guided by tp->xmit_size_goal. It is always set to a multiple of MSS and tells sendmsg/sendpage how large an SKB to try and build. Later, tcp_write_xmit() and tcp_push_one() chop up the packet if necessary and conditions warrant. These routines can also decide to "defer" in order to wait for more ACKs to arrive and thus allow larger TSO frames to be emitted. A general observation is that TSO elongates the pipe, thus requiring a larger congestion window and larger buffering especially at the sender side. Therefore, it is important that applications 1) get a large enough socket send buffer (this is accomplished by our dynamic send buffer expansion code) 2) do large enough writes. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp.c26
-rw-r--r--net/ipv4/tcp_input.c10
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c578
4 files changed, 380 insertions, 236 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2ba73bf3a8f9..29894c749163 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
615 size_t psize, int flags) 615 size_t psize, int flags)
616{ 616{
617 struct tcp_sock *tp = tcp_sk(sk); 617 struct tcp_sock *tp = tcp_sk(sk);
618 int mss_now; 618 int mss_now, size_goal;
619 int err; 619 int err;
620 ssize_t copied; 620 ssize_t copied;
621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 629
630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 size_goal = tp->xmit_size_goal;
631 copied = 0; 632 copied = 0;
632 633
633 err = -EPIPE; 634 err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
641 int offset = poffset % PAGE_SIZE; 642 int offset = poffset % PAGE_SIZE;
642 int size = min_t(size_t, psize, PAGE_SIZE - offset); 643 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643 644
644 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { 645 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
645new_segment: 646new_segment:
646 if (!sk_stream_memory_free(sk)) 647 if (!sk_stream_memory_free(sk))
647 goto wait_for_sndbuf; 648 goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
652 goto wait_for_memory; 653 goto wait_for_memory;
653 654
654 skb_entail(sk, tp, skb); 655 skb_entail(sk, tp, skb);
655 copy = mss_now; 656 copy = size_goal;
656 } 657 }
657 658
658 if (copy > size) 659 if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
693 if (!(psize -= copy)) 694 if (!(psize -= copy))
694 goto out; 695 goto out;
695 696
696 if (skb->len != mss_now || (flags & MSG_OOB)) 697 if (skb->len < mss_now || (flags & MSG_OOB))
697 continue; 698 continue;
698 699
699 if (forced_push(tp)) { 700 if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
713 goto do_error; 714 goto do_error;
714 715
715 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 716 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 size_goal = tp->xmit_size_goal;
716 } 718 }
717 719
718out: 720out:
@@ -754,7 +756,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
754 756
755static inline int select_size(struct sock *sk, struct tcp_sock *tp) 757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{ 758{
757 int tmp = tp->mss_cache_std; 759 int tmp = tp->mss_cache;
758 760
759 if (sk->sk_route_caps & NETIF_F_SG) { 761 if (sk->sk_route_caps & NETIF_F_SG) {
760 if (sk->sk_route_caps & NETIF_F_TSO) 762 if (sk->sk_route_caps & NETIF_F_TSO)
@@ -778,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
778 struct tcp_sock *tp = tcp_sk(sk); 780 struct tcp_sock *tp = tcp_sk(sk);
779 struct sk_buff *skb; 781 struct sk_buff *skb;
780 int iovlen, flags; 782 int iovlen, flags;
781 int mss_now; 783 int mss_now, size_goal;
782 int err, copied; 784 int err, copied;
783 long timeo; 785 long timeo;
784 786
@@ -797,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
797 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 799 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
798 800
799 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 801 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 size_goal = tp->xmit_size_goal;
800 803
801 /* Ok commence sending. */ 804 /* Ok commence sending. */
802 iovlen = msg->msg_iovlen; 805 iovlen = msg->msg_iovlen;
@@ -819,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
819 skb = sk->sk_write_queue.prev; 822 skb = sk->sk_write_queue.prev;
820 823
821 if (!sk->sk_send_head || 824 if (!sk->sk_send_head ||
822 (copy = mss_now - skb->len) <= 0) { 825 (copy = size_goal - skb->len) <= 0) {
823 826
824new_segment: 827new_segment:
825 /* Allocate new segment. If the interface is SG, 828 /* Allocate new segment. If the interface is SG,
@@ -842,7 +845,7 @@ new_segment:
842 skb->ip_summed = CHECKSUM_HW; 845 skb->ip_summed = CHECKSUM_HW;
843 846
844 skb_entail(sk, tp, skb); 847 skb_entail(sk, tp, skb);
845 copy = mss_now; 848 copy = size_goal;
846 } 849 }
847 850
848 /* Try to append data to the end of skb. */ 851 /* Try to append data to the end of skb. */
@@ -937,7 +940,7 @@ new_segment:
937 if ((seglen -= copy) == 0 && iovlen == 0) 940 if ((seglen -= copy) == 0 && iovlen == 0)
938 goto out; 941 goto out;
939 942
940 if (skb->len != mss_now || (flags & MSG_OOB)) 943 if (skb->len < mss_now || (flags & MSG_OOB))
941 continue; 944 continue;
942 945
943 if (forced_push(tp)) { 946 if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
957 goto do_error; 960 goto do_error;
958 961
959 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 962 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 size_goal = tp->xmit_size_goal;
960 } 964 }
961 } 965 }
962 966
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2128 2132
2129 info->tcpi_rto = jiffies_to_usecs(tp->rto); 2133 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2130 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2131 info->tcpi_snd_mss = tp->mss_cache_std; 2135 info->tcpi_snd_mss = tp->mss_cache;
2132 info->tcpi_rcv_mss = tp->ack.rcv_mss; 2136 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2133 2137
2134 info->tcpi_unacked = tp->packets_out; 2138 info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2178 2182
2179 switch (optname) { 2183 switch (optname) {
2180 case TCP_MAXSEG: 2184 case TCP_MAXSEG:
2181 val = tp->mss_cache_std; 2185 val = tp->mss_cache;
2182 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2186 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2183 val = tp->rx_opt.user_mss; 2187 val = tp->rx_opt.user_mss;
2184 break; 2188 break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2ef2f355b8b8..8de2f1071c2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
741 741
742 if (!cwnd) { 742 if (!cwnd) {
743 if (tp->mss_cache_std > 1460) 743 if (tp->mss_cache > 1460)
744 cwnd = 2; 744 cwnd = 2;
745 else 745 else
746 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; 746 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
747 } 747 }
748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
749} 749}
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
914 if (sk->sk_route_caps & NETIF_F_TSO) { 914 if (sk->sk_route_caps & NETIF_F_TSO) {
915 sk->sk_route_caps &= ~NETIF_F_TSO; 915 sk->sk_route_caps &= ~NETIF_F_TSO;
916 sock_set_flag(sk, SOCK_NO_LARGESEND); 916 sock_set_flag(sk, SOCK_NO_LARGESEND);
917 tp->mss_cache = tp->mss_cache_std; 917 tp->mss_cache = tp->mss_cache;
918 } 918 }
919 919
920 if (!tp->sacked_out) 920 if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1077 (IsFack(tp) || 1077 (IsFack(tp) ||
1078 !before(lost_retrans, 1078 !before(lost_retrans,
1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering * 1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1080 tp->mss_cache_std))) { 1080 tp->mss_cache))) {
1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1082 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1083 1083
@@ -3334,7 +3334,7 @@ static void tcp_new_space(struct sock *sk)
3334 struct tcp_sock *tp = tcp_sk(sk); 3334 struct tcp_sock *tp = tcp_sk(sk);
3335 3335
3336 if (tcp_should_expand_sndbuf(sk, tp)) { 3336 if (tcp_should_expand_sndbuf(sk, tp)) {
3337 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + 3337 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3338 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3338 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3339 demanded = max_t(unsigned int, tp->snd_cwnd, 3339 demanded = max_t(unsigned int, tp->snd_cwnd,
3340 tp->reordering + 1); 3340 tp->reordering + 1);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
2045 */ 2045 */
2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2047 tp->snd_cwnd_clamp = ~0; 2047 tp->snd_cwnd_clamp = ~0;
2048 tp->mss_cache_std = tp->mss_cache = 536; 2048 tp->mss_cache = 536;
2049 2049
2050 tp->reordering = sysctl_tcp_reordering; 2050 tp->reordering = sysctl_tcp_reordering;
2051 tp->ca_ops = &tcp_init_congestion_ops; 2051 tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0a4cd24b6578..fd3ce38184ae 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -403,21 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
403 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
404} 404}
405 405
406static inline void tcp_tso_set_push(struct sk_buff *skb)
407{
408 /* Force push to be on for any TSO frames to workaround
409 * problems with busted implementations like Mac OS-X that
410 * hold off socket receive wakeups until push is seen.
411 */
412 if (tcp_skb_pcount(skb) > 1)
413 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
414}
415
416static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
417{ 407{
418 struct tcp_sock *tp = tcp_sk(sk); 408 struct tcp_sock *tp = tcp_sk(sk);
419 409
420 if (skb->len <= tp->mss_cache_std || 410 if (skb->len <= tp->mss_cache ||
421 !(sk->sk_route_caps & NETIF_F_TSO)) { 411 !(sk->sk_route_caps & NETIF_F_TSO)) {
422 /* Avoid the costly divide in the normal 412 /* Avoid the costly divide in the normal
423 * non-TSO case. 413 * non-TSO case.
@@ -427,164 +417,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
427 } else { 417 } else {
428 unsigned int factor; 418 unsigned int factor;
429 419
430 factor = skb->len + (tp->mss_cache_std - 1); 420 factor = skb->len + (tp->mss_cache - 1);
431 factor /= tp->mss_cache_std; 421 factor /= tp->mss_cache;
432 skb_shinfo(skb)->tso_segs = factor; 422 skb_shinfo(skb)->tso_segs = factor;
433 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 423 skb_shinfo(skb)->tso_size = tp->mss_cache;
434 }
435}
436
437/* Does SKB fit into the send window? */
438static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
439{
440 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
441
442 return !after(end_seq, tp->snd_una + tp->snd_wnd);
443}
444
445/* Can at least one segment of SKB be sent right now, according to the
446 * congestion window rules? If so, return how many segments are allowed.
447 */
448static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
449{
450 u32 in_flight, cwnd;
451
452 /* Don't be strict about the congestion window for the final FIN. */
453 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
454 return 1;
455
456 in_flight = tcp_packets_in_flight(tp);
457 cwnd = tp->snd_cwnd;
458 if (in_flight < cwnd)
459 return (cwnd - in_flight);
460
461 return 0;
462}
463
464static inline int tcp_minshall_check(const struct tcp_sock *tp)
465{
466 return after(tp->snd_sml,tp->snd_una) &&
467 !after(tp->snd_sml, tp->snd_nxt);
468}
469
470/* Return 0, if packet can be sent now without violation Nagle's rules:
471 * 1. It is full sized.
472 * 2. Or it contains FIN. (already checked by caller)
473 * 3. Or TCP_NODELAY was set.
474 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
475 * With Minshall's modification: all sent small packets are ACKed.
476 */
477
478static inline int tcp_nagle_check(const struct tcp_sock *tp,
479 const struct sk_buff *skb,
480 unsigned mss_now, int nonagle)
481{
482 return (skb->len < mss_now &&
483 ((nonagle&TCP_NAGLE_CORK) ||
484 (!nonagle &&
485 tp->packets_out &&
486 tcp_minshall_check(tp))));
487}
488
489/* Return non-zero if the Nagle test allows this packet to be
490 * sent now.
491 */
492static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
493 unsigned int cur_mss, int nonagle)
494{
495 /* Nagle rule does not apply to frames, which sit in the middle of the
496 * write_queue (they have no chances to get new data).
497 *
498 * This is implemented in the callers, where they modify the 'nonagle'
499 * argument based upon the location of SKB in the send queue.
500 */
501 if (nonagle & TCP_NAGLE_PUSH)
502 return 1;
503
504 /* Don't use the nagle rule for urgent data (or for the final FIN). */
505 if (tp->urg_mode ||
506 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
507 return 1;
508
509 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
510 return 1;
511
512 return 0;
513}
514
515/* This must be invoked the first time we consider transmitting
516 * SKB onto the wire.
517 */
518static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
519{
520 int tso_segs = tcp_skb_pcount(skb);
521
522 if (!tso_segs) {
523 tcp_set_skb_tso_segs(sk, skb);
524 tso_segs = tcp_skb_pcount(skb);
525 }
526 return tso_segs;
527}
528
529/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
530 * should be put on the wire right now. If so, it returns the number of
531 * packets allowed by the congestion window.
532 */
533static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
534 unsigned int cur_mss, int nonagle)
535{
536 struct tcp_sock *tp = tcp_sk(sk);
537 unsigned int cwnd_quota;
538
539 tcp_init_tso_segs(sk, skb);
540
541 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
542 return 0;
543
544 cwnd_quota = tcp_cwnd_test(tp, skb);
545 if (cwnd_quota &&
546 !tcp_snd_wnd_test(tp, skb, cur_mss))
547 cwnd_quota = 0;
548
549 return cwnd_quota;
550}
551
552static inline int tcp_skb_is_last(const struct sock *sk,
553 const struct sk_buff *skb)
554{
555 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
556}
557
558int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
559{
560 struct sk_buff *skb = sk->sk_send_head;
561
562 return (skb &&
563 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
564 (tcp_skb_is_last(sk, skb) ?
565 TCP_NAGLE_PUSH :
566 tp->nonagle)));
567}
568
569
570/* Send _single_ skb sitting at the send head. This function requires
571 * true push pending frames to setup probe timer etc.
572 */
573void tcp_push_one(struct sock *sk, unsigned cur_mss)
574{
575 struct tcp_sock *tp = tcp_sk(sk);
576 struct sk_buff *skb = sk->sk_send_head;
577
578 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
579 /* Send it out now. */
580 TCP_SKB_CB(skb)->when = tcp_time_stamp;
581 tcp_tso_set_push(skb);
582 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
583 sk->sk_send_head = NULL;
584 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
585 tcp_packets_out_inc(sk, tp, skb);
586 return;
587 }
588 } 424 }
589} 425}
590 426
@@ -791,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
791 627
792 /* And store cached results */ 628 /* And store cached results */
793 tp->pmtu_cookie = pmtu; 629 tp->pmtu_cookie = pmtu;
794 tp->mss_cache = tp->mss_cache_std = mss_now; 630 tp->mss_cache = mss_now;
795 631
796 return mss_now; 632 return mss_now;
797} 633}
@@ -803,56 +639,47 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
803 * cannot be large. However, taking into account rare use of URG, this 639 * cannot be large. However, taking into account rare use of URG, this
804 * is not a big flaw. 640 * is not a big flaw.
805 */ 641 */
806 642unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
807unsigned int tcp_current_mss(struct sock *sk, int large)
808{ 643{
809 struct tcp_sock *tp = tcp_sk(sk); 644 struct tcp_sock *tp = tcp_sk(sk);
810 struct dst_entry *dst = __sk_dst_get(sk); 645 struct dst_entry *dst = __sk_dst_get(sk);
811 unsigned int do_large, mss_now; 646 u32 mss_now;
647 u16 xmit_size_goal;
648 int doing_tso = 0;
649
650 mss_now = tp->mss_cache;
651
652 if (large_allowed &&
653 (sk->sk_route_caps & NETIF_F_TSO) &&
654 !tp->urg_mode)
655 doing_tso = 1;
812 656
813 mss_now = tp->mss_cache_std;
814 if (dst) { 657 if (dst) {
815 u32 mtu = dst_mtu(dst); 658 u32 mtu = dst_mtu(dst);
816 if (mtu != tp->pmtu_cookie) 659 if (mtu != tp->pmtu_cookie)
817 mss_now = tcp_sync_mss(sk, mtu); 660 mss_now = tcp_sync_mss(sk, mtu);
818 } 661 }
819 662
820 do_large = (large && 663 if (tp->rx_opt.eff_sacks)
821 (sk->sk_route_caps & NETIF_F_TSO) && 664 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
822 !tp->urg_mode); 665 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
823 666
824 if (do_large) { 667 xmit_size_goal = mss_now;
825 unsigned int large_mss, factor, limit;
826 668
827 large_mss = 65535 - tp->af_specific->net_header_len - 669 if (doing_tso) {
670 xmit_size_goal = 65535 -
671 tp->af_specific->net_header_len -
828 tp->ext_header_len - tp->tcp_header_len; 672 tp->ext_header_len - tp->tcp_header_len;
829 673
830 if (tp->max_window && large_mss > (tp->max_window>>1)) 674 if (tp->max_window &&
831 large_mss = max((tp->max_window>>1), 675 (xmit_size_goal > (tp->max_window >> 1)))
832 68U - tp->tcp_header_len); 676 xmit_size_goal = max((tp->max_window >> 1),
833 677 68U - tp->tcp_header_len);
834 factor = large_mss / mss_now;
835 678
836 /* Always keep large mss multiple of real mss, but 679 xmit_size_goal -= (xmit_size_goal % mss_now);
837 * do not exceed 1/tso_win_divisor of the congestion window
838 * so we can keep the ACK clock ticking and minimize
839 * bursting.
840 */
841 limit = tp->snd_cwnd;
842 if (sysctl_tcp_tso_win_divisor)
843 limit /= sysctl_tcp_tso_win_divisor;
844 limit = max(1U, limit);
845 if (factor > limit)
846 factor = limit;
847
848 tp->mss_cache = mss_now * factor;
849
850 mss_now = tp->mss_cache;
851 } 680 }
681 tp->xmit_size_goal = xmit_size_goal;
852 682
853 if (tp->rx_opt.eff_sacks)
854 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
855 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
856 return mss_now; 683 return mss_now;
857} 684}
858 685
@@ -876,6 +703,251 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
876 } 703 }
877} 704}
878 705
706static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
707{
708 u32 window, cwnd_len;
709
710 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
711 cwnd_len = mss_now * cwnd;
712 return min(window, cwnd_len);
713}
714
715/* Can at least one segment of SKB be sent right now, according to the
716 * congestion window rules? If so, return how many segments are allowed.
717 */
718static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
719{
720 u32 in_flight, cwnd;
721
722 /* Don't be strict about the congestion window for the final FIN. */
723 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
724 return 1;
725
726 in_flight = tcp_packets_in_flight(tp);
727 cwnd = tp->snd_cwnd;
728 if (in_flight < cwnd)
729 return (cwnd - in_flight);
730
731 return 0;
732}
733
734/* This must be invoked the first time we consider transmitting
735 * SKB onto the wire.
736 */
737static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
738{
739 int tso_segs = tcp_skb_pcount(skb);
740
741 if (!tso_segs) {
742 tcp_set_skb_tso_segs(sk, skb);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 BUG_ON(skb->len != skb->data_len);
865
866 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
867 if (unlikely(buff == NULL))
868 return -ENOMEM;
869
870 buff->truesize = nlen;
871 skb->truesize -= nlen;
872
873 /* Correct the sequence numbers. */
874 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
875 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
876 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
877
878 /* PSH and FIN should only be set in the second packet. */
879 flags = TCP_SKB_CB(skb)->flags;
880 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
881 TCP_SKB_CB(buff)->flags = flags;
882
883 /* This packet was never sent out yet, so no SACK bits. */
884 TCP_SKB_CB(buff)->sacked = 0;
885
886 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
887 skb_split(skb, buff, len);
888
889 /* Fix up tso_factor for both original and new SKB. */
890 tcp_set_skb_tso_segs(sk, skb);
891 tcp_set_skb_tso_segs(sk, buff);
892
893 /* Link BUFF into the send queue. */
894 skb_header_release(buff);
895 __skb_append(skb, buff);
896
897 return 0;
898}
899
900/* Try to defer sending, if possible, in order to minimize the amount
901 * of TSO splitting we do. View it as a kind of TSO Nagle test.
902 *
903 * This algorithm is from John Heffner.
904 */
905static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
906{
907 u32 send_win, cong_win, limit, in_flight;
908
909 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
910 return 0;
911
912 in_flight = tcp_packets_in_flight(tp);
913
914 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
915 (tp->snd_cwnd <= in_flight));
916
917 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
918
919 /* From in_flight test above, we know that cwnd > in_flight. */
920 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
921
922 limit = min(send_win, cong_win);
923
924 /* If sk_send_head can be sent fully now, just do it. */
925 if (skb->len <= limit)
926 return 0;
927
928 if (sysctl_tcp_tso_win_divisor) {
929 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
930
931 /* If at least some fraction of a window is available,
932 * just use it.
933 */
934 chunk /= sysctl_tcp_tso_win_divisor;
935 if (limit >= chunk)
936 return 0;
937 } else {
938 /* Different approach, try not to defer past a single
939 * ACK. Receiver should ACK every other full sized
940 * frame, so if we have space for more than 3 frames
941 * then send now.
942 */
943 if (limit > tcp_max_burst(tp) * tp->mss_cache)
944 return 0;
945 }
946
947 /* Ok, it looks like it is advisable to defer. */
948 return 1;
949}
950
879/* This routine writes packets to the network. It advances the 951/* This routine writes packets to the network. It advances the
880 * send_head. This happens as incoming acks open up the remote 952 * send_head. This happens as incoming acks open up the remote
881 * window for us. 953 * window for us.
@@ -887,8 +959,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
887{ 959{
888 struct tcp_sock *tp = tcp_sk(sk); 960 struct tcp_sock *tp = tcp_sk(sk);
889 struct sk_buff *skb; 961 struct sk_buff *skb;
890 unsigned int tso_segs, cwnd_quota; 962 unsigned int tso_segs, sent_pkts;
891 int sent_pkts; 963 int cwnd_quota;
892 964
893 /* If we are closed, the bytes will have to remain here. 965 /* If we are closed, the bytes will have to remain here.
894 * In time closedown will finish, we empty the write queue and all 966 * In time closedown will finish, we empty the write queue and all
@@ -903,24 +975,44 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
903 975
904 tso_segs = tcp_init_tso_segs(sk, skb); 976 tso_segs = tcp_init_tso_segs(sk, skb);
905 cwnd_quota = tcp_cwnd_test(tp, skb); 977 cwnd_quota = tcp_cwnd_test(tp, skb);
978 if (unlikely(!cwnd_quota))
979 goto out;
980
906 sent_pkts = 0; 981 sent_pkts = 0;
982 while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
983 BUG_ON(!tso_segs);
907 984
908 while (cwnd_quota >= tso_segs) { 985 if (tso_segs == 1) {
909 if (unlikely(!tcp_nagle_test(tp, skb, mss_now, 986 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
910 (tcp_skb_is_last(sk, skb) ? 987 (tcp_skb_is_last(sk, skb) ?
911 nonagle : TCP_NAGLE_PUSH)))) 988 nonagle : TCP_NAGLE_PUSH))))
912 break; 989 break;
990 } else {
991 if (tcp_tso_should_defer(sk, tp, skb))
992 break;
993 }
913 994
914 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) 995 if (tso_segs > 1) {
915 break; 996 u32 limit = tcp_window_allows(tp, skb,
997 mss_now, cwnd_quota);
998
999 if (skb->len < limit) {
1000 unsigned int trim = skb->len % mss_now;
916 1001
917 if (unlikely(skb->len > mss_now)) { 1002 if (trim)
1003 limit = skb->len - trim;
1004 }
1005 if (skb->len > limit) {
1006 if (tso_fragment(sk, skb, limit))
1007 break;
1008 }
1009 } else if (unlikely(skb->len > mss_now)) {
918 if (unlikely(tcp_fragment(sk, skb, mss_now))) 1010 if (unlikely(tcp_fragment(sk, skb, mss_now)))
919 break; 1011 break;
920 } 1012 }
921 1013
922 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1014 TCP_SKB_CB(skb)->when = tcp_time_stamp;
923 tcp_tso_set_push(skb); 1015
924 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) 1016 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
925 break; 1017 break;
926 1018
@@ -936,6 +1028,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
936 * the packet above, tso_segs will no longer be valid. 1028 * the packet above, tso_segs will no longer be valid.
937 */ 1029 */
938 cwnd_quota -= tcp_skb_pcount(skb); 1030 cwnd_quota -= tcp_skb_pcount(skb);
1031
1032 BUG_ON(cwnd_quota < 0);
1033 if (!cwnd_quota)
1034 break;
1035
939 skb = sk->sk_send_head; 1036 skb = sk->sk_send_head;
940 if (!skb) 1037 if (!skb)
941 break; 1038 break;
@@ -946,7 +1043,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
946 tcp_cwnd_validate(sk, tp); 1043 tcp_cwnd_validate(sk, tp);
947 return 0; 1044 return 0;
948 } 1045 }
949 1046out:
950 return !tp->packets_out && sk->sk_send_head; 1047 return !tp->packets_out && sk->sk_send_head;
951} 1048}
952 1049
@@ -965,6 +1062,53 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
965 } 1062 }
966} 1063}
967 1064
1065/* Send _single_ skb sitting at the send head. This function requires
1066 * true push pending frames to setup probe timer etc.
1067 */
1068void tcp_push_one(struct sock *sk, unsigned int mss_now)
1069{
1070 struct tcp_sock *tp = tcp_sk(sk);
1071 struct sk_buff *skb = sk->sk_send_head;
1072 unsigned int tso_segs, cwnd_quota;
1073
1074 BUG_ON(!skb || skb->len < mss_now);
1075
1076 tso_segs = tcp_init_tso_segs(sk, skb);
1077 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1078
1079 if (likely(cwnd_quota)) {
1080 BUG_ON(!tso_segs);
1081
1082 if (tso_segs > 1) {
1083 u32 limit = tcp_window_allows(tp, skb,
1084 mss_now, cwnd_quota);
1085
1086 if (skb->len < limit) {
1087 unsigned int trim = skb->len % mss_now;
1088
1089 if (trim)
1090 limit = skb->len - trim;
1091 }
1092 if (skb->len > limit) {
1093 if (unlikely(tso_fragment(sk, skb, limit)))
1094 return;
1095 }
1096 } else if (unlikely(skb->len > mss_now)) {
1097 if (unlikely(tcp_fragment(sk, skb, mss_now)))
1098 return;
1099 }
1100
1101 /* Send it out now. */
1102 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1103
1104 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1105 update_send_head(sk, tp, skb);
1106 tcp_cwnd_validate(sk, tp);
1107 return;
1108 }
1109 }
1110}
1111
968/* This function returns the amount that we can raise the 1112/* This function returns the amount that we can raise the
969 * usable window based on the following constraints 1113 * usable window based on the following constraints
970 * 1114 *
@@ -1222,7 +1366,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1222 if (sk->sk_route_caps & NETIF_F_TSO) { 1366 if (sk->sk_route_caps & NETIF_F_TSO) {
1223 sk->sk_route_caps &= ~NETIF_F_TSO; 1367 sk->sk_route_caps &= ~NETIF_F_TSO;
1224 sock_set_flag(sk, SOCK_NO_LARGESEND); 1368 sock_set_flag(sk, SOCK_NO_LARGESEND);
1225 tp->mss_cache = tp->mss_cache_std;
1226 } 1369 }
1227 1370
1228 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1371 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1284,7 +1427,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1284 * is still in somebody's hands, else make a clone. 1427 * is still in somebody's hands, else make a clone.
1285 */ 1428 */
1286 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1429 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1287 tcp_tso_set_push(skb);
1288 1430
1289 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1431 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1290 pskb_copy(skb, GFP_ATOMIC): 1432 pskb_copy(skb, GFP_ATOMIC):
@@ -1853,14 +1995,12 @@ int tcp_write_wakeup(struct sock *sk)
1853 if (sk->sk_route_caps & NETIF_F_TSO) { 1995 if (sk->sk_route_caps & NETIF_F_TSO) {
1854 sock_set_flag(sk, SOCK_NO_LARGESEND); 1996 sock_set_flag(sk, SOCK_NO_LARGESEND);
1855 sk->sk_route_caps &= ~NETIF_F_TSO; 1997 sk->sk_route_caps &= ~NETIF_F_TSO;
1856 tp->mss_cache = tp->mss_cache_std;
1857 } 1998 }
1858 } else if (!tcp_skb_pcount(skb)) 1999 } else if (!tcp_skb_pcount(skb))
1859 tcp_set_skb_tso_segs(sk, skb); 2000 tcp_set_skb_tso_segs(sk, skb);
1860 2001
1861 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2002 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1862 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2003 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1863 tcp_tso_set_push(skb);
1864 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 2004 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1865 if (!err) { 2005 if (!err) {
1866 update_send_head(sk, tp, skb); 2006 update_send_head(sk, tp, skb);