4 files changed, 380 insertions, 236 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2ba73bf3a8f9..29894c749163 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
                         size_t psize, int flags)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        int mss_now;
+        int mss_now, size_goal;
        int err;
        ssize_t copied;
        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        size_goal = tp->xmit_size_goal;
        copied = 0;
        err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
                int offset = poffset % PAGE_SIZE;
                int size = min_t(size_t, psize, PAGE_SIZE - offset);
-                if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+                if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
                                goto wait_for_memory;
                        skb_entail(sk, tp, skb);
-                        copy = mss_now;
+                        copy = size_goal;
                }
                if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
                if (!(psize -= copy))
                        goto out;
-                if (skb->len != mss_now || (flags & MSG_OOB))
+                if (skb->len < mss_now || (flags & MSG_OOB))
                        continue;
                if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
                        goto do_error;
                mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+                size_goal = tp->xmit_size_goal;
        }
 out:
@@ -754,7 +756,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
-        int tmp = tp->mss_cache_std;
+        int tmp = tp->mss_cache;
        if (sk->sk_route_caps & NETIF_F_SG) {
                if (sk->sk_route_caps & NETIF_F_TSO)
@@ -778,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int iovlen, flags;
-        int mss_now;
+        int mss_now, size_goal;
        int err, copied;
        long timeo;
@@ -797,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        size_goal = tp->xmit_size_goal;
        /* Ok commence sending. */
        iovlen = msg->msg_iovlen;
@@ -819,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        skb = sk->sk_write_queue.prev;
                        if (!sk->sk_send_head ||
-                            (copy = mss_now - skb->len) <= 0) {
+                            (copy = size_goal - skb->len) <= 0) {
 new_segment:
                                /* Allocate new segment. If the interface is SG,
@@ -842,7 +845,7 @@ new_segment:
                                        skb->ip_summed = CHECKSUM_HW;
                                skb_entail(sk, tp, skb);
-                                copy = mss_now;
+                                copy = size_goal;
                        }
                        /* Try to append data to the end of skb. */
@@ -937,7 +940,7 @@ new_segment:
                        if ((seglen -= copy) == 0 && iovlen == 0)
                                goto out;
-                        if (skb->len != mss_now || (flags & MSG_OOB))
+                        if (skb->len < mss_now || (flags & MSG_OOB))
                                continue;
                        if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
                                goto do_error;
                        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+                        size_goal = tp->xmit_size_goal;
                }
        }
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        info->tcpi_rto = jiffies_to_usecs(tp->rto);
        info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
-        info->tcpi_snd_mss = tp->mss_cache_std;
+        info->tcpi_snd_mss = tp->mss_cache;
        info->tcpi_rcv_mss = tp->ack.rcv_mss;
        info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
        switch (optname) {
        case TCP_MAXSEG:
-                val = tp->mss_cache_std;
+                val = tp->mss_cache;
                if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                        val = tp->rx_opt.user_mss;
                break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2ef2f355b8b8..8de2f1071c2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
        if (!cwnd) {
-                if (tp->mss_cache_std > 1460)
+                if (tp->mss_cache > 1460)
                        cwnd = 2;
                else
-                        cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+                        cwnd = (tp->mss_cache > 1095) ? 3 : 4;
        }
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
        if (sk->sk_route_caps & NETIF_F_TSO) {
                sk->sk_route_caps &= ~NETIF_F_TSO;
                sock_set_flag(sk, SOCK_NO_LARGESEND);
-                tp->mss_cache = tp->mss_cache_std;
+                tp->mss_cache = tp->mss_cache;
        }
        if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                            (IsFack(tp) ||
                             !before(lost_retrans,
                                     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
-                                     tp->mss_cache_std))) {
+                                     tp->mss_cache))) {
                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                                tp->retrans_out -= tcp_skb_pcount(skb);
@@ -3334,7 +3334,7 @@ static void tcp_new_space(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
        if (tcp_should_expand_sndbuf(sk, tp)) {
-                int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+                int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
                        MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
                    demanded = max_t(unsigned int, tp->snd_cwnd,
                                                   tp->reordering + 1);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
         */
        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
        tp->snd_cwnd_clamp = ~0;
-        tp->mss_cache_std = tp->mss_cache = 536;
+        tp->mss_cache = 536;
        tp->reordering = sysctl_tcp_reordering;
        tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0a4cd24b6578..fd3ce38184ae 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
 */
-int sysctl_tcp_tso_win_divisor = 8;
+int sysctl_tcp_tso_win_divisor = 3;
 static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
                                    struct sk_buff *skb)
@@ -403,21 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
                sk->sk_send_head = skb;
 }
-static inline void tcp_tso_set_push(struct sk_buff *skb)
-{
-        /* Force push to be on for any TSO frames to workaround
-         * problems with busted implementations like Mac OS-X that
-         * hold off socket receive wakeups until push is seen.
-         */
-        if (tcp_skb_pcount(skb) > 1)
-                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
 static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        if (skb->len <= tp->mss_cache_std ||
+        if (skb->len <= tp->mss_cache ||
            !(sk->sk_route_caps & NETIF_F_TSO)) {
                /* Avoid the costly divide in the normal
                 * non-TSO case.
@@ -427,164 +417,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
        } else {
                unsigned int factor;
-                factor = skb->len + (tp->mss_cache_std - 1);
+                factor = skb->len + (tp->mss_cache - 1);
-                factor /= tp->mss_cache_std;
+                factor /= tp->mss_cache;
                skb_shinfo(skb)->tso_segs = factor;
-                skb_shinfo(skb)->tso_size = tp->mss_cache_std;
+                skb_shinfo(skb)->tso_size = tp->mss_cache;
-        }
-}
-/* Does SKB fit into the send window? */
-static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
-{
-        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
-        return !after(end_seq, tp->snd_una + tp->snd_wnd);
-}
-/* Can at least one segment of SKB be sent right now, according to the
- * congestion window rules?  If so, return how many segments are allowed.
- */
-static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
-{
-        u32 in_flight, cwnd;
-        /* Don't be strict about the congestion window for the final FIN.  */
-        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
-                return 1;
-        in_flight = tcp_packets_in_flight(tp);
-        cwnd = tp->snd_cwnd;
-        if (in_flight < cwnd)
-                return (cwnd - in_flight);
-        return 0;
-}
-static inline int tcp_minshall_check(const struct tcp_sock *tp)
-{
-        return after(tp->snd_sml,tp->snd_una) &&
-                !after(tp->snd_sml, tp->snd_nxt);
-}
-/* Return 0, if packet can be sent now without violation Nagle's rules:
- * 1. It is full sized.
- * 2. Or it contains FIN. (already checked by caller)
- * 3. Or TCP_NODELAY was set.
- * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
- *    With Minshall's modification: all sent small packets are ACKed.
- */
-static inline int tcp_nagle_check(const struct tcp_sock *tp,
-                                  const struct sk_buff *skb, 
-                                  unsigned mss_now, int nonagle)
-{
-        return (skb->len < mss_now &&
-                ((nonagle&TCP_NAGLE_CORK) ||
-                 (!nonagle &&
-                  tp->packets_out &&
-                  tcp_minshall_check(tp))));
-}
-/* Return non-zero if the Nagle test allows this packet to be
- * sent now.
- */
-static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
-                                 unsigned int cur_mss, int nonagle)
-{
-        /* Nagle rule does not apply to frames, which sit in the middle of the
-         * write_queue (they have no chances to get new data).
-         *
-         * This is implemented in the callers, where they modify the 'nonagle'
-         * argument based upon the location of SKB in the send queue.
-         */
-        if (nonagle & TCP_NAGLE_PUSH)
-                return 1;
-        /* Don't use the nagle rule for urgent data (or for the final FIN).  */
-        if (tp->urg_mode ||
-            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
-                return 1;
-        if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
-                return 1;
-        return 0;
-}
-/* This must be invoked the first time we consider transmitting
- * SKB onto the wire.
- */
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-        int tso_segs = tcp_skb_pcount(skb);
-        if (!tso_segs) {
-                tcp_set_skb_tso_segs(sk, skb);
-                tso_segs = tcp_skb_pcount(skb);
-        }
-        return tso_segs;
-}
-/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
- * should be put on the wire right now.  If so, it returns the number of
- * packets allowed by the congestion window.
- */
-static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
-                                 unsigned int cur_mss, int nonagle)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        unsigned int cwnd_quota;
-        tcp_init_tso_segs(sk, skb);
-        if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
-                return 0;
-        cwnd_quota = tcp_cwnd_test(tp, skb);
-        if (cwnd_quota &&
-            !tcp_snd_wnd_test(tp, skb, cur_mss))
-                cwnd_quota = 0;
-        return cwnd_quota;
-}
-static inline int tcp_skb_is_last(const struct sock *sk, 
-                                  const struct sk_buff *skb)
-{
-        return skb->next == (struct sk_buff *)&sk->sk_write_queue;
-}
-int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
-{
-        struct sk_buff *skb = sk->sk_send_head;
-        return (skb &&
-                tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
-                             (tcp_skb_is_last(sk, skb) ?
-                              TCP_NAGLE_PUSH :
-                              tp->nonagle)));
-}
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
- */
-void tcp_push_one(struct sock *sk, unsigned cur_mss)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct sk_buff *skb = sk->sk_send_head;
-        if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
-                /* Send it out now. */
-                TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                tcp_tso_set_push(skb);
-                if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
-                        sk->sk_send_head = NULL;
-                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-                        tcp_packets_out_inc(sk, tp, skb);
-                        return;
-                }
        }
 }
@@ -791,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
        /* And store cached results */
        tp->pmtu_cookie = pmtu;
-        tp->mss_cache = tp->mss_cache_std = mss_now;
+        tp->mss_cache = mss_now;
        return mss_now;
 }
@@ -803,56 +639,47 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 * cannot be large. However, taking into account rare use of URG, this
 * is not a big flaw.
 */
+unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
-unsigned int tcp_current_mss(struct sock *sk, int large)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct dst_entry *dst = __sk_dst_get(sk);
-        unsigned int do_large, mss_now;
+        u32 mss_now;
+        u16 xmit_size_goal;
+        int doing_tso = 0;
+        mss_now = tp->mss_cache;
+        if (large_allowed &&
+            (sk->sk_route_caps & NETIF_F_TSO) &&
+            !tp->urg_mode)
+                doing_tso = 1;
-        mss_now = tp->mss_cache_std;
        if (dst) {
                u32 mtu = dst_mtu(dst);
                if (mtu != tp->pmtu_cookie)
                        mss_now = tcp_sync_mss(sk, mtu);
        }
-        do_large = (large &&
+        if (tp->rx_opt.eff_sacks)
-                    (sk->sk_route_caps & NETIF_F_TSO) &&
+                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-                    !tp->urg_mode);
+                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
-        if (do_large) {
+        xmit_size_goal = mss_now;
-                unsigned int large_mss, factor, limit;
-                large_mss = 65535 - tp->af_specific->net_header_len -
+        if (doing_tso) {
+                xmit_size_goal = 65535 -
+                        tp->af_specific->net_header_len -
                        tp->ext_header_len - tp->tcp_header_len;
-                if (tp->max_window && large_mss > (tp->max_window>>1))
+                if (tp->max_window &&
-                        large_mss = max((tp->max_window>>1),
+                    (xmit_size_goal > (tp->max_window >> 1)))
-                                        68U - tp->tcp_header_len);
+                        xmit_size_goal = max((tp->max_window >> 1),
+                                             68U - tp->tcp_header_len);
-                factor = large_mss / mss_now;
-                /* Always keep large mss multiple of real mss, but
+                xmit_size_goal -= (xmit_size_goal % mss_now);
-                 * do not exceed 1/tso_win_divisor of the congestion window
-                 * so we can keep the ACK clock ticking and minimize
-                 * bursting.
-                 */
-                limit = tp->snd_cwnd;
-                if (sysctl_tcp_tso_win_divisor)
-                        limit /= sysctl_tcp_tso_win_divisor;
-                limit = max(1U, limit);
-                if (factor > limit)
-                        factor = limit;
-                tp->mss_cache = mss_now * factor;
-                mss_now = tp->mss_cache;
        }
+        tp->xmit_size_goal = xmit_size_goal;
-        if (tp->rx_opt.eff_sacks)
-                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
        return mss_now;
 }
@@ -876,6 +703,251 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
        }
 }
+static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
+{
+        u32 window, cwnd_len;
+        window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+        cwnd_len = mss_now * cwnd;
+        return min(window, cwnd_len);
+}
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+        u32 in_flight, cwnd;
+        /* Don't be strict about the congestion window for the final FIN.  */
+        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+                return 1;
+        in_flight = tcp_packets_in_flight(tp);
+        cwnd = tp->snd_cwnd;
+        if (in_flight < cwnd)
+                return (cwnd - in_flight);
+        return 0;
+}
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+        int tso_segs = tcp_skb_pcount(skb);
+        if (!tso_segs) {
+                tcp_set_skb_tso_segs(sk, skb);
+                tso_segs = tcp_skb_pcount(skb);
+        }
+        return tso_segs;
+}
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+        return after(tp->snd_sml,tp->snd_una) &&
+                !after(tp->snd_sml, tp->snd_nxt);
+}
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+                                  const struct sk_buff *skb, 
+                                  unsigned mss_now, int nonagle)
+{
+        return (skb->len < mss_now &&
+                ((nonagle&TCP_NAGLE_CORK) ||
+                 (!nonagle &&
+                  tp->packets_out &&
+                  tcp_minshall_check(tp))));
+}
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+                                 unsigned int cur_mss, int nonagle)
+{
+        /* Nagle rule does not apply to frames, which sit in the middle of the
+         * write_queue (they have no chances to get new data).
+         *
+         * This is implemented in the callers, where they modify the 'nonagle'
+         * argument based upon the location of SKB in the send queue.
+         */
+        if (nonagle & TCP_NAGLE_PUSH)
+                return 1;
+        /* Don't use the nagle rule for urgent data (or for the final FIN).  */
+        if (tp->urg_mode ||
+            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+                return 1;
+        if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+                return 1;
+        return 0;
+}
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+        if (skb->len > cur_mss)
+                end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+        return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now.  If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+                                 unsigned int cur_mss, int nonagle)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        unsigned int cwnd_quota;
+        tcp_init_tso_segs(sk, skb);
+        if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+                return 0;
+        cwnd_quota = tcp_cwnd_test(tp, skb);
+        if (cwnd_quota &&
+            !tcp_snd_wnd_test(tp, skb, cur_mss))
+                cwnd_quota = 0;
+        return cwnd_quota;
+}
+static inline int tcp_skb_is_last(const struct sock *sk, 
+                                  const struct sk_buff *skb)
+{
+        return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+        struct sk_buff *skb = sk->sk_send_head;
+        return (skb &&
+                tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+                             (tcp_skb_is_last(sk, skb) ?
+                              TCP_NAGLE_PUSH :
+                              tp->nonagle)));
+}
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list.  It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation.  In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+{
+        struct sk_buff *buff;
+        int nlen = skb->len - len;
+        u16 flags;
+        /* All of a TSO frame must be composed of paged data.  */
+        BUG_ON(skb->len != skb->data_len);
+        buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+        if (unlikely(buff == NULL))
+                return -ENOMEM;
+        buff->truesize = nlen;
+        skb->truesize -= nlen;
+        /* Correct the sequence numbers. */
+        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+        /* PSH and FIN should only be set in the second packet. */
+        flags = TCP_SKB_CB(skb)->flags;
+        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+        TCP_SKB_CB(buff)->flags = flags;
+        /* This packet was never sent out yet, so no SACK bits. */
+        TCP_SKB_CB(buff)->sacked = 0;
+        buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+        skb_split(skb, buff, len);
+        /* Fix up tso_factor for both original and new SKB.  */
+        tcp_set_skb_tso_segs(sk, skb);
+        tcp_set_skb_tso_segs(sk, buff);
+        /* Link BUFF into the send queue. */
+        skb_header_release(buff);
+        __skb_append(skb, buff);
+        return 0;
+}
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+        u32 send_win, cong_win, limit, in_flight;
+        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+                return 0;
+        in_flight = tcp_packets_in_flight(tp);
+        BUG_ON(tcp_skb_pcount(skb) <= 1 ||
+               (tp->snd_cwnd <= in_flight));
+        send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+        /* From in_flight test above, we know that cwnd > in_flight.  */
+        cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+        limit = min(send_win, cong_win);
+        /* If sk_send_head can be sent fully now, just do it.  */
+        if (skb->len <= limit)
+                return 0;
+        if (sysctl_tcp_tso_win_divisor) {
+                u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+                /* If at least some fraction of a window is available,
+                 * just use it.
+                 */
+                chunk /= sysctl_tcp_tso_win_divisor;
+                if (limit >= chunk)
+                        return 0;
+        } else {
+                /* Different approach, try not to defer past a single
+                 * ACK.  Receiver should ACK every other full sized
+                 * frame, so if we have space for more than 3 frames
+                 * then send now.
+                 */
+                if (limit > tcp_max_burst(tp) * tp->mss_cache)
+                        return 0;
+        }
+        /* Ok, it looks like it is advisable to defer.  */
+        return 1;
+}
 /* This routine writes packets to the network.  It advances the
 * send_head.  This happens as incoming acks open up the remote
 * window for us.
@@ -887,8 +959,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-        unsigned int tso_segs, cwnd_quota;
+        unsigned int tso_segs, sent_pkts;
-        int sent_pkts;
+        int cwnd_quota;
        /* If we are closed, the bytes will have to remain here.
         * In time closedown will finish, we empty the write queue and all
@@ -903,24 +975,44 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
        tso_segs = tcp_init_tso_segs(sk, skb);
        cwnd_quota = tcp_cwnd_test(tp, skb);
+        if (unlikely(!cwnd_quota))
+                goto out;
        sent_pkts = 0;
+        while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+                BUG_ON(!tso_segs);
-        while (cwnd_quota >= tso_segs) {
+                if (tso_segs == 1) {
-                if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+                        if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
-                                             (tcp_skb_is_last(sk, skb) ?
+                                                     (tcp_skb_is_last(sk, skb) ?
-                                              nonagle : TCP_NAGLE_PUSH))))
+                                                      nonagle : TCP_NAGLE_PUSH))))
-                        break;
+                                break;
+                } else {
+                        if (tcp_tso_should_defer(sk, tp, skb))
+                                break;
+                }
-                if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+                if (tso_segs > 1) {
-                        break;
+                        u32 limit = tcp_window_allows(tp, skb,
+                                                      mss_now, cwnd_quota);
+                        if (skb->len < limit) {
+                                unsigned int trim = skb->len % mss_now;
-                if (unlikely(skb->len > mss_now)) {
+                                if (trim)
+                                        limit = skb->len - trim;
+                        }
+                        if (skb->len > limit) {
+                                if (tso_fragment(sk, skb, limit))
+                                        break;
+                        }
+                } else if (unlikely(skb->len > mss_now)) {
                        if (unlikely(tcp_fragment(sk, skb,  mss_now)))
                                break;
                }
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                tcp_tso_set_push(skb);
                if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
                        break;
@@ -936,6 +1028,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
                 * the packet above, tso_segs will no longer be valid.
                 */
                cwnd_quota -= tcp_skb_pcount(skb);
+                BUG_ON(cwnd_quota < 0);
+                if (!cwnd_quota)
+                        break;
                skb = sk->sk_send_head;
                if (!skb)
                        break;
@@ -946,7 +1043,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
                tcp_cwnd_validate(sk, tp);
                return 0;
        }
+out:
        return !tp->packets_out && sk->sk_send_head;
 }
@@ -965,6 +1062,53 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
        }
 }
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb = sk->sk_send_head;
+        unsigned int tso_segs, cwnd_quota;
+        BUG_ON(!skb || skb->len < mss_now);
+        tso_segs = tcp_init_tso_segs(sk, skb);
+        cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+        if (likely(cwnd_quota)) {
+                BUG_ON(!tso_segs);
+                if (tso_segs > 1) {
+                        u32 limit = tcp_window_allows(tp, skb,
+                                                      mss_now, cwnd_quota);
+                        if (skb->len < limit) {
+                                unsigned int trim = skb->len % mss_now;
+                                if (trim)
+                                        limit = skb->len - trim;
+                        }
+                        if (skb->len > limit) {
+                                if (unlikely(tso_fragment(sk, skb, limit)))
+                                        return;
+                        }
+                } else if (unlikely(skb->len > mss_now)) {
+                        if (unlikely(tcp_fragment(sk, skb, mss_now)))
+                                return;
+                }
+                /* Send it out now. */
+                TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+                        update_send_head(sk, tp, skb);
+                        tcp_cwnd_validate(sk, tp);
+                        return;
+                }
+        }
+}
 /* This function returns the amount that we can raise the
 * usable window based on the following constraints
 *  
@@ -1222,7 +1366,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
                if (sk->sk_route_caps & NETIF_F_TSO) {
                        sk->sk_route_caps &= ~NETIF_F_TSO;
                        sock_set_flag(sk, SOCK_NO_LARGESEND);
-                        tp->mss_cache = tp->mss_cache_std;
                }
                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1284,7 +1427,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         * is still in somebody's hands, else make a clone.
         */
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-        tcp_tso_set_push(skb);
        err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
                                    pskb_copy(skb, GFP_ATOMIC):
@@ -1853,14 +1995,12 @@ int tcp_write_wakeup(struct sock *sk)
                                if (sk->sk_route_caps & NETIF_F_TSO) {
                                        sock_set_flag(sk, SOCK_NO_LARGESEND);
                                        sk->sk_route_caps &= ~NETIF_F_TSO;
-                                        tp->mss_cache = tp->mss_cache_std;
                                }
                        } else if (!tcp_skb_pcount(skb))
                                tcp_set_skb_tso_segs(sk, skb);
                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                        tcp_tso_set_push(skb);
                        err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
                        if (!err) {
                                update_send_head(sk, tp, skb);