1 files changed, 120 insertions, 69 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7820f3a7dd70..17a11e65e57f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -363,15 +363,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
 */
 static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 {
+        struct skb_shared_info *shinfo = skb_shinfo(skb);
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum = 0;
        TCP_SKB_CB(skb)->tcp_flags = flags;
        TCP_SKB_CB(skb)->sacked = 0;
-        skb_shinfo(skb)->gso_segs = 1;
+        shinfo->gso_segs = 1;
-        skb_shinfo(skb)->gso_size = 0;
+        shinfo->gso_size = 0;
-        skb_shinfo(skb)->gso_type = 0;
+        shinfo->gso_type = 0;
        TCP_SKB_CB(skb)->seq = seq;
        if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -406,7 +408,7 @@ struct tcp_out_options {
 * Beware: Something in the Internet is very sensitive to the ordering of
 * TCP options, we learned this through the hard way, so be careful here.
 * Luckily we can at least blame others for their non-compliance but from
- * inter-operatibility perspective it seems that we're somewhat stuck with
+ * inter-operability perspective it seems that we're somewhat stuck with
 * the ordering which we have been using if we want to keep working with
 * those broken things (not that it currently hurts anybody as there isn't
 * particular reason why the ordering would need to be changed).
@@ -679,7 +681,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 *
 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
 * needs to be reallocated in a driver.
- * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
+ * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
 *
 * Since transmit from skb destructor is forbidden, we use a tasklet
 * to process all sockets that eventually need to send more skbs.
@@ -696,12 +698,13 @@ static void tcp_tsq_handler(struct sock *sk)
        if ((1 << sk->sk_state) &
            (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
             TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
-                tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
+                tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+                               0, GFP_ATOMIC);
 }
 /*
- * One tasklest per cpu tries to send more skbs.
+ * One tasklet per cpu tries to send more skbs.
 * We run in tasklet context but need to disable irqs when
- * transfering tsq->head because tcp_wfree() might
+ * transferring tsq->head because tcp_wfree() might
 * interrupt us (non NAPI drivers)
 */
 static void tcp_tasklet_func(unsigned long data)
@@ -764,6 +767,17 @@ void tcp_release_cb(struct sock *sk)
        if (flags & (1UL << TCP_TSQ_DEFERRED))
                tcp_tsq_handler(sk);
+        /* Here begins the tricky part :
+         * We are called from release_sock() with :
+         * 1) BH disabled
+         * 2) sk_lock.slock spinlock held
+         * 3) socket owned by us (sk->sk_lock.owned == 1)
+         *
+         * But following code is meant to be called from BH handlers,
+         * so we should keep BH disabled, but early release socket ownership
+         */
+        sock_release_ownership(sk);
        if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
                tcp_write_timer_handler(sk);
                __sock_put(sk);
@@ -795,7 +809,7 @@ void __init tcp_tasklet_init(void)
 /*
 * Write buffer destructor automatically called from kfree_skb.
- * We cant xmit new skbs from this context, as we might already
+ * We can't xmit new skbs from this context, as we might already
 * hold qdisc lock.
 */
 void tcp_wfree(struct sk_buff *skb)
@@ -861,8 +875,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
                             fclone->fclone == SKB_FCLONE_CLONE))
-                        NET_INC_STATS_BH(sock_net(sk),
+                        NET_INC_STATS(sock_net(sk),
-                                         LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+                                      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
                if (unlikely(skb_cloned(skb)))
                        skb = pskb_copy(skb, gfp_mask);
@@ -986,6 +1000,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
                                 unsigned int mss_now)
 {
+        struct skb_shared_info *shinfo = skb_shinfo(skb);
        /* Make sure we own this skb before messing gso_size/gso_segs */
        WARN_ON_ONCE(skb_cloned(skb));
@@ -993,13 +1009,13 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
                /* Avoid the costly divide in the normal
                 * non-TSO case.
                 */
-                skb_shinfo(skb)->gso_segs = 1;
+                shinfo->gso_segs = 1;
-                skb_shinfo(skb)->gso_size = 0;
+                shinfo->gso_size = 0;
-                skb_shinfo(skb)->gso_type = 0;
+                shinfo->gso_type = 0;
        } else {
-                skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
+                shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
-                skb_shinfo(skb)->gso_size = mss_now;
+                shinfo->gso_size = mss_now;
-                skb_shinfo(skb)->gso_type = sk->sk_gso_type;
+                shinfo->gso_type = sk->sk_gso_type;
        }
 }
@@ -1146,6 +1162,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 */
 static void __pskb_trim_head(struct sk_buff *skb, int len)
 {
+        struct skb_shared_info *shinfo;
        int i, k, eat;
        eat = min_t(int, len, skb_headlen(skb));
@@ -1157,23 +1174,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
        }
        eat = len;
        k = 0;
-        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+        shinfo = skb_shinfo(skb);
-                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+        for (i = 0; i < shinfo->nr_frags; i++) {
+                int size = skb_frag_size(&shinfo->frags[i]);
                if (size <= eat) {
                        skb_frag_unref(skb, i);
                        eat -= size;
                } else {
-                        skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+                        shinfo->frags[k] = shinfo->frags[i];
                        if (eat) {
-                                skb_shinfo(skb)->frags[k].page_offset += eat;
+                                shinfo->frags[k].page_offset += eat;
-                                skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
+                                skb_frag_size_sub(&shinfo->frags[k], eat);
                                eat = 0;
                        }
                        k++;
                }
        }
-        skb_shinfo(skb)->nr_frags = k;
+        shinfo->nr_frags = k;
        skb_reset_tail_pointer(skb);
        skb->data_len -= len;
@@ -1378,23 +1396,51 @@ static void tcp_cwnd_validate(struct sock *sk)
        }
 }
-/* Returns the portion of skb which can be sent right away without
+/* Minshall's variant of the Nagle send check. */
- * introducing MSS oddities to segment boundaries. In rare cases where
+static bool tcp_minshall_check(const struct tcp_sock *tp)
- * mss_now != mss_cache, we will request caller to create a small skb
+{
- * per input skb which could be mostly avoided here (if desired).
+        return after(tp->snd_sml, tp->snd_una) &&
- *
+                !after(tp->snd_sml, tp->snd_nxt);
- * We explicitly want to create a request for splitting write queue tail
+}
- * to a small skb for Nagle purposes while avoiding unnecessary modulos,
- * thus all the complexity (cwnd_len is always MSS multiple which we
+/* Update snd_sml if this skb is under mss
- * return whenever allowed by the other factors). Basically we need the
+ * Note that a TSO packet might end with a sub-mss segment
- * modulo only when the receiver window alone is the limiting factor or
+ * The test is really :
- * when we would be allowed to send the split-due-to-Nagle skb fully.
+ * if ((skb->len % mss) != 0)
+ *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+ * But we can avoid doing the divide again given we already have
+ *  skb_pcount = skb->len / mss_now
+ */
+static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+                                const struct sk_buff *skb)
+{
+        if (skb->len < tcp_skb_pcount(skb) * mss_now)
+                tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+}
+/* Return false, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized. (provided by caller in %partial bool)
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
 */
-static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
+static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
-                                        unsigned int mss_now, unsigned int max_segs)
+                            unsigned int mss_now, int nonagle)
+{
+        return partial &&
+                ((nonagle & TCP_NAGLE_CORK) ||
+                 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+}
+/* Returns the portion of skb which can be sent right away */
+static unsigned int tcp_mss_split_point(const struct sock *sk,
+                                        const struct sk_buff *skb,
+                                        unsigned int mss_now,
+                                        unsigned int max_segs,
+                                        int nonagle)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
-        u32 needed, window, max_len;
+        u32 partial, needed, window, max_len;
        window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
        max_len = mss_now * max_segs;
@@ -1407,7 +1453,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b
        if (max_len <= needed)
                return max_len;
-        return needed - needed % mss_now;
+        partial = needed % mss_now;
+        /* If last segment is not a full MSS, check if Nagle rules allow us
+         * to include this last segment in this skb.
+         * Otherwise, we'll split the skb at last MSS boundary
+         */
+        if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle))
+                return needed - partial;
+        return needed;
 }
 /* Can at least one segment of SKB be sent right now, according to the
@@ -1447,28 +1501,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
        return tso_segs;
 }
-/* Minshall's variant of the Nagle send check. */
-static inline bool tcp_minshall_check(const struct tcp_sock *tp)
-{
-        return after(tp->snd_sml, tp->snd_una) &&
-                !after(tp->snd_sml, tp->snd_nxt);
-}
-/* Return false, if packet can be sent now without violation Nagle's rules:
- * 1. It is full sized.
- * 2. Or it contains FIN. (already checked by caller)
- * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
- * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
- *    With Minshall's modification: all sent small packets are ACKed.
- */
-static inline bool tcp_nagle_check(const struct tcp_sock *tp,
-                                  const struct sk_buff *skb,
-                                  unsigned int mss_now, int nonagle)
-{
-        return skb->len < mss_now &&
-                ((nonagle & TCP_NAGLE_CORK) ||
-                 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
-}
 /* Return true if the Nagle test allows this packet to be
 * sent now.
@@ -1489,7 +1521,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
        if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
                return true;
-        if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+        if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle))
                return true;
        return false;
@@ -1884,7 +1916,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                if (atomic_read(&sk->sk_wmem_alloc) > limit) {
                        set_bit(TSQ_THROTTLED, &tp->tsq_flags);
-                        break;
+                        /* It is possible TX completion already happened
+                         * before we set TSQ_THROTTLED, so we must
+                         * test again the condition.
+                         * We abuse smp_mb__after_clear_bit() because
+                         * there is no smp_mb__after_set_bit() yet
+                         */
+                        smp_mb__after_clear_bit();
+                        if (atomic_read(&sk->sk_wmem_alloc) > limit)
+                                break;
                }
                limit = mss_now;
@@ -1892,7 +1932,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                        limit = tcp_mss_split_point(sk, skb, mss_now,
                                                    min_t(unsigned int,
                                                          cwnd_quota,
-                                                          sk->sk_gso_max_segs));
+                                                          sk->sk_gso_max_segs),
+                                                    nonagle);
                if (skb->len > limit &&
                    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -1956,7 +1997,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
        /* Schedule a loss probe in 2*RTT for SACK capable connections
         * in Open state, that are either limited by cwnd or application.
         */
-        if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
+        if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out ||
            !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
                return false;
@@ -2307,6 +2348,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        unsigned int cur_mss;
+        int err;
        /* Inconslusive MTU probe */
        if (icsk->icsk_mtup.probe_size) {
@@ -2370,11 +2412,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
                     skb_headroom(skb) >= 0xFFFF)) {
                struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
                                                   GFP_ATOMIC);
-                return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+                err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-                              -ENOBUFS;
+                             -ENOBUFS;
        } else {
-                return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+                err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
        }
+        if (likely(!err))
+                TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+        return err;
 }
 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
@@ -2756,7 +2802,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 EXPORT_SYMBOL(tcp_make_synack);
 /* Do all connect socket setups that can be done AF independent. */
-void tcp_connect_init(struct sock *sk)
+static void tcp_connect_init(struct sock *sk)
 {
        const struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2878,7 +2924,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
        space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
                MAX_TCP_OPTION_SPACE;
-        syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
+        space = min_t(size_t, space, fo->size);
+        /* limit to order-0 allocations */
+        space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
+        syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,
                                   sk->sk_allocation);
        if (syn_data == NULL)
                goto fallback;

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7820f3a7dd70..17a11e65e57f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -363,15 +363,17 @@ static inline void TCP_ECN_send(struct sock sk, struct sk_buff skb,
363	*/	363	*/
364	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)	364	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
365	{	365	{
		366	struct skb_shared_info *shinfo = skb_shinfo(skb);
		367
366	skb->ip_summed = CHECKSUM_PARTIAL;	368	skb->ip_summed = CHECKSUM_PARTIAL;
367	skb->csum = 0;	369	skb->csum = 0;
368		370
369	TCP_SKB_CB(skb)->tcp_flags = flags;	371	TCP_SKB_CB(skb)->tcp_flags = flags;
370	TCP_SKB_CB(skb)->sacked = 0;	372	TCP_SKB_CB(skb)->sacked = 0;
371		373
372	skb_shinfo(skb)->gso_segs = 1;	374	shinfo->gso_segs = 1;
373	skb_shinfo(skb)->gso_size = 0;	375	shinfo->gso_size = 0;
374	skb_shinfo(skb)->gso_type = 0;	376	shinfo->gso_type = 0;
375		377
376	TCP_SKB_CB(skb)->seq = seq;	378	TCP_SKB_CB(skb)->seq = seq;
377	if (flags & (TCPHDR_SYN \| TCPHDR_FIN))	379	if (flags & (TCPHDR_SYN \| TCPHDR_FIN))
@@ -406,7 +408,7 @@ struct tcp_out_options {
406	* Beware: Something in the Internet is very sensitive to the ordering of	408	* Beware: Something in the Internet is very sensitive to the ordering of
407	* TCP options, we learned this through the hard way, so be careful here.	409	* TCP options, we learned this through the hard way, so be careful here.
408	* Luckily we can at least blame others for their non-compliance but from	410	* Luckily we can at least blame others for their non-compliance but from
409	* inter-operatibility perspective it seems that we're somewhat stuck with	411	* inter-operability perspective it seems that we're somewhat stuck with
410	* the ordering which we have been using if we want to keep working with	412	* the ordering which we have been using if we want to keep working with
411	* those broken things (not that it currently hurts anybody as there isn't	413	* those broken things (not that it currently hurts anybody as there isn't
412	* particular reason why the ordering would need to be changed).	414	* particular reason why the ordering would need to be changed).
@@ -679,7 +681,7 @@ static unsigned int tcp_established_options(struct sock sk, struct sk_buff skb
679	*	681	*
680	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb	682	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
681	* needs to be reallocated in a driver.	683	* needs to be reallocated in a driver.
682	* The invariant being skb->truesize substracted from sk->sk_wmem_alloc	684	* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
683	*	685	*
684	* Since transmit from skb destructor is forbidden, we use a tasklet	686	* Since transmit from skb destructor is forbidden, we use a tasklet
685	* to process all sockets that eventually need to send more skbs.	687	* to process all sockets that eventually need to send more skbs.
@@ -696,12 +698,13 @@ static void tcp_tsq_handler(struct sock *sk)
696	if ((1 << sk->sk_state) &	698	if ((1 << sk->sk_state) &
697	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \| TCPF_CLOSING \|	699	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \| TCPF_CLOSING \|
698	TCPF_CLOSE_WAIT \| TCPF_LAST_ACK))	700	TCPF_CLOSE_WAIT \| TCPF_LAST_ACK))
699	tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);	701	tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
		702	0, GFP_ATOMIC);
700	}	703	}
701	/*	704	/*
702	* One tasklest per cpu tries to send more skbs.	705	* One tasklet per cpu tries to send more skbs.
703	* We run in tasklet context but need to disable irqs when	706	* We run in tasklet context but need to disable irqs when
704	* transfering tsq->head because tcp_wfree() might	707	* transferring tsq->head because tcp_wfree() might
705	* interrupt us (non NAPI drivers)	708	* interrupt us (non NAPI drivers)
706	*/	709	*/
707	static void tcp_tasklet_func(unsigned long data)	710	static void tcp_tasklet_func(unsigned long data)
@@ -764,6 +767,17 @@ void tcp_release_cb(struct sock *sk)
764	if (flags & (1UL << TCP_TSQ_DEFERRED))	767	if (flags & (1UL << TCP_TSQ_DEFERRED))
765	tcp_tsq_handler(sk);	768	tcp_tsq_handler(sk);
766		769
		770	/* Here begins the tricky part :
		771	* We are called from release_sock() with :
		772	* 1) BH disabled
		773	* 2) sk_lock.slock spinlock held
		774	* 3) socket owned by us (sk->sk_lock.owned == 1)
		775	*
		776	* But following code is meant to be called from BH handlers,
		777	* so we should keep BH disabled, but early release socket ownership
		778	*/
		779	sock_release_ownership(sk);
		780
767	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {	781	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
768	tcp_write_timer_handler(sk);	782	tcp_write_timer_handler(sk);
769	__sock_put(sk);	783	__sock_put(sk);
@@ -795,7 +809,7 @@ void __init tcp_tasklet_init(void)
795		809
796	/*	810	/*
797	* Write buffer destructor automatically called from kfree_skb.	811	* Write buffer destructor automatically called from kfree_skb.
798	* We cant xmit new skbs from this context, as we might already	812	* We can't xmit new skbs from this context, as we might already
799	* hold qdisc lock.	813	* hold qdisc lock.
800	*/	814	*/
801	void tcp_wfree(struct sk_buff *skb)	815	void tcp_wfree(struct sk_buff *skb)
@@ -861,8 +875,8 @@ static int tcp_transmit_skb(struct sock sk, struct sk_buff skb, int clone_it,
861		875
862	if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&	876	if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
863	fclone->fclone == SKB_FCLONE_CLONE))	877	fclone->fclone == SKB_FCLONE_CLONE))
864	NET_INC_STATS_BH(sock_net(sk),	878	NET_INC_STATS(sock_net(sk),
865	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);	879	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
866		880
867	if (unlikely(skb_cloned(skb)))	881	if (unlikely(skb_cloned(skb)))
868	skb = pskb_copy(skb, gfp_mask);	882	skb = pskb_copy(skb, gfp_mask);
@@ -986,6 +1000,8 @@ static void tcp_queue_skb(struct sock sk, struct sk_buff skb)
986	static void tcp_set_skb_tso_segs(const struct sock sk, struct sk_buff skb,	1000	static void tcp_set_skb_tso_segs(const struct sock sk, struct sk_buff skb,
987	unsigned int mss_now)	1001	unsigned int mss_now)
988	{	1002	{
		1003	struct skb_shared_info *shinfo = skb_shinfo(skb);
		1004
989	/* Make sure we own this skb before messing gso_size/gso_segs */	1005	/* Make sure we own this skb before messing gso_size/gso_segs */
990	WARN_ON_ONCE(skb_cloned(skb));	1006	WARN_ON_ONCE(skb_cloned(skb));
991		1007
@@ -993,13 +1009,13 @@ static void tcp_set_skb_tso_segs(const struct sock sk, struct sk_buff skb,
993	/* Avoid the costly divide in the normal	1009	/* Avoid the costly divide in the normal
994	* non-TSO case.	1010	* non-TSO case.
995	*/	1011	*/
996	skb_shinfo(skb)->gso_segs = 1;	1012	shinfo->gso_segs = 1;
997	skb_shinfo(skb)->gso_size = 0;	1013	shinfo->gso_size = 0;
998	skb_shinfo(skb)->gso_type = 0;	1014	shinfo->gso_type = 0;
999	} else {	1015	} else {
1000	skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);	1016	shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
1001	skb_shinfo(skb)->gso_size = mss_now;	1017	shinfo->gso_size = mss_now;
1002	skb_shinfo(skb)->gso_type = sk->sk_gso_type;	1018	shinfo->gso_type = sk->sk_gso_type;
1003	}	1019	}
1004	}	1020	}
1005		1021
@@ -1146,6 +1162,7 @@ int tcp_fragment(struct sock sk, struct sk_buff skb, u32 len,
1146	*/	1162	*/
1147	static void __pskb_trim_head(struct sk_buff *skb, int len)	1163	static void __pskb_trim_head(struct sk_buff *skb, int len)
1148	{	1164	{
		1165	struct skb_shared_info *shinfo;
1149	int i, k, eat;	1166	int i, k, eat;
1150		1167
1151	eat = min_t(int, len, skb_headlen(skb));	1168	eat = min_t(int, len, skb_headlen(skb));
@@ -1157,23 +1174,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
1157	}	1174	}
1158	eat = len;	1175	eat = len;
1159	k = 0;	1176	k = 0;
1160	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {	1177	shinfo = skb_shinfo(skb);
1161	int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);	1178	for (i = 0; i < shinfo->nr_frags; i++) {
		1179	int size = skb_frag_size(&shinfo->frags[i]);
1162		1180
1163	if (size <= eat) {	1181	if (size <= eat) {
1164	skb_frag_unref(skb, i);	1182	skb_frag_unref(skb, i);
1165	eat -= size;	1183	eat -= size;
1166	} else {	1184	} else {
1167	skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];	1185	shinfo->frags[k] = shinfo->frags[i];
1168	if (eat) {	1186	if (eat) {
1169	skb_shinfo(skb)->frags[k].page_offset += eat;	1187	shinfo->frags[k].page_offset += eat;
1170	skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);	1188	skb_frag_size_sub(&shinfo->frags[k], eat);
1171	eat = 0;	1189	eat = 0;
1172	}	1190	}
1173	k++;	1191	k++;
1174	}	1192	}
1175	}	1193	}
1176	skb_shinfo(skb)->nr_frags = k;	1194	shinfo->nr_frags = k;
1177		1195
1178	skb_reset_tail_pointer(skb);	1196	skb_reset_tail_pointer(skb);
1179	skb->data_len -= len;	1197	skb->data_len -= len;
@@ -1378,23 +1396,51 @@ static void tcp_cwnd_validate(struct sock *sk)
1378	}	1396	}
1379	}	1397	}
1380		1398
1381	/* Returns the portion of skb which can be sent right away without	1399	/* Minshall's variant of the Nagle send check. */
1382	* introducing MSS oddities to segment boundaries. In rare cases where	1400	static bool tcp_minshall_check(const struct tcp_sock *tp)
1383	* mss_now != mss_cache, we will request caller to create a small skb	1401	{
1384	* per input skb which could be mostly avoided here (if desired).	1402	return after(tp->snd_sml, tp->snd_una) &&
1385	*	1403	!after(tp->snd_sml, tp->snd_nxt);
1386	* We explicitly want to create a request for splitting write queue tail	1404	}
1387	* to a small skb for Nagle purposes while avoiding unnecessary modulos,	1405
1388	* thus all the complexity (cwnd_len is always MSS multiple which we	1406	/* Update snd_sml if this skb is under mss
1389	* return whenever allowed by the other factors). Basically we need the	1407	* Note that a TSO packet might end with a sub-mss segment
1390	* modulo only when the receiver window alone is the limiting factor or	1408	* The test is really :
1391	* when we would be allowed to send the split-due-to-Nagle skb fully.	1409	* if ((skb->len % mss) != 0)
		1410	* tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
		1411	* But we can avoid doing the divide again given we already have
		1412	* skb_pcount = skb->len / mss_now
		1413	*/
		1414	static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
		1415	const struct sk_buff *skb)
		1416	{
		1417	if (skb->len < tcp_skb_pcount(skb) * mss_now)
		1418	tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
		1419	}
		1420
		1421	/* Return false, if packet can be sent now without violation Nagle's rules:
		1422	* 1. It is full sized. (provided by caller in %partial bool)
		1423	* 2. Or it contains FIN. (already checked by caller)
		1424	* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
		1425	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
		1426	* With Minshall's modification: all sent small packets are ACKed.
1392	*/	1427	*/
1393	static unsigned int tcp_mss_split_point(const struct sock sk, const struct sk_buff skb,	1428	static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1394	unsigned int mss_now, unsigned int max_segs)	1429	unsigned int mss_now, int nonagle)
		1430	{
		1431	return partial &&
		1432	((nonagle & TCP_NAGLE_CORK) \|\|
		1433	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
		1434	}
		1435	/* Returns the portion of skb which can be sent right away */
		1436	static unsigned int tcp_mss_split_point(const struct sock *sk,
		1437	const struct sk_buff *skb,
		1438	unsigned int mss_now,
		1439	unsigned int max_segs,
		1440	int nonagle)
1395	{	1441	{
1396	const struct tcp_sock *tp = tcp_sk(sk);	1442	const struct tcp_sock *tp = tcp_sk(sk);
1397	u32 needed, window, max_len;	1443	u32 partial, needed, window, max_len;
1398		1444
1399	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;	1445	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1400	max_len = mss_now * max_segs;	1446	max_len = mss_now * max_segs;
@@ -1407,7 +1453,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b
1407	if (max_len <= needed)	1453	if (max_len <= needed)
1408	return max_len;	1454	return max_len;
1409		1455
1410	return needed - needed % mss_now;	1456	partial = needed % mss_now;
		1457	/* If last segment is not a full MSS, check if Nagle rules allow us
		1458	* to include this last segment in this skb.
		1459	* Otherwise, we'll split the skb at last MSS boundary
		1460	*/
		1461	if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle))
		1462	return needed - partial;
		1463
		1464	return needed;
1411	}	1465	}
1412		1466
1413	/* Can at least one segment of SKB be sent right now, according to the	1467	/* Can at least one segment of SKB be sent right now, according to the
@@ -1447,28 +1501,6 @@ static int tcp_init_tso_segs(const struct sock sk, struct sk_buff skb,
1447	return tso_segs;	1501	return tso_segs;
1448	}	1502	}
1449		1503
1450	/* Minshall's variant of the Nagle send check. */
1451	static inline bool tcp_minshall_check(const struct tcp_sock *tp)
1452	{
1453	return after(tp->snd_sml, tp->snd_una) &&
1454	!after(tp->snd_sml, tp->snd_nxt);
1455	}
1456
1457	/* Return false, if packet can be sent now without violation Nagle's rules:
1458	* 1. It is full sized.
1459	* 2. Or it contains FIN. (already checked by caller)
1460	* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1461	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1462	* With Minshall's modification: all sent small packets are ACKed.
1463	*/
1464	static inline bool tcp_nagle_check(const struct tcp_sock *tp,
1465	const struct sk_buff *skb,
1466	unsigned int mss_now, int nonagle)
1467	{
1468	return skb->len < mss_now &&
1469	((nonagle & TCP_NAGLE_CORK) \|\|
1470	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1471	}
1472		1504
1473	/* Return true if the Nagle test allows this packet to be	1505	/* Return true if the Nagle test allows this packet to be
1474	* sent now.	1506	* sent now.
@@ -1489,7 +1521,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
1489	if (tcp_urg_mode(tp) \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))	1521	if (tcp_urg_mode(tp) \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1490	return true;	1522	return true;
1491		1523
1492	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))	1524	if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle))
1493	return true;	1525	return true;
1494		1526
1495	return false;	1527	return false;
@@ -1884,7 +1916,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1884		1916
1885	if (atomic_read(&sk->sk_wmem_alloc) > limit) {	1917	if (atomic_read(&sk->sk_wmem_alloc) > limit) {
1886	set_bit(TSQ_THROTTLED, &tp->tsq_flags);	1918	set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1887	break;	1919	/* It is possible TX completion already happened
		1920	* before we set TSQ_THROTTLED, so we must
		1921	* test again the condition.
		1922	* We abuse smp_mb__after_clear_bit() because
		1923	* there is no smp_mb__after_set_bit() yet
		1924	*/
		1925	smp_mb__after_clear_bit();
		1926	if (atomic_read(&sk->sk_wmem_alloc) > limit)
		1927	break;
1888	}	1928	}
1889		1929
1890	limit = mss_now;	1930	limit = mss_now;
@@ -1892,7 +1932,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1892	limit = tcp_mss_split_point(sk, skb, mss_now,	1932	limit = tcp_mss_split_point(sk, skb, mss_now,
1893	min_t(unsigned int,	1933	min_t(unsigned int,
1894	cwnd_quota,	1934	cwnd_quota,
1895	sk->sk_gso_max_segs));	1935	sk->sk_gso_max_segs),
		1936	nonagle);
1896		1937
1897	if (skb->len > limit &&	1938	if (skb->len > limit &&
1898	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))	1939	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -1956,7 +1997,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
1956	/* Schedule a loss probe in 2*RTT for SACK capable connections	1997	/* Schedule a loss probe in 2*RTT for SACK capable connections
1957	* in Open state, that are either limited by cwnd or application.	1998	* in Open state, that are either limited by cwnd or application.
1958	*/	1999	*/
1959	if (sysctl_tcp_early_retrans < 3 \|\| !rtt \|\| !tp->packets_out \|\|	2000	if (sysctl_tcp_early_retrans < 3 \|\| !tp->srtt \|\| !tp->packets_out \|\|
1960	!tcp_is_sack(tp) \|\| inet_csk(sk)->icsk_ca_state != TCP_CA_Open)	2001	!tcp_is_sack(tp) \|\| inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
1961	return false;	2002	return false;
1962		2003
@@ -2307,6 +2348,7 @@ int __tcp_retransmit_skb(struct sock sk, struct sk_buff skb)
2307	struct tcp_sock *tp = tcp_sk(sk);	2348	struct tcp_sock *tp = tcp_sk(sk);
2308	struct inet_connection_sock *icsk = inet_csk(sk);	2349	struct inet_connection_sock *icsk = inet_csk(sk);
2309	unsigned int cur_mss;	2350	unsigned int cur_mss;
		2351	int err;
2310		2352
2311	/* Inconslusive MTU probe */	2353	/* Inconslusive MTU probe */
2312	if (icsk->icsk_mtup.probe_size) {	2354	if (icsk->icsk_mtup.probe_size) {
@@ -2370,11 +2412,15 @@ int __tcp_retransmit_skb(struct sock sk, struct sk_buff skb)
2370	skb_headroom(skb) >= 0xFFFF)) {	2412	skb_headroom(skb) >= 0xFFFF)) {
2371	struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,	2413	struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
2372	GFP_ATOMIC);	2414	GFP_ATOMIC);
2373	return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :	2415	err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2374	-ENOBUFS;	2416	-ENOBUFS;
2375	} else {	2417	} else {
2376	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);	2418	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2377	}	2419	}
		2420
		2421	if (likely(!err))
		2422	TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
		2423	return err;
2378	}	2424	}
2379		2425
2380	int tcp_retransmit_skb(struct sock sk, struct sk_buff skb)	2426	int tcp_retransmit_skb(struct sock sk, struct sk_buff skb)
@@ -2756,7 +2802,7 @@ struct sk_buff tcp_make_synack(struct sock sk, struct dst_entry *dst,
2756	EXPORT_SYMBOL(tcp_make_synack);	2802	EXPORT_SYMBOL(tcp_make_synack);
2757		2803
2758	/* Do all connect socket setups that can be done AF independent. */	2804	/* Do all connect socket setups that can be done AF independent. */
2759	void tcp_connect_init(struct sock *sk)	2805	static void tcp_connect_init(struct sock *sk)
2760	{	2806	{
2761	const struct dst_entry *dst = __sk_dst_get(sk);	2807	const struct dst_entry *dst = __sk_dst_get(sk);
2762	struct tcp_sock *tp = tcp_sk(sk);	2808	struct tcp_sock *tp = tcp_sk(sk);
@@ -2878,7 +2924,12 @@ static int tcp_send_syn_data(struct sock sk, struct sk_buff syn)
2878	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -	2924	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
2879	MAX_TCP_OPTION_SPACE;	2925	MAX_TCP_OPTION_SPACE;
2880		2926
2881	syn_data = skb_copy_expand(syn, skb_headroom(syn), space,	2927	space = min_t(size_t, space, fo->size);
		2928
		2929	/* limit to order-0 allocations */
		2930	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
		2931
		2932	syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,
2882	sk->sk_allocation);	2933	sk->sk_allocation);
2883	if (syn_data == NULL)	2934	if (syn_data == NULL)
2884	goto fallback;	2935	goto fallback;