4 files changed, 427 insertions, 7 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a01b6f84e3bc..acf17af45af9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -493,6 +493,19 @@ static inline bool skb_queue_is_last(const struct sk_buff_head *list,
 }
 /**
+ *      skb_queue_is_first - check if skb is the first entry in the queue
+ *      @list: queue head
+ *      @skb: buffer
+ *
+ *      Returns true if @skb is the first buffer on the list.
+ */
+static inline bool skb_queue_is_first(const struct sk_buff_head *list,
+                                      const struct sk_buff *skb)
+{
+        return (skb->prev == (struct sk_buff *) list);
+}
+/**
 *      skb_queue_next - return the next packet in the queue
 *      @list: queue head
 *      @skb: current buffer
@@ -511,6 +524,24 @@ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
 }
 /**
+ *      skb_queue_prev - return the prev packet in the queue
+ *      @list: queue head
+ *      @skb: current buffer
+ *
+ *      Return the prev packet in @list before @skb.  It is only valid to
+ *      call this if skb_queue_is_first() evaluates to false.
+ */
+static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
+                                             const struct sk_buff *skb)
+{
+        /* This BUG_ON may seem severe, but if we just return then we
+         * are going to dereference garbage.
+         */
+        BUG_ON(skb_queue_is_first(list, skb));
+        return skb->prev;
+}
+/**
 *      skb_get - reference buffer
 *      @skb: buffer to reference
 *
@@ -1652,6 +1683,8 @@ extern int             skb_splice_bits(struct sk_buff *skb,
 extern void            skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 extern void            skb_split(struct sk_buff *skb,
                                 struct sk_buff *skb1, const u32 len);
+extern int             skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
+                                 int shiftlen);
 extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 90b4c3b4c336..265392470b26 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1192,6 +1192,11 @@ static inline struct sk_buff *tcp_write_queue_next(struct sock *sk, struct sk_bu
        return skb_queue_next(&sk->sk_write_queue, skb);
 }
+static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct sk_buff *skb)
+{
+        return skb_queue_prev(&sk->sk_write_queue, skb);
+}
 #define tcp_for_write_queue(skb, sk)                                    \
        skb_queue_walk(&(sk)->sk_write_queue, skb)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 267185a848f6..844b8abeb18c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2018,6 +2018,146 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
                skb_split_no_header(skb, skb1, len, pos);
 }
+/* Shifting from/to a cloned skb is a no-go.
+ *
+ * TODO: handle cloned skbs by using pskb_expand_head()
+ */
+static int skb_prepare_for_shift(struct sk_buff *skb)
+{
+        return skb_cloned(skb);
+}
+/**
+ * skb_shift - Shifts paged data partially from skb to another
+ * @tgt: buffer into which tail data gets added
+ * @skb: buffer from which the paged data comes from
+ * @shiftlen: shift up to this many bytes
+ *
+ * Attempts to shift up to shiftlen worth of bytes, which may be less than
+ * the length of the skb, from tgt to skb. Returns number bytes shifted.
+ * It's up to caller to free skb if everything was shifted.
+ *
+ * If @tgt runs out of frags, the whole operation is aborted.
+ *
+ * Skb cannot include anything else but paged data while tgt is allowed
+ * to have non-paged data as well.
+ *
+ * TODO: full sized shift could be optimized but that would need
+ * specialized skb free'er to handle frags without up-to-date nr_frags.
+ */
+int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
+{
+        int from, to, merge, todo;
+        struct skb_frag_struct *fragfrom, *fragto;
+        BUG_ON(shiftlen > skb->len);
+        BUG_ON(skb_headlen(skb));       /* Would corrupt stream */
+        todo = shiftlen;
+        from = 0;
+        to = skb_shinfo(tgt)->nr_frags;
+        fragfrom = &skb_shinfo(skb)->frags[from];
+        /* Actual merge is delayed until the point when we know we can
+         * commit all, so that we don't have to undo partial changes
+         */
+        if (!to ||
+            !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) {
+                merge = -1;
+        } else {
+                merge = to - 1;
+                todo -= fragfrom->size;
+                if (todo < 0) {
+                        if (skb_prepare_for_shift(skb) ||
+                            skb_prepare_for_shift(tgt))
+                                return 0;
+                        fragto = &skb_shinfo(tgt)->frags[merge];
+                        fragto->size += shiftlen;
+                        fragfrom->size -= shiftlen;
+                        fragfrom->page_offset += shiftlen;
+                        goto onlymerged;
+                }
+                from++;
+        }
+        /* Skip full, not-fitting skb to avoid expensive operations */
+        if ((shiftlen == skb->len) &&
+            (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
+                return 0;
+        if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
+                return 0;
+        while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
+                if (to == MAX_SKB_FRAGS)
+                        return 0;
+                fragfrom = &skb_shinfo(skb)->frags[from];
+                fragto = &skb_shinfo(tgt)->frags[to];
+                if (todo >= fragfrom->size) {
+                        *fragto = *fragfrom;
+                        todo -= fragfrom->size;
+                        from++;
+                        to++;
+                } else {
+                        get_page(fragfrom->page);
+                        fragto->page = fragfrom->page;
+                        fragto->page_offset = fragfrom->page_offset;
+                        fragto->size = todo;
+                        fragfrom->page_offset += todo;
+                        fragfrom->size -= todo;
+                        todo = 0;
+                        to++;
+                        break;
+                }
+        }
+        /* Ready to "commit" this state change to tgt */
+        skb_shinfo(tgt)->nr_frags = to;
+        if (merge >= 0) {
+                fragfrom = &skb_shinfo(skb)->frags[0];
+                fragto = &skb_shinfo(tgt)->frags[merge];
+                fragto->size += fragfrom->size;
+                put_page(fragfrom->page);
+        }
+        /* Reposition in the original skb */
+        to = 0;
+        while (from < skb_shinfo(skb)->nr_frags)
+                skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
+        skb_shinfo(skb)->nr_frags = to;
+        BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
+onlymerged:
+        /* Most likely the tgt won't ever need its checksum anymore, skb on
+         * the other hand might need it if it needs to be resent
+         */
+        tgt->ip_summed = CHECKSUM_PARTIAL;
+        skb->ip_summed = CHECKSUM_PARTIAL;
+        /* Yak, is it really working this way? Some helper please? */
+        skb->len -= shiftlen;
+        skb->data_len -= shiftlen;
+        skb->truesize -= shiftlen;
+        tgt->len += shiftlen;
+        tgt->data_len += shiftlen;
+        tgt->truesize += shiftlen;
+        return shiftlen;
+}
 /**
 * skb_prepare_seq_read - Prepare a sequential read of skb data
 * @skb: the buffer to read
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3c8e297e2c39..97d57676b8ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
 * aligned portion of it that matches. Therefore we might need to fragment
 * which may fail and creates some hassle (caller must handle error case
 * returns).
+ *
+ * FIXME: this could be merged to shift decision code
 */
 static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
                                 u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
                if (fack_count > tp->fackets_out)
                        tp->fackets_out = fack_count;
-                if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-                        tcp_advance_highest_sack(sk, skb);
        }
        /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
        return flag;
 }
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+                           struct sk_buff *skb, unsigned int pcount,
+                           int shifted, int fack_count, int *reord,
+                           int *flag, int mss)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        u8 dummy_sacked = TCP_SKB_CB(skb)->sacked;      /* We discard results */
+        BUG_ON(!pcount);
+        TCP_SKB_CB(prev)->end_seq += shifted;
+        TCP_SKB_CB(skb)->seq += shifted;
+        skb_shinfo(prev)->gso_segs += pcount;
+        BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
+        skb_shinfo(skb)->gso_segs -= pcount;
+        /* When we're adding to gso_segs == 1, gso_size will be zero,
+         * in theory this shouldn't be necessary but as long as DSACK
+         * code can come after this skb later on it's better to keep
+         * setting gso_size to something.
+         */
+        if (!skb_shinfo(prev)->gso_size) {
+                skb_shinfo(prev)->gso_size = mss;
+                skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+        }
+        /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+        if (skb_shinfo(skb)->gso_segs <= 1) {
+                skb_shinfo(skb)->gso_size = 0;
+                skb_shinfo(skb)->gso_type = 0;
+        }
+        *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
+                                 pcount);
+        /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+        TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+        tcp_clear_all_retrans_hints(tp);
+        if (skb->len > 0) {
+                BUG_ON(!tcp_skb_pcount(skb));
+                return 0;
+        }
+        /* Whole SKB was eaten :-) */
+        TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+        if (skb == tcp_highest_sack(sk))
+                tcp_advance_highest_sack(sk, skb);
+        tcp_unlink_write_queue(skb, sk);
+        sk_wmem_free_skb(sk, skb);
+        return 1;
+}
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_shift_mss(struct sk_buff *skb)
+{
+        int mss = tcp_skb_mss(skb);
+        if (!mss)
+                mss = skb->len;
+        return mss;
+}
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(struct sk_buff *skb)
+{
+        return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+                                          u32 start_seq, u32 end_seq,
+                                          int dup_sack, int *fack_count,
+                                          int *reord, int *flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *prev;
+        int mss;
+        int pcount = 0;
+        int len;
+        int in_sack;
+        if (!sk_can_gso(sk))
+                goto fallback;
+        /* Normally R but no L won't result in plain S */
+        if (!dup_sack &&
+            (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
+                goto fallback;
+        if (!skb_can_shift(skb))
+                goto fallback;
+        /* This frame is about to be dropped (was ACKed). */
+        if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+                goto fallback;
+        /* Can only happen with delayed DSACK + discard craziness */
+        if (unlikely(skb == tcp_write_queue_head(sk)))
+                goto fallback;
+        prev = tcp_write_queue_prev(sk, skb);
+        if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+                goto fallback;
+        in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+                  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+        if (in_sack) {
+                len = skb->len;
+                pcount = tcp_skb_pcount(skb);
+                mss = tcp_shift_mss(skb);
+                /* TODO: Fix DSACKs to not fragment already SACKed and we can
+                 * drop this restriction as unnecessary
+                 */
+                if (mss != tcp_shift_mss(prev))
+                        goto fallback;
+        } else {
+                if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+                        goto noop;
+                /* CHECKME: This is non-MSS split case only?, this will
+                 * cause skipped skbs due to advancing loop btw, original
+                 * has that feature too
+                 */
+                if (tcp_skb_pcount(skb) <= 1)
+                        goto noop;
+                in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+                if (!in_sack) {
+                        /* TODO: head merge to next could be attempted here
+                         * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+                         * though it might not be worth of the additional hassle
+                         *
+                         * ...we can probably just fallback to what was done
+                         * previously. We could try merging non-SACKed ones
+                         * as well but it probably isn't going to buy off
+                         * because later SACKs might again split them, and
+                         * it would make skb timestamp tracking considerably
+                         * harder problem.
+                         */
+                        goto fallback;
+                }
+                len = end_seq - TCP_SKB_CB(skb)->seq;
+                BUG_ON(len < 0);
+                BUG_ON(len > skb->len);
+                /* MSS boundaries should be honoured or else pcount will
+                 * severely break even though it makes things bit trickier.
+                 * Optimize common case to avoid most of the divides
+                 */
+                mss = tcp_skb_mss(skb);
+                /* TODO: Fix DSACKs to not fragment already SACKed and we can
+                 * drop this restriction as unnecessary
+                 */
+                if (mss != tcp_shift_mss(prev))
+                        goto fallback;
+                if (len == mss) {
+                        pcount = 1;
+                } else if (len < mss) {
+                        goto noop;
+                } else {
+                        pcount = len / mss;
+                        len = pcount * mss;
+                }
+        }
+        if (!skb_shift(prev, skb, len))
+                goto fallback;
+        if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
+                             flag, mss))
+                goto out;
+        /* Hole filled allows collapsing with the next as well, this is very
+         * useful when hole on every nth skb pattern happens
+         */
+        if (prev == tcp_write_queue_tail(sk))
+                goto out;
+        skb = tcp_write_queue_next(sk, prev);
+        if (!skb_can_shift(skb))
+                goto out;
+        if (skb == tcp_send_head(sk))
+                goto out;
+        if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+                goto out;
+        len = skb->len;
+        if (skb_shift(prev, skb, len)) {
+                pcount += tcp_skb_pcount(skb);
+                tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
+                                *fack_count, reord, flag, mss);
+        }
+out:
+        *fack_count += pcount;
+        return prev;
+noop:
+        return skb;
+fallback:
+        return NULL;
+}
 static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                        struct tcp_sack_block *next_dup,
                                        u32 start_seq, u32 end_seq,
                                        int dup_sack_in, int *fack_count,
                                        int *reord, int *flag)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *tmp;
        tcp_for_write_queue_from(skb, sk) {
                int in_sack = 0;
                int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                dup_sack = 1;
                }
-                if (in_sack <= 0)
+                /* skb reference here is a bit tricky to get right, since
-                        in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
+                 * shifting can eat and free both this skb and the next,
-                                                        end_seq);
+                 * so not even _safe variant of the loop is enough.
+                 */
+                if (in_sack <= 0) {
+                        tmp = tcp_shift_skb_data(sk, skb, start_seq,
+                                                 end_seq, dup_sack,
+                                                 fack_count, reord, flag);
+                        if (tmp != NULL) {
+                                if (tmp != skb) {
+                                        skb = tmp;
+                                        continue;
+                                }
+                                in_sack = 0;
+                        } else {
+                                in_sack = tcp_match_skb_to_sack(sk, skb,
+                                                                start_seq,
+                                                                end_seq);
+                        }
+                }
                if (unlikely(in_sack < 0))
                        break;
-                if (in_sack)
+                if (in_sack) {
                        *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
                                                 *fack_count,
                                                 &(TCP_SKB_CB(skb)->sacked),
                                                 tcp_skb_pcount(skb));
+                        if (!before(TCP_SKB_CB(skb)->seq,
+                                    tcp_highest_sack_seq(tp)))
+                                tcp_advance_highest_sack(sk, skb);
+                }
                *fack_count += tcp_skb_pcount(skb);
        }
        return skb;

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a01b6f84e3bc..acf17af45af9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h
@@ -493,6 +493,19 @@ static inline bool skb_queue_is_last(const struct sk_buff_head *list,
493	}	493	}
494		494
495	/**	495	/**
		496	* skb_queue_is_first - check if skb is the first entry in the queue
		497	* @list: queue head
		498	* @skb: buffer
		499	*
		500	* Returns true if @skb is the first buffer on the list.
		501	*/
		502	static inline bool skb_queue_is_first(const struct sk_buff_head *list,
		503	const struct sk_buff *skb)
		504	{
		505	return (skb->prev == (struct sk_buff *) list);
		506	}
		507
		508	/**
496	* skb_queue_next - return the next packet in the queue	509	* skb_queue_next - return the next packet in the queue
497	* @list: queue head	510	* @list: queue head
498	* @skb: current buffer	511	* @skb: current buffer
@@ -511,6 +524,24 @@ static inline struct sk_buff skb_queue_next(const struct sk_buff_head list,
511	}	524	}
512		525
513	/**	526	/**
		527	* skb_queue_prev - return the prev packet in the queue
		528	* @list: queue head
		529	* @skb: current buffer
		530	*
		531	* Return the prev packet in @list before @skb. It is only valid to
		532	* call this if skb_queue_is_first() evaluates to false.
		533	*/
		534	static inline struct sk_buff skb_queue_prev(const struct sk_buff_head list,
		535	const struct sk_buff *skb)
		536	{
		537	/* This BUG_ON may seem severe, but if we just return then we
		538	* are going to dereference garbage.
		539	*/
		540	BUG_ON(skb_queue_is_first(list, skb));
		541	return skb->prev;
		542	}
		543
		544	/**
514	* skb_get - reference buffer	545	* skb_get - reference buffer
515	* @skb: buffer to reference	546	* @skb: buffer to reference
516	*	547	*
@@ -1652,6 +1683,8 @@ extern int skb_splice_bits(struct sk_buff *skb,
1652	extern void skb_copy_and_csum_dev(const struct sk_buff skb, u8 to);	1683	extern void skb_copy_and_csum_dev(const struct sk_buff skb, u8 to);
1653	extern void skb_split(struct sk_buff *skb,	1684	extern void skb_split(struct sk_buff *skb,
1654	struct sk_buff *skb1, const u32 len);	1685	struct sk_buff *skb1, const u32 len);
		1686	extern int skb_shift(struct sk_buff tgt, struct sk_buff skb,
		1687	int shiftlen);
1655		1688
1656	extern struct sk_buff skb_segment(struct sk_buff skb, int features);	1689	extern struct sk_buff skb_segment(struct sk_buff skb, int features);
1657		1690


diff --git a/include/net/tcp.h b/include/net/tcp.h index 90b4c3b4c336..265392470b26 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h
@@ -1192,6 +1192,11 @@ static inline struct sk_buff tcp_write_queue_next(struct sock sk, struct sk_bu
1192	return skb_queue_next(&sk->sk_write_queue, skb);	1192	return skb_queue_next(&sk->sk_write_queue, skb);
1193	}	1193	}
1194		1194
		1195	static inline struct sk_buff tcp_write_queue_prev(struct sock sk, struct sk_buff *skb)
		1196	{
		1197	return skb_queue_prev(&sk->sk_write_queue, skb);
		1198	}
		1199
1195	#define tcp_for_write_queue(skb, sk) \	1200	#define tcp_for_write_queue(skb, sk) \
1196	skb_queue_walk(&(sk)->sk_write_queue, skb)	1201	skb_queue_walk(&(sk)->sk_write_queue, skb)
1197		1202


diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 267185a848f6..844b8abeb18c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c
@@ -2018,6 +2018,146 @@ void skb_split(struct sk_buff skb, struct sk_buff skb1, const u32 len)
2018	skb_split_no_header(skb, skb1, len, pos);	2018	skb_split_no_header(skb, skb1, len, pos);
2019	}	2019	}
2020		2020
		2021	/* Shifting from/to a cloned skb is a no-go.
		2022	*
		2023	* TODO: handle cloned skbs by using pskb_expand_head()
		2024	*/
		2025	static int skb_prepare_for_shift(struct sk_buff *skb)
		2026	{
		2027	return skb_cloned(skb);
		2028	}
		2029
		2030	/**
		2031	* skb_shift - Shifts paged data partially from skb to another
		2032	* @tgt: buffer into which tail data gets added
		2033	* @skb: buffer from which the paged data comes from
		2034	* @shiftlen: shift up to this many bytes
		2035	*
		2036	* Attempts to shift up to shiftlen worth of bytes, which may be less than
		2037	* the length of the skb, from tgt to skb. Returns number bytes shifted.
		2038	* It's up to caller to free skb if everything was shifted.
		2039	*
		2040	* If @tgt runs out of frags, the whole operation is aborted.
		2041	*
		2042	* Skb cannot include anything else but paged data while tgt is allowed
		2043	* to have non-paged data as well.
		2044	*
		2045	* TODO: full sized shift could be optimized but that would need
		2046	* specialized skb free'er to handle frags without up-to-date nr_frags.
		2047	*/
		2048	int skb_shift(struct sk_buff tgt, struct sk_buff skb, int shiftlen)
		2049	{
		2050	int from, to, merge, todo;
		2051	struct skb_frag_struct fragfrom, fragto;
		2052
		2053	BUG_ON(shiftlen > skb->len);
		2054	BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
		2055
		2056	todo = shiftlen;
		2057	from = 0;
		2058	to = skb_shinfo(tgt)->nr_frags;
		2059	fragfrom = &skb_shinfo(skb)->frags[from];
		2060
		2061	/* Actual merge is delayed until the point when we know we can
		2062	* commit all, so that we don't have to undo partial changes
		2063	*/
		2064	if (!to \|\|
		2065	!skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) {
		2066	merge = -1;
		2067	} else {
		2068	merge = to - 1;
		2069
		2070	todo -= fragfrom->size;
		2071	if (todo < 0) {
		2072	if (skb_prepare_for_shift(skb) \|\|
		2073	skb_prepare_for_shift(tgt))
		2074	return 0;
		2075
		2076	fragto = &skb_shinfo(tgt)->frags[merge];
		2077
		2078	fragto->size += shiftlen;
		2079	fragfrom->size -= shiftlen;
		2080	fragfrom->page_offset += shiftlen;
		2081
		2082	goto onlymerged;
		2083	}
		2084
		2085	from++;
		2086	}
		2087
		2088	/* Skip full, not-fitting skb to avoid expensive operations */
		2089	if ((shiftlen == skb->len) &&
		2090	(skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
		2091	return 0;
		2092
		2093	if (skb_prepare_for_shift(skb) \|\| skb_prepare_for_shift(tgt))
		2094	return 0;
		2095
		2096	while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
		2097	if (to == MAX_SKB_FRAGS)
		2098	return 0;
		2099
		2100	fragfrom = &skb_shinfo(skb)->frags[from];
		2101	fragto = &skb_shinfo(tgt)->frags[to];
		2102
		2103	if (todo >= fragfrom->size) {
		2104	fragto = fragfrom;
		2105	todo -= fragfrom->size;
		2106	from++;
		2107	to++;
		2108
		2109	} else {
		2110	get_page(fragfrom->page);
		2111	fragto->page = fragfrom->page;
		2112	fragto->page_offset = fragfrom->page_offset;
		2113	fragto->size = todo;
		2114
		2115	fragfrom->page_offset += todo;
		2116	fragfrom->size -= todo;
		2117	todo = 0;
		2118
		2119	to++;
		2120	break;
		2121	}
		2122	}
		2123
		2124	/* Ready to "commit" this state change to tgt */
		2125	skb_shinfo(tgt)->nr_frags = to;
		2126
		2127	if (merge >= 0) {
		2128	fragfrom = &skb_shinfo(skb)->frags[0];
		2129	fragto = &skb_shinfo(tgt)->frags[merge];
		2130
		2131	fragto->size += fragfrom->size;
		2132	put_page(fragfrom->page);
		2133	}
		2134
		2135	/* Reposition in the original skb */
		2136	to = 0;
		2137	while (from < skb_shinfo(skb)->nr_frags)
		2138	skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
		2139	skb_shinfo(skb)->nr_frags = to;
		2140
		2141	BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
		2142
		2143	onlymerged:
		2144	/* Most likely the tgt won't ever need its checksum anymore, skb on
		2145	* the other hand might need it if it needs to be resent
		2146	*/
		2147	tgt->ip_summed = CHECKSUM_PARTIAL;
		2148	skb->ip_summed = CHECKSUM_PARTIAL;
		2149
		2150	/* Yak, is it really working this way? Some helper please? */
		2151	skb->len -= shiftlen;
		2152	skb->data_len -= shiftlen;
		2153	skb->truesize -= shiftlen;
		2154	tgt->len += shiftlen;
		2155	tgt->data_len += shiftlen;
		2156	tgt->truesize += shiftlen;
		2157
		2158	return shiftlen;
		2159	}
		2160
2021	/**	2161	/**
2022	* skb_prepare_seq_read - Prepare a sequential read of skb data	2162	* skb_prepare_seq_read - Prepare a sequential read of skb data
2023	* @skb: the buffer to read	2163	* @skb: the buffer to read


diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3c8e297e2c39..97d57676b8ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock sk, struct sk_buff ack_skb,
1242	* aligned portion of it that matches. Therefore we might need to fragment	1242	* aligned portion of it that matches. Therefore we might need to fragment
1243	* which may fail and creates some hassle (caller must handle error case	1243	* which may fail and creates some hassle (caller must handle error case
1244	* returns).	1244	* returns).
		1245	*
		1246	* FIXME: this could be merged to shift decision code
1245	*/	1247	*/
1246	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,	1248	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,
1247	u32 start_seq, u32 end_seq)	1249	u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff skb, struct sock sk,
1353		1355
1354	if (fack_count > tp->fackets_out)	1356	if (fack_count > tp->fackets_out)
1355	tp->fackets_out = fack_count;	1357	tp->fackets_out = fack_count;
1356
1357	if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
1358	tcp_advance_highest_sack(sk, skb);
1359	}	1358	}
1360		1359
1361	/* D-SACK. We can detect redundant retransmission in S\|R and plain R	1360	/* D-SACK. We can detect redundant retransmission in S\|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff skb, struct sock sk,
1370	return flag;	1369	return flag;
1371	}	1370	}
1372		1371
		1372	static int tcp_shifted_skb(struct sock sk, struct sk_buff prev,
		1373	struct sk_buff *skb, unsigned int pcount,
		1374	int shifted, int fack_count, int *reord,
		1375	int *flag, int mss)
		1376	{
		1377	struct tcp_sock *tp = tcp_sk(sk);
		1378	u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */
		1379
		1380	BUG_ON(!pcount);
		1381
		1382	TCP_SKB_CB(prev)->end_seq += shifted;
		1383	TCP_SKB_CB(skb)->seq += shifted;
		1384
		1385	skb_shinfo(prev)->gso_segs += pcount;
		1386	BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
		1387	skb_shinfo(skb)->gso_segs -= pcount;
		1388
		1389	/* When we're adding to gso_segs == 1, gso_size will be zero,
		1390	* in theory this shouldn't be necessary but as long as DSACK
		1391	* code can come after this skb later on it's better to keep
		1392	* setting gso_size to something.
		1393	*/
		1394	if (!skb_shinfo(prev)->gso_size) {
		1395	skb_shinfo(prev)->gso_size = mss;
		1396	skb_shinfo(prev)->gso_type = sk->sk_gso_type;
		1397	}
		1398
		1399	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
		1400	if (skb_shinfo(skb)->gso_segs <= 1) {
		1401	skb_shinfo(skb)->gso_size = 0;
		1402	skb_shinfo(skb)->gso_type = 0;
		1403	}
		1404
		1405	*flag \|= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
		1406	pcount);
		1407
		1408	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
		1409	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
		1410
		1411	tcp_clear_all_retrans_hints(tp);
		1412
		1413	if (skb->len > 0) {
		1414	BUG_ON(!tcp_skb_pcount(skb));
		1415	return 0;
		1416	}
		1417
		1418	/* Whole SKB was eaten :-) */
		1419
		1420	TCP_SKB_CB(skb)->flags \|= TCP_SKB_CB(prev)->flags;
		1421	if (skb == tcp_highest_sack(sk))
		1422	tcp_advance_highest_sack(sk, skb);
		1423
		1424	tcp_unlink_write_queue(skb, sk);
		1425	sk_wmem_free_skb(sk, skb);
		1426
		1427	return 1;
		1428	}
		1429
		1430	/* I wish gso_size would have a bit more sane initialization than
		1431	* something-or-zero which complicates things
		1432	*/
		1433	static int tcp_shift_mss(struct sk_buff *skb)
		1434	{
		1435	int mss = tcp_skb_mss(skb);
		1436
		1437	if (!mss)
		1438	mss = skb->len;
		1439
		1440	return mss;
		1441	}
		1442
		1443	/* Shifting pages past head area doesn't work */
		1444	static int skb_can_shift(struct sk_buff *skb)
		1445	{
		1446	return !skb_headlen(skb) && skb_is_nonlinear(skb);
		1447	}
		1448
		1449	/* Try collapsing SACK blocks spanning across multiple skbs to a single
		1450	* skb.
		1451	*/
		1452	static struct sk_buff tcp_shift_skb_data(struct sock sk, struct sk_buff *skb,
		1453	u32 start_seq, u32 end_seq,
		1454	int dup_sack, int *fack_count,
		1455	int reord, int flag)
		1456	{
		1457	struct tcp_sock *tp = tcp_sk(sk);
		1458	struct sk_buff *prev;
		1459	int mss;
		1460	int pcount = 0;
		1461	int len;
		1462	int in_sack;
		1463
		1464	if (!sk_can_gso(sk))
		1465	goto fallback;
		1466
		1467	/* Normally R but no L won't result in plain S */
		1468	if (!dup_sack &&
		1469	(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
		1470	goto fallback;
		1471	if (!skb_can_shift(skb))
		1472	goto fallback;
		1473	/* This frame is about to be dropped (was ACKed). */
		1474	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
		1475	goto fallback;
		1476
		1477	/* Can only happen with delayed DSACK + discard craziness */
		1478	if (unlikely(skb == tcp_write_queue_head(sk)))
		1479	goto fallback;
		1480	prev = tcp_write_queue_prev(sk, skb);
		1481
		1482	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
		1483	goto fallback;
		1484
		1485	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
		1486	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
		1487
		1488	if (in_sack) {
		1489	len = skb->len;
		1490	pcount = tcp_skb_pcount(skb);
		1491	mss = tcp_shift_mss(skb);
		1492
		1493	/* TODO: Fix DSACKs to not fragment already SACKed and we can
		1494	* drop this restriction as unnecessary
		1495	*/
		1496	if (mss != tcp_shift_mss(prev))
		1497	goto fallback;
		1498	} else {
		1499	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
		1500	goto noop;
		1501	/* CHECKME: This is non-MSS split case only?, this will
		1502	* cause skipped skbs due to advancing loop btw, original
		1503	* has that feature too
		1504	*/
		1505	if (tcp_skb_pcount(skb) <= 1)
		1506	goto noop;
		1507
		1508	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
		1509	if (!in_sack) {
		1510	/* TODO: head merge to next could be attempted here
		1511	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
		1512	* though it might not be worth of the additional hassle
		1513	*
		1514	* ...we can probably just fallback to what was done
		1515	* previously. We could try merging non-SACKed ones
		1516	* as well but it probably isn't going to buy off
		1517	* because later SACKs might again split them, and
		1518	* it would make skb timestamp tracking considerably
		1519	* harder problem.
		1520	*/
		1521	goto fallback;
		1522	}
		1523
		1524	len = end_seq - TCP_SKB_CB(skb)->seq;
		1525	BUG_ON(len < 0);
		1526	BUG_ON(len > skb->len);
		1527
		1528	/* MSS boundaries should be honoured or else pcount will
		1529	* severely break even though it makes things bit trickier.
		1530	* Optimize common case to avoid most of the divides
		1531	*/
		1532	mss = tcp_skb_mss(skb);
		1533
		1534	/* TODO: Fix DSACKs to not fragment already SACKed and we can
		1535	* drop this restriction as unnecessary
		1536	*/
		1537	if (mss != tcp_shift_mss(prev))
		1538	goto fallback;
		1539
		1540	if (len == mss) {
		1541	pcount = 1;
		1542	} else if (len < mss) {
		1543	goto noop;
		1544	} else {
		1545	pcount = len / mss;
		1546	len = pcount * mss;
		1547	}
		1548	}
		1549
		1550	if (!skb_shift(prev, skb, len))
		1551	goto fallback;
		1552	if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
		1553	flag, mss))
		1554	goto out;
		1555
		1556	/* Hole filled allows collapsing with the next as well, this is very
		1557	* useful when hole on every nth skb pattern happens
		1558	*/
		1559	if (prev == tcp_write_queue_tail(sk))
		1560	goto out;
		1561	skb = tcp_write_queue_next(sk, prev);
		1562
		1563	if (!skb_can_shift(skb))
		1564	goto out;
		1565	if (skb == tcp_send_head(sk))
		1566	goto out;
		1567	if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
		1568	goto out;
		1569
		1570	len = skb->len;
		1571	if (skb_shift(prev, skb, len)) {
		1572	pcount += tcp_skb_pcount(skb);
		1573	tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
		1574	*fack_count, reord, flag, mss);
		1575	}
		1576
		1577	out:
		1578	*fack_count += pcount;
		1579	return prev;
		1580
		1581	noop:
		1582	return skb;
		1583
		1584	fallback:
		1585	return NULL;
		1586	}
		1587
1373	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,	1588	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
1374	struct tcp_sack_block *next_dup,	1589	struct tcp_sack_block *next_dup,
1375	u32 start_seq, u32 end_seq,	1590	u32 start_seq, u32 end_seq,
1376	int dup_sack_in, int *fack_count,	1591	int dup_sack_in, int *fack_count,
1377	int reord, int flag)	1592	int reord, int flag)
1378	{	1593	{
		1594	struct tcp_sock *tp = tcp_sk(sk);
		1595	struct sk_buff *tmp;
		1596
1379	tcp_for_write_queue_from(skb, sk) {	1597	tcp_for_write_queue_from(skb, sk) {
1380	int in_sack = 0;	1598	int in_sack = 0;
1381	int dup_sack = dup_sack_in;	1599	int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
1396	dup_sack = 1;	1614	dup_sack = 1;
1397	}	1615	}
1398		1616
1399	if (in_sack <= 0)	1617	/* skb reference here is a bit tricky to get right, since
1400	in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,	1618	* shifting can eat and free both this skb and the next,
1401	end_seq);	1619	* so not even _safe variant of the loop is enough.
		1620	*/
		1621	if (in_sack <= 0) {
		1622	tmp = tcp_shift_skb_data(sk, skb, start_seq,
		1623	end_seq, dup_sack,
		1624	fack_count, reord, flag);
		1625	if (tmp != NULL) {
		1626	if (tmp != skb) {
		1627	skb = tmp;
		1628	continue;
		1629	}
		1630
		1631	in_sack = 0;
		1632	} else {
		1633	in_sack = tcp_match_skb_to_sack(sk, skb,
		1634	start_seq,
		1635	end_seq);
		1636	}
		1637	}
		1638
1402	if (unlikely(in_sack < 0))	1639	if (unlikely(in_sack < 0))
1403	break;	1640	break;
1404		1641
1405	if (in_sack)	1642	if (in_sack) {
1406	*flag \|= tcp_sacktag_one(skb, sk, reord, dup_sack,	1643	*flag \|= tcp_sacktag_one(skb, sk, reord, dup_sack,
1407	*fack_count,	1644	*fack_count,
1408	&(TCP_SKB_CB(skb)->sacked),	1645	&(TCP_SKB_CB(skb)->sacked),
1409	tcp_skb_pcount(skb));	1646	tcp_skb_pcount(skb));
1410		1647
		1648	if (!before(TCP_SKB_CB(skb)->seq,
		1649	tcp_highest_sack_seq(tp)))
		1650	tcp_advance_highest_sack(sk, skb);
		1651	}
		1652
1411	*fack_count += tcp_skb_pcount(skb);	1653	*fack_count += tcp_skb_pcount(skb);
1412	}	1654	}
1413	return skb;	1655	return skb;