1 files changed, 249 insertions, 7 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3c8e297e2c39..97d57676b8ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
 * aligned portion of it that matches. Therefore we might need to fragment
 * which may fail and creates some hassle (caller must handle error case
 * returns).
+ *
+ * FIXME: this could be merged to shift decision code
 */
 static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
                                 u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
                if (fack_count > tp->fackets_out)
                        tp->fackets_out = fack_count;
-                if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-                        tcp_advance_highest_sack(sk, skb);
        }
        /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
        return flag;
 }
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+                           struct sk_buff *skb, unsigned int pcount,
+                           int shifted, int fack_count, int *reord,
+                           int *flag, int mss)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        u8 dummy_sacked = TCP_SKB_CB(skb)->sacked;      /* We discard results */
+        BUG_ON(!pcount);
+        TCP_SKB_CB(prev)->end_seq += shifted;
+        TCP_SKB_CB(skb)->seq += shifted;
+        skb_shinfo(prev)->gso_segs += pcount;
+        BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
+        skb_shinfo(skb)->gso_segs -= pcount;
+        /* When we're adding to gso_segs == 1, gso_size will be zero,
+         * in theory this shouldn't be necessary but as long as DSACK
+         * code can come after this skb later on it's better to keep
+         * setting gso_size to something.
+         */
+        if (!skb_shinfo(prev)->gso_size) {
+                skb_shinfo(prev)->gso_size = mss;
+                skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+        }
+        /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+        if (skb_shinfo(skb)->gso_segs <= 1) {
+                skb_shinfo(skb)->gso_size = 0;
+                skb_shinfo(skb)->gso_type = 0;
+        }
+        *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
+                                 pcount);
+        /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+        TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+        tcp_clear_all_retrans_hints(tp);
+        if (skb->len > 0) {
+                BUG_ON(!tcp_skb_pcount(skb));
+                return 0;
+        }
+        /* Whole SKB was eaten :-) */
+        TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+        if (skb == tcp_highest_sack(sk))
+                tcp_advance_highest_sack(sk, skb);
+        tcp_unlink_write_queue(skb, sk);
+        sk_wmem_free_skb(sk, skb);
+        return 1;
+}
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_shift_mss(struct sk_buff *skb)
+{
+        int mss = tcp_skb_mss(skb);
+        if (!mss)
+                mss = skb->len;
+        return mss;
+}
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(struct sk_buff *skb)
+{
+        return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+                                          u32 start_seq, u32 end_seq,
+                                          int dup_sack, int *fack_count,
+                                          int *reord, int *flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *prev;
+        int mss;
+        int pcount = 0;
+        int len;
+        int in_sack;
+        if (!sk_can_gso(sk))
+                goto fallback;
+        /* Normally R but no L won't result in plain S */
+        if (!dup_sack &&
+            (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
+                goto fallback;
+        if (!skb_can_shift(skb))
+                goto fallback;
+        /* This frame is about to be dropped (was ACKed). */
+        if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+                goto fallback;
+        /* Can only happen with delayed DSACK + discard craziness */
+        if (unlikely(skb == tcp_write_queue_head(sk)))
+                goto fallback;
+        prev = tcp_write_queue_prev(sk, skb);
+        if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+                goto fallback;
+        in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+                  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+        if (in_sack) {
+                len = skb->len;
+                pcount = tcp_skb_pcount(skb);
+                mss = tcp_shift_mss(skb);
+                /* TODO: Fix DSACKs to not fragment already SACKed and we can
+                 * drop this restriction as unnecessary
+                 */
+                if (mss != tcp_shift_mss(prev))
+                        goto fallback;
+        } else {
+                if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+                        goto noop;
+                /* CHECKME: This is non-MSS split case only?, this will
+                 * cause skipped skbs due to advancing loop btw, original
+                 * has that feature too
+                 */
+                if (tcp_skb_pcount(skb) <= 1)
+                        goto noop;
+                in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+                if (!in_sack) {
+                        /* TODO: head merge to next could be attempted here
+                         * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+                         * though it might not be worth of the additional hassle
+                         *
+                         * ...we can probably just fallback to what was done
+                         * previously. We could try merging non-SACKed ones
+                         * as well but it probably isn't going to buy off
+                         * because later SACKs might again split them, and
+                         * it would make skb timestamp tracking considerably
+                         * harder problem.
+                         */
+                        goto fallback;
+                }
+                len = end_seq - TCP_SKB_CB(skb)->seq;
+                BUG_ON(len < 0);
+                BUG_ON(len > skb->len);
+                /* MSS boundaries should be honoured or else pcount will
+                 * severely break even though it makes things bit trickier.
+                 * Optimize common case to avoid most of the divides
+                 */
+                mss = tcp_skb_mss(skb);
+                /* TODO: Fix DSACKs to not fragment already SACKed and we can
+                 * drop this restriction as unnecessary
+                 */
+                if (mss != tcp_shift_mss(prev))
+                        goto fallback;
+                if (len == mss) {
+                        pcount = 1;
+                } else if (len < mss) {
+                        goto noop;
+                } else {
+                        pcount = len / mss;
+                        len = pcount * mss;
+                }
+        }
+        if (!skb_shift(prev, skb, len))
+                goto fallback;
+        if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
+                             flag, mss))
+                goto out;
+        /* Hole filled allows collapsing with the next as well, this is very
+         * useful when hole on every nth skb pattern happens
+         */
+        if (prev == tcp_write_queue_tail(sk))
+                goto out;
+        skb = tcp_write_queue_next(sk, prev);
+        if (!skb_can_shift(skb))
+                goto out;
+        if (skb == tcp_send_head(sk))
+                goto out;
+        if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+                goto out;
+        len = skb->len;
+        if (skb_shift(prev, skb, len)) {
+                pcount += tcp_skb_pcount(skb);
+                tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
+                                *fack_count, reord, flag, mss);
+        }
+out:
+        *fack_count += pcount;
+        return prev;
+noop:
+        return skb;
+fallback:
+        return NULL;
+}
 static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                        struct tcp_sack_block *next_dup,
                                        u32 start_seq, u32 end_seq,
                                        int dup_sack_in, int *fack_count,
                                        int *reord, int *flag)
 {
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *tmp;
        tcp_for_write_queue_from(skb, sk) {
                int in_sack = 0;
                int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                dup_sack = 1;
                }
-                if (in_sack <= 0)
+                /* skb reference here is a bit tricky to get right, since
-                        in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
+                 * shifting can eat and free both this skb and the next,
-                                                        end_seq);
+                 * so not even _safe variant of the loop is enough.
+                 */
+                if (in_sack <= 0) {
+                        tmp = tcp_shift_skb_data(sk, skb, start_seq,
+                                                 end_seq, dup_sack,
+                                                 fack_count, reord, flag);
+                        if (tmp != NULL) {
+                                if (tmp != skb) {
+                                        skb = tmp;
+                                        continue;
+                                }
+                                in_sack = 0;
+                        } else {
+                                in_sack = tcp_match_skb_to_sack(sk, skb,
+                                                                start_seq,
+                                                                end_seq);
+                        }
+                }
                if (unlikely(in_sack < 0))
                        break;
-                if (in_sack)
+                if (in_sack) {
                        *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
                                                 *fack_count,
                                                 &(TCP_SKB_CB(skb)->sacked),
                                                 tcp_skb_pcount(skb));
+                        if (!before(TCP_SKB_CB(skb)->seq,
+                                    tcp_highest_sack_seq(tp)))
+                                tcp_advance_highest_sack(sk, skb);
+                }
                *fack_count += tcp_skb_pcount(skb);
        }
        return skb;

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3c8e297e2c39..97d57676b8ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock sk, struct sk_buff ack_skb,
1242	* aligned portion of it that matches. Therefore we might need to fragment	1242	* aligned portion of it that matches. Therefore we might need to fragment
1243	* which may fail and creates some hassle (caller must handle error case	1243	* which may fail and creates some hassle (caller must handle error case
1244	* returns).	1244	* returns).
		1245	*
		1246	* FIXME: this could be merged to shift decision code
1245	*/	1247	*/
1246	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,	1248	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,
1247	u32 start_seq, u32 end_seq)	1249	u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff skb, struct sock sk,
1353		1355
1354	if (fack_count > tp->fackets_out)	1356	if (fack_count > tp->fackets_out)
1355	tp->fackets_out = fack_count;	1357	tp->fackets_out = fack_count;
1356
1357	if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
1358	tcp_advance_highest_sack(sk, skb);
1359	}	1358	}
1360		1359
1361	/* D-SACK. We can detect redundant retransmission in S\|R and plain R	1360	/* D-SACK. We can detect redundant retransmission in S\|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff skb, struct sock sk,
1370	return flag;	1369	return flag;
1371	}	1370	}
1372		1371
		1372	static int tcp_shifted_skb(struct sock sk, struct sk_buff prev,
		1373	struct sk_buff *skb, unsigned int pcount,
		1374	int shifted, int fack_count, int *reord,
		1375	int *flag, int mss)
		1376	{
		1377	struct tcp_sock *tp = tcp_sk(sk);
		1378	u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */
		1379
		1380	BUG_ON(!pcount);
		1381
		1382	TCP_SKB_CB(prev)->end_seq += shifted;
		1383	TCP_SKB_CB(skb)->seq += shifted;
		1384
		1385	skb_shinfo(prev)->gso_segs += pcount;
		1386	BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
		1387	skb_shinfo(skb)->gso_segs -= pcount;
		1388
		1389	/* When we're adding to gso_segs == 1, gso_size will be zero,
		1390	* in theory this shouldn't be necessary but as long as DSACK
		1391	* code can come after this skb later on it's better to keep
		1392	* setting gso_size to something.
		1393	*/
		1394	if (!skb_shinfo(prev)->gso_size) {
		1395	skb_shinfo(prev)->gso_size = mss;
		1396	skb_shinfo(prev)->gso_type = sk->sk_gso_type;
		1397	}
		1398
		1399	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
		1400	if (skb_shinfo(skb)->gso_segs <= 1) {
		1401	skb_shinfo(skb)->gso_size = 0;
		1402	skb_shinfo(skb)->gso_type = 0;
		1403	}
		1404
		1405	*flag \|= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
		1406	pcount);
		1407
		1408	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
		1409	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
		1410
		1411	tcp_clear_all_retrans_hints(tp);
		1412
		1413	if (skb->len > 0) {
		1414	BUG_ON(!tcp_skb_pcount(skb));
		1415	return 0;
		1416	}
		1417
		1418	/* Whole SKB was eaten :-) */
		1419
		1420	TCP_SKB_CB(skb)->flags \|= TCP_SKB_CB(prev)->flags;
		1421	if (skb == tcp_highest_sack(sk))
		1422	tcp_advance_highest_sack(sk, skb);
		1423
		1424	tcp_unlink_write_queue(skb, sk);
		1425	sk_wmem_free_skb(sk, skb);
		1426
		1427	return 1;
		1428	}
		1429
		1430	/* I wish gso_size would have a bit more sane initialization than
		1431	* something-or-zero which complicates things
		1432	*/
		1433	static int tcp_shift_mss(struct sk_buff *skb)
		1434	{
		1435	int mss = tcp_skb_mss(skb);
		1436
		1437	if (!mss)
		1438	mss = skb->len;
		1439
		1440	return mss;
		1441	}
		1442
		1443	/* Shifting pages past head area doesn't work */
		1444	static int skb_can_shift(struct sk_buff *skb)
		1445	{
		1446	return !skb_headlen(skb) && skb_is_nonlinear(skb);
		1447	}
		1448
		1449	/* Try collapsing SACK blocks spanning across multiple skbs to a single
		1450	* skb.
		1451	*/
		1452	static struct sk_buff tcp_shift_skb_data(struct sock sk, struct sk_buff *skb,
		1453	u32 start_seq, u32 end_seq,
		1454	int dup_sack, int *fack_count,
		1455	int reord, int flag)
		1456	{
		1457	struct tcp_sock *tp = tcp_sk(sk);
		1458	struct sk_buff *prev;
		1459	int mss;
		1460	int pcount = 0;
		1461	int len;
		1462	int in_sack;
		1463
		1464	if (!sk_can_gso(sk))
		1465	goto fallback;
		1466
		1467	/* Normally R but no L won't result in plain S */
		1468	if (!dup_sack &&
		1469	(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
		1470	goto fallback;
		1471	if (!skb_can_shift(skb))
		1472	goto fallback;
		1473	/* This frame is about to be dropped (was ACKed). */
		1474	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
		1475	goto fallback;
		1476
		1477	/* Can only happen with delayed DSACK + discard craziness */
		1478	if (unlikely(skb == tcp_write_queue_head(sk)))
		1479	goto fallback;
		1480	prev = tcp_write_queue_prev(sk, skb);
		1481
		1482	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
		1483	goto fallback;
		1484
		1485	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
		1486	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
		1487
		1488	if (in_sack) {
		1489	len = skb->len;
		1490	pcount = tcp_skb_pcount(skb);
		1491	mss = tcp_shift_mss(skb);
		1492
		1493	/* TODO: Fix DSACKs to not fragment already SACKed and we can
		1494	* drop this restriction as unnecessary
		1495	*/
		1496	if (mss != tcp_shift_mss(prev))
		1497	goto fallback;
		1498	} else {
		1499	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
		1500	goto noop;
		1501	/* CHECKME: This is non-MSS split case only?, this will
		1502	* cause skipped skbs due to advancing loop btw, original
		1503	* has that feature too
		1504	*/
		1505	if (tcp_skb_pcount(skb) <= 1)
		1506	goto noop;
		1507
		1508	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
		1509	if (!in_sack) {
		1510	/* TODO: head merge to next could be attempted here
		1511	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
		1512	* though it might not be worth of the additional hassle
		1513	*
		1514	* ...we can probably just fallback to what was done
		1515	* previously. We could try merging non-SACKed ones
		1516	* as well but it probably isn't going to buy off
		1517	* because later SACKs might again split them, and
		1518	* it would make skb timestamp tracking considerably
		1519	* harder problem.
		1520	*/
		1521	goto fallback;
		1522	}
		1523
		1524	len = end_seq - TCP_SKB_CB(skb)->seq;
		1525	BUG_ON(len < 0);
		1526	BUG_ON(len > skb->len);
		1527
		1528	/* MSS boundaries should be honoured or else pcount will
		1529	* severely break even though it makes things bit trickier.
		1530	* Optimize common case to avoid most of the divides
		1531	*/
		1532	mss = tcp_skb_mss(skb);
		1533
		1534	/* TODO: Fix DSACKs to not fragment already SACKed and we can
		1535	* drop this restriction as unnecessary
		1536	*/
		1537	if (mss != tcp_shift_mss(prev))
		1538	goto fallback;
		1539
		1540	if (len == mss) {
		1541	pcount = 1;
		1542	} else if (len < mss) {
		1543	goto noop;
		1544	} else {
		1545	pcount = len / mss;
		1546	len = pcount * mss;
		1547	}
		1548	}
		1549
		1550	if (!skb_shift(prev, skb, len))
		1551	goto fallback;
		1552	if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
		1553	flag, mss))
		1554	goto out;
		1555
		1556	/* Hole filled allows collapsing with the next as well, this is very
		1557	* useful when hole on every nth skb pattern happens
		1558	*/
		1559	if (prev == tcp_write_queue_tail(sk))
		1560	goto out;
		1561	skb = tcp_write_queue_next(sk, prev);
		1562
		1563	if (!skb_can_shift(skb))
		1564	goto out;
		1565	if (skb == tcp_send_head(sk))
		1566	goto out;
		1567	if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
		1568	goto out;
		1569
		1570	len = skb->len;
		1571	if (skb_shift(prev, skb, len)) {
		1572	pcount += tcp_skb_pcount(skb);
		1573	tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
		1574	*fack_count, reord, flag, mss);
		1575	}
		1576
		1577	out:
		1578	*fack_count += pcount;
		1579	return prev;
		1580
		1581	noop:
		1582	return skb;
		1583
		1584	fallback:
		1585	return NULL;
		1586	}
		1587
1373	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,	1588	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
1374	struct tcp_sack_block *next_dup,	1589	struct tcp_sack_block *next_dup,
1375	u32 start_seq, u32 end_seq,	1590	u32 start_seq, u32 end_seq,
1376	int dup_sack_in, int *fack_count,	1591	int dup_sack_in, int *fack_count,
1377	int reord, int flag)	1592	int reord, int flag)
1378	{	1593	{
		1594	struct tcp_sock *tp = tcp_sk(sk);
		1595	struct sk_buff *tmp;
		1596
1379	tcp_for_write_queue_from(skb, sk) {	1597	tcp_for_write_queue_from(skb, sk) {
1380	int in_sack = 0;	1598	int in_sack = 0;
1381	int dup_sack = dup_sack_in;	1599	int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
1396	dup_sack = 1;	1614	dup_sack = 1;
1397	}	1615	}
1398		1616
1399	if (in_sack <= 0)	1617	/* skb reference here is a bit tricky to get right, since
1400	in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,	1618	* shifting can eat and free both this skb and the next,
1401	end_seq);	1619	* so not even _safe variant of the loop is enough.
		1620	*/
		1621	if (in_sack <= 0) {
		1622	tmp = tcp_shift_skb_data(sk, skb, start_seq,
		1623	end_seq, dup_sack,
		1624	fack_count, reord, flag);
		1625	if (tmp != NULL) {
		1626	if (tmp != skb) {
		1627	skb = tmp;
		1628	continue;
		1629	}
		1630
		1631	in_sack = 0;
		1632	} else {
		1633	in_sack = tcp_match_skb_to_sack(sk, skb,
		1634	start_seq,
		1635	end_seq);
		1636	}
		1637	}
		1638
1402	if (unlikely(in_sack < 0))	1639	if (unlikely(in_sack < 0))
1403	break;	1640	break;
1404		1641
1405	if (in_sack)	1642	if (in_sack) {
1406	*flag \|= tcp_sacktag_one(skb, sk, reord, dup_sack,	1643	*flag \|= tcp_sacktag_one(skb, sk, reord, dup_sack,
1407	*fack_count,	1644	*fack_count,
1408	&(TCP_SKB_CB(skb)->sacked),	1645	&(TCP_SKB_CB(skb)->sacked),
1409	tcp_skb_pcount(skb));	1646	tcp_skb_pcount(skb));
1410		1647
		1648	if (!before(TCP_SKB_CB(skb)->seq,
		1649	tcp_highest_sack_seq(tp)))
		1650	tcp_advance_highest_sack(sk, skb);
		1651	}
		1652
1411	*fack_count += tcp_skb_pcount(skb);	1653	*fack_count += tcp_skb_pcount(skb);
1412	}	1654	}
1413	return skb;	1655	return skb;