aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/skbuff.h33
-rw-r--r--include/net/tcp.h5
-rw-r--r--net/core/skbuff.c140
-rw-r--r--net/ipv4/tcp_input.c256
4 files changed, 427 insertions, 7 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a01b6f84e3bc..acf17af45af9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -493,6 +493,19 @@ static inline bool skb_queue_is_last(const struct sk_buff_head *list,
493} 493}
494 494
495/** 495/**
496 * skb_queue_is_first - check if skb is the first entry in the queue
497 * @list: queue head
498 * @skb: buffer
499 *
500 * Returns true if @skb is the first buffer on the list.
501 */
502static inline bool skb_queue_is_first(const struct sk_buff_head *list,
503 const struct sk_buff *skb)
504{
505 return (skb->prev == (struct sk_buff *) list);
506}
507
508/**
496 * skb_queue_next - return the next packet in the queue 509 * skb_queue_next - return the next packet in the queue
497 * @list: queue head 510 * @list: queue head
498 * @skb: current buffer 511 * @skb: current buffer
@@ -511,6 +524,24 @@ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
511} 524}
512 525
513/** 526/**
527 * skb_queue_prev - return the prev packet in the queue
528 * @list: queue head
529 * @skb: current buffer
530 *
531 * Return the prev packet in @list before @skb. It is only valid to
532 * call this if skb_queue_is_first() evaluates to false.
533 */
534static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
535 const struct sk_buff *skb)
536{
537 /* This BUG_ON may seem severe, but if we just return then we
538 * are going to dereference garbage.
539 */
540 BUG_ON(skb_queue_is_first(list, skb));
541 return skb->prev;
542}
543
544/**
514 * skb_get - reference buffer 545 * skb_get - reference buffer
515 * @skb: buffer to reference 546 * @skb: buffer to reference
516 * 547 *
@@ -1652,6 +1683,8 @@ extern int skb_splice_bits(struct sk_buff *skb,
1652extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); 1683extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
1653extern void skb_split(struct sk_buff *skb, 1684extern void skb_split(struct sk_buff *skb,
1654 struct sk_buff *skb1, const u32 len); 1685 struct sk_buff *skb1, const u32 len);
1686extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
1687 int shiftlen);
1655 1688
1656extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); 1689extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
1657 1690
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 90b4c3b4c336..265392470b26 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1192,6 +1192,11 @@ static inline struct sk_buff *tcp_write_queue_next(struct sock *sk, struct sk_bu
1192 return skb_queue_next(&sk->sk_write_queue, skb); 1192 return skb_queue_next(&sk->sk_write_queue, skb);
1193} 1193}
1194 1194
1195static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct sk_buff *skb)
1196{
1197 return skb_queue_prev(&sk->sk_write_queue, skb);
1198}
1199
1195#define tcp_for_write_queue(skb, sk) \ 1200#define tcp_for_write_queue(skb, sk) \
1196 skb_queue_walk(&(sk)->sk_write_queue, skb) 1201 skb_queue_walk(&(sk)->sk_write_queue, skb)
1197 1202
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 267185a848f6..844b8abeb18c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2018,6 +2018,146 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
2018 skb_split_no_header(skb, skb1, len, pos); 2018 skb_split_no_header(skb, skb1, len, pos);
2019} 2019}
2020 2020
2021/* Shifting from/to a cloned skb is a no-go.
2022 *
2023 * TODO: handle cloned skbs by using pskb_expand_head()
2024 */
2025static int skb_prepare_for_shift(struct sk_buff *skb)
2026{
2027 return skb_cloned(skb);
2028}
2029
2030/**
2031 * skb_shift - Shifts paged data partially from skb to another
2032 * @tgt: buffer into which tail data gets added
2033 * @skb: buffer from which the paged data comes from
2034 * @shiftlen: shift up to this many bytes
2035 *
2036 * Attempts to shift up to shiftlen worth of bytes, which may be less than
2037 * the length of the skb, from tgt to skb. Returns number bytes shifted.
2038 * It's up to caller to free skb if everything was shifted.
2039 *
2040 * If @tgt runs out of frags, the whole operation is aborted.
2041 *
2042 * Skb cannot include anything else but paged data while tgt is allowed
2043 * to have non-paged data as well.
2044 *
2045 * TODO: full sized shift could be optimized but that would need
2046 * specialized skb free'er to handle frags without up-to-date nr_frags.
2047 */
2048int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
2049{
2050 int from, to, merge, todo;
2051 struct skb_frag_struct *fragfrom, *fragto;
2052
2053 BUG_ON(shiftlen > skb->len);
2054 BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
2055
2056 todo = shiftlen;
2057 from = 0;
2058 to = skb_shinfo(tgt)->nr_frags;
2059 fragfrom = &skb_shinfo(skb)->frags[from];
2060
2061 /* Actual merge is delayed until the point when we know we can
2062 * commit all, so that we don't have to undo partial changes
2063 */
2064 if (!to ||
2065 !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) {
2066 merge = -1;
2067 } else {
2068 merge = to - 1;
2069
2070 todo -= fragfrom->size;
2071 if (todo < 0) {
2072 if (skb_prepare_for_shift(skb) ||
2073 skb_prepare_for_shift(tgt))
2074 return 0;
2075
2076 fragto = &skb_shinfo(tgt)->frags[merge];
2077
2078 fragto->size += shiftlen;
2079 fragfrom->size -= shiftlen;
2080 fragfrom->page_offset += shiftlen;
2081
2082 goto onlymerged;
2083 }
2084
2085 from++;
2086 }
2087
2088 /* Skip full, not-fitting skb to avoid expensive operations */
2089 if ((shiftlen == skb->len) &&
2090 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
2091 return 0;
2092
2093 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
2094 return 0;
2095
2096 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
2097 if (to == MAX_SKB_FRAGS)
2098 return 0;
2099
2100 fragfrom = &skb_shinfo(skb)->frags[from];
2101 fragto = &skb_shinfo(tgt)->frags[to];
2102
2103 if (todo >= fragfrom->size) {
2104 *fragto = *fragfrom;
2105 todo -= fragfrom->size;
2106 from++;
2107 to++;
2108
2109 } else {
2110 get_page(fragfrom->page);
2111 fragto->page = fragfrom->page;
2112 fragto->page_offset = fragfrom->page_offset;
2113 fragto->size = todo;
2114
2115 fragfrom->page_offset += todo;
2116 fragfrom->size -= todo;
2117 todo = 0;
2118
2119 to++;
2120 break;
2121 }
2122 }
2123
2124 /* Ready to "commit" this state change to tgt */
2125 skb_shinfo(tgt)->nr_frags = to;
2126
2127 if (merge >= 0) {
2128 fragfrom = &skb_shinfo(skb)->frags[0];
2129 fragto = &skb_shinfo(tgt)->frags[merge];
2130
2131 fragto->size += fragfrom->size;
2132 put_page(fragfrom->page);
2133 }
2134
2135 /* Reposition in the original skb */
2136 to = 0;
2137 while (from < skb_shinfo(skb)->nr_frags)
2138 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
2139 skb_shinfo(skb)->nr_frags = to;
2140
2141 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
2142
2143onlymerged:
2144 /* Most likely the tgt won't ever need its checksum anymore, skb on
2145 * the other hand might need it if it needs to be resent
2146 */
2147 tgt->ip_summed = CHECKSUM_PARTIAL;
2148 skb->ip_summed = CHECKSUM_PARTIAL;
2149
2150 /* Yak, is it really working this way? Some helper please? */
2151 skb->len -= shiftlen;
2152 skb->data_len -= shiftlen;
2153 skb->truesize -= shiftlen;
2154 tgt->len += shiftlen;
2155 tgt->data_len += shiftlen;
2156 tgt->truesize += shiftlen;
2157
2158 return shiftlen;
2159}
2160
2021/** 2161/**
2022 * skb_prepare_seq_read - Prepare a sequential read of skb data 2162 * skb_prepare_seq_read - Prepare a sequential read of skb data
2023 * @skb: the buffer to read 2163 * @skb: the buffer to read
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3c8e297e2c39..97d57676b8ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1242 * aligned portion of it that matches. Therefore we might need to fragment 1242 * aligned portion of it that matches. Therefore we might need to fragment
1243 * which may fail and creates some hassle (caller must handle error case 1243 * which may fail and creates some hassle (caller must handle error case
1244 * returns). 1244 * returns).
1245 *
1246 * FIXME: this could be merged to shift decision code
1245 */ 1247 */
1246static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1248static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1247 u32 start_seq, u32 end_seq) 1249 u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1353 1355
1354 if (fack_count > tp->fackets_out) 1356 if (fack_count > tp->fackets_out)
1355 tp->fackets_out = fack_count; 1357 tp->fackets_out = fack_count;
1356
1357 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
1358 tcp_advance_highest_sack(sk, skb);
1359 } 1358 }
1360 1359
1361 /* D-SACK. We can detect redundant retransmission in S|R and plain R 1360 /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1370 return flag; 1369 return flag;
1371} 1370}
1372 1371
1372static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1373 struct sk_buff *skb, unsigned int pcount,
1374 int shifted, int fack_count, int *reord,
1375 int *flag, int mss)
1376{
1377 struct tcp_sock *tp = tcp_sk(sk);
1378 u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */
1379
1380 BUG_ON(!pcount);
1381
1382 TCP_SKB_CB(prev)->end_seq += shifted;
1383 TCP_SKB_CB(skb)->seq += shifted;
1384
1385 skb_shinfo(prev)->gso_segs += pcount;
1386 BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
1387 skb_shinfo(skb)->gso_segs -= pcount;
1388
1389 /* When we're adding to gso_segs == 1, gso_size will be zero,
1390 * in theory this shouldn't be necessary but as long as DSACK
1391 * code can come after this skb later on it's better to keep
1392 * setting gso_size to something.
1393 */
1394 if (!skb_shinfo(prev)->gso_size) {
1395 skb_shinfo(prev)->gso_size = mss;
1396 skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1397 }
1398
1399 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1400 if (skb_shinfo(skb)->gso_segs <= 1) {
1401 skb_shinfo(skb)->gso_size = 0;
1402 skb_shinfo(skb)->gso_type = 0;
1403 }
1404
1405 *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
1406 pcount);
1407
1408 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1409 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1410
1411 tcp_clear_all_retrans_hints(tp);
1412
1413 if (skb->len > 0) {
1414 BUG_ON(!tcp_skb_pcount(skb));
1415 return 0;
1416 }
1417
1418 /* Whole SKB was eaten :-) */
1419
1420 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
1421 if (skb == tcp_highest_sack(sk))
1422 tcp_advance_highest_sack(sk, skb);
1423
1424 tcp_unlink_write_queue(skb, sk);
1425 sk_wmem_free_skb(sk, skb);
1426
1427 return 1;
1428}
1429
1430/* I wish gso_size would have a bit more sane initialization than
1431 * something-or-zero which complicates things
1432 */
1433static int tcp_shift_mss(struct sk_buff *skb)
1434{
1435 int mss = tcp_skb_mss(skb);
1436
1437 if (!mss)
1438 mss = skb->len;
1439
1440 return mss;
1441}
1442
1443/* Shifting pages past head area doesn't work */
1444static int skb_can_shift(struct sk_buff *skb)
1445{
1446 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1447}
1448
1449/* Try collapsing SACK blocks spanning across multiple skbs to a single
1450 * skb.
1451 */
1452static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1453 u32 start_seq, u32 end_seq,
1454 int dup_sack, int *fack_count,
1455 int *reord, int *flag)
1456{
1457 struct tcp_sock *tp = tcp_sk(sk);
1458 struct sk_buff *prev;
1459 int mss;
1460 int pcount = 0;
1461 int len;
1462 int in_sack;
1463
1464 if (!sk_can_gso(sk))
1465 goto fallback;
1466
1467 /* Normally R but no L won't result in plain S */
1468 if (!dup_sack &&
1469 (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
1470 goto fallback;
1471 if (!skb_can_shift(skb))
1472 goto fallback;
1473 /* This frame is about to be dropped (was ACKed). */
1474 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1475 goto fallback;
1476
1477 /* Can only happen with delayed DSACK + discard craziness */
1478 if (unlikely(skb == tcp_write_queue_head(sk)))
1479 goto fallback;
1480 prev = tcp_write_queue_prev(sk, skb);
1481
1482 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1483 goto fallback;
1484
1485 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1486 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1487
1488 if (in_sack) {
1489 len = skb->len;
1490 pcount = tcp_skb_pcount(skb);
1491 mss = tcp_shift_mss(skb);
1492
1493 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1494 * drop this restriction as unnecessary
1495 */
1496 if (mss != tcp_shift_mss(prev))
1497 goto fallback;
1498 } else {
1499 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1500 goto noop;
1501 /* CHECKME: This is non-MSS split case only?, this will
1502 * cause skipped skbs due to advancing loop btw, original
1503 * has that feature too
1504 */
1505 if (tcp_skb_pcount(skb) <= 1)
1506 goto noop;
1507
1508 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1509 if (!in_sack) {
1510 /* TODO: head merge to next could be attempted here
1511 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1512 * though it might not be worth of the additional hassle
1513 *
1514 * ...we can probably just fallback to what was done
1515 * previously. We could try merging non-SACKed ones
1516 * as well but it probably isn't going to buy off
1517 * because later SACKs might again split them, and
1518 * it would make skb timestamp tracking considerably
1519 * harder problem.
1520 */
1521 goto fallback;
1522 }
1523
1524 len = end_seq - TCP_SKB_CB(skb)->seq;
1525 BUG_ON(len < 0);
1526 BUG_ON(len > skb->len);
1527
1528 /* MSS boundaries should be honoured or else pcount will
1529 * severely break even though it makes things bit trickier.
1530 * Optimize common case to avoid most of the divides
1531 */
1532 mss = tcp_skb_mss(skb);
1533
1534 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1535 * drop this restriction as unnecessary
1536 */
1537 if (mss != tcp_shift_mss(prev))
1538 goto fallback;
1539
1540 if (len == mss) {
1541 pcount = 1;
1542 } else if (len < mss) {
1543 goto noop;
1544 } else {
1545 pcount = len / mss;
1546 len = pcount * mss;
1547 }
1548 }
1549
1550 if (!skb_shift(prev, skb, len))
1551 goto fallback;
1552 if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
1553 flag, mss))
1554 goto out;
1555
1556 /* Hole filled allows collapsing with the next as well, this is very
1557 * useful when hole on every nth skb pattern happens
1558 */
1559 if (prev == tcp_write_queue_tail(sk))
1560 goto out;
1561 skb = tcp_write_queue_next(sk, prev);
1562
1563 if (!skb_can_shift(skb))
1564 goto out;
1565 if (skb == tcp_send_head(sk))
1566 goto out;
1567 if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1568 goto out;
1569
1570 len = skb->len;
1571 if (skb_shift(prev, skb, len)) {
1572 pcount += tcp_skb_pcount(skb);
1573 tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
1574 *fack_count, reord, flag, mss);
1575 }
1576
1577out:
1578 *fack_count += pcount;
1579 return prev;
1580
1581noop:
1582 return skb;
1583
1584fallback:
1585 return NULL;
1586}
1587
1373static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, 1588static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1374 struct tcp_sack_block *next_dup, 1589 struct tcp_sack_block *next_dup,
1375 u32 start_seq, u32 end_seq, 1590 u32 start_seq, u32 end_seq,
1376 int dup_sack_in, int *fack_count, 1591 int dup_sack_in, int *fack_count,
1377 int *reord, int *flag) 1592 int *reord, int *flag)
1378{ 1593{
1594 struct tcp_sock *tp = tcp_sk(sk);
1595 struct sk_buff *tmp;
1596
1379 tcp_for_write_queue_from(skb, sk) { 1597 tcp_for_write_queue_from(skb, sk) {
1380 int in_sack = 0; 1598 int in_sack = 0;
1381 int dup_sack = dup_sack_in; 1599 int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1396 dup_sack = 1; 1614 dup_sack = 1;
1397 } 1615 }
1398 1616
1399 if (in_sack <= 0) 1617 /* skb reference here is a bit tricky to get right, since
1400 in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, 1618 * shifting can eat and free both this skb and the next,
1401 end_seq); 1619 * so not even _safe variant of the loop is enough.
1620 */
1621 if (in_sack <= 0) {
1622 tmp = tcp_shift_skb_data(sk, skb, start_seq,
1623 end_seq, dup_sack,
1624 fack_count, reord, flag);
1625 if (tmp != NULL) {
1626 if (tmp != skb) {
1627 skb = tmp;
1628 continue;
1629 }
1630
1631 in_sack = 0;
1632 } else {
1633 in_sack = tcp_match_skb_to_sack(sk, skb,
1634 start_seq,
1635 end_seq);
1636 }
1637 }
1638
1402 if (unlikely(in_sack < 0)) 1639 if (unlikely(in_sack < 0))
1403 break; 1640 break;
1404 1641
1405 if (in_sack) 1642 if (in_sack) {
1406 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, 1643 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
1407 *fack_count, 1644 *fack_count,
1408 &(TCP_SKB_CB(skb)->sacked), 1645 &(TCP_SKB_CB(skb)->sacked),
1409 tcp_skb_pcount(skb)); 1646 tcp_skb_pcount(skb));
1410 1647
1648 if (!before(TCP_SKB_CB(skb)->seq,
1649 tcp_highest_sack_seq(tp)))
1650 tcp_advance_highest_sack(sk, skb);
1651 }
1652
1411 *fack_count += tcp_skb_pcount(skb); 1653 *fack_count += tcp_skb_pcount(skb);
1412 } 1654 }
1413 return skb; 1655 return skb;