aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorIlpo Järvinen <ilpo.jarvinen@helsinki.fi>2008-11-25 00:20:15 -0500
committerDavid S. Miller <davem@davemloft.net>2008-11-25 00:20:15 -0500
commit832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 (patch)
tree95b22ad16d1ff414cab39578ed8c927c2ce08723 /net/ipv4
parentf58b22fd3c16444edc393a217a74208f1894b601 (diff)
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by the SACK blocks that one-by-one fragment SKBs to MSS sized chunks. Then we're in problems when cleanup work for them has to be done when a large cumulative ACK comes. Try to return back to pre-split state already while more and more SACK info gets discovered by combining newly discovered SACK areas with the previous skb if that's SACKed as well. This approach has a number of benefits: 1) The processing overhead is spread more equally over the RTT 2) Write queue has less skbs to process (affect everything which has to walk in the queue past the sacked areas) 3) Write queue is consistent whole the time, so no other parts of TCP has to be aware of this (this was not the case with some other approach that was, well, quite intrusive all around). 4) Clean_rtx_queue can release most of the pages using single put_page instead of previous PAGE_SIZE/mss+1 calls In case a hole is fully filled by the new SACK block, we attempt to combine the next skb too which allows construction of skbs that are even larger than what tso split them to and it handles hole per on every nth patterns that often occur during slow start overshoot pretty nicely. Though this to be really useful also a retransmission would have to get lost since cumulative ACKs advance one hole at a time in the most typical case. TODO: handle upwards only merging. That should be rather easy when segment is fully sacked but I'm leaving that as future work item (it won't make very large difference anyway since this current approach already covers quite a lot of normal cases). I was earlier thinking of some sophisticated way of tracking timestamps of the first and the last segment but later on realized that it won't be that necessary at all to store the timestamp of the last segment. The cases that can occur are basically either: 1) ambiguous => no sensible measurement can be taken anyway 2) non-ambiguous is due to reordering => having the timestamp of the last segment there is just skewing things more off than does some good since the ack got triggered by one of the holes (besides some substle issues that would make determining right hole/skb even harder problem). Anyway, it has nothing to do with this change then. I choose to route some abnormal looking cases with goto noop, some could be handled differently (eg., by stopping the walking at that skb but again). In general, they either shouldn't happen at all or are rare enough to make no difference in practice. In theory this change (as whole) could cause some macroscale regression (global) because of cache misses that are taken over the round-trip time but it gets very likely better because of much less (local) cache misses per other write queue walkers and the big recovery clearing cumulative ack. Worth to note that these benefits would be very easy to get also without TSO/GSO being on as long as the data is in pages so that we can merge them. Currently I won't let that happen because DSACK splitting at fragment that would mess up pcounts due to sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets avoided, we have some conditions that can be made less strict. TODO: I will probably have to convert the excessive pointer passing to struct sacktag_state... :-) My testing revealed that considerable amount of skbs couldn't be shifted because they were cloned (most likely still awaiting tx reclaim)... [The rest is considering future work instead since I got repeatably EFAULT to tcpdump's recvfrom when I added pskb_expand_head to deal with clones, so I separated that into another, later patch] ...To counter that, I gave up on the fifth advantage: 5) When growing previous SACK block, less allocs for new skbs are done, basically a new alloc is needed only when new hole is detected and when the previous skb runs out of frags space ...which now only happens of if reclaim is fast enough to dispose the clone before the SACK block comes in (the window is RTT long), otherwise we'll have to alloc some. With clones being handled I got these numbers (will be somewhat worse without that), taken with fine-grained mibs: TCPSackShifted 398 TCPSackMerged 877 TCPSackShiftFallback 320 TCPSACKCOLLAPSEFALLBACKGSO 0 TCPSACKCOLLAPSEFALLBACKSKBBITS 0 TCPSACKCOLLAPSEFALLBACKSKBDATA 0 TCPSACKCOLLAPSEFALLBACKBELOW 0 TCPSACKCOLLAPSEFALLBACKFIRST 1 TCPSACKCOLLAPSEFALLBACKPREVBITS 318 TCPSACKCOLLAPSEFALLBACKMSS 1 TCPSACKCOLLAPSEFALLBACKNOHEAD 0 TCPSACKCOLLAPSEFALLBACKSHIFT 0 TCPSACKCOLLAPSENOOPSEQ 0 TCPSACKCOLLAPSENOOPSMALLPCOUNT 0 TCPSACKCOLLAPSENOOPSMALLLEN 0 TCPSACKCOLLAPSEHOLE 12 Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_input.c256
1 files changed, 249 insertions, 7 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3c8e297e2c39..97d57676b8ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1242 * aligned portion of it that matches. Therefore we might need to fragment 1242 * aligned portion of it that matches. Therefore we might need to fragment
1243 * which may fail and creates some hassle (caller must handle error case 1243 * which may fail and creates some hassle (caller must handle error case
1244 * returns). 1244 * returns).
1245 *
1246 * FIXME: this could be merged to shift decision code
1245 */ 1247 */
1246static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1248static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1247 u32 start_seq, u32 end_seq) 1249 u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1353 1355
1354 if (fack_count > tp->fackets_out) 1356 if (fack_count > tp->fackets_out)
1355 tp->fackets_out = fack_count; 1357 tp->fackets_out = fack_count;
1356
1357 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
1358 tcp_advance_highest_sack(sk, skb);
1359 } 1358 }
1360 1359
1361 /* D-SACK. We can detect redundant retransmission in S|R and plain R 1360 /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1370 return flag; 1369 return flag;
1371} 1370}
1372 1371
1372static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1373 struct sk_buff *skb, unsigned int pcount,
1374 int shifted, int fack_count, int *reord,
1375 int *flag, int mss)
1376{
1377 struct tcp_sock *tp = tcp_sk(sk);
1378 u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */
1379
1380 BUG_ON(!pcount);
1381
1382 TCP_SKB_CB(prev)->end_seq += shifted;
1383 TCP_SKB_CB(skb)->seq += shifted;
1384
1385 skb_shinfo(prev)->gso_segs += pcount;
1386 BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
1387 skb_shinfo(skb)->gso_segs -= pcount;
1388
1389 /* When we're adding to gso_segs == 1, gso_size will be zero,
1390 * in theory this shouldn't be necessary but as long as DSACK
1391 * code can come after this skb later on it's better to keep
1392 * setting gso_size to something.
1393 */
1394 if (!skb_shinfo(prev)->gso_size) {
1395 skb_shinfo(prev)->gso_size = mss;
1396 skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1397 }
1398
1399 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1400 if (skb_shinfo(skb)->gso_segs <= 1) {
1401 skb_shinfo(skb)->gso_size = 0;
1402 skb_shinfo(skb)->gso_type = 0;
1403 }
1404
1405 *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
1406 pcount);
1407
1408 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1409 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1410
1411 tcp_clear_all_retrans_hints(tp);
1412
1413 if (skb->len > 0) {
1414 BUG_ON(!tcp_skb_pcount(skb));
1415 return 0;
1416 }
1417
1418 /* Whole SKB was eaten :-) */
1419
1420 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
1421 if (skb == tcp_highest_sack(sk))
1422 tcp_advance_highest_sack(sk, skb);
1423
1424 tcp_unlink_write_queue(skb, sk);
1425 sk_wmem_free_skb(sk, skb);
1426
1427 return 1;
1428}
1429
1430/* I wish gso_size would have a bit more sane initialization than
1431 * something-or-zero which complicates things
1432 */
1433static int tcp_shift_mss(struct sk_buff *skb)
1434{
1435 int mss = tcp_skb_mss(skb);
1436
1437 if (!mss)
1438 mss = skb->len;
1439
1440 return mss;
1441}
1442
1443/* Shifting pages past head area doesn't work */
1444static int skb_can_shift(struct sk_buff *skb)
1445{
1446 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1447}
1448
1449/* Try collapsing SACK blocks spanning across multiple skbs to a single
1450 * skb.
1451 */
1452static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1453 u32 start_seq, u32 end_seq,
1454 int dup_sack, int *fack_count,
1455 int *reord, int *flag)
1456{
1457 struct tcp_sock *tp = tcp_sk(sk);
1458 struct sk_buff *prev;
1459 int mss;
1460 int pcount = 0;
1461 int len;
1462 int in_sack;
1463
1464 if (!sk_can_gso(sk))
1465 goto fallback;
1466
1467 /* Normally R but no L won't result in plain S */
1468 if (!dup_sack &&
1469 (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
1470 goto fallback;
1471 if (!skb_can_shift(skb))
1472 goto fallback;
1473 /* This frame is about to be dropped (was ACKed). */
1474 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1475 goto fallback;
1476
1477 /* Can only happen with delayed DSACK + discard craziness */
1478 if (unlikely(skb == tcp_write_queue_head(sk)))
1479 goto fallback;
1480 prev = tcp_write_queue_prev(sk, skb);
1481
1482 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1483 goto fallback;
1484
1485 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1486 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1487
1488 if (in_sack) {
1489 len = skb->len;
1490 pcount = tcp_skb_pcount(skb);
1491 mss = tcp_shift_mss(skb);
1492
1493 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1494 * drop this restriction as unnecessary
1495 */
1496 if (mss != tcp_shift_mss(prev))
1497 goto fallback;
1498 } else {
1499 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1500 goto noop;
1501 /* CHECKME: This is non-MSS split case only?, this will
1502 * cause skipped skbs due to advancing loop btw, original
1503 * has that feature too
1504 */
1505 if (tcp_skb_pcount(skb) <= 1)
1506 goto noop;
1507
1508 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1509 if (!in_sack) {
1510 /* TODO: head merge to next could be attempted here
1511 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1512 * though it might not be worth of the additional hassle
1513 *
1514 * ...we can probably just fallback to what was done
1515 * previously. We could try merging non-SACKed ones
1516 * as well but it probably isn't going to buy off
1517 * because later SACKs might again split them, and
1518 * it would make skb timestamp tracking considerably
1519 * harder problem.
1520 */
1521 goto fallback;
1522 }
1523
1524 len = end_seq - TCP_SKB_CB(skb)->seq;
1525 BUG_ON(len < 0);
1526 BUG_ON(len > skb->len);
1527
1528 /* MSS boundaries should be honoured or else pcount will
1529 * severely break even though it makes things bit trickier.
1530 * Optimize common case to avoid most of the divides
1531 */
1532 mss = tcp_skb_mss(skb);
1533
1534 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1535 * drop this restriction as unnecessary
1536 */
1537 if (mss != tcp_shift_mss(prev))
1538 goto fallback;
1539
1540 if (len == mss) {
1541 pcount = 1;
1542 } else if (len < mss) {
1543 goto noop;
1544 } else {
1545 pcount = len / mss;
1546 len = pcount * mss;
1547 }
1548 }
1549
1550 if (!skb_shift(prev, skb, len))
1551 goto fallback;
1552 if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
1553 flag, mss))
1554 goto out;
1555
1556 /* Hole filled allows collapsing with the next as well, this is very
1557 * useful when hole on every nth skb pattern happens
1558 */
1559 if (prev == tcp_write_queue_tail(sk))
1560 goto out;
1561 skb = tcp_write_queue_next(sk, prev);
1562
1563 if (!skb_can_shift(skb))
1564 goto out;
1565 if (skb == tcp_send_head(sk))
1566 goto out;
1567 if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1568 goto out;
1569
1570 len = skb->len;
1571 if (skb_shift(prev, skb, len)) {
1572 pcount += tcp_skb_pcount(skb);
1573 tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
1574 *fack_count, reord, flag, mss);
1575 }
1576
1577out:
1578 *fack_count += pcount;
1579 return prev;
1580
1581noop:
1582 return skb;
1583
1584fallback:
1585 return NULL;
1586}
1587
1373static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, 1588static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1374 struct tcp_sack_block *next_dup, 1589 struct tcp_sack_block *next_dup,
1375 u32 start_seq, u32 end_seq, 1590 u32 start_seq, u32 end_seq,
1376 int dup_sack_in, int *fack_count, 1591 int dup_sack_in, int *fack_count,
1377 int *reord, int *flag) 1592 int *reord, int *flag)
1378{ 1593{
1594 struct tcp_sock *tp = tcp_sk(sk);
1595 struct sk_buff *tmp;
1596
1379 tcp_for_write_queue_from(skb, sk) { 1597 tcp_for_write_queue_from(skb, sk) {
1380 int in_sack = 0; 1598 int in_sack = 0;
1381 int dup_sack = dup_sack_in; 1599 int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1396 dup_sack = 1; 1614 dup_sack = 1;
1397 } 1615 }
1398 1616
1399 if (in_sack <= 0) 1617 /* skb reference here is a bit tricky to get right, since
1400 in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, 1618 * shifting can eat and free both this skb and the next,
1401 end_seq); 1619 * so not even _safe variant of the loop is enough.
1620 */
1621 if (in_sack <= 0) {
1622 tmp = tcp_shift_skb_data(sk, skb, start_seq,
1623 end_seq, dup_sack,
1624 fack_count, reord, flag);
1625 if (tmp != NULL) {
1626 if (tmp != skb) {
1627 skb = tmp;
1628 continue;
1629 }
1630
1631 in_sack = 0;
1632 } else {
1633 in_sack = tcp_match_skb_to_sack(sk, skb,
1634 start_seq,
1635 end_seq);
1636 }
1637 }
1638
1402 if (unlikely(in_sack < 0)) 1639 if (unlikely(in_sack < 0))
1403 break; 1640 break;
1404 1641
1405 if (in_sack) 1642 if (in_sack) {
1406 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, 1643 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
1407 *fack_count, 1644 *fack_count,
1408 &(TCP_SKB_CB(skb)->sacked), 1645 &(TCP_SKB_CB(skb)->sacked),
1409 tcp_skb_pcount(skb)); 1646 tcp_skb_pcount(skb));
1410 1647
1648 if (!before(TCP_SKB_CB(skb)->seq,
1649 tcp_highest_sack_seq(tp)))
1650 tcp_advance_highest_sack(sk, skb);
1651 }
1652
1411 *fack_count += tcp_skb_pcount(skb); 1653 *fack_count += tcp_skb_pcount(skb);
1412 } 1654 }
1413 return skb; 1655 return skb;