aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlpo Järvinen <ilpo.jarvinen@helsinki.fi>2007-11-15 22:50:37 -0500
committerDavid S. Miller <davem@davemloft.net>2008-01-28 17:54:07 -0500
commit68f8353b480e5f2e136c38a511abdbb88eaa8ce2 (patch)
tree3e412890c3caa98619872f15e117daffb68e9edf
parentfd6dad616d4fe2f08d690f25ca76b0102158fb3a (diff)
[TCP]: Rewrite SACK block processing & sack_recv_cache use
Key points of this patch are: - In case new SACK information is advance only type, no skb processing below previously discovered highest point is done - Optimize cases below highest point too since there's no need to always go up to highest point (which is very likely still present in that SACK), this is not entirely true though because I'm dropping the fastpath_skb_hint which could previously optimize those cases even better. Whether that's significant, I'm not too sure. Currently it will provide skipping by walking. Combined with RB-tree, all skipping would become fast too regardless of window size (can be done incrementally later). Previously a number of cases in TCP SACK processing fails to take advantage of costly stored information in sack_recv_cache, most importantly, expected events such as cumulative ACK and new hole ACKs. Processing on such ACKs result in rather long walks building up latencies (which easily gets nasty when window is huge). Those latencies are often completely unnecessary compared with the amount of _new_ information received, usually for cumulative ACK there's no new information at all, yet TCP walks whole queue unnecessary potentially taking a number of costly cache misses on the way, etc.! Since the inclusion of highest_sack, there's a lot information that is very likely redundant (SACK fastpath hint stuff, fackets_out, highest_sack), though there's no ultimate guarantee that they'll remain the same whole the time (in all unearthly scenarios). Take advantage of this knowledge here and drop fastpath hint and use direct access to highest SACKed skb as a replacement. Effectively "special cased" fastpath is dropped. This change adds some complexity to introduce better coveraged "fastpath", though the added complexity should make TCP behave more cache friendly. The current ACK's SACK blocks are compared against each cached block individially and only ranges that are new are then scanned by the high constant walk. For other parts of write queue, even when in previously known part of the SACK blocks, a faster skip function is used (if necessary at all). In addition, whenever possible, TCP fast-forwards to highest_sack skb that was made available by an earlier patch. In typical case, no other things but this fast-forward and mandatory markings after that occur making the access pattern quite similar to the former fastpath "special case". DSACKs are special case that must always be walked. The local to recv_sack_cache copying could be more intelligent w.r.t DSACKs which are likely to be there only once but that is left to a separate patch. Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/tcp.h3
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/ipv4/tcp_input.c271
-rw-r--r--net/ipv4/tcp_output.c14
4 files changed, 172 insertions, 117 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 794497c7d755..08027f1d7f31 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -343,10 +343,7 @@ struct tcp_sock {
343 struct sk_buff *scoreboard_skb_hint; 343 struct sk_buff *scoreboard_skb_hint;
344 struct sk_buff *retransmit_skb_hint; 344 struct sk_buff *retransmit_skb_hint;
345 struct sk_buff *forward_skb_hint; 345 struct sk_buff *forward_skb_hint;
346 struct sk_buff *fastpath_skb_hint;
347 346
348 int fastpath_cnt_hint; /* Lags behind by current skb's pcount
349 * compared to respective fackets_out */
350 int lost_cnt_hint; 347 int lost_cnt_hint;
351 int retransmit_cnt_hint; 348 int retransmit_cnt_hint;
352 349
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ede804b16d4..f0c5e7a29400 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1081,7 +1081,6 @@ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
1081static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) 1081static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
1082{ 1082{
1083 tcp_clear_retrans_hints_partial(tp); 1083 tcp_clear_retrans_hints_partial(tp);
1084 tp->fastpath_skb_hint = NULL;
1085} 1084}
1086 1085
1087/* MD5 Signature */ 1086/* MD5 Signature */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a287747e9dd6..3ad6a19ad30f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1333,6 +1333,88 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct tcp_sock *tp,
1333 return flag; 1333 return flag;
1334} 1334}
1335 1335
1336static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1337 struct tcp_sack_block *next_dup,
1338 u32 start_seq, u32 end_seq,
1339 int dup_sack_in, int *fack_count,
1340 int *reord, int *flag)
1341{
1342 struct tcp_sock *tp = tcp_sk(sk);
1343
1344 tcp_for_write_queue_from(skb, sk) {
1345 int in_sack = 0;
1346 int dup_sack = dup_sack_in;
1347
1348 if (skb == tcp_send_head(sk))
1349 break;
1350
1351 /* queue is in-order => we can short-circuit the walk early */
1352 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1353 break;
1354
1355 if ((next_dup != NULL) &&
1356 before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
1357 in_sack = tcp_match_skb_to_sack(sk, skb,
1358 next_dup->start_seq,
1359 next_dup->end_seq);
1360 if (in_sack > 0)
1361 dup_sack = 1;
1362 }
1363
1364 if (in_sack <= 0)
1365 in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq);
1366 if (unlikely(in_sack < 0))
1367 break;
1368
1369 if (in_sack)
1370 *flag |= tcp_sacktag_one(skb, tp, reord, dup_sack, *fack_count);
1371
1372 *fack_count += tcp_skb_pcount(skb);
1373 }
1374 return skb;
1375}
1376
1377/* Avoid all extra work that is being done by sacktag while walking in
1378 * a normal way
1379 */
1380static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1381 u32 skip_to_seq)
1382{
1383 tcp_for_write_queue_from(skb, sk) {
1384 if (skb == tcp_send_head(sk))
1385 break;
1386
1387 if (before(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1388 break;
1389 }
1390 return skb;
1391}
1392
1393static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1394 struct sock *sk,
1395 struct tcp_sack_block *next_dup,
1396 u32 skip_to_seq,
1397 int *fack_count, int *reord,
1398 int *flag)
1399{
1400 if (next_dup == NULL)
1401 return skb;
1402
1403 if (before(next_dup->start_seq, skip_to_seq)) {
1404 skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
1405 tcp_sacktag_walk(skb, sk, NULL,
1406 next_dup->start_seq, next_dup->end_seq,
1407 1, fack_count, reord, flag);
1408 }
1409
1410 return skb;
1411}
1412
1413static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
1414{
1415 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1416}
1417
1336static int 1418static int
1337tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) 1419tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
1338{ 1420{
@@ -1342,16 +1424,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1342 TCP_SKB_CB(ack_skb)->sacked); 1424 TCP_SKB_CB(ack_skb)->sacked);
1343 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); 1425 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1344 struct tcp_sack_block sp[4]; 1426 struct tcp_sack_block sp[4];
1345 struct sk_buff *cached_skb; 1427 struct tcp_sack_block *cache;
1428 struct sk_buff *skb;
1346 int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; 1429 int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
1347 int used_sacks; 1430 int used_sacks;
1348 int reord = tp->packets_out; 1431 int reord = tp->packets_out;
1349 int flag = 0; 1432 int flag = 0;
1350 int found_dup_sack = 0; 1433 int found_dup_sack = 0;
1351 int cached_fack_count; 1434 int fack_count;
1352 int i; 1435 int i, j;
1353 int first_sack_index; 1436 int first_sack_index;
1354 int force_one_sack;
1355 1437
1356 if (!tp->sacked_out) { 1438 if (!tp->sacked_out) {
1357 if (WARN_ON(tp->fackets_out)) 1439 if (WARN_ON(tp->fackets_out))
@@ -1409,132 +1491,123 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1409 used_sacks++; 1491 used_sacks++;
1410 } 1492 }
1411 1493
1412 /* SACK fastpath: 1494 /* order SACK blocks to allow in order walk of the retrans queue */
1413 * if the only SACK change is the increase of the end_seq of 1495 for (i = used_sacks - 1; i > 0; i--) {
1414 * the first block then only apply that SACK block 1496 for (j = 0; j < i; j++){
1415 * and use retrans queue hinting otherwise slowpath */ 1497 if (after(sp[j].start_seq, sp[j+1].start_seq)) {
1416 force_one_sack = 1; 1498 struct tcp_sack_block tmp;
1417 for (i = 0; i < used_sacks; i++) {
1418 u32 start_seq = sp[i].start_seq;
1419 u32 end_seq = sp[i].end_seq;
1420
1421 if (i == 0) {
1422 if (tp->recv_sack_cache[i].start_seq != start_seq)
1423 force_one_sack = 0;
1424 } else {
1425 if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
1426 (tp->recv_sack_cache[i].end_seq != end_seq))
1427 force_one_sack = 0;
1428 }
1429 tp->recv_sack_cache[i].start_seq = start_seq;
1430 tp->recv_sack_cache[i].end_seq = end_seq;
1431 }
1432 /* Clear the rest of the cache sack blocks so they won't match mistakenly. */
1433 for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) {
1434 tp->recv_sack_cache[i].start_seq = 0;
1435 tp->recv_sack_cache[i].end_seq = 0;
1436 }
1437 1499
1438 if (force_one_sack) 1500 tmp = sp[j];
1439 used_sacks = 1; 1501 sp[j] = sp[j+1];
1440 else { 1502 sp[j+1] = tmp;
1441 int j;
1442 tp->fastpath_skb_hint = NULL;
1443
1444 /* order SACK blocks to allow in order walk of the retrans queue */
1445 for (i = used_sacks - 1; i > 0; i--) {
1446 for (j = 0; j < i; j++){
1447 if (after(sp[j].start_seq, sp[j+1].start_seq)) {
1448 struct tcp_sack_block tmp;
1449
1450 tmp = sp[j];
1451 sp[j] = sp[j+1];
1452 sp[j+1] = tmp;
1453
1454 /* Track where the first SACK block goes to */
1455 if (j == first_sack_index)
1456 first_sack_index = j+1;
1457 }
1458 1503
1504 /* Track where the first SACK block goes to */
1505 if (j == first_sack_index)
1506 first_sack_index = j+1;
1459 } 1507 }
1460 } 1508 }
1461 } 1509 }
1462 1510
1463 /* Use SACK fastpath hint if valid */ 1511 skb = tcp_write_queue_head(sk);
1464 cached_skb = tp->fastpath_skb_hint; 1512 fack_count = 0;
1465 cached_fack_count = tp->fastpath_cnt_hint; 1513 i = 0;
1466 if (!cached_skb) { 1514
1467 cached_skb = tcp_write_queue_head(sk); 1515 if (!tp->sacked_out) {
1468 cached_fack_count = 0; 1516 /* It's already past, so skip checking against it */
1517 cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1518 } else {
1519 cache = tp->recv_sack_cache;
1520 /* Skip empty blocks in at head of the cache */
1521 while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
1522 !cache->end_seq)
1523 cache++;
1469 } 1524 }
1470 1525
1471 for (i = 0; i < used_sacks; i++) { 1526 while (i < used_sacks) {
1472 struct sk_buff *skb;
1473 u32 start_seq = sp[i].start_seq; 1527 u32 start_seq = sp[i].start_seq;
1474 u32 end_seq = sp[i].end_seq; 1528 u32 end_seq = sp[i].end_seq;
1475 int fack_count;
1476 int dup_sack = (found_dup_sack && (i == first_sack_index)); 1529 int dup_sack = (found_dup_sack && (i == first_sack_index));
1477 int next_dup = (found_dup_sack && (i+1 == first_sack_index)); 1530 struct tcp_sack_block *next_dup = NULL;
1478 1531
1479 skb = cached_skb; 1532 if (found_dup_sack && ((i + 1) == first_sack_index))
1480 fack_count = cached_fack_count; 1533 next_dup = &sp[i + 1];
1481 1534
1482 /* Event "B" in the comment above. */ 1535 /* Event "B" in the comment above. */
1483 if (after(end_seq, tp->high_seq)) 1536 if (after(end_seq, tp->high_seq))
1484 flag |= FLAG_DATA_LOST; 1537 flag |= FLAG_DATA_LOST;
1485 1538
1486 tcp_for_write_queue_from(skb, sk) { 1539 /* Skip too early cached blocks */
1487 int in_sack = 0; 1540 while (tcp_sack_cache_ok(tp, cache) &&
1488 1541 !before(start_seq, cache->end_seq))
1489 if (skb == tcp_send_head(sk)) 1542 cache++;
1490 break; 1543
1491 1544 /* Can skip some work by looking recv_sack_cache? */
1492 cached_skb = skb; 1545 if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
1493 cached_fack_count = fack_count; 1546 after(end_seq, cache->start_seq)) {
1494 if (i == first_sack_index) { 1547
1495 tp->fastpath_skb_hint = skb; 1548 /* Head todo? */
1496 tp->fastpath_cnt_hint = fack_count; 1549 if (before(start_seq, cache->start_seq)) {
1550 skb = tcp_sacktag_skip(skb, sk, start_seq);
1551 skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq,
1552 cache->start_seq, dup_sack,
1553 &fack_count, &reord, &flag);
1497 } 1554 }
1498 1555
1499 /* The retransmission queue is always in order, so 1556 /* Rest of the block already fully processed? */
1500 * we can short-circuit the walk early. 1557 if (!after(end_seq, cache->end_seq)) {
1501 */ 1558 skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq,
1502 if (!before(TCP_SKB_CB(skb)->seq, end_seq)) 1559 &fack_count, &reord, &flag);
1503 break; 1560 goto advance_sp;
1504 1561 }
1505 dup_sack = (found_dup_sack && (i == first_sack_index));
1506 1562
1507 /* Due to sorting DSACK may reside within this SACK block! */ 1563 /* ...tail remains todo... */
1508 if (next_dup) { 1564 if (TCP_SKB_CB(tp->highest_sack)->end_seq == cache->end_seq) {
1509 u32 dup_start = sp[i+1].start_seq; 1565 /* ...but better entrypoint exists! Check that DSACKs are
1510 u32 dup_end = sp[i+1].end_seq; 1566 * properly accounted while skipping here
1567 */
1568 tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq,
1569 &fack_count, &reord, &flag);
1511 1570
1512 if (before(TCP_SKB_CB(skb)->seq, dup_end)) { 1571 skb = tcp_write_queue_next(sk, tp->highest_sack);
1513 in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end); 1572 fack_count = tp->fackets_out;
1514 if (in_sack > 0) 1573 cache++;
1515 dup_sack = 1; 1574 goto walk;
1516 }
1517 } 1575 }
1518 1576
1519 /* DSACK info lost if out-of-mem, try SACK still */ 1577 skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
1520 if (in_sack <= 0) 1578 /* Check overlap against next cached too (past this one already) */
1521 in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); 1579 cache++;
1522 if (unlikely(in_sack < 0)) 1580 continue;
1523 break; 1581 }
1524
1525 if (in_sack)
1526 flag |= tcp_sacktag_one(skb, tp, &reord, dup_sack, fack_count);
1527 1582
1528 fack_count += tcp_skb_pcount(skb); 1583 if (!before(start_seq, tcp_highest_sack_seq(tp))) {
1584 skb = tcp_write_queue_next(sk, tp->highest_sack);
1585 fack_count = tp->fackets_out;
1529 } 1586 }
1587 skb = tcp_sacktag_skip(skb, sk, start_seq);
1588
1589walk:
1590 skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq,
1591 dup_sack, &fack_count, &reord, &flag);
1530 1592
1593advance_sp:
1531 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct 1594 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
1532 * due to in-order walk 1595 * due to in-order walk
1533 */ 1596 */
1534 if (after(end_seq, tp->frto_highmark)) 1597 if (after(end_seq, tp->frto_highmark))
1535 flag &= ~FLAG_ONLY_ORIG_SACKED; 1598 flag &= ~FLAG_ONLY_ORIG_SACKED;
1599
1600 i++;
1536 } 1601 }
1537 1602
1603 /* Clear the head of the cache sack blocks so we can skip it next time */
1604 for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
1605 tp->recv_sack_cache[i].start_seq = 0;
1606 tp->recv_sack_cache[i].end_seq = 0;
1607 }
1608 for (j = 0; j < used_sacks; j++)
1609 tp->recv_sack_cache[i++] = sp[j];
1610
1538 flag |= tcp_mark_lost_retrans(sk); 1611 flag |= tcp_mark_lost_retrans(sk);
1539 1612
1540 tcp_verify_left_out(tp); 1613 tcp_verify_left_out(tp);
@@ -2821,9 +2894,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
2821 } 2894 }
2822 2895
2823 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 2896 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
2824 /* hint's skb might be NULL but we don't need to care */ 2897
2825 tp->fastpath_cnt_hint -= min_t(u32, pkts_acked,
2826 tp->fastpath_cnt_hint);
2827 if (ca_ops->pkts_acked) { 2898 if (ca_ops->pkts_acked) {
2828 s32 rtt_us = -1; 2899 s32 rtt_us = -1;
2829 2900
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ce506af5ce07..030fc69ea217 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -653,9 +653,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned
653} 653}
654 654
655/* When a modification to fackets out becomes necessary, we need to check 655/* When a modification to fackets out becomes necessary, we need to check
656 * skb is counted to fackets_out or not. Another important thing is to 656 * skb is counted to fackets_out or not.
657 * tweak SACK fastpath hint too as it would overwrite all changes unless
658 * hint is also changed.
659 */ 657 */
660static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, 658static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
661 int decr) 659 int decr)
@@ -667,11 +665,6 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
667 665
668 if (!before(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) 666 if (!before(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
669 tp->fackets_out -= decr; 667 tp->fackets_out -= decr;
670
671 /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */
672 if (tp->fastpath_skb_hint != NULL &&
673 after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
674 tp->fastpath_cnt_hint -= decr;
675} 668}
676 669
677/* Function to create two new TCP segments. Shrinks the given segment 670/* Function to create two new TCP segments. Shrinks the given segment
@@ -1753,11 +1746,6 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1753 1746
1754 /* changed transmit queue under us so clear hints */ 1747 /* changed transmit queue under us so clear hints */
1755 tcp_clear_retrans_hints_partial(tp); 1748 tcp_clear_retrans_hints_partial(tp);
1756 /* manually tune sacktag skb hint */
1757 if (tp->fastpath_skb_hint == next_skb) {
1758 tp->fastpath_skb_hint = skb;
1759 tp->fastpath_cnt_hint -= tcp_skb_pcount(skb);
1760 }
1761 1749
1762 sk_stream_free_skb(sk, next_skb); 1750 sk_stream_free_skb(sk, next_skb);
1763 } 1751 }