diff options
author | Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> | 2007-11-15 22:50:37 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-01-28 17:54:07 -0500 |
commit | 68f8353b480e5f2e136c38a511abdbb88eaa8ce2 (patch) | |
tree | 3e412890c3caa98619872f15e117daffb68e9edf /net/ipv4/tcp_input.c | |
parent | fd6dad616d4fe2f08d690f25ca76b0102158fb3a (diff) |
[TCP]: Rewrite SACK block processing & sack_recv_cache use
Key points of this patch are:
- In case new SACK information is advance only type, no skb
processing below previously discovered highest point is done
- Optimize cases below highest point too since there's no need
to always go up to highest point (which is very likely still
present in that SACK), this is not entirely true though
because I'm dropping the fastpath_skb_hint which could
previously optimize those cases even better. Whether that's
significant, I'm not too sure.
Currently it will provide skipping by walking. Combined with
RB-tree, all skipping would become fast too regardless of window
size (can be done incrementally later).
Previously a number of cases in TCP SACK processing fails to
take advantage of costly stored information in sack_recv_cache,
most importantly, expected events such as cumulative ACK and new
hole ACKs. Processing on such ACKs result in rather long walks
building up latencies (which easily gets nasty when window is
huge). Those latencies are often completely unnecessary
compared with the amount of _new_ information received, usually
for cumulative ACK there's no new information at all, yet TCP
walks whole queue unnecessary potentially taking a number of
costly cache misses on the way, etc.!
Since the inclusion of highest_sack, there's a lot information
that is very likely redundant (SACK fastpath hint stuff,
fackets_out, highest_sack), though there's no ultimate guarantee
that they'll remain the same whole the time (in all unearthly
scenarios). Take advantage of this knowledge here and drop
fastpath hint and use direct access to highest SACKed skb as
a replacement.
Effectively "special cased" fastpath is dropped. This change
adds some complexity to introduce better coveraged "fastpath",
though the added complexity should make TCP behave more cache
friendly.
The current ACK's SACK blocks are compared against each cached
block individially and only ranges that are new are then scanned
by the high constant walk. For other parts of write queue, even
when in previously known part of the SACK blocks, a faster skip
function is used (if necessary at all). In addition, whenever
possible, TCP fast-forwards to highest_sack skb that was made
available by an earlier patch. In typical case, no other things
but this fast-forward and mandatory markings after that occur
making the access pattern quite similar to the former fastpath
"special case".
DSACKs are special case that must always be walked.
The local to recv_sack_cache copying could be more intelligent
w.r.t DSACKs which are likely to be there only once but that
is left to a separate patch.
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 271 |
1 files changed, 171 insertions, 100 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a287747e9dd6..3ad6a19ad30f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -1333,6 +1333,88 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct tcp_sock *tp, | |||
1333 | return flag; | 1333 | return flag; |
1334 | } | 1334 | } |
1335 | 1335 | ||
1336 | static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | ||
1337 | struct tcp_sack_block *next_dup, | ||
1338 | u32 start_seq, u32 end_seq, | ||
1339 | int dup_sack_in, int *fack_count, | ||
1340 | int *reord, int *flag) | ||
1341 | { | ||
1342 | struct tcp_sock *tp = tcp_sk(sk); | ||
1343 | |||
1344 | tcp_for_write_queue_from(skb, sk) { | ||
1345 | int in_sack = 0; | ||
1346 | int dup_sack = dup_sack_in; | ||
1347 | |||
1348 | if (skb == tcp_send_head(sk)) | ||
1349 | break; | ||
1350 | |||
1351 | /* queue is in-order => we can short-circuit the walk early */ | ||
1352 | if (!before(TCP_SKB_CB(skb)->seq, end_seq)) | ||
1353 | break; | ||
1354 | |||
1355 | if ((next_dup != NULL) && | ||
1356 | before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { | ||
1357 | in_sack = tcp_match_skb_to_sack(sk, skb, | ||
1358 | next_dup->start_seq, | ||
1359 | next_dup->end_seq); | ||
1360 | if (in_sack > 0) | ||
1361 | dup_sack = 1; | ||
1362 | } | ||
1363 | |||
1364 | if (in_sack <= 0) | ||
1365 | in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); | ||
1366 | if (unlikely(in_sack < 0)) | ||
1367 | break; | ||
1368 | |||
1369 | if (in_sack) | ||
1370 | *flag |= tcp_sacktag_one(skb, tp, reord, dup_sack, *fack_count); | ||
1371 | |||
1372 | *fack_count += tcp_skb_pcount(skb); | ||
1373 | } | ||
1374 | return skb; | ||
1375 | } | ||
1376 | |||
1377 | /* Avoid all extra work that is being done by sacktag while walking in | ||
1378 | * a normal way | ||
1379 | */ | ||
1380 | static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, | ||
1381 | u32 skip_to_seq) | ||
1382 | { | ||
1383 | tcp_for_write_queue_from(skb, sk) { | ||
1384 | if (skb == tcp_send_head(sk)) | ||
1385 | break; | ||
1386 | |||
1387 | if (before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) | ||
1388 | break; | ||
1389 | } | ||
1390 | return skb; | ||
1391 | } | ||
1392 | |||
1393 | static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, | ||
1394 | struct sock *sk, | ||
1395 | struct tcp_sack_block *next_dup, | ||
1396 | u32 skip_to_seq, | ||
1397 | int *fack_count, int *reord, | ||
1398 | int *flag) | ||
1399 | { | ||
1400 | if (next_dup == NULL) | ||
1401 | return skb; | ||
1402 | |||
1403 | if (before(next_dup->start_seq, skip_to_seq)) { | ||
1404 | skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); | ||
1405 | tcp_sacktag_walk(skb, sk, NULL, | ||
1406 | next_dup->start_seq, next_dup->end_seq, | ||
1407 | 1, fack_count, reord, flag); | ||
1408 | } | ||
1409 | |||
1410 | return skb; | ||
1411 | } | ||
1412 | |||
1413 | static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) | ||
1414 | { | ||
1415 | return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); | ||
1416 | } | ||
1417 | |||
1336 | static int | 1418 | static int |
1337 | tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) | 1419 | tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) |
1338 | { | 1420 | { |
@@ -1342,16 +1424,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1342 | TCP_SKB_CB(ack_skb)->sacked); | 1424 | TCP_SKB_CB(ack_skb)->sacked); |
1343 | struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); | 1425 | struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); |
1344 | struct tcp_sack_block sp[4]; | 1426 | struct tcp_sack_block sp[4]; |
1345 | struct sk_buff *cached_skb; | 1427 | struct tcp_sack_block *cache; |
1428 | struct sk_buff *skb; | ||
1346 | int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; | 1429 | int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; |
1347 | int used_sacks; | 1430 | int used_sacks; |
1348 | int reord = tp->packets_out; | 1431 | int reord = tp->packets_out; |
1349 | int flag = 0; | 1432 | int flag = 0; |
1350 | int found_dup_sack = 0; | 1433 | int found_dup_sack = 0; |
1351 | int cached_fack_count; | 1434 | int fack_count; |
1352 | int i; | 1435 | int i, j; |
1353 | int first_sack_index; | 1436 | int first_sack_index; |
1354 | int force_one_sack; | ||
1355 | 1437 | ||
1356 | if (!tp->sacked_out) { | 1438 | if (!tp->sacked_out) { |
1357 | if (WARN_ON(tp->fackets_out)) | 1439 | if (WARN_ON(tp->fackets_out)) |
@@ -1409,132 +1491,123 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1409 | used_sacks++; | 1491 | used_sacks++; |
1410 | } | 1492 | } |
1411 | 1493 | ||
1412 | /* SACK fastpath: | 1494 | /* order SACK blocks to allow in order walk of the retrans queue */ |
1413 | * if the only SACK change is the increase of the end_seq of | 1495 | for (i = used_sacks - 1; i > 0; i--) { |
1414 | * the first block then only apply that SACK block | 1496 | for (j = 0; j < i; j++){ |
1415 | * and use retrans queue hinting otherwise slowpath */ | 1497 | if (after(sp[j].start_seq, sp[j+1].start_seq)) { |
1416 | force_one_sack = 1; | 1498 | struct tcp_sack_block tmp; |
1417 | for (i = 0; i < used_sacks; i++) { | ||
1418 | u32 start_seq = sp[i].start_seq; | ||
1419 | u32 end_seq = sp[i].end_seq; | ||
1420 | |||
1421 | if (i == 0) { | ||
1422 | if (tp->recv_sack_cache[i].start_seq != start_seq) | ||
1423 | force_one_sack = 0; | ||
1424 | } else { | ||
1425 | if ((tp->recv_sack_cache[i].start_seq != start_seq) || | ||
1426 | (tp->recv_sack_cache[i].end_seq != end_seq)) | ||
1427 | force_one_sack = 0; | ||
1428 | } | ||
1429 | tp->recv_sack_cache[i].start_seq = start_seq; | ||
1430 | tp->recv_sack_cache[i].end_seq = end_seq; | ||
1431 | } | ||
1432 | /* Clear the rest of the cache sack blocks so they won't match mistakenly. */ | ||
1433 | for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) { | ||
1434 | tp->recv_sack_cache[i].start_seq = 0; | ||
1435 | tp->recv_sack_cache[i].end_seq = 0; | ||
1436 | } | ||
1437 | 1499 | ||
1438 | if (force_one_sack) | 1500 | tmp = sp[j]; |
1439 | used_sacks = 1; | 1501 | sp[j] = sp[j+1]; |
1440 | else { | 1502 | sp[j+1] = tmp; |
1441 | int j; | ||
1442 | tp->fastpath_skb_hint = NULL; | ||
1443 | |||
1444 | /* order SACK blocks to allow in order walk of the retrans queue */ | ||
1445 | for (i = used_sacks - 1; i > 0; i--) { | ||
1446 | for (j = 0; j < i; j++){ | ||
1447 | if (after(sp[j].start_seq, sp[j+1].start_seq)) { | ||
1448 | struct tcp_sack_block tmp; | ||
1449 | |||
1450 | tmp = sp[j]; | ||
1451 | sp[j] = sp[j+1]; | ||
1452 | sp[j+1] = tmp; | ||
1453 | |||
1454 | /* Track where the first SACK block goes to */ | ||
1455 | if (j == first_sack_index) | ||
1456 | first_sack_index = j+1; | ||
1457 | } | ||
1458 | 1503 | ||
1504 | /* Track where the first SACK block goes to */ | ||
1505 | if (j == first_sack_index) | ||
1506 | first_sack_index = j+1; | ||
1459 | } | 1507 | } |
1460 | } | 1508 | } |
1461 | } | 1509 | } |
1462 | 1510 | ||
1463 | /* Use SACK fastpath hint if valid */ | 1511 | skb = tcp_write_queue_head(sk); |
1464 | cached_skb = tp->fastpath_skb_hint; | 1512 | fack_count = 0; |
1465 | cached_fack_count = tp->fastpath_cnt_hint; | 1513 | i = 0; |
1466 | if (!cached_skb) { | 1514 | |
1467 | cached_skb = tcp_write_queue_head(sk); | 1515 | if (!tp->sacked_out) { |
1468 | cached_fack_count = 0; | 1516 | /* It's already past, so skip checking against it */ |
1517 | cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); | ||
1518 | } else { | ||
1519 | cache = tp->recv_sack_cache; | ||
1520 | /* Skip empty blocks in at head of the cache */ | ||
1521 | while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && | ||
1522 | !cache->end_seq) | ||
1523 | cache++; | ||
1469 | } | 1524 | } |
1470 | 1525 | ||
1471 | for (i = 0; i < used_sacks; i++) { | 1526 | while (i < used_sacks) { |
1472 | struct sk_buff *skb; | ||
1473 | u32 start_seq = sp[i].start_seq; | 1527 | u32 start_seq = sp[i].start_seq; |
1474 | u32 end_seq = sp[i].end_seq; | 1528 | u32 end_seq = sp[i].end_seq; |
1475 | int fack_count; | ||
1476 | int dup_sack = (found_dup_sack && (i == first_sack_index)); | 1529 | int dup_sack = (found_dup_sack && (i == first_sack_index)); |
1477 | int next_dup = (found_dup_sack && (i+1 == first_sack_index)); | 1530 | struct tcp_sack_block *next_dup = NULL; |
1478 | 1531 | ||
1479 | skb = cached_skb; | 1532 | if (found_dup_sack && ((i + 1) == first_sack_index)) |
1480 | fack_count = cached_fack_count; | 1533 | next_dup = &sp[i + 1]; |
1481 | 1534 | ||
1482 | /* Event "B" in the comment above. */ | 1535 | /* Event "B" in the comment above. */ |
1483 | if (after(end_seq, tp->high_seq)) | 1536 | if (after(end_seq, tp->high_seq)) |
1484 | flag |= FLAG_DATA_LOST; | 1537 | flag |= FLAG_DATA_LOST; |
1485 | 1538 | ||
1486 | tcp_for_write_queue_from(skb, sk) { | 1539 | /* Skip too early cached blocks */ |
1487 | int in_sack = 0; | 1540 | while (tcp_sack_cache_ok(tp, cache) && |
1488 | 1541 | !before(start_seq, cache->end_seq)) | |
1489 | if (skb == tcp_send_head(sk)) | 1542 | cache++; |
1490 | break; | 1543 | |
1491 | 1544 | /* Can skip some work by looking recv_sack_cache? */ | |
1492 | cached_skb = skb; | 1545 | if (tcp_sack_cache_ok(tp, cache) && !dup_sack && |
1493 | cached_fack_count = fack_count; | 1546 | after(end_seq, cache->start_seq)) { |
1494 | if (i == first_sack_index) { | 1547 | |
1495 | tp->fastpath_skb_hint = skb; | 1548 | /* Head todo? */ |
1496 | tp->fastpath_cnt_hint = fack_count; | 1549 | if (before(start_seq, cache->start_seq)) { |
1550 | skb = tcp_sacktag_skip(skb, sk, start_seq); | ||
1551 | skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, | ||
1552 | cache->start_seq, dup_sack, | ||
1553 | &fack_count, &reord, &flag); | ||
1497 | } | 1554 | } |
1498 | 1555 | ||
1499 | /* The retransmission queue is always in order, so | 1556 | /* Rest of the block already fully processed? */ |
1500 | * we can short-circuit the walk early. | 1557 | if (!after(end_seq, cache->end_seq)) { |
1501 | */ | 1558 | skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq, |
1502 | if (!before(TCP_SKB_CB(skb)->seq, end_seq)) | 1559 | &fack_count, &reord, &flag); |
1503 | break; | 1560 | goto advance_sp; |
1504 | 1561 | } | |
1505 | dup_sack = (found_dup_sack && (i == first_sack_index)); | ||
1506 | 1562 | ||
1507 | /* Due to sorting DSACK may reside within this SACK block! */ | 1563 | /* ...tail remains todo... */ |
1508 | if (next_dup) { | 1564 | if (TCP_SKB_CB(tp->highest_sack)->end_seq == cache->end_seq) { |
1509 | u32 dup_start = sp[i+1].start_seq; | 1565 | /* ...but better entrypoint exists! Check that DSACKs are |
1510 | u32 dup_end = sp[i+1].end_seq; | 1566 | * properly accounted while skipping here |
1567 | */ | ||
1568 | tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq, | ||
1569 | &fack_count, &reord, &flag); | ||
1511 | 1570 | ||
1512 | if (before(TCP_SKB_CB(skb)->seq, dup_end)) { | 1571 | skb = tcp_write_queue_next(sk, tp->highest_sack); |
1513 | in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end); | 1572 | fack_count = tp->fackets_out; |
1514 | if (in_sack > 0) | 1573 | cache++; |
1515 | dup_sack = 1; | 1574 | goto walk; |
1516 | } | ||
1517 | } | 1575 | } |
1518 | 1576 | ||
1519 | /* DSACK info lost if out-of-mem, try SACK still */ | 1577 | skb = tcp_sacktag_skip(skb, sk, cache->end_seq); |
1520 | if (in_sack <= 0) | 1578 | /* Check overlap against next cached too (past this one already) */ |
1521 | in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); | 1579 | cache++; |
1522 | if (unlikely(in_sack < 0)) | 1580 | continue; |
1523 | break; | 1581 | } |
1524 | |||
1525 | if (in_sack) | ||
1526 | flag |= tcp_sacktag_one(skb, tp, &reord, dup_sack, fack_count); | ||
1527 | 1582 | ||
1528 | fack_count += tcp_skb_pcount(skb); | 1583 | if (!before(start_seq, tcp_highest_sack_seq(tp))) { |
1584 | skb = tcp_write_queue_next(sk, tp->highest_sack); | ||
1585 | fack_count = tp->fackets_out; | ||
1529 | } | 1586 | } |
1587 | skb = tcp_sacktag_skip(skb, sk, start_seq); | ||
1588 | |||
1589 | walk: | ||
1590 | skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, | ||
1591 | dup_sack, &fack_count, &reord, &flag); | ||
1530 | 1592 | ||
1593 | advance_sp: | ||
1531 | /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct | 1594 | /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct |
1532 | * due to in-order walk | 1595 | * due to in-order walk |
1533 | */ | 1596 | */ |
1534 | if (after(end_seq, tp->frto_highmark)) | 1597 | if (after(end_seq, tp->frto_highmark)) |
1535 | flag &= ~FLAG_ONLY_ORIG_SACKED; | 1598 | flag &= ~FLAG_ONLY_ORIG_SACKED; |
1599 | |||
1600 | i++; | ||
1536 | } | 1601 | } |
1537 | 1602 | ||
1603 | /* Clear the head of the cache sack blocks so we can skip it next time */ | ||
1604 | for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { | ||
1605 | tp->recv_sack_cache[i].start_seq = 0; | ||
1606 | tp->recv_sack_cache[i].end_seq = 0; | ||
1607 | } | ||
1608 | for (j = 0; j < used_sacks; j++) | ||
1609 | tp->recv_sack_cache[i++] = sp[j]; | ||
1610 | |||
1538 | flag |= tcp_mark_lost_retrans(sk); | 1611 | flag |= tcp_mark_lost_retrans(sk); |
1539 | 1612 | ||
1540 | tcp_verify_left_out(tp); | 1613 | tcp_verify_left_out(tp); |
@@ -2821,9 +2894,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, | |||
2821 | } | 2894 | } |
2822 | 2895 | ||
2823 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); | 2896 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); |
2824 | /* hint's skb might be NULL but we don't need to care */ | 2897 | |
2825 | tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, | ||
2826 | tp->fastpath_cnt_hint); | ||
2827 | if (ca_ops->pkts_acked) { | 2898 | if (ca_ops->pkts_acked) { |
2828 | s32 rtt_us = -1; | 2899 | s32 rtt_us = -1; |
2829 | 2900 | ||