diff options
-rw-r--r-- | include/linux/tcp.h | 3 | ||||
-rw-r--r-- | include/net/tcp.h | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 271 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 14 |
4 files changed, 172 insertions, 117 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 794497c7d755..08027f1d7f31 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -343,10 +343,7 @@ struct tcp_sock { | |||
343 | struct sk_buff *scoreboard_skb_hint; | 343 | struct sk_buff *scoreboard_skb_hint; |
344 | struct sk_buff *retransmit_skb_hint; | 344 | struct sk_buff *retransmit_skb_hint; |
345 | struct sk_buff *forward_skb_hint; | 345 | struct sk_buff *forward_skb_hint; |
346 | struct sk_buff *fastpath_skb_hint; | ||
347 | 346 | ||
348 | int fastpath_cnt_hint; /* Lags behind by current skb's pcount | ||
349 | * compared to respective fackets_out */ | ||
350 | int lost_cnt_hint; | 347 | int lost_cnt_hint; |
351 | int retransmit_cnt_hint; | 348 | int retransmit_cnt_hint; |
352 | 349 | ||
diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ede804b16d4..f0c5e7a29400 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -1081,7 +1081,6 @@ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) | |||
1081 | static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) | 1081 | static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) |
1082 | { | 1082 | { |
1083 | tcp_clear_retrans_hints_partial(tp); | 1083 | tcp_clear_retrans_hints_partial(tp); |
1084 | tp->fastpath_skb_hint = NULL; | ||
1085 | } | 1084 | } |
1086 | 1085 | ||
1087 | /* MD5 Signature */ | 1086 | /* MD5 Signature */ |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a287747e9dd6..3ad6a19ad30f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -1333,6 +1333,88 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct tcp_sock *tp, | |||
1333 | return flag; | 1333 | return flag; |
1334 | } | 1334 | } |
1335 | 1335 | ||
1336 | static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | ||
1337 | struct tcp_sack_block *next_dup, | ||
1338 | u32 start_seq, u32 end_seq, | ||
1339 | int dup_sack_in, int *fack_count, | ||
1340 | int *reord, int *flag) | ||
1341 | { | ||
1342 | struct tcp_sock *tp = tcp_sk(sk); | ||
1343 | |||
1344 | tcp_for_write_queue_from(skb, sk) { | ||
1345 | int in_sack = 0; | ||
1346 | int dup_sack = dup_sack_in; | ||
1347 | |||
1348 | if (skb == tcp_send_head(sk)) | ||
1349 | break; | ||
1350 | |||
1351 | /* queue is in-order => we can short-circuit the walk early */ | ||
1352 | if (!before(TCP_SKB_CB(skb)->seq, end_seq)) | ||
1353 | break; | ||
1354 | |||
1355 | if ((next_dup != NULL) && | ||
1356 | before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { | ||
1357 | in_sack = tcp_match_skb_to_sack(sk, skb, | ||
1358 | next_dup->start_seq, | ||
1359 | next_dup->end_seq); | ||
1360 | if (in_sack > 0) | ||
1361 | dup_sack = 1; | ||
1362 | } | ||
1363 | |||
1364 | if (in_sack <= 0) | ||
1365 | in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); | ||
1366 | if (unlikely(in_sack < 0)) | ||
1367 | break; | ||
1368 | |||
1369 | if (in_sack) | ||
1370 | *flag |= tcp_sacktag_one(skb, tp, reord, dup_sack, *fack_count); | ||
1371 | |||
1372 | *fack_count += tcp_skb_pcount(skb); | ||
1373 | } | ||
1374 | return skb; | ||
1375 | } | ||
1376 | |||
1377 | /* Avoid all extra work that is being done by sacktag while walking in | ||
1378 | * a normal way | ||
1379 | */ | ||
1380 | static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, | ||
1381 | u32 skip_to_seq) | ||
1382 | { | ||
1383 | tcp_for_write_queue_from(skb, sk) { | ||
1384 | if (skb == tcp_send_head(sk)) | ||
1385 | break; | ||
1386 | |||
1387 | if (before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) | ||
1388 | break; | ||
1389 | } | ||
1390 | return skb; | ||
1391 | } | ||
1392 | |||
1393 | static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, | ||
1394 | struct sock *sk, | ||
1395 | struct tcp_sack_block *next_dup, | ||
1396 | u32 skip_to_seq, | ||
1397 | int *fack_count, int *reord, | ||
1398 | int *flag) | ||
1399 | { | ||
1400 | if (next_dup == NULL) | ||
1401 | return skb; | ||
1402 | |||
1403 | if (before(next_dup->start_seq, skip_to_seq)) { | ||
1404 | skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); | ||
1405 | tcp_sacktag_walk(skb, sk, NULL, | ||
1406 | next_dup->start_seq, next_dup->end_seq, | ||
1407 | 1, fack_count, reord, flag); | ||
1408 | } | ||
1409 | |||
1410 | return skb; | ||
1411 | } | ||
1412 | |||
1413 | static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) | ||
1414 | { | ||
1415 | return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); | ||
1416 | } | ||
1417 | |||
1336 | static int | 1418 | static int |
1337 | tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) | 1419 | tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) |
1338 | { | 1420 | { |
@@ -1342,16 +1424,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1342 | TCP_SKB_CB(ack_skb)->sacked); | 1424 | TCP_SKB_CB(ack_skb)->sacked); |
1343 | struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); | 1425 | struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); |
1344 | struct tcp_sack_block sp[4]; | 1426 | struct tcp_sack_block sp[4]; |
1345 | struct sk_buff *cached_skb; | 1427 | struct tcp_sack_block *cache; |
1428 | struct sk_buff *skb; | ||
1346 | int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; | 1429 | int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; |
1347 | int used_sacks; | 1430 | int used_sacks; |
1348 | int reord = tp->packets_out; | 1431 | int reord = tp->packets_out; |
1349 | int flag = 0; | 1432 | int flag = 0; |
1350 | int found_dup_sack = 0; | 1433 | int found_dup_sack = 0; |
1351 | int cached_fack_count; | 1434 | int fack_count; |
1352 | int i; | 1435 | int i, j; |
1353 | int first_sack_index; | 1436 | int first_sack_index; |
1354 | int force_one_sack; | ||
1355 | 1437 | ||
1356 | if (!tp->sacked_out) { | 1438 | if (!tp->sacked_out) { |
1357 | if (WARN_ON(tp->fackets_out)) | 1439 | if (WARN_ON(tp->fackets_out)) |
@@ -1409,132 +1491,123 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1409 | used_sacks++; | 1491 | used_sacks++; |
1410 | } | 1492 | } |
1411 | 1493 | ||
1412 | /* SACK fastpath: | 1494 | /* order SACK blocks to allow in order walk of the retrans queue */ |
1413 | * if the only SACK change is the increase of the end_seq of | 1495 | for (i = used_sacks - 1; i > 0; i--) { |
1414 | * the first block then only apply that SACK block | 1496 | for (j = 0; j < i; j++){ |
1415 | * and use retrans queue hinting otherwise slowpath */ | 1497 | if (after(sp[j].start_seq, sp[j+1].start_seq)) { |
1416 | force_one_sack = 1; | 1498 | struct tcp_sack_block tmp; |
1417 | for (i = 0; i < used_sacks; i++) { | ||
1418 | u32 start_seq = sp[i].start_seq; | ||
1419 | u32 end_seq = sp[i].end_seq; | ||
1420 | |||
1421 | if (i == 0) { | ||
1422 | if (tp->recv_sack_cache[i].start_seq != start_seq) | ||
1423 | force_one_sack = 0; | ||
1424 | } else { | ||
1425 | if ((tp->recv_sack_cache[i].start_seq != start_seq) || | ||
1426 | (tp->recv_sack_cache[i].end_seq != end_seq)) | ||
1427 | force_one_sack = 0; | ||
1428 | } | ||
1429 | tp->recv_sack_cache[i].start_seq = start_seq; | ||
1430 | tp->recv_sack_cache[i].end_seq = end_seq; | ||
1431 | } | ||
1432 | /* Clear the rest of the cache sack blocks so they won't match mistakenly. */ | ||
1433 | for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) { | ||
1434 | tp->recv_sack_cache[i].start_seq = 0; | ||
1435 | tp->recv_sack_cache[i].end_seq = 0; | ||
1436 | } | ||
1437 | 1499 | ||
1438 | if (force_one_sack) | 1500 | tmp = sp[j]; |
1439 | used_sacks = 1; | 1501 | sp[j] = sp[j+1]; |
1440 | else { | 1502 | sp[j+1] = tmp; |
1441 | int j; | ||
1442 | tp->fastpath_skb_hint = NULL; | ||
1443 | |||
1444 | /* order SACK blocks to allow in order walk of the retrans queue */ | ||
1445 | for (i = used_sacks - 1; i > 0; i--) { | ||
1446 | for (j = 0; j < i; j++){ | ||
1447 | if (after(sp[j].start_seq, sp[j+1].start_seq)) { | ||
1448 | struct tcp_sack_block tmp; | ||
1449 | |||
1450 | tmp = sp[j]; | ||
1451 | sp[j] = sp[j+1]; | ||
1452 | sp[j+1] = tmp; | ||
1453 | |||
1454 | /* Track where the first SACK block goes to */ | ||
1455 | if (j == first_sack_index) | ||
1456 | first_sack_index = j+1; | ||
1457 | } | ||
1458 | 1503 | ||
1504 | /* Track where the first SACK block goes to */ | ||
1505 | if (j == first_sack_index) | ||
1506 | first_sack_index = j+1; | ||
1459 | } | 1507 | } |
1460 | } | 1508 | } |
1461 | } | 1509 | } |
1462 | 1510 | ||
1463 | /* Use SACK fastpath hint if valid */ | 1511 | skb = tcp_write_queue_head(sk); |
1464 | cached_skb = tp->fastpath_skb_hint; | 1512 | fack_count = 0; |
1465 | cached_fack_count = tp->fastpath_cnt_hint; | 1513 | i = 0; |
1466 | if (!cached_skb) { | 1514 | |
1467 | cached_skb = tcp_write_queue_head(sk); | 1515 | if (!tp->sacked_out) { |
1468 | cached_fack_count = 0; | 1516 | /* It's already past, so skip checking against it */ |
1517 | cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); | ||
1518 | } else { | ||
1519 | cache = tp->recv_sack_cache; | ||
1520 | /* Skip empty blocks in at head of the cache */ | ||
1521 | while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && | ||
1522 | !cache->end_seq) | ||
1523 | cache++; | ||
1469 | } | 1524 | } |
1470 | 1525 | ||
1471 | for (i = 0; i < used_sacks; i++) { | 1526 | while (i < used_sacks) { |
1472 | struct sk_buff *skb; | ||
1473 | u32 start_seq = sp[i].start_seq; | 1527 | u32 start_seq = sp[i].start_seq; |
1474 | u32 end_seq = sp[i].end_seq; | 1528 | u32 end_seq = sp[i].end_seq; |
1475 | int fack_count; | ||
1476 | int dup_sack = (found_dup_sack && (i == first_sack_index)); | 1529 | int dup_sack = (found_dup_sack && (i == first_sack_index)); |
1477 | int next_dup = (found_dup_sack && (i+1 == first_sack_index)); | 1530 | struct tcp_sack_block *next_dup = NULL; |
1478 | 1531 | ||
1479 | skb = cached_skb; | 1532 | if (found_dup_sack && ((i + 1) == first_sack_index)) |
1480 | fack_count = cached_fack_count; | 1533 | next_dup = &sp[i + 1]; |
1481 | 1534 | ||
1482 | /* Event "B" in the comment above. */ | 1535 | /* Event "B" in the comment above. */ |
1483 | if (after(end_seq, tp->high_seq)) | 1536 | if (after(end_seq, tp->high_seq)) |
1484 | flag |= FLAG_DATA_LOST; | 1537 | flag |= FLAG_DATA_LOST; |
1485 | 1538 | ||
1486 | tcp_for_write_queue_from(skb, sk) { | 1539 | /* Skip too early cached blocks */ |
1487 | int in_sack = 0; | 1540 | while (tcp_sack_cache_ok(tp, cache) && |
1488 | 1541 | !before(start_seq, cache->end_seq)) | |
1489 | if (skb == tcp_send_head(sk)) | 1542 | cache++; |
1490 | break; | 1543 | |
1491 | 1544 | /* Can skip some work by looking recv_sack_cache? */ | |
1492 | cached_skb = skb; | 1545 | if (tcp_sack_cache_ok(tp, cache) && !dup_sack && |
1493 | cached_fack_count = fack_count; | 1546 | after(end_seq, cache->start_seq)) { |
1494 | if (i == first_sack_index) { | 1547 | |
1495 | tp->fastpath_skb_hint = skb; | 1548 | /* Head todo? */ |
1496 | tp->fastpath_cnt_hint = fack_count; | 1549 | if (before(start_seq, cache->start_seq)) { |
1550 | skb = tcp_sacktag_skip(skb, sk, start_seq); | ||
1551 | skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, | ||
1552 | cache->start_seq, dup_sack, | ||
1553 | &fack_count, &reord, &flag); | ||
1497 | } | 1554 | } |
1498 | 1555 | ||
1499 | /* The retransmission queue is always in order, so | 1556 | /* Rest of the block already fully processed? */ |
1500 | * we can short-circuit the walk early. | 1557 | if (!after(end_seq, cache->end_seq)) { |
1501 | */ | 1558 | skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq, |
1502 | if (!before(TCP_SKB_CB(skb)->seq, end_seq)) | 1559 | &fack_count, &reord, &flag); |
1503 | break; | 1560 | goto advance_sp; |
1504 | 1561 | } | |
1505 | dup_sack = (found_dup_sack && (i == first_sack_index)); | ||
1506 | 1562 | ||
1507 | /* Due to sorting DSACK may reside within this SACK block! */ | 1563 | /* ...tail remains todo... */ |
1508 | if (next_dup) { | 1564 | if (TCP_SKB_CB(tp->highest_sack)->end_seq == cache->end_seq) { |
1509 | u32 dup_start = sp[i+1].start_seq; | 1565 | /* ...but better entrypoint exists! Check that DSACKs are |
1510 | u32 dup_end = sp[i+1].end_seq; | 1566 | * properly accounted while skipping here |
1567 | */ | ||
1568 | tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq, | ||
1569 | &fack_count, &reord, &flag); | ||
1511 | 1570 | ||
1512 | if (before(TCP_SKB_CB(skb)->seq, dup_end)) { | 1571 | skb = tcp_write_queue_next(sk, tp->highest_sack); |
1513 | in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end); | 1572 | fack_count = tp->fackets_out; |
1514 | if (in_sack > 0) | 1573 | cache++; |
1515 | dup_sack = 1; | 1574 | goto walk; |
1516 | } | ||
1517 | } | 1575 | } |
1518 | 1576 | ||
1519 | /* DSACK info lost if out-of-mem, try SACK still */ | 1577 | skb = tcp_sacktag_skip(skb, sk, cache->end_seq); |
1520 | if (in_sack <= 0) | 1578 | /* Check overlap against next cached too (past this one already) */ |
1521 | in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); | 1579 | cache++; |
1522 | if (unlikely(in_sack < 0)) | 1580 | continue; |
1523 | break; | 1581 | } |
1524 | |||
1525 | if (in_sack) | ||
1526 | flag |= tcp_sacktag_one(skb, tp, &reord, dup_sack, fack_count); | ||
1527 | 1582 | ||
1528 | fack_count += tcp_skb_pcount(skb); | 1583 | if (!before(start_seq, tcp_highest_sack_seq(tp))) { |
1584 | skb = tcp_write_queue_next(sk, tp->highest_sack); | ||
1585 | fack_count = tp->fackets_out; | ||
1529 | } | 1586 | } |
1587 | skb = tcp_sacktag_skip(skb, sk, start_seq); | ||
1588 | |||
1589 | walk: | ||
1590 | skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, | ||
1591 | dup_sack, &fack_count, &reord, &flag); | ||
1530 | 1592 | ||
1593 | advance_sp: | ||
1531 | /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct | 1594 | /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct |
1532 | * due to in-order walk | 1595 | * due to in-order walk |
1533 | */ | 1596 | */ |
1534 | if (after(end_seq, tp->frto_highmark)) | 1597 | if (after(end_seq, tp->frto_highmark)) |
1535 | flag &= ~FLAG_ONLY_ORIG_SACKED; | 1598 | flag &= ~FLAG_ONLY_ORIG_SACKED; |
1599 | |||
1600 | i++; | ||
1536 | } | 1601 | } |
1537 | 1602 | ||
1603 | /* Clear the head of the cache sack blocks so we can skip it next time */ | ||
1604 | for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { | ||
1605 | tp->recv_sack_cache[i].start_seq = 0; | ||
1606 | tp->recv_sack_cache[i].end_seq = 0; | ||
1607 | } | ||
1608 | for (j = 0; j < used_sacks; j++) | ||
1609 | tp->recv_sack_cache[i++] = sp[j]; | ||
1610 | |||
1538 | flag |= tcp_mark_lost_retrans(sk); | 1611 | flag |= tcp_mark_lost_retrans(sk); |
1539 | 1612 | ||
1540 | tcp_verify_left_out(tp); | 1613 | tcp_verify_left_out(tp); |
@@ -2821,9 +2894,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, | |||
2821 | } | 2894 | } |
2822 | 2895 | ||
2823 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); | 2896 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); |
2824 | /* hint's skb might be NULL but we don't need to care */ | 2897 | |
2825 | tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, | ||
2826 | tp->fastpath_cnt_hint); | ||
2827 | if (ca_ops->pkts_acked) { | 2898 | if (ca_ops->pkts_acked) { |
2828 | s32 rtt_us = -1; | 2899 | s32 rtt_us = -1; |
2829 | 2900 | ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ce506af5ce07..030fc69ea217 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -653,9 +653,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned | |||
653 | } | 653 | } |
654 | 654 | ||
655 | /* When a modification to fackets out becomes necessary, we need to check | 655 | /* When a modification to fackets out becomes necessary, we need to check |
656 | * skb is counted to fackets_out or not. Another important thing is to | 656 | * skb is counted to fackets_out or not. |
657 | * tweak SACK fastpath hint too as it would overwrite all changes unless | ||
658 | * hint is also changed. | ||
659 | */ | 657 | */ |
660 | static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, | 658 | static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, |
661 | int decr) | 659 | int decr) |
@@ -667,11 +665,6 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, | |||
667 | 665 | ||
668 | if (!before(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) | 666 | if (!before(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) |
669 | tp->fackets_out -= decr; | 667 | tp->fackets_out -= decr; |
670 | |||
671 | /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */ | ||
672 | if (tp->fastpath_skb_hint != NULL && | ||
673 | after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) | ||
674 | tp->fastpath_cnt_hint -= decr; | ||
675 | } | 668 | } |
676 | 669 | ||
677 | /* Function to create two new TCP segments. Shrinks the given segment | 670 | /* Function to create two new TCP segments. Shrinks the given segment |
@@ -1753,11 +1746,6 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m | |||
1753 | 1746 | ||
1754 | /* changed transmit queue under us so clear hints */ | 1747 | /* changed transmit queue under us so clear hints */ |
1755 | tcp_clear_retrans_hints_partial(tp); | 1748 | tcp_clear_retrans_hints_partial(tp); |
1756 | /* manually tune sacktag skb hint */ | ||
1757 | if (tp->fastpath_skb_hint == next_skb) { | ||
1758 | tp->fastpath_skb_hint = skb; | ||
1759 | tp->fastpath_cnt_hint -= tcp_skb_pcount(skb); | ||
1760 | } | ||
1761 | 1749 | ||
1762 | sk_stream_free_skb(sk, next_skb); | 1750 | sk_stream_free_skb(sk, next_skb); |
1763 | } | 1751 | } |