diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/tcp_input.c | 256 |
1 files changed, 249 insertions, 7 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3c8e297e2c39..97d57676b8ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, | |||
1242 | * aligned portion of it that matches. Therefore we might need to fragment | 1242 | * aligned portion of it that matches. Therefore we might need to fragment |
1243 | * which may fail and creates some hassle (caller must handle error case | 1243 | * which may fail and creates some hassle (caller must handle error case |
1244 | * returns). | 1244 | * returns). |
1245 | * | ||
1246 | * FIXME: this could be merged to shift decision code | ||
1245 | */ | 1247 | */ |
1246 | static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | 1248 | static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, |
1247 | u32 start_seq, u32 end_seq) | 1249 | u32 start_seq, u32 end_seq) |
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | |||
1353 | 1355 | ||
1354 | if (fack_count > tp->fackets_out) | 1356 | if (fack_count > tp->fackets_out) |
1355 | tp->fackets_out = fack_count; | 1357 | tp->fackets_out = fack_count; |
1356 | |||
1357 | if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) | ||
1358 | tcp_advance_highest_sack(sk, skb); | ||
1359 | } | 1358 | } |
1360 | 1359 | ||
1361 | /* D-SACK. We can detect redundant retransmission in S|R and plain R | 1360 | /* D-SACK. We can detect redundant retransmission in S|R and plain R |
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | |||
1370 | return flag; | 1369 | return flag; |
1371 | } | 1370 | } |
1372 | 1371 | ||
1372 | static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, | ||
1373 | struct sk_buff *skb, unsigned int pcount, | ||
1374 | int shifted, int fack_count, int *reord, | ||
1375 | int *flag, int mss) | ||
1376 | { | ||
1377 | struct tcp_sock *tp = tcp_sk(sk); | ||
1378 | u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */ | ||
1379 | |||
1380 | BUG_ON(!pcount); | ||
1381 | |||
1382 | TCP_SKB_CB(prev)->end_seq += shifted; | ||
1383 | TCP_SKB_CB(skb)->seq += shifted; | ||
1384 | |||
1385 | skb_shinfo(prev)->gso_segs += pcount; | ||
1386 | BUG_ON(skb_shinfo(skb)->gso_segs < pcount); | ||
1387 | skb_shinfo(skb)->gso_segs -= pcount; | ||
1388 | |||
1389 | /* When we're adding to gso_segs == 1, gso_size will be zero, | ||
1390 | * in theory this shouldn't be necessary but as long as DSACK | ||
1391 | * code can come after this skb later on it's better to keep | ||
1392 | * setting gso_size to something. | ||
1393 | */ | ||
1394 | if (!skb_shinfo(prev)->gso_size) { | ||
1395 | skb_shinfo(prev)->gso_size = mss; | ||
1396 | skb_shinfo(prev)->gso_type = sk->sk_gso_type; | ||
1397 | } | ||
1398 | |||
1399 | /* CHECKME: To clear or not to clear? Mimics normal skb currently */ | ||
1400 | if (skb_shinfo(skb)->gso_segs <= 1) { | ||
1401 | skb_shinfo(skb)->gso_size = 0; | ||
1402 | skb_shinfo(skb)->gso_type = 0; | ||
1403 | } | ||
1404 | |||
1405 | *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked, | ||
1406 | pcount); | ||
1407 | |||
1408 | /* Difference in this won't matter, both ACKed by the same cumul. ACK */ | ||
1409 | TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); | ||
1410 | |||
1411 | tcp_clear_all_retrans_hints(tp); | ||
1412 | |||
1413 | if (skb->len > 0) { | ||
1414 | BUG_ON(!tcp_skb_pcount(skb)); | ||
1415 | return 0; | ||
1416 | } | ||
1417 | |||
1418 | /* Whole SKB was eaten :-) */ | ||
1419 | |||
1420 | TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; | ||
1421 | if (skb == tcp_highest_sack(sk)) | ||
1422 | tcp_advance_highest_sack(sk, skb); | ||
1423 | |||
1424 | tcp_unlink_write_queue(skb, sk); | ||
1425 | sk_wmem_free_skb(sk, skb); | ||
1426 | |||
1427 | return 1; | ||
1428 | } | ||
1429 | |||
1430 | /* I wish gso_size would have a bit more sane initialization than | ||
1431 | * something-or-zero which complicates things | ||
1432 | */ | ||
1433 | static int tcp_shift_mss(struct sk_buff *skb) | ||
1434 | { | ||
1435 | int mss = tcp_skb_mss(skb); | ||
1436 | |||
1437 | if (!mss) | ||
1438 | mss = skb->len; | ||
1439 | |||
1440 | return mss; | ||
1441 | } | ||
1442 | |||
1443 | /* Shifting pages past head area doesn't work */ | ||
1444 | static int skb_can_shift(struct sk_buff *skb) | ||
1445 | { | ||
1446 | return !skb_headlen(skb) && skb_is_nonlinear(skb); | ||
1447 | } | ||
1448 | |||
1449 | /* Try collapsing SACK blocks spanning across multiple skbs to a single | ||
1450 | * skb. | ||
1451 | */ | ||
1452 | static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | ||
1453 | u32 start_seq, u32 end_seq, | ||
1454 | int dup_sack, int *fack_count, | ||
1455 | int *reord, int *flag) | ||
1456 | { | ||
1457 | struct tcp_sock *tp = tcp_sk(sk); | ||
1458 | struct sk_buff *prev; | ||
1459 | int mss; | ||
1460 | int pcount = 0; | ||
1461 | int len; | ||
1462 | int in_sack; | ||
1463 | |||
1464 | if (!sk_can_gso(sk)) | ||
1465 | goto fallback; | ||
1466 | |||
1467 | /* Normally R but no L won't result in plain S */ | ||
1468 | if (!dup_sack && | ||
1469 | (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS) | ||
1470 | goto fallback; | ||
1471 | if (!skb_can_shift(skb)) | ||
1472 | goto fallback; | ||
1473 | /* This frame is about to be dropped (was ACKed). */ | ||
1474 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) | ||
1475 | goto fallback; | ||
1476 | |||
1477 | /* Can only happen with delayed DSACK + discard craziness */ | ||
1478 | if (unlikely(skb == tcp_write_queue_head(sk))) | ||
1479 | goto fallback; | ||
1480 | prev = tcp_write_queue_prev(sk, skb); | ||
1481 | |||
1482 | if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) | ||
1483 | goto fallback; | ||
1484 | |||
1485 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && | ||
1486 | !before(end_seq, TCP_SKB_CB(skb)->end_seq); | ||
1487 | |||
1488 | if (in_sack) { | ||
1489 | len = skb->len; | ||
1490 | pcount = tcp_skb_pcount(skb); | ||
1491 | mss = tcp_shift_mss(skb); | ||
1492 | |||
1493 | /* TODO: Fix DSACKs to not fragment already SACKed and we can | ||
1494 | * drop this restriction as unnecessary | ||
1495 | */ | ||
1496 | if (mss != tcp_shift_mss(prev)) | ||
1497 | goto fallback; | ||
1498 | } else { | ||
1499 | if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) | ||
1500 | goto noop; | ||
1501 | /* CHECKME: This is non-MSS split case only?, this will | ||
1502 | * cause skipped skbs due to advancing loop btw, original | ||
1503 | * has that feature too | ||
1504 | */ | ||
1505 | if (tcp_skb_pcount(skb) <= 1) | ||
1506 | goto noop; | ||
1507 | |||
1508 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); | ||
1509 | if (!in_sack) { | ||
1510 | /* TODO: head merge to next could be attempted here | ||
1511 | * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), | ||
1512 | * though it might not be worth of the additional hassle | ||
1513 | * | ||
1514 | * ...we can probably just fallback to what was done | ||
1515 | * previously. We could try merging non-SACKed ones | ||
1516 | * as well but it probably isn't going to buy off | ||
1517 | * because later SACKs might again split them, and | ||
1518 | * it would make skb timestamp tracking considerably | ||
1519 | * harder problem. | ||
1520 | */ | ||
1521 | goto fallback; | ||
1522 | } | ||
1523 | |||
1524 | len = end_seq - TCP_SKB_CB(skb)->seq; | ||
1525 | BUG_ON(len < 0); | ||
1526 | BUG_ON(len > skb->len); | ||
1527 | |||
1528 | /* MSS boundaries should be honoured or else pcount will | ||
1529 | * severely break even though it makes things bit trickier. | ||
1530 | * Optimize common case to avoid most of the divides | ||
1531 | */ | ||
1532 | mss = tcp_skb_mss(skb); | ||
1533 | |||
1534 | /* TODO: Fix DSACKs to not fragment already SACKed and we can | ||
1535 | * drop this restriction as unnecessary | ||
1536 | */ | ||
1537 | if (mss != tcp_shift_mss(prev)) | ||
1538 | goto fallback; | ||
1539 | |||
1540 | if (len == mss) { | ||
1541 | pcount = 1; | ||
1542 | } else if (len < mss) { | ||
1543 | goto noop; | ||
1544 | } else { | ||
1545 | pcount = len / mss; | ||
1546 | len = pcount * mss; | ||
1547 | } | ||
1548 | } | ||
1549 | |||
1550 | if (!skb_shift(prev, skb, len)) | ||
1551 | goto fallback; | ||
1552 | if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord, | ||
1553 | flag, mss)) | ||
1554 | goto out; | ||
1555 | |||
1556 | /* Hole filled allows collapsing with the next as well, this is very | ||
1557 | * useful when hole on every nth skb pattern happens | ||
1558 | */ | ||
1559 | if (prev == tcp_write_queue_tail(sk)) | ||
1560 | goto out; | ||
1561 | skb = tcp_write_queue_next(sk, prev); | ||
1562 | |||
1563 | if (!skb_can_shift(skb)) | ||
1564 | goto out; | ||
1565 | if (skb == tcp_send_head(sk)) | ||
1566 | goto out; | ||
1567 | if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) | ||
1568 | goto out; | ||
1569 | |||
1570 | len = skb->len; | ||
1571 | if (skb_shift(prev, skb, len)) { | ||
1572 | pcount += tcp_skb_pcount(skb); | ||
1573 | tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len, | ||
1574 | *fack_count, reord, flag, mss); | ||
1575 | } | ||
1576 | |||
1577 | out: | ||
1578 | *fack_count += pcount; | ||
1579 | return prev; | ||
1580 | |||
1581 | noop: | ||
1582 | return skb; | ||
1583 | |||
1584 | fallback: | ||
1585 | return NULL; | ||
1586 | } | ||
1587 | |||
1373 | static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | 1588 | static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, |
1374 | struct tcp_sack_block *next_dup, | 1589 | struct tcp_sack_block *next_dup, |
1375 | u32 start_seq, u32 end_seq, | 1590 | u32 start_seq, u32 end_seq, |
1376 | int dup_sack_in, int *fack_count, | 1591 | int dup_sack_in, int *fack_count, |
1377 | int *reord, int *flag) | 1592 | int *reord, int *flag) |
1378 | { | 1593 | { |
1594 | struct tcp_sock *tp = tcp_sk(sk); | ||
1595 | struct sk_buff *tmp; | ||
1596 | |||
1379 | tcp_for_write_queue_from(skb, sk) { | 1597 | tcp_for_write_queue_from(skb, sk) { |
1380 | int in_sack = 0; | 1598 | int in_sack = 0; |
1381 | int dup_sack = dup_sack_in; | 1599 | int dup_sack = dup_sack_in; |
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1396 | dup_sack = 1; | 1614 | dup_sack = 1; |
1397 | } | 1615 | } |
1398 | 1616 | ||
1399 | if (in_sack <= 0) | 1617 | /* skb reference here is a bit tricky to get right, since |
1400 | in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, | 1618 | * shifting can eat and free both this skb and the next, |
1401 | end_seq); | 1619 | * so not even _safe variant of the loop is enough. |
1620 | */ | ||
1621 | if (in_sack <= 0) { | ||
1622 | tmp = tcp_shift_skb_data(sk, skb, start_seq, | ||
1623 | end_seq, dup_sack, | ||
1624 | fack_count, reord, flag); | ||
1625 | if (tmp != NULL) { | ||
1626 | if (tmp != skb) { | ||
1627 | skb = tmp; | ||
1628 | continue; | ||
1629 | } | ||
1630 | |||
1631 | in_sack = 0; | ||
1632 | } else { | ||
1633 | in_sack = tcp_match_skb_to_sack(sk, skb, | ||
1634 | start_seq, | ||
1635 | end_seq); | ||
1636 | } | ||
1637 | } | ||
1638 | |||
1402 | if (unlikely(in_sack < 0)) | 1639 | if (unlikely(in_sack < 0)) |
1403 | break; | 1640 | break; |
1404 | 1641 | ||
1405 | if (in_sack) | 1642 | if (in_sack) { |
1406 | *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, | 1643 | *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, |
1407 | *fack_count, | 1644 | *fack_count, |
1408 | &(TCP_SKB_CB(skb)->sacked), | 1645 | &(TCP_SKB_CB(skb)->sacked), |
1409 | tcp_skb_pcount(skb)); | 1646 | tcp_skb_pcount(skb)); |
1410 | 1647 | ||
1648 | if (!before(TCP_SKB_CB(skb)->seq, | ||
1649 | tcp_highest_sack_seq(tp))) | ||
1650 | tcp_advance_highest_sack(sk, skb); | ||
1651 | } | ||
1652 | |||
1411 | *fack_count += tcp_skb_pcount(skb); | 1653 | *fack_count += tcp_skb_pcount(skb); |
1412 | } | 1654 | } |
1413 | return skb; | 1655 | return skb; |