aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_input.c256
1 files changed, 249 insertions, 7 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3c8e297e2c39..97d57676b8ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1242 * aligned portion of it that matches. Therefore we might need to fragment 1242 * aligned portion of it that matches. Therefore we might need to fragment
1243 * which may fail and creates some hassle (caller must handle error case 1243 * which may fail and creates some hassle (caller must handle error case
1244 * returns). 1244 * returns).
1245 *
1246 * FIXME: this could be merged to shift decision code
1245 */ 1247 */
1246static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1248static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1247 u32 start_seq, u32 end_seq) 1249 u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1353 1355
1354 if (fack_count > tp->fackets_out) 1356 if (fack_count > tp->fackets_out)
1355 tp->fackets_out = fack_count; 1357 tp->fackets_out = fack_count;
1356
1357 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
1358 tcp_advance_highest_sack(sk, skb);
1359 } 1358 }
1360 1359
1361 /* D-SACK. We can detect redundant retransmission in S|R and plain R 1360 /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1370 return flag; 1369 return flag;
1371} 1370}
1372 1371
1372static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1373 struct sk_buff *skb, unsigned int pcount,
1374 int shifted, int fack_count, int *reord,
1375 int *flag, int mss)
1376{
1377 struct tcp_sock *tp = tcp_sk(sk);
1378 u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */
1379
1380 BUG_ON(!pcount);
1381
1382 TCP_SKB_CB(prev)->end_seq += shifted;
1383 TCP_SKB_CB(skb)->seq += shifted;
1384
1385 skb_shinfo(prev)->gso_segs += pcount;
1386 BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
1387 skb_shinfo(skb)->gso_segs -= pcount;
1388
1389 /* When we're adding to gso_segs == 1, gso_size will be zero,
1390 * in theory this shouldn't be necessary but as long as DSACK
1391 * code can come after this skb later on it's better to keep
1392 * setting gso_size to something.
1393 */
1394 if (!skb_shinfo(prev)->gso_size) {
1395 skb_shinfo(prev)->gso_size = mss;
1396 skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1397 }
1398
1399 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1400 if (skb_shinfo(skb)->gso_segs <= 1) {
1401 skb_shinfo(skb)->gso_size = 0;
1402 skb_shinfo(skb)->gso_type = 0;
1403 }
1404
1405 *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
1406 pcount);
1407
1408 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1409 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1410
1411 tcp_clear_all_retrans_hints(tp);
1412
1413 if (skb->len > 0) {
1414 BUG_ON(!tcp_skb_pcount(skb));
1415 return 0;
1416 }
1417
1418 /* Whole SKB was eaten :-) */
1419
1420 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
1421 if (skb == tcp_highest_sack(sk))
1422 tcp_advance_highest_sack(sk, skb);
1423
1424 tcp_unlink_write_queue(skb, sk);
1425 sk_wmem_free_skb(sk, skb);
1426
1427 return 1;
1428}
1429
1430/* I wish gso_size would have a bit more sane initialization than
1431 * something-or-zero which complicates things
1432 */
1433static int tcp_shift_mss(struct sk_buff *skb)
1434{
1435 int mss = tcp_skb_mss(skb);
1436
1437 if (!mss)
1438 mss = skb->len;
1439
1440 return mss;
1441}
1442
1443/* Shifting pages past head area doesn't work */
1444static int skb_can_shift(struct sk_buff *skb)
1445{
1446 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1447}
1448
1449/* Try collapsing SACK blocks spanning across multiple skbs to a single
1450 * skb.
1451 */
1452static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1453 u32 start_seq, u32 end_seq,
1454 int dup_sack, int *fack_count,
1455 int *reord, int *flag)
1456{
1457 struct tcp_sock *tp = tcp_sk(sk);
1458 struct sk_buff *prev;
1459 int mss;
1460 int pcount = 0;
1461 int len;
1462 int in_sack;
1463
1464 if (!sk_can_gso(sk))
1465 goto fallback;
1466
1467 /* Normally R but no L won't result in plain S */
1468 if (!dup_sack &&
1469 (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
1470 goto fallback;
1471 if (!skb_can_shift(skb))
1472 goto fallback;
1473 /* This frame is about to be dropped (was ACKed). */
1474 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1475 goto fallback;
1476
1477 /* Can only happen with delayed DSACK + discard craziness */
1478 if (unlikely(skb == tcp_write_queue_head(sk)))
1479 goto fallback;
1480 prev = tcp_write_queue_prev(sk, skb);
1481
1482 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1483 goto fallback;
1484
1485 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1486 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1487
1488 if (in_sack) {
1489 len = skb->len;
1490 pcount = tcp_skb_pcount(skb);
1491 mss = tcp_shift_mss(skb);
1492
1493 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1494 * drop this restriction as unnecessary
1495 */
1496 if (mss != tcp_shift_mss(prev))
1497 goto fallback;
1498 } else {
1499 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1500 goto noop;
1501 /* CHECKME: This is non-MSS split case only?, this will
1502 * cause skipped skbs due to advancing loop btw, original
1503 * has that feature too
1504 */
1505 if (tcp_skb_pcount(skb) <= 1)
1506 goto noop;
1507
1508 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1509 if (!in_sack) {
1510 /* TODO: head merge to next could be attempted here
1511 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1512 * though it might not be worth of the additional hassle
1513 *
1514 * ...we can probably just fallback to what was done
1515 * previously. We could try merging non-SACKed ones
1516 * as well but it probably isn't going to buy off
1517 * because later SACKs might again split them, and
1518 * it would make skb timestamp tracking considerably
1519 * harder problem.
1520 */
1521 goto fallback;
1522 }
1523
1524 len = end_seq - TCP_SKB_CB(skb)->seq;
1525 BUG_ON(len < 0);
1526 BUG_ON(len > skb->len);
1527
1528 /* MSS boundaries should be honoured or else pcount will
1529 * severely break even though it makes things bit trickier.
1530 * Optimize common case to avoid most of the divides
1531 */
1532 mss = tcp_skb_mss(skb);
1533
1534 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1535 * drop this restriction as unnecessary
1536 */
1537 if (mss != tcp_shift_mss(prev))
1538 goto fallback;
1539
1540 if (len == mss) {
1541 pcount = 1;
1542 } else if (len < mss) {
1543 goto noop;
1544 } else {
1545 pcount = len / mss;
1546 len = pcount * mss;
1547 }
1548 }
1549
1550 if (!skb_shift(prev, skb, len))
1551 goto fallback;
1552 if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
1553 flag, mss))
1554 goto out;
1555
1556 /* Hole filled allows collapsing with the next as well, this is very
1557 * useful when hole on every nth skb pattern happens
1558 */
1559 if (prev == tcp_write_queue_tail(sk))
1560 goto out;
1561 skb = tcp_write_queue_next(sk, prev);
1562
1563 if (!skb_can_shift(skb))
1564 goto out;
1565 if (skb == tcp_send_head(sk))
1566 goto out;
1567 if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1568 goto out;
1569
1570 len = skb->len;
1571 if (skb_shift(prev, skb, len)) {
1572 pcount += tcp_skb_pcount(skb);
1573 tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
1574 *fack_count, reord, flag, mss);
1575 }
1576
1577out:
1578 *fack_count += pcount;
1579 return prev;
1580
1581noop:
1582 return skb;
1583
1584fallback:
1585 return NULL;
1586}
1587
1373static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, 1588static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1374 struct tcp_sack_block *next_dup, 1589 struct tcp_sack_block *next_dup,
1375 u32 start_seq, u32 end_seq, 1590 u32 start_seq, u32 end_seq,
1376 int dup_sack_in, int *fack_count, 1591 int dup_sack_in, int *fack_count,
1377 int *reord, int *flag) 1592 int *reord, int *flag)
1378{ 1593{
1594 struct tcp_sock *tp = tcp_sk(sk);
1595 struct sk_buff *tmp;
1596
1379 tcp_for_write_queue_from(skb, sk) { 1597 tcp_for_write_queue_from(skb, sk) {
1380 int in_sack = 0; 1598 int in_sack = 0;
1381 int dup_sack = dup_sack_in; 1599 int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1396 dup_sack = 1; 1614 dup_sack = 1;
1397 } 1615 }
1398 1616
1399 if (in_sack <= 0) 1617 /* skb reference here is a bit tricky to get right, since
1400 in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, 1618 * shifting can eat and free both this skb and the next,
1401 end_seq); 1619 * so not even _safe variant of the loop is enough.
1620 */
1621 if (in_sack <= 0) {
1622 tmp = tcp_shift_skb_data(sk, skb, start_seq,
1623 end_seq, dup_sack,
1624 fack_count, reord, flag);
1625 if (tmp != NULL) {
1626 if (tmp != skb) {
1627 skb = tmp;
1628 continue;
1629 }
1630
1631 in_sack = 0;
1632 } else {
1633 in_sack = tcp_match_skb_to_sack(sk, skb,
1634 start_seq,
1635 end_seq);
1636 }
1637 }
1638
1402 if (unlikely(in_sack < 0)) 1639 if (unlikely(in_sack < 0))
1403 break; 1640 break;
1404 1641
1405 if (in_sack) 1642 if (in_sack) {
1406 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, 1643 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
1407 *fack_count, 1644 *fack_count,
1408 &(TCP_SKB_CB(skb)->sacked), 1645 &(TCP_SKB_CB(skb)->sacked),
1409 tcp_skb_pcount(skb)); 1646 tcp_skb_pcount(skb));
1410 1647
1648 if (!before(TCP_SKB_CB(skb)->seq,
1649 tcp_highest_sack_seq(tp)))
1650 tcp_advance_highest_sack(sk, skb);
1651 }
1652
1411 *fack_count += tcp_skb_pcount(skb); 1653 *fack_count += tcp_skb_pcount(skb);
1412 } 1654 }
1413 return skb; 1655 return skb;