aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c511
1 files changed, 417 insertions, 94 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d77c0d29e239..99b7ecbe8893 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -701,13 +701,10 @@ static inline void tcp_set_rto(struct sock *sk)
701 * all the algo is pure shit and should be replaced 701 * all the algo is pure shit and should be replaced
702 * with correct one. It is exactly, which we pretend to do. 702 * with correct one. It is exactly, which we pretend to do.
703 */ 703 */
704}
705 704
706/* NOTE: clamping at TCP_RTO_MIN is not required, current algo 705 /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
707 * guarantees that rto is higher. 706 * guarantees that rto is higher.
708 */ 707 */
709static inline void tcp_bound_rto(struct sock *sk)
710{
711 if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) 708 if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
712 inet_csk(sk)->icsk_rto = TCP_RTO_MAX; 709 inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
713} 710}
@@ -928,7 +925,6 @@ static void tcp_init_metrics(struct sock *sk)
928 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 925 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
929 } 926 }
930 tcp_set_rto(sk); 927 tcp_set_rto(sk);
931 tcp_bound_rto(sk);
932 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 928 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
933 goto reset; 929 goto reset;
934 tp->snd_cwnd = tcp_init_cwnd(tp, dst); 930 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
@@ -1002,7 +998,8 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
1002 } 998 }
1003} 999}
1004 1000
1005void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) 1001static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
1002 struct sk_buff *skb)
1006{ 1003{
1007 tcp_verify_retransmit_hint(tp, skb); 1004 tcp_verify_retransmit_hint(tp, skb);
1008 1005
@@ -1236,31 +1233,58 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1236 return dup_sack; 1233 return dup_sack;
1237} 1234}
1238 1235
1236struct tcp_sacktag_state {
1237 int reord;
1238 int fack_count;
1239 int flag;
1240};
1241
1239/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1242/* Check if skb is fully within the SACK block. In presence of GSO skbs,
1240 * the incoming SACK may not exactly match but we can find smaller MSS 1243 * the incoming SACK may not exactly match but we can find smaller MSS
1241 * aligned portion of it that matches. Therefore we might need to fragment 1244 * aligned portion of it that matches. Therefore we might need to fragment
1242 * which may fail and creates some hassle (caller must handle error case 1245 * which may fail and creates some hassle (caller must handle error case
1243 * returns). 1246 * returns).
1247 *
1248 * FIXME: this could be merged to shift decision code
1244 */ 1249 */
1245static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1250static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1246 u32 start_seq, u32 end_seq) 1251 u32 start_seq, u32 end_seq)
1247{ 1252{
1248 int in_sack, err; 1253 int in_sack, err;
1249 unsigned int pkt_len; 1254 unsigned int pkt_len;
1255 unsigned int mss;
1250 1256
1251 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && 1257 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1252 !before(end_seq, TCP_SKB_CB(skb)->end_seq); 1258 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1253 1259
1254 if (tcp_skb_pcount(skb) > 1 && !in_sack && 1260 if (tcp_skb_pcount(skb) > 1 && !in_sack &&
1255 after(TCP_SKB_CB(skb)->end_seq, start_seq)) { 1261 after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1256 1262 mss = tcp_skb_mss(skb);
1257 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); 1263 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1258 1264
1259 if (!in_sack) 1265 if (!in_sack) {
1260 pkt_len = start_seq - TCP_SKB_CB(skb)->seq; 1266 pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1261 else 1267 if (pkt_len < mss)
1268 pkt_len = mss;
1269 } else {
1262 pkt_len = end_seq - TCP_SKB_CB(skb)->seq; 1270 pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1263 err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); 1271 if (pkt_len < mss)
1272 return -EINVAL;
1273 }
1274
1275 /* Round if necessary so that SACKs cover only full MSSes
1276 * and/or the remaining small portion (if present)
1277 */
1278 if (pkt_len > mss) {
1279 unsigned int new_len = (pkt_len / mss) * mss;
1280 if (!in_sack && new_len < pkt_len) {
1281 new_len += mss;
1282 if (new_len > skb->len)
1283 return 0;
1284 }
1285 pkt_len = new_len;
1286 }
1287 err = tcp_fragment(sk, skb, pkt_len, mss);
1264 if (err < 0) 1288 if (err < 0)
1265 return err; 1289 return err;
1266 } 1290 }
@@ -1268,24 +1292,25 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1268 return in_sack; 1292 return in_sack;
1269} 1293}
1270 1294
1271static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, 1295static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1272 int *reord, int dup_sack, int fack_count) 1296 struct tcp_sacktag_state *state,
1297 int dup_sack, int pcount)
1273{ 1298{
1274 struct tcp_sock *tp = tcp_sk(sk); 1299 struct tcp_sock *tp = tcp_sk(sk);
1275 u8 sacked = TCP_SKB_CB(skb)->sacked; 1300 u8 sacked = TCP_SKB_CB(skb)->sacked;
1276 int flag = 0; 1301 int fack_count = state->fack_count;
1277 1302
1278 /* Account D-SACK for retransmitted packet. */ 1303 /* Account D-SACK for retransmitted packet. */
1279 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1304 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1280 if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) 1305 if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
1281 tp->undo_retrans--; 1306 tp->undo_retrans--;
1282 if (sacked & TCPCB_SACKED_ACKED) 1307 if (sacked & TCPCB_SACKED_ACKED)
1283 *reord = min(fack_count, *reord); 1308 state->reord = min(fack_count, state->reord);
1284 } 1309 }
1285 1310
1286 /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1311 /* Nothing to do; acked frame is about to be dropped (was ACKed). */
1287 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 1312 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1288 return flag; 1313 return sacked;
1289 1314
1290 if (!(sacked & TCPCB_SACKED_ACKED)) { 1315 if (!(sacked & TCPCB_SACKED_ACKED)) {
1291 if (sacked & TCPCB_SACKED_RETRANS) { 1316 if (sacked & TCPCB_SACKED_RETRANS) {
@@ -1294,10 +1319,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1294 * that retransmission is still in flight. 1319 * that retransmission is still in flight.
1295 */ 1320 */
1296 if (sacked & TCPCB_LOST) { 1321 if (sacked & TCPCB_LOST) {
1297 TCP_SKB_CB(skb)->sacked &= 1322 sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1298 ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1323 tp->lost_out -= pcount;
1299 tp->lost_out -= tcp_skb_pcount(skb); 1324 tp->retrans_out -= pcount;
1300 tp->retrans_out -= tcp_skb_pcount(skb);
1301 } 1325 }
1302 } else { 1326 } else {
1303 if (!(sacked & TCPCB_RETRANS)) { 1327 if (!(sacked & TCPCB_RETRANS)) {
@@ -1306,56 +1330,280 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1306 */ 1330 */
1307 if (before(TCP_SKB_CB(skb)->seq, 1331 if (before(TCP_SKB_CB(skb)->seq,
1308 tcp_highest_sack_seq(tp))) 1332 tcp_highest_sack_seq(tp)))
1309 *reord = min(fack_count, *reord); 1333 state->reord = min(fack_count,
1334 state->reord);
1310 1335
1311 /* SACK enhanced F-RTO (RFC4138; Appendix B) */ 1336 /* SACK enhanced F-RTO (RFC4138; Appendix B) */
1312 if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) 1337 if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
1313 flag |= FLAG_ONLY_ORIG_SACKED; 1338 state->flag |= FLAG_ONLY_ORIG_SACKED;
1314 } 1339 }
1315 1340
1316 if (sacked & TCPCB_LOST) { 1341 if (sacked & TCPCB_LOST) {
1317 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1342 sacked &= ~TCPCB_LOST;
1318 tp->lost_out -= tcp_skb_pcount(skb); 1343 tp->lost_out -= pcount;
1319 } 1344 }
1320 } 1345 }
1321 1346
1322 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; 1347 sacked |= TCPCB_SACKED_ACKED;
1323 flag |= FLAG_DATA_SACKED; 1348 state->flag |= FLAG_DATA_SACKED;
1324 tp->sacked_out += tcp_skb_pcount(skb); 1349 tp->sacked_out += pcount;
1325 1350
1326 fack_count += tcp_skb_pcount(skb); 1351 fack_count += pcount;
1327 1352
1328 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1353 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1329 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && 1354 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
1330 before(TCP_SKB_CB(skb)->seq, 1355 before(TCP_SKB_CB(skb)->seq,
1331 TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1356 TCP_SKB_CB(tp->lost_skb_hint)->seq))
1332 tp->lost_cnt_hint += tcp_skb_pcount(skb); 1357 tp->lost_cnt_hint += pcount;
1333 1358
1334 if (fack_count > tp->fackets_out) 1359 if (fack_count > tp->fackets_out)
1335 tp->fackets_out = fack_count; 1360 tp->fackets_out = fack_count;
1336
1337 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
1338 tcp_advance_highest_sack(sk, skb);
1339 } 1361 }
1340 1362
1341 /* D-SACK. We can detect redundant retransmission in S|R and plain R 1363 /* D-SACK. We can detect redundant retransmission in S|R and plain R
1342 * frames and clear it. undo_retrans is decreased above, L|R frames 1364 * frames and clear it. undo_retrans is decreased above, L|R frames
1343 * are accounted above as well. 1365 * are accounted above as well.
1344 */ 1366 */
1345 if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { 1367 if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1346 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1368 sacked &= ~TCPCB_SACKED_RETRANS;
1347 tp->retrans_out -= tcp_skb_pcount(skb); 1369 tp->retrans_out -= pcount;
1348 } 1370 }
1349 1371
1350 return flag; 1372 return sacked;
1373}
1374
1375static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1376 struct tcp_sacktag_state *state,
1377 unsigned int pcount, int shifted, int mss)
1378{
1379 struct tcp_sock *tp = tcp_sk(sk);
1380 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1381
1382 BUG_ON(!pcount);
1383
1384 /* Tweak before seqno plays */
1385 if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint &&
1386 !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
1387 tp->lost_cnt_hint += pcount;
1388
1389 TCP_SKB_CB(prev)->end_seq += shifted;
1390 TCP_SKB_CB(skb)->seq += shifted;
1391
1392 skb_shinfo(prev)->gso_segs += pcount;
1393 BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
1394 skb_shinfo(skb)->gso_segs -= pcount;
1395
1396 /* When we're adding to gso_segs == 1, gso_size will be zero,
1397 * in theory this shouldn't be necessary but as long as DSACK
1398 * code can come after this skb later on it's better to keep
1399 * setting gso_size to something.
1400 */
1401 if (!skb_shinfo(prev)->gso_size) {
1402 skb_shinfo(prev)->gso_size = mss;
1403 skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1404 }
1405
1406 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1407 if (skb_shinfo(skb)->gso_segs <= 1) {
1408 skb_shinfo(skb)->gso_size = 0;
1409 skb_shinfo(skb)->gso_type = 0;
1410 }
1411
1412 /* We discard results */
1413 tcp_sacktag_one(skb, sk, state, 0, pcount);
1414
1415 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1416 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1417
1418 if (skb->len > 0) {
1419 BUG_ON(!tcp_skb_pcount(skb));
1420 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1421 return 0;
1422 }
1423
1424 /* Whole SKB was eaten :-) */
1425
1426 if (skb == tp->retransmit_skb_hint)
1427 tp->retransmit_skb_hint = prev;
1428 if (skb == tp->scoreboard_skb_hint)
1429 tp->scoreboard_skb_hint = prev;
1430 if (skb == tp->lost_skb_hint) {
1431 tp->lost_skb_hint = prev;
1432 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1433 }
1434
1435 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
1436 if (skb == tcp_highest_sack(sk))
1437 tcp_advance_highest_sack(sk, skb);
1438
1439 tcp_unlink_write_queue(skb, sk);
1440 sk_wmem_free_skb(sk, skb);
1441
1442 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
1443
1444 return 1;
1445}
1446
1447/* I wish gso_size would have a bit more sane initialization than
1448 * something-or-zero which complicates things
1449 */
1450static int tcp_skb_seglen(struct sk_buff *skb)
1451{
1452 return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1453}
1454
1455/* Shifting pages past head area doesn't work */
1456static int skb_can_shift(struct sk_buff *skb)
1457{
1458 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1459}
1460
1461/* Try collapsing SACK blocks spanning across multiple skbs to a single
1462 * skb.
1463 */
1464static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1465 struct tcp_sacktag_state *state,
1466 u32 start_seq, u32 end_seq,
1467 int dup_sack)
1468{
1469 struct tcp_sock *tp = tcp_sk(sk);
1470 struct sk_buff *prev;
1471 int mss;
1472 int pcount = 0;
1473 int len;
1474 int in_sack;
1475
1476 if (!sk_can_gso(sk))
1477 goto fallback;
1478
1479 /* Normally R but no L won't result in plain S */
1480 if (!dup_sack &&
1481 (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1482 goto fallback;
1483 if (!skb_can_shift(skb))
1484 goto fallback;
1485 /* This frame is about to be dropped (was ACKed). */
1486 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1487 goto fallback;
1488
1489 /* Can only happen with delayed DSACK + discard craziness */
1490 if (unlikely(skb == tcp_write_queue_head(sk)))
1491 goto fallback;
1492 prev = tcp_write_queue_prev(sk, skb);
1493
1494 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1495 goto fallback;
1496
1497 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1498 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1499
1500 if (in_sack) {
1501 len = skb->len;
1502 pcount = tcp_skb_pcount(skb);
1503 mss = tcp_skb_seglen(skb);
1504
1505 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1506 * drop this restriction as unnecessary
1507 */
1508 if (mss != tcp_skb_seglen(prev))
1509 goto fallback;
1510 } else {
1511 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1512 goto noop;
1513 /* CHECKME: This is non-MSS split case only?, this will
1514 * cause skipped skbs due to advancing loop btw, original
1515 * has that feature too
1516 */
1517 if (tcp_skb_pcount(skb) <= 1)
1518 goto noop;
1519
1520 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1521 if (!in_sack) {
1522 /* TODO: head merge to next could be attempted here
1523 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1524 * though it might not be worth of the additional hassle
1525 *
1526 * ...we can probably just fallback to what was done
1527 * previously. We could try merging non-SACKed ones
1528 * as well but it probably isn't going to buy off
1529 * because later SACKs might again split them, and
1530 * it would make skb timestamp tracking considerably
1531 * harder problem.
1532 */
1533 goto fallback;
1534 }
1535
1536 len = end_seq - TCP_SKB_CB(skb)->seq;
1537 BUG_ON(len < 0);
1538 BUG_ON(len > skb->len);
1539
1540 /* MSS boundaries should be honoured or else pcount will
1541 * severely break even though it makes things bit trickier.
1542 * Optimize common case to avoid most of the divides
1543 */
1544 mss = tcp_skb_mss(skb);
1545
1546 /* TODO: Fix DSACKs to not fragment already SACKed and we can
1547 * drop this restriction as unnecessary
1548 */
1549 if (mss != tcp_skb_seglen(prev))
1550 goto fallback;
1551
1552 if (len == mss) {
1553 pcount = 1;
1554 } else if (len < mss) {
1555 goto noop;
1556 } else {
1557 pcount = len / mss;
1558 len = pcount * mss;
1559 }
1560 }
1561
1562 if (!skb_shift(prev, skb, len))
1563 goto fallback;
1564 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss))
1565 goto out;
1566
1567 /* Hole filled allows collapsing with the next as well, this is very
1568 * useful when hole on every nth skb pattern happens
1569 */
1570 if (prev == tcp_write_queue_tail(sk))
1571 goto out;
1572 skb = tcp_write_queue_next(sk, prev);
1573
1574 if (!skb_can_shift(skb) ||
1575 (skb == tcp_send_head(sk)) ||
1576 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1577 (mss != tcp_skb_seglen(skb)))
1578 goto out;
1579
1580 len = skb->len;
1581 if (skb_shift(prev, skb, len)) {
1582 pcount += tcp_skb_pcount(skb);
1583 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss);
1584 }
1585
1586out:
1587 state->fack_count += pcount;
1588 return prev;
1589
1590noop:
1591 return skb;
1592
1593fallback:
1594 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1595 return NULL;
1351} 1596}
1352 1597
1353static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, 1598static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1354 struct tcp_sack_block *next_dup, 1599 struct tcp_sack_block *next_dup,
1600 struct tcp_sacktag_state *state,
1355 u32 start_seq, u32 end_seq, 1601 u32 start_seq, u32 end_seq,
1356 int dup_sack_in, int *fack_count, 1602 int dup_sack_in)
1357 int *reord, int *flag)
1358{ 1603{
1604 struct tcp_sock *tp = tcp_sk(sk);
1605 struct sk_buff *tmp;
1606
1359 tcp_for_write_queue_from(skb, sk) { 1607 tcp_for_write_queue_from(skb, sk) {
1360 int in_sack = 0; 1608 int in_sack = 0;
1361 int dup_sack = dup_sack_in; 1609 int dup_sack = dup_sack_in;
@@ -1376,17 +1624,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1376 dup_sack = 1; 1624 dup_sack = 1;
1377 } 1625 }
1378 1626
1379 if (in_sack <= 0) 1627 /* skb reference here is a bit tricky to get right, since
1380 in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, 1628 * shifting can eat and free both this skb and the next,
1381 end_seq); 1629 * so not even _safe variant of the loop is enough.
1630 */
1631 if (in_sack <= 0) {
1632 tmp = tcp_shift_skb_data(sk, skb, state,
1633 start_seq, end_seq, dup_sack);
1634 if (tmp != NULL) {
1635 if (tmp != skb) {
1636 skb = tmp;
1637 continue;
1638 }
1639
1640 in_sack = 0;
1641 } else {
1642 in_sack = tcp_match_skb_to_sack(sk, skb,
1643 start_seq,
1644 end_seq);
1645 }
1646 }
1647
1382 if (unlikely(in_sack < 0)) 1648 if (unlikely(in_sack < 0))
1383 break; 1649 break;
1384 1650
1385 if (in_sack) 1651 if (in_sack) {
1386 *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, 1652 TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
1387 *fack_count); 1653 state,
1654 dup_sack,
1655 tcp_skb_pcount(skb));
1656
1657 if (!before(TCP_SKB_CB(skb)->seq,
1658 tcp_highest_sack_seq(tp)))
1659 tcp_advance_highest_sack(sk, skb);
1660 }
1388 1661
1389 *fack_count += tcp_skb_pcount(skb); 1662 state->fack_count += tcp_skb_pcount(skb);
1390 } 1663 }
1391 return skb; 1664 return skb;
1392} 1665}
@@ -1395,16 +1668,17 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1395 * a normal way 1668 * a normal way
1396 */ 1669 */
1397static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, 1670static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1398 u32 skip_to_seq, int *fack_count) 1671 struct tcp_sacktag_state *state,
1672 u32 skip_to_seq)
1399{ 1673{
1400 tcp_for_write_queue_from(skb, sk) { 1674 tcp_for_write_queue_from(skb, sk) {
1401 if (skb == tcp_send_head(sk)) 1675 if (skb == tcp_send_head(sk))
1402 break; 1676 break;
1403 1677
1404 if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) 1678 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1405 break; 1679 break;
1406 1680
1407 *fack_count += tcp_skb_pcount(skb); 1681 state->fack_count += tcp_skb_pcount(skb);
1408 } 1682 }
1409 return skb; 1683 return skb;
1410} 1684}
@@ -1412,18 +1686,17 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1412static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, 1686static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1413 struct sock *sk, 1687 struct sock *sk,
1414 struct tcp_sack_block *next_dup, 1688 struct tcp_sack_block *next_dup,
1415 u32 skip_to_seq, 1689 struct tcp_sacktag_state *state,
1416 int *fack_count, int *reord, 1690 u32 skip_to_seq)
1417 int *flag)
1418{ 1691{
1419 if (next_dup == NULL) 1692 if (next_dup == NULL)
1420 return skb; 1693 return skb;
1421 1694
1422 if (before(next_dup->start_seq, skip_to_seq)) { 1695 if (before(next_dup->start_seq, skip_to_seq)) {
1423 skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); 1696 skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
1424 skb = tcp_sacktag_walk(skb, sk, NULL, 1697 skb = tcp_sacktag_walk(skb, sk, NULL, state,
1425 next_dup->start_seq, next_dup->end_seq, 1698 next_dup->start_seq, next_dup->end_seq,
1426 1, fack_count, reord, flag); 1699 1);
1427 } 1700 }
1428 1701
1429 return skb; 1702 return skb;
@@ -1445,16 +1718,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1445 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); 1718 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1446 struct tcp_sack_block sp[TCP_NUM_SACKS]; 1719 struct tcp_sack_block sp[TCP_NUM_SACKS];
1447 struct tcp_sack_block *cache; 1720 struct tcp_sack_block *cache;
1721 struct tcp_sacktag_state state;
1448 struct sk_buff *skb; 1722 struct sk_buff *skb;
1449 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); 1723 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1450 int used_sacks; 1724 int used_sacks;
1451 int reord = tp->packets_out;
1452 int flag = 0;
1453 int found_dup_sack = 0; 1725 int found_dup_sack = 0;
1454 int fack_count;
1455 int i, j; 1726 int i, j;
1456 int first_sack_index; 1727 int first_sack_index;
1457 1728
1729 state.flag = 0;
1730 state.reord = tp->packets_out;
1731
1458 if (!tp->sacked_out) { 1732 if (!tp->sacked_out) {
1459 if (WARN_ON(tp->fackets_out)) 1733 if (WARN_ON(tp->fackets_out))
1460 tp->fackets_out = 0; 1734 tp->fackets_out = 0;
@@ -1464,7 +1738,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1464 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, 1738 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1465 num_sacks, prior_snd_una); 1739 num_sacks, prior_snd_una);
1466 if (found_dup_sack) 1740 if (found_dup_sack)
1467 flag |= FLAG_DSACKING_ACK; 1741 state.flag |= FLAG_DSACKING_ACK;
1468 1742
1469 /* Eliminate too old ACKs, but take into 1743 /* Eliminate too old ACKs, but take into
1470 * account more or less fresh ones, they can 1744 * account more or less fresh ones, they can
@@ -1533,7 +1807,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1533 } 1807 }
1534 1808
1535 skb = tcp_write_queue_head(sk); 1809 skb = tcp_write_queue_head(sk);
1536 fack_count = 0; 1810 state.fack_count = 0;
1537 i = 0; 1811 i = 0;
1538 1812
1539 if (!tp->sacked_out) { 1813 if (!tp->sacked_out) {
@@ -1558,7 +1832,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1558 1832
1559 /* Event "B" in the comment above. */ 1833 /* Event "B" in the comment above. */
1560 if (after(end_seq, tp->high_seq)) 1834 if (after(end_seq, tp->high_seq))
1561 flag |= FLAG_DATA_LOST; 1835 state.flag |= FLAG_DATA_LOST;
1562 1836
1563 /* Skip too early cached blocks */ 1837 /* Skip too early cached blocks */
1564 while (tcp_sack_cache_ok(tp, cache) && 1838 while (tcp_sack_cache_ok(tp, cache) &&
@@ -1571,13 +1845,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1571 1845
1572 /* Head todo? */ 1846 /* Head todo? */
1573 if (before(start_seq, cache->start_seq)) { 1847 if (before(start_seq, cache->start_seq)) {
1574 skb = tcp_sacktag_skip(skb, sk, start_seq, 1848 skb = tcp_sacktag_skip(skb, sk, &state,
1575 &fack_count); 1849 start_seq);
1576 skb = tcp_sacktag_walk(skb, sk, next_dup, 1850 skb = tcp_sacktag_walk(skb, sk, next_dup,
1851 &state,
1577 start_seq, 1852 start_seq,
1578 cache->start_seq, 1853 cache->start_seq,
1579 dup_sack, &fack_count, 1854 dup_sack);
1580 &reord, &flag);
1581 } 1855 }
1582 1856
1583 /* Rest of the block already fully processed? */ 1857 /* Rest of the block already fully processed? */
@@ -1585,9 +1859,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1585 goto advance_sp; 1859 goto advance_sp;
1586 1860
1587 skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, 1861 skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1588 cache->end_seq, 1862 &state,
1589 &fack_count, &reord, 1863 cache->end_seq);
1590 &flag);
1591 1864
1592 /* ...tail remains todo... */ 1865 /* ...tail remains todo... */
1593 if (tcp_highest_sack_seq(tp) == cache->end_seq) { 1866 if (tcp_highest_sack_seq(tp) == cache->end_seq) {
@@ -1595,13 +1868,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1595 skb = tcp_highest_sack(sk); 1868 skb = tcp_highest_sack(sk);
1596 if (skb == NULL) 1869 if (skb == NULL)
1597 break; 1870 break;
1598 fack_count = tp->fackets_out; 1871 state.fack_count = tp->fackets_out;
1599 cache++; 1872 cache++;
1600 goto walk; 1873 goto walk;
1601 } 1874 }
1602 1875
1603 skb = tcp_sacktag_skip(skb, sk, cache->end_seq, 1876 skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
1604 &fack_count);
1605 /* Check overlap against next cached too (past this one already) */ 1877 /* Check overlap against next cached too (past this one already) */
1606 cache++; 1878 cache++;
1607 continue; 1879 continue;
@@ -1611,20 +1883,20 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1611 skb = tcp_highest_sack(sk); 1883 skb = tcp_highest_sack(sk);
1612 if (skb == NULL) 1884 if (skb == NULL)
1613 break; 1885 break;
1614 fack_count = tp->fackets_out; 1886 state.fack_count = tp->fackets_out;
1615 } 1887 }
1616 skb = tcp_sacktag_skip(skb, sk, start_seq, &fack_count); 1888 skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
1617 1889
1618walk: 1890walk:
1619 skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, 1891 skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
1620 dup_sack, &fack_count, &reord, &flag); 1892 start_seq, end_seq, dup_sack);
1621 1893
1622advance_sp: 1894advance_sp:
1623 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct 1895 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
1624 * due to in-order walk 1896 * due to in-order walk
1625 */ 1897 */
1626 if (after(end_seq, tp->frto_highmark)) 1898 if (after(end_seq, tp->frto_highmark))
1627 flag &= ~FLAG_ONLY_ORIG_SACKED; 1899 state.flag &= ~FLAG_ONLY_ORIG_SACKED;
1628 1900
1629 i++; 1901 i++;
1630 } 1902 }
@@ -1641,10 +1913,10 @@ advance_sp:
1641 1913
1642 tcp_verify_left_out(tp); 1914 tcp_verify_left_out(tp);
1643 1915
1644 if ((reord < tp->fackets_out) && 1916 if ((state.reord < tp->fackets_out) &&
1645 ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && 1917 ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
1646 (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) 1918 (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
1647 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 1919 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
1648 1920
1649out: 1921out:
1650 1922
@@ -1654,13 +1926,13 @@ out:
1654 WARN_ON((int)tp->retrans_out < 0); 1926 WARN_ON((int)tp->retrans_out < 0);
1655 WARN_ON((int)tcp_packets_in_flight(tp) < 0); 1927 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1656#endif 1928#endif
1657 return flag; 1929 return state.flag;
1658} 1930}
1659 1931
1660/* Limits sacked_out so that sum with lost_out isn't ever larger than 1932/* Limits sacked_out so that sum with lost_out isn't ever larger than
1661 * packets_out. Returns zero if sacked_out adjustement wasn't necessary. 1933 * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
1662 */ 1934 */
1663int tcp_limit_reno_sacked(struct tcp_sock *tp) 1935static int tcp_limit_reno_sacked(struct tcp_sock *tp)
1664{ 1936{
1665 u32 holes; 1937 u32 holes;
1666 1938
@@ -2336,9 +2608,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2336 struct inet_sock *inet = inet_sk(sk); 2608 struct inet_sock *inet = inet_sk(sk);
2337 2609
2338 if (sk->sk_family == AF_INET) { 2610 if (sk->sk_family == AF_INET) {
2339 printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n", 2611 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2340 msg, 2612 msg,
2341 NIPQUAD(inet->daddr), ntohs(inet->dport), 2613 &inet->daddr, ntohs(inet->dport),
2342 tp->snd_cwnd, tcp_left_out(tp), 2614 tp->snd_cwnd, tcp_left_out(tp),
2343 tp->snd_ssthresh, tp->prior_ssthresh, 2615 tp->snd_ssthresh, tp->prior_ssthresh,
2344 tp->packets_out); 2616 tp->packets_out);
@@ -2346,9 +2618,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2346#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 2618#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2347 else if (sk->sk_family == AF_INET6) { 2619 else if (sk->sk_family == AF_INET6) {
2348 struct ipv6_pinfo *np = inet6_sk(sk); 2620 struct ipv6_pinfo *np = inet6_sk(sk);
2349 printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n", 2621 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2350 msg, 2622 msg,
2351 NIP6(np->daddr), ntohs(inet->dport), 2623 &np->daddr, ntohs(inet->dport),
2352 tp->snd_cwnd, tcp_left_out(tp), 2624 tp->snd_cwnd, tcp_left_out(tp),
2353 tp->snd_ssthresh, tp->prior_ssthresh, 2625 tp->snd_ssthresh, tp->prior_ssthresh,
2354 tp->packets_out); 2626 tp->packets_out);
@@ -2559,6 +2831,56 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
2559 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 2831 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2560} 2832}
2561 2833
2834/* Do a simple retransmit without using the backoff mechanisms in
2835 * tcp_timer. This is used for path mtu discovery.
2836 * The socket is already locked here.
2837 */
2838void tcp_simple_retransmit(struct sock *sk)
2839{
2840 const struct inet_connection_sock *icsk = inet_csk(sk);
2841 struct tcp_sock *tp = tcp_sk(sk);
2842 struct sk_buff *skb;
2843 unsigned int mss = tcp_current_mss(sk, 0);
2844 u32 prior_lost = tp->lost_out;
2845
2846 tcp_for_write_queue(skb, sk) {
2847 if (skb == tcp_send_head(sk))
2848 break;
2849 if (tcp_skb_seglen(skb) > mss &&
2850 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2851 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2852 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2853 tp->retrans_out -= tcp_skb_pcount(skb);
2854 }
2855 tcp_skb_mark_lost_uncond_verify(tp, skb);
2856 }
2857 }
2858
2859 tcp_clear_retrans_hints_partial(tp);
2860
2861 if (prior_lost == tp->lost_out)
2862 return;
2863
2864 if (tcp_is_reno(tp))
2865 tcp_limit_reno_sacked(tp);
2866
2867 tcp_verify_left_out(tp);
2868
2869 /* Don't muck with the congestion window here.
2870 * Reason is that we do not increase amount of _data_
2871 * in network, but units changed and effective
2872 * cwnd/ssthresh really reduced now.
2873 */
2874 if (icsk->icsk_ca_state != TCP_CA_Loss) {
2875 tp->high_seq = tp->snd_nxt;
2876 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2877 tp->prior_ssthresh = 0;
2878 tp->undo_marker = 0;
2879 tcp_set_ca_state(sk, TCP_CA_Loss);
2880 }
2881 tcp_xmit_retransmit_queue(sk);
2882}
2883
2562/* Process an event, which can update packets-in-flight not trivially. 2884/* Process an event, which can update packets-in-flight not trivially.
2563 * Main goal of this function is to calculate new estimate for left_out, 2885 * Main goal of this function is to calculate new estimate for left_out,
2564 * taking into account both packets sitting in receiver's buffer and 2886 * taking into account both packets sitting in receiver's buffer and
@@ -2730,6 +3052,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
2730 tcp_xmit_retransmit_queue(sk); 3052 tcp_xmit_retransmit_queue(sk);
2731} 3053}
2732 3054
3055static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
3056{
3057 tcp_rtt_estimator(sk, seq_rtt);
3058 tcp_set_rto(sk);
3059 inet_csk(sk)->icsk_backoff = 0;
3060}
3061
2733/* Read draft-ietf-tcplw-high-performance before mucking 3062/* Read draft-ietf-tcplw-high-performance before mucking
2734 * with this code. (Supersedes RFC1323) 3063 * with this code. (Supersedes RFC1323)
2735 */ 3064 */
@@ -2751,11 +3080,8 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
2751 * in window is lost... Voila. --ANK (010210) 3080 * in window is lost... Voila. --ANK (010210)
2752 */ 3081 */
2753 struct tcp_sock *tp = tcp_sk(sk); 3082 struct tcp_sock *tp = tcp_sk(sk);
2754 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 3083
2755 tcp_rtt_estimator(sk, seq_rtt); 3084 tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
2756 tcp_set_rto(sk);
2757 inet_csk(sk)->icsk_backoff = 0;
2758 tcp_bound_rto(sk);
2759} 3085}
2760 3086
2761static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) 3087static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
@@ -2772,10 +3098,7 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
2772 if (flag & FLAG_RETRANS_DATA_ACKED) 3098 if (flag & FLAG_RETRANS_DATA_ACKED)
2773 return; 3099 return;
2774 3100
2775 tcp_rtt_estimator(sk, seq_rtt); 3101 tcp_valid_rtt_meas(sk, seq_rtt);
2776 tcp_set_rto(sk);
2777 inet_csk(sk)->icsk_backoff = 0;
2778 tcp_bound_rto(sk);
2779} 3102}
2780 3103
2781static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, 3104static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,