diff options
author | Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> | 2007-11-15 22:39:31 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-01-28 17:54:03 -0500 |
commit | 85cc391c0e4584db594bfc4005c63c07c76c5077 (patch) | |
tree | d62f02260161ca6d0fa03986ae70a24e260d1c75 | |
parent | f577111302677e6d1448475821cc19ba8835f60e (diff) |
[TCP]: non-FACK SACK follows conservative SACK loss recovery
Many assumptions that are true when no reordering or other
strange events happen are not a part of the RFC3517. FACK
implementation is based on such assumptions. Previously (before
the rewrite) the non-FACK SACK was basically doing fast rexmit
and then it times out all skbs when first cumulative ACK arrives,
which cannot really be called SACK based recovery :-).
RFC3517 SACK disables these things:
- Per SKB timeouts & head timeout entry to recovery
- Marking at least one skb while in recovery (RFC3517 does this
only for the fast retransmission but not for the other skbs
when cumulative ACKs arrive in the recovery)
- Sacktag's loss detection flavors B and C (see comment before
tcp_sacktag_write_queue)
This does not implement the "last resort" rule 3 of NextSeg, which
allows retransmissions also when not enough SACK blocks have yet
arrived above a segment for IsLost to return true [RFC3517].
The implementation differs from RFC3517 in these points:
- Rate-halving is used instead of FlightSize / 2
- Instead of using dupACKs to trigger the recovery, the number
of SACK blocks is used as FACK does with SACK blocks+holes
(which provides more accurate number). It seems that the
difference can affect negatively only if the receiver does not
generate SACK blocks at all even though it claimed to be
SACK-capable.
- Dupthresh is not a constant one. Dynamical adjustments include
both holes and sacked segments (equal to what FACK has) due to
complexity involved in determining the number sacked blocks
between highest_sack and the reordered segment. Thus it's will
be an over-estimate.
Implementation note:
tcp_clean_rtx_queue doesn't need a lost_cnt tweak because head
skb at that point cannot be SACKED_ACKED (nor would such
situation last for long enough to cause problems).
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/ipv4/tcp_input.c | 80 |
1 files changed, 62 insertions, 18 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 26713e5d89db..c0e8f2b1fa7e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -863,6 +863,9 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) | |||
863 | */ | 863 | */ |
864 | static void tcp_disable_fack(struct tcp_sock *tp) | 864 | static void tcp_disable_fack(struct tcp_sock *tp) |
865 | { | 865 | { |
866 | /* RFC3517 uses different metric in lost marker => reset on change */ | ||
867 | if (tcp_is_fack(tp)) | ||
868 | tp->lost_skb_hint = NULL; | ||
866 | tp->rx_opt.sack_ok &= ~2; | 869 | tp->rx_opt.sack_ok &= ~2; |
867 | } | 870 | } |
868 | 871 | ||
@@ -1470,6 +1473,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1470 | tp->sacked_out += tcp_skb_pcount(skb); | 1473 | tp->sacked_out += tcp_skb_pcount(skb); |
1471 | 1474 | ||
1472 | fack_count += tcp_skb_pcount(skb); | 1475 | fack_count += tcp_skb_pcount(skb); |
1476 | |||
1477 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ | ||
1478 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && | ||
1479 | before(TCP_SKB_CB(skb)->seq, | ||
1480 | TCP_SKB_CB(tp->lost_skb_hint)->seq)) | ||
1481 | tp->lost_cnt_hint += tcp_skb_pcount(skb); | ||
1482 | |||
1473 | if (fack_count > tp->fackets_out) | 1483 | if (fack_count > tp->fackets_out) |
1474 | tp->fackets_out = fack_count; | 1484 | tp->fackets_out = fack_count; |
1475 | 1485 | ||
@@ -1504,7 +1514,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1504 | flag &= ~FLAG_ONLY_ORIG_SACKED; | 1514 | flag &= ~FLAG_ONLY_ORIG_SACKED; |
1505 | } | 1515 | } |
1506 | 1516 | ||
1507 | if (tp->retrans_out && | 1517 | if (tcp_is_fack(tp) && tp->retrans_out && |
1508 | after(highest_sack_end_seq, tp->lost_retrans_low) && | 1518 | after(highest_sack_end_seq, tp->lost_retrans_low) && |
1509 | icsk->icsk_ca_state == TCP_CA_Recovery) | 1519 | icsk->icsk_ca_state == TCP_CA_Recovery) |
1510 | flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); | 1520 | flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); |
@@ -1858,6 +1868,26 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) | |||
1858 | return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; | 1868 | return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; |
1859 | } | 1869 | } |
1860 | 1870 | ||
1871 | /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs | ||
1872 | * counter when SACK is enabled (without SACK, sacked_out is used for | ||
1873 | * that purpose). | ||
1874 | * | ||
1875 | * Instead, with FACK TCP uses fackets_out that includes both SACKed | ||
1876 | * segments up to the highest received SACK block so far and holes in | ||
1877 | * between them. | ||
1878 | * | ||
1879 | * With reordering, holes may still be in flight, so RFC3517 recovery | ||
1880 | * uses pure sacked_out (total number of SACKed segments) even though | ||
1881 | * it violates the RFC that uses duplicate ACKs, often these are equal | ||
1882 | * but when e.g. out-of-window ACKs or packet duplication occurs, | ||
1883 | * they differ. Since neither occurs due to loss, TCP should really | ||
1884 | * ignore them. | ||
1885 | */ | ||
1886 | static inline int tcp_dupack_heurestics(struct tcp_sock *tp) | ||
1887 | { | ||
1888 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; | ||
1889 | } | ||
1890 | |||
1861 | static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) | 1891 | static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) |
1862 | { | 1892 | { |
1863 | return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); | 1893 | return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); |
@@ -1978,13 +2008,13 @@ static int tcp_time_to_recover(struct sock *sk) | |||
1978 | return 1; | 2008 | return 1; |
1979 | 2009 | ||
1980 | /* Not-A-Trick#2 : Classic rule... */ | 2010 | /* Not-A-Trick#2 : Classic rule... */ |
1981 | if (tcp_fackets_out(tp) > tp->reordering) | 2011 | if (tcp_dupack_heurestics(tp) > tp->reordering) |
1982 | return 1; | 2012 | return 1; |
1983 | 2013 | ||
1984 | /* Trick#3 : when we use RFC2988 timer restart, fast | 2014 | /* Trick#3 : when we use RFC2988 timer restart, fast |
1985 | * retransmit can be triggered by timeout of queue head. | 2015 | * retransmit can be triggered by timeout of queue head. |
1986 | */ | 2016 | */ |
1987 | if (tcp_head_timedout(sk)) | 2017 | if (tcp_is_fack(tp) && tcp_head_timedout(sk)) |
1988 | return 1; | 2018 | return 1; |
1989 | 2019 | ||
1990 | /* Trick#4: It is still not OK... But will it be useful to delay | 2020 | /* Trick#4: It is still not OK... But will it be useful to delay |
@@ -2017,8 +2047,10 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, | |||
2017 | tp->retransmit_skb_hint = NULL; | 2047 | tp->retransmit_skb_hint = NULL; |
2018 | } | 2048 | } |
2019 | 2049 | ||
2020 | /* Mark head of queue up as lost. */ | 2050 | /* Mark head of queue up as lost. With RFC3517 SACK, the packets is |
2021 | static void tcp_mark_head_lost(struct sock *sk, int packets) | 2051 | * is against sacked "cnt", otherwise it's against facked "cnt" |
2052 | */ | ||
2053 | static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit) | ||
2022 | { | 2054 | { |
2023 | struct tcp_sock *tp = tcp_sk(sk); | 2055 | struct tcp_sock *tp = tcp_sk(sk); |
2024 | struct sk_buff *skb; | 2056 | struct sk_buff *skb; |
@@ -2040,8 +2072,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) | |||
2040 | /* this is not the most efficient way to do this... */ | 2072 | /* this is not the most efficient way to do this... */ |
2041 | tp->lost_skb_hint = skb; | 2073 | tp->lost_skb_hint = skb; |
2042 | tp->lost_cnt_hint = cnt; | 2074 | tp->lost_cnt_hint = cnt; |
2043 | cnt += tcp_skb_pcount(skb); | 2075 | |
2044 | if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) | 2076 | if (tcp_is_fack(tp) || |
2077 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | ||
2078 | cnt += tcp_skb_pcount(skb); | ||
2079 | |||
2080 | if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) || | ||
2081 | after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) | ||
2045 | break; | 2082 | break; |
2046 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { | 2083 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { |
2047 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 2084 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
@@ -2054,17 +2091,22 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) | |||
2054 | 2091 | ||
2055 | /* Account newly detected lost packet(s) */ | 2092 | /* Account newly detected lost packet(s) */ |
2056 | 2093 | ||
2057 | static void tcp_update_scoreboard(struct sock *sk) | 2094 | static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) |
2058 | { | 2095 | { |
2059 | struct tcp_sock *tp = tcp_sk(sk); | 2096 | struct tcp_sock *tp = tcp_sk(sk); |
2060 | 2097 | ||
2061 | if (tcp_is_fack(tp)) { | 2098 | if (tcp_is_reno(tp)) { |
2099 | tcp_mark_head_lost(sk, 1, fast_rexmit); | ||
2100 | } else if (tcp_is_fack(tp)) { | ||
2062 | int lost = tp->fackets_out - tp->reordering; | 2101 | int lost = tp->fackets_out - tp->reordering; |
2063 | if (lost <= 0) | 2102 | if (lost <= 0) |
2064 | lost = 1; | 2103 | lost = 1; |
2065 | tcp_mark_head_lost(sk, lost); | 2104 | tcp_mark_head_lost(sk, lost, fast_rexmit); |
2066 | } else { | 2105 | } else { |
2067 | tcp_mark_head_lost(sk, 1); | 2106 | int sacked_upto = tp->sacked_out - tp->reordering; |
2107 | if (sacked_upto < 0) | ||
2108 | sacked_upto = 0; | ||
2109 | tcp_mark_head_lost(sk, sacked_upto, fast_rexmit); | ||
2068 | } | 2110 | } |
2069 | 2111 | ||
2070 | /* New heuristics: it is possible only after we switched | 2112 | /* New heuristics: it is possible only after we switched |
@@ -2072,7 +2114,7 @@ static void tcp_update_scoreboard(struct sock *sk) | |||
2072 | * Hence, we can detect timed out packets during fast | 2114 | * Hence, we can detect timed out packets during fast |
2073 | * retransmit without falling to slow start. | 2115 | * retransmit without falling to slow start. |
2074 | */ | 2116 | */ |
2075 | if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) { | 2117 | if (tcp_is_fack(tp) && tcp_head_timedout(sk)) { |
2076 | struct sk_buff *skb; | 2118 | struct sk_buff *skb; |
2077 | 2119 | ||
2078 | skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint | 2120 | skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint |
@@ -2245,7 +2287,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) | |||
2245 | { | 2287 | { |
2246 | struct tcp_sock *tp = tcp_sk(sk); | 2288 | struct tcp_sock *tp = tcp_sk(sk); |
2247 | /* Partial ACK arrived. Force Hoe's retransmit. */ | 2289 | /* Partial ACK arrived. Force Hoe's retransmit. */ |
2248 | int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering; | 2290 | int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); |
2249 | 2291 | ||
2250 | if (tcp_may_undo(tp)) { | 2292 | if (tcp_may_undo(tp)) { |
2251 | /* Plain luck! Hole if filled with delayed | 2293 | /* Plain luck! Hole if filled with delayed |
@@ -2379,7 +2421,8 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) | |||
2379 | struct tcp_sock *tp = tcp_sk(sk); | 2421 | struct tcp_sock *tp = tcp_sk(sk); |
2380 | int is_dupack = !(flag&(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP)); | 2422 | int is_dupack = !(flag&(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP)); |
2381 | int do_lost = is_dupack || ((flag&FLAG_DATA_SACKED) && | 2423 | int do_lost = is_dupack || ((flag&FLAG_DATA_SACKED) && |
2382 | (tp->fackets_out > tp->reordering)); | 2424 | (tcp_fackets_out(tp) > tp->reordering)); |
2425 | int fast_rexmit = 0; | ||
2383 | 2426 | ||
2384 | /* Some technical things: | 2427 | /* Some technical things: |
2385 | * 1. Reno does not count dupacks (sacked_out) automatically. */ | 2428 | * 1. Reno does not count dupacks (sacked_out) automatically. */ |
@@ -2399,11 +2442,11 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) | |||
2399 | return; | 2442 | return; |
2400 | 2443 | ||
2401 | /* C. Process data loss notification, provided it is valid. */ | 2444 | /* C. Process data loss notification, provided it is valid. */ |
2402 | if ((flag&FLAG_DATA_LOST) && | 2445 | if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && |
2403 | before(tp->snd_una, tp->high_seq) && | 2446 | before(tp->snd_una, tp->high_seq) && |
2404 | icsk->icsk_ca_state != TCP_CA_Open && | 2447 | icsk->icsk_ca_state != TCP_CA_Open && |
2405 | tp->fackets_out > tp->reordering) { | 2448 | tp->fackets_out > tp->reordering) { |
2406 | tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); | 2449 | tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, 0); |
2407 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); | 2450 | NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); |
2408 | } | 2451 | } |
2409 | 2452 | ||
@@ -2522,10 +2565,11 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) | |||
2522 | tp->bytes_acked = 0; | 2565 | tp->bytes_acked = 0; |
2523 | tp->snd_cwnd_cnt = 0; | 2566 | tp->snd_cwnd_cnt = 0; |
2524 | tcp_set_ca_state(sk, TCP_CA_Recovery); | 2567 | tcp_set_ca_state(sk, TCP_CA_Recovery); |
2568 | fast_rexmit = 1; | ||
2525 | } | 2569 | } |
2526 | 2570 | ||
2527 | if (do_lost || tcp_head_timedout(sk)) | 2571 | if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) |
2528 | tcp_update_scoreboard(sk); | 2572 | tcp_update_scoreboard(sk, fast_rexmit); |
2529 | tcp_cwnd_down(sk, flag); | 2573 | tcp_cwnd_down(sk, flag); |
2530 | tcp_xmit_retransmit_queue(sk); | 2574 | tcp_xmit_retransmit_queue(sk); |
2531 | } | 2575 | } |