diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 18 | ||||
-rw-r--r-- | include/linux/tcp.h | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 73 |
3 files changed, 68 insertions, 26 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8a977a0aaede..f98ca633b528 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -225,19 +225,13 @@ tcp_fin_timeout - INTEGER | |||
225 | Default: 60 seconds | 225 | Default: 60 seconds |
226 | 226 | ||
227 | tcp_frto - INTEGER | 227 | tcp_frto - INTEGER |
228 | Enables Forward RTO-Recovery (F-RTO) defined in RFC4138. | 228 | Enables Forward RTO-Recovery (F-RTO) defined in RFC5682. |
229 | F-RTO is an enhanced recovery algorithm for TCP retransmission | 229 | F-RTO is an enhanced recovery algorithm for TCP retransmission |
230 | timeouts. It is particularly beneficial in wireless environments | 230 | timeouts. It is particularly beneficial in networks where the |
231 | where packet loss is typically due to random radio interference | 231 | RTT fluctuates (e.g., wireless). F-RTO is sender-side only |
232 | rather than intermediate router congestion. F-RTO is sender-side | 232 | modification. It does not require any support from the peer. |
233 | only modification. Therefore it does not require any support from | 233 | |
234 | the peer. | 234 | By default it's enabled with a non-zero value. 0 disables F-RTO. |
235 | |||
236 | If set to 1, basic version is enabled. 2 enables SACK enhanced | ||
237 | F-RTO if flow uses SACK. The basic version can be used also when | ||
238 | SACK is in use though scenario(s) with it exists where F-RTO | ||
239 | interacts badly with the packet counting of the SACK enabled TCP | ||
240 | flow. | ||
241 | 235 | ||
242 | tcp_keepalive_time - INTEGER | 236 | tcp_keepalive_time - INTEGER |
243 | How often TCP sends out keepalive messages when keepalive is enabled. | 237 | How often TCP sends out keepalive messages when keepalive is enabled. |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f5f203b36379..5adbc33d1ab3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -192,7 +192,8 @@ struct tcp_sock { | |||
192 | u8 nonagle : 4,/* Disable Nagle algorithm? */ | 192 | u8 nonagle : 4,/* Disable Nagle algorithm? */ |
193 | thin_lto : 1,/* Use linear timeouts for thin streams */ | 193 | thin_lto : 1,/* Use linear timeouts for thin streams */ |
194 | thin_dupack : 1,/* Fast retransmit on first dupack */ | 194 | thin_dupack : 1,/* Fast retransmit on first dupack */ |
195 | repair : 1; | 195 | repair : 1, |
196 | frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ | ||
196 | u8 repair_queue; | 197 | u8 repair_queue; |
197 | u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ | 198 | u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ |
198 | syn_data:1, /* SYN includes data */ | 199 | syn_data:1, /* SYN includes data */ |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8d821e45b917..b2b36196b342 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -107,6 +107,7 @@ int sysctl_tcp_early_retrans __read_mostly = 3; | |||
107 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ | 107 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ |
108 | #define FLAG_ECE 0x40 /* ECE in this ACK */ | 108 | #define FLAG_ECE 0x40 /* ECE in this ACK */ |
109 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ | 109 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ |
110 | #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ | ||
110 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ | 111 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ |
111 | #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ | 112 | #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ |
112 | #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ | 113 | #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ |
@@ -1155,6 +1156,8 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1155 | tcp_highest_sack_seq(tp))) | 1156 | tcp_highest_sack_seq(tp))) |
1156 | state->reord = min(fack_count, | 1157 | state->reord = min(fack_count, |
1157 | state->reord); | 1158 | state->reord); |
1159 | if (!after(end_seq, tp->high_seq)) | ||
1160 | state->flag |= FLAG_ORIG_SACK_ACKED; | ||
1158 | } | 1161 | } |
1159 | 1162 | ||
1160 | if (sacked & TCPCB_LOST) { | 1163 | if (sacked & TCPCB_LOST) { |
@@ -1835,10 +1838,13 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1835 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1838 | const struct inet_connection_sock *icsk = inet_csk(sk); |
1836 | struct tcp_sock *tp = tcp_sk(sk); | 1839 | struct tcp_sock *tp = tcp_sk(sk); |
1837 | struct sk_buff *skb; | 1840 | struct sk_buff *skb; |
1841 | bool new_recovery = false; | ||
1838 | 1842 | ||
1839 | /* Reduce ssthresh if it has not yet been made inside this window. */ | 1843 | /* Reduce ssthresh if it has not yet been made inside this window. */ |
1840 | if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || | 1844 | if (icsk->icsk_ca_state <= TCP_CA_Disorder || |
1845 | !after(tp->high_seq, tp->snd_una) || | ||
1841 | (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { | 1846 | (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { |
1847 | new_recovery = true; | ||
1842 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | 1848 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
1843 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | 1849 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); |
1844 | tcp_ca_event(sk, CA_EVENT_LOSS); | 1850 | tcp_ca_event(sk, CA_EVENT_LOSS); |
@@ -1883,6 +1889,14 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1883 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1889 | tcp_set_ca_state(sk, TCP_CA_Loss); |
1884 | tp->high_seq = tp->snd_nxt; | 1890 | tp->high_seq = tp->snd_nxt; |
1885 | TCP_ECN_queue_cwr(tp); | 1891 | TCP_ECN_queue_cwr(tp); |
1892 | |||
1893 | /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous | ||
1894 | * loss recovery is underway except recurring timeout(s) on | ||
1895 | * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing | ||
1896 | */ | ||
1897 | tp->frto = sysctl_tcp_frto && | ||
1898 | (new_recovery || icsk->icsk_retransmits) && | ||
1899 | !inet_csk(sk)->icsk_mtup.probe_size; | ||
1886 | } | 1900 | } |
1887 | 1901 | ||
1888 | /* If ACK arrived pointing to a remembered SACK, it means that our | 1902 | /* If ACK arrived pointing to a remembered SACK, it means that our |
@@ -2426,12 +2440,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) | |||
2426 | return failed; | 2440 | return failed; |
2427 | } | 2441 | } |
2428 | 2442 | ||
2429 | /* Undo during loss recovery after partial ACK. */ | 2443 | /* Undo during loss recovery after partial ACK or using F-RTO. */ |
2430 | static bool tcp_try_undo_loss(struct sock *sk) | 2444 | static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) |
2431 | { | 2445 | { |
2432 | struct tcp_sock *tp = tcp_sk(sk); | 2446 | struct tcp_sock *tp = tcp_sk(sk); |
2433 | 2447 | ||
2434 | if (tcp_may_undo(tp)) { | 2448 | if (frto_undo || tcp_may_undo(tp)) { |
2435 | struct sk_buff *skb; | 2449 | struct sk_buff *skb; |
2436 | tcp_for_write_queue(skb, sk) { | 2450 | tcp_for_write_queue(skb, sk) { |
2437 | if (skb == tcp_send_head(sk)) | 2451 | if (skb == tcp_send_head(sk)) |
@@ -2445,9 +2459,12 @@ static bool tcp_try_undo_loss(struct sock *sk) | |||
2445 | tp->lost_out = 0; | 2459 | tp->lost_out = 0; |
2446 | tcp_undo_cwr(sk, true); | 2460 | tcp_undo_cwr(sk, true); |
2447 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); | 2461 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); |
2462 | if (frto_undo) | ||
2463 | NET_INC_STATS_BH(sock_net(sk), | ||
2464 | LINUX_MIB_TCPSPURIOUSRTOS); | ||
2448 | inet_csk(sk)->icsk_retransmits = 0; | 2465 | inet_csk(sk)->icsk_retransmits = 0; |
2449 | tp->undo_marker = 0; | 2466 | tp->undo_marker = 0; |
2450 | if (tcp_is_sack(tp)) | 2467 | if (frto_undo || tcp_is_sack(tp)) |
2451 | tcp_set_ca_state(sk, TCP_CA_Open); | 2468 | tcp_set_ca_state(sk, TCP_CA_Open); |
2452 | return true; | 2469 | return true; |
2453 | } | 2470 | } |
@@ -2667,24 +2684,52 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | |||
2667 | /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are | 2684 | /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are |
2668 | * recovered or spurious. Otherwise retransmits more on partial ACKs. | 2685 | * recovered or spurious. Otherwise retransmits more on partial ACKs. |
2669 | */ | 2686 | */ |
2670 | static void tcp_process_loss(struct sock *sk, int flag) | 2687 | static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) |
2671 | { | 2688 | { |
2672 | struct inet_connection_sock *icsk = inet_csk(sk); | 2689 | struct inet_connection_sock *icsk = inet_csk(sk); |
2673 | struct tcp_sock *tp = tcp_sk(sk); | 2690 | struct tcp_sock *tp = tcp_sk(sk); |
2691 | bool recovered = !before(tp->snd_una, tp->high_seq); | ||
2674 | 2692 | ||
2675 | if (!before(tp->snd_una, tp->high_seq)) { | 2693 | if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ |
2694 | if (flag & FLAG_ORIG_SACK_ACKED) { | ||
2695 | /* Step 3.b. A timeout is spurious if not all data are | ||
2696 | * lost, i.e., never-retransmitted data are (s)acked. | ||
2697 | */ | ||
2698 | tcp_try_undo_loss(sk, true); | ||
2699 | return; | ||
2700 | } | ||
2701 | if (after(tp->snd_nxt, tp->high_seq) && | ||
2702 | (flag & FLAG_DATA_SACKED || is_dupack)) { | ||
2703 | tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ | ||
2704 | } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { | ||
2705 | tp->high_seq = tp->snd_nxt; | ||
2706 | __tcp_push_pending_frames(sk, tcp_current_mss(sk), | ||
2707 | TCP_NAGLE_OFF); | ||
2708 | if (after(tp->snd_nxt, tp->high_seq)) | ||
2709 | return; /* Step 2.b */ | ||
2710 | tp->frto = 0; | ||
2711 | } | ||
2712 | } | ||
2713 | |||
2714 | if (recovered) { | ||
2715 | /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ | ||
2676 | icsk->icsk_retransmits = 0; | 2716 | icsk->icsk_retransmits = 0; |
2677 | tcp_try_undo_recovery(sk); | 2717 | tcp_try_undo_recovery(sk); |
2678 | return; | 2718 | return; |
2679 | } | 2719 | } |
2680 | |||
2681 | if (flag & FLAG_DATA_ACKED) | 2720 | if (flag & FLAG_DATA_ACKED) |
2682 | icsk->icsk_retransmits = 0; | 2721 | icsk->icsk_retransmits = 0; |
2683 | if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) | 2722 | if (tcp_is_reno(tp)) { |
2684 | tcp_reset_reno_sack(tp); | 2723 | /* A Reno DUPACK means new data in F-RTO step 2.b above are |
2685 | if (tcp_try_undo_loss(sk)) | 2724 | * delivered. Lower inflight to clock out (re)tranmissions. |
2725 | */ | ||
2726 | if (after(tp->snd_nxt, tp->high_seq) && is_dupack) | ||
2727 | tcp_add_reno_sack(sk); | ||
2728 | else if (flag & FLAG_SND_UNA_ADVANCED) | ||
2729 | tcp_reset_reno_sack(tp); | ||
2730 | } | ||
2731 | if (tcp_try_undo_loss(sk, false)) | ||
2686 | return; | 2732 | return; |
2687 | tcp_moderate_cwnd(tp); | ||
2688 | tcp_xmit_retransmit_queue(sk); | 2733 | tcp_xmit_retransmit_queue(sk); |
2689 | } | 2734 | } |
2690 | 2735 | ||
@@ -2764,7 +2809,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2764 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; | 2809 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; |
2765 | break; | 2810 | break; |
2766 | case TCP_CA_Loss: | 2811 | case TCP_CA_Loss: |
2767 | tcp_process_loss(sk, flag); | 2812 | tcp_process_loss(sk, flag, is_dupack); |
2768 | if (icsk->icsk_ca_state != TCP_CA_Open) | 2813 | if (icsk->icsk_ca_state != TCP_CA_Open) |
2769 | return; | 2814 | return; |
2770 | /* Fall through to processing in Open state. */ | 2815 | /* Fall through to processing in Open state. */ |
@@ -3003,6 +3048,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3003 | } | 3048 | } |
3004 | if (!(sacked & TCPCB_SACKED_ACKED)) | 3049 | if (!(sacked & TCPCB_SACKED_ACKED)) |
3005 | reord = min(pkts_acked, reord); | 3050 | reord = min(pkts_acked, reord); |
3051 | if (!after(scb->end_seq, tp->high_seq)) | ||
3052 | flag |= FLAG_ORIG_SACK_ACKED; | ||
3006 | } | 3053 | } |
3007 | 3054 | ||
3008 | if (sacked & TCPCB_SACKED_ACKED) | 3055 | if (sacked & TCPCB_SACKED_ACKED) |