diff options
author | Yuchung Cheng <ycheng@google.com> | 2012-05-02 09:30:03 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-05-02 20:56:10 -0400 |
commit | eed530b6c67624db3f2cf477bac7c4d005d8f7ba (patch) | |
tree | c07096807ead2adb9d85e85d1a9cd1ada85755ac | |
parent | 1fbc340514fc3003514bd681b372e1f47ae6183f (diff) |
tcp: early retransmit
This patch implements RFC 5827 early retransmit (ER) for TCP.
It reduces DUPACK threshold (dupthresh) if outstanding packets are
less than 4 to recover losses by fast recovery instead of timeout.
While the algorithm is simple, small but frequent network reordering
makes this feature dangerous: the connection repeatedly enter
false recovery and degrade performance. Therefore we implement
a mitigation suggested in the appendix of the RFC that delays
entering fast recovery by a small interval, i.e., RTT/4. Currently
ER is conservative and is disabled for the rest of the connection
after the first reordering event. A large scale web server
experiment on the performance impact of ER is summarized in
section 6 of the paper "Proportional Rate Reduction for TCP”,
IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf
Note that Linux has a similar feature called THIN_DUPACK. The
differences are THIN_DUPACK do not mitigate reorderings and is only
used after slow start. Currently ER is disabled if THIN_DUPACK is
enabled. I would be happy to merge THIN_DUPACK feature with ER if
people think it's a good idea.
ER is enabled by sysctl_tcp_early_retrans:
0: Disables ER
1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4.
2: (Default) reduce dupthresh like mode 1. In addition, delay
entering fast recovery by RTT/4.
Note: mode 2 is implemented in the third part of this patch series.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 14 | ||||
-rw-r--r-- | include/linux/tcp.h | 1 | ||||
-rw-r--r-- | include/net/tcp.h | 15 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 15 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 1 |
7 files changed, 59 insertions, 0 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 9b569a2d9c60..34916e792d9d 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -190,6 +190,20 @@ tcp_cookie_size - INTEGER | |||
190 | tcp_dsack - BOOLEAN | 190 | tcp_dsack - BOOLEAN |
191 | Allows TCP to send "duplicate" SACKs. | 191 | Allows TCP to send "duplicate" SACKs. |
192 | 192 | ||
193 | tcp_early_retrans - INTEGER | ||
194 | Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold | ||
195 | for triggering fast retransmit when the amount of outstanding data is | ||
196 | small and when no previously unsent data can be transmitted (such | ||
197 | that limited transmit could be used). | ||
198 | Possible values: | ||
199 | 0 disables ER | ||
200 | 1 enables ER | ||
201 | 2 enables ER but delays fast recovery and fast retransmit | ||
202 | by a fourth of RTT. This mitigates connection falsely | ||
203 | recovers when network has a small degree of reordering | ||
204 | (less than 3 packets). | ||
205 | Default: 2 | ||
206 | |||
193 | tcp_ecn - INTEGER | 207 | tcp_ecn - INTEGER |
194 | Enable Explicit Congestion Notification (ECN) in TCP. ECN is only | 208 | Enable Explicit Congestion Notification (ECN) in TCP. ECN is only |
195 | used when both ends of the TCP flow support it. It is useful to | 209 | used when both ends of the TCP flow support it. It is useful to |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 278af9ea42d4..7859b416d46e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -372,6 +372,7 @@ struct tcp_sock { | |||
372 | repair : 1, | 372 | repair : 1, |
373 | unused : 1; | 373 | unused : 1; |
374 | u8 repair_queue; | 374 | u8 repair_queue; |
375 | u8 do_early_retrans:1;/* Enable RFC5827 early-retransmit */ | ||
375 | 376 | ||
376 | /* RTT measurement */ | 377 | /* RTT measurement */ |
377 | u32 srtt; /* smoothed round trip time << 3 */ | 378 | u32 srtt; /* smoothed round trip time << 3 */ |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 0fb84de6da36..685437a16c97 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -252,6 +252,7 @@ extern int sysctl_tcp_max_ssthresh; | |||
252 | extern int sysctl_tcp_cookie_size; | 252 | extern int sysctl_tcp_cookie_size; |
253 | extern int sysctl_tcp_thin_linear_timeouts; | 253 | extern int sysctl_tcp_thin_linear_timeouts; |
254 | extern int sysctl_tcp_thin_dupack; | 254 | extern int sysctl_tcp_thin_dupack; |
255 | extern int sysctl_tcp_early_retrans; | ||
255 | 256 | ||
256 | extern atomic_long_t tcp_memory_allocated; | 257 | extern atomic_long_t tcp_memory_allocated; |
257 | extern struct percpu_counter tcp_sockets_allocated; | 258 | extern struct percpu_counter tcp_sockets_allocated; |
@@ -797,6 +798,20 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) | |||
797 | tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; | 798 | tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; |
798 | } | 799 | } |
799 | 800 | ||
801 | /* TCP early-retransmit (ER) is similar to but more conservative than | ||
802 | * the thin-dupack feature. Enable ER only if thin-dupack is disabled. | ||
803 | */ | ||
804 | static inline void tcp_enable_early_retrans(struct tcp_sock *tp) | ||
805 | { | ||
806 | tp->do_early_retrans = sysctl_tcp_early_retrans && | ||
807 | !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3; | ||
808 | } | ||
809 | |||
810 | static inline void tcp_disable_early_retrans(struct tcp_sock *tp) | ||
811 | { | ||
812 | tp->do_early_retrans = 0; | ||
813 | } | ||
814 | |||
800 | static inline unsigned int tcp_left_out(const struct tcp_sock *tp) | 815 | static inline unsigned int tcp_left_out(const struct tcp_sock *tp) |
801 | { | 816 | { |
802 | return tp->sacked_out + tp->lost_out; | 817 | return tp->sacked_out + tp->lost_out; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 33417f84e07f..ef32956ed655 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <net/tcp_memcontrol.h> | 27 | #include <net/tcp_memcontrol.h> |
28 | 28 | ||
29 | static int zero; | 29 | static int zero; |
30 | static int two = 2; | ||
30 | static int tcp_retr1_max = 255; | 31 | static int tcp_retr1_max = 255; |
31 | static int ip_local_port_range_min[] = { 1, 1 }; | 32 | static int ip_local_port_range_min[] = { 1, 1 }; |
32 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 33 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
@@ -677,6 +678,15 @@ static struct ctl_table ipv4_table[] = { | |||
677 | .proc_handler = proc_dointvec | 678 | .proc_handler = proc_dointvec |
678 | }, | 679 | }, |
679 | { | 680 | { |
681 | .procname = "tcp_early_retrans", | ||
682 | .data = &sysctl_tcp_early_retrans, | ||
683 | .maxlen = sizeof(int), | ||
684 | .mode = 0644, | ||
685 | .proc_handler = proc_dointvec_minmax, | ||
686 | .extra1 = &zero, | ||
687 | .extra2 = &two, | ||
688 | }, | ||
689 | { | ||
680 | .procname = "udp_mem", | 690 | .procname = "udp_mem", |
681 | .data = &sysctl_udp_mem, | 691 | .data = &sysctl_udp_mem, |
682 | .maxlen = sizeof(sysctl_udp_mem), | 692 | .maxlen = sizeof(sysctl_udp_mem), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9670af341931..6802c89bc44d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -395,6 +395,7 @@ void tcp_init_sock(struct sock *sk) | |||
395 | tp->mss_cache = TCP_MSS_DEFAULT; | 395 | tp->mss_cache = TCP_MSS_DEFAULT; |
396 | 396 | ||
397 | tp->reordering = sysctl_tcp_reordering; | 397 | tp->reordering = sysctl_tcp_reordering; |
398 | tcp_enable_early_retrans(tp); | ||
398 | icsk->icsk_ca_ops = &tcp_init_congestion_ops; | 399 | icsk->icsk_ca_ops = &tcp_init_congestion_ops; |
399 | 400 | ||
400 | sk->sk_state = TCP_CLOSE; | 401 | sk->sk_state = TCP_CLOSE; |
@@ -2495,6 +2496,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2495 | err = -EINVAL; | 2496 | err = -EINVAL; |
2496 | else | 2497 | else |
2497 | tp->thin_dupack = val; | 2498 | tp->thin_dupack = val; |
2499 | if (tp->thin_dupack) | ||
2500 | tcp_disable_early_retrans(tp); | ||
2498 | break; | 2501 | break; |
2499 | 2502 | ||
2500 | case TCP_REPAIR: | 2503 | case TCP_REPAIR: |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index be8e09d2c6b1..e042cabb695e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly; | |||
99 | 99 | ||
100 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | 100 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
101 | int sysctl_tcp_abc __read_mostly; | 101 | int sysctl_tcp_abc __read_mostly; |
102 | int sysctl_tcp_early_retrans __read_mostly = 2; | ||
102 | 103 | ||
103 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 104 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
104 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 105 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
@@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk) | |||
906 | if (dst_metric(dst, RTAX_REORDERING) && | 907 | if (dst_metric(dst, RTAX_REORDERING) && |
907 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { | 908 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { |
908 | tcp_disable_fack(tp); | 909 | tcp_disable_fack(tp); |
910 | tcp_disable_early_retrans(tp); | ||
909 | tp->reordering = dst_metric(dst, RTAX_REORDERING); | 911 | tp->reordering = dst_metric(dst, RTAX_REORDERING); |
910 | } | 912 | } |
911 | 913 | ||
@@ -988,6 +990,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, | |||
988 | #endif | 990 | #endif |
989 | tcp_disable_fack(tp); | 991 | tcp_disable_fack(tp); |
990 | } | 992 | } |
993 | |||
994 | if (metric > 0) | ||
995 | tcp_disable_early_retrans(tp); | ||
991 | } | 996 | } |
992 | 997 | ||
993 | /* This must be called before lost_out is incremented */ | 998 | /* This must be called before lost_out is incremented */ |
@@ -2492,6 +2497,16 @@ static int tcp_time_to_recover(struct sock *sk) | |||
2492 | tcp_is_sack(tp) && !tcp_send_head(sk)) | 2497 | tcp_is_sack(tp) && !tcp_send_head(sk)) |
2493 | return 1; | 2498 | return 1; |
2494 | 2499 | ||
2500 | /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious | ||
2501 | * retransmissions due to small network reorderings, we implement | ||
2502 | * Mitigation A.3 in the RFC and delay the retransmission for a short | ||
2503 | * interval if appropriate. | ||
2504 | */ | ||
2505 | if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && | ||
2506 | (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && | ||
2507 | !tcp_may_send_now(sk)) | ||
2508 | return 1; | ||
2509 | |||
2495 | return 0; | 2510 | return 0; |
2496 | } | 2511 | } |
2497 | 2512 | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3cabafb5cdd1..6f6a91832826 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
482 | newtp->sacked_out = 0; | 482 | newtp->sacked_out = 0; |
483 | newtp->fackets_out = 0; | 483 | newtp->fackets_out = 0; |
484 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | 484 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
485 | tcp_enable_early_retrans(newtp); | ||
485 | 486 | ||
486 | /* So many TCP implementations out there (incorrectly) count the | 487 | /* So many TCP implementations out there (incorrectly) count the |
487 | * initial SYN frame in their delayed-ACK and congestion control | 488 | * initial SYN frame in their delayed-ACK and congestion control |