aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2012-05-02 09:30:03 -0400
committerDavid S. Miller <davem@davemloft.net>2012-05-02 20:56:10 -0400
commiteed530b6c67624db3f2cf477bac7c4d005d8f7ba (patch)
treec07096807ead2adb9d85e85d1a9cd1ada85755ac
parent1fbc340514fc3003514bd681b372e1f47ae6183f (diff)
tcp: early retransmit
This patch implements RFC 5827 early retransmit (ER) for TCP. It reduces DUPACK threshold (dupthresh) if outstanding packets are less than 4 to recover losses by fast recovery instead of timeout. While the algorithm is simple, small but frequent network reordering makes this feature dangerous: the connection repeatedly enter false recovery and degrade performance. Therefore we implement a mitigation suggested in the appendix of the RFC that delays entering fast recovery by a small interval, i.e., RTT/4. Currently ER is conservative and is disabled for the rest of the connection after the first reordering event. A large scale web server experiment on the performance impact of ER is summarized in section 6 of the paper "Proportional Rate Reduction for TCP”, IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf Note that Linux has a similar feature called THIN_DUPACK. The differences are THIN_DUPACK do not mitigate reorderings and is only used after slow start. Currently ER is disabled if THIN_DUPACK is enabled. I would be happy to merge THIN_DUPACK feature with ER if people think it's a good idea. ER is enabled by sysctl_tcp_early_retrans: 0: Disables ER 1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4. 2: (Default) reduce dupthresh like mode 1. In addition, delay entering fast recovery by RTT/4. Note: mode 2 is implemented in the third part of this patch series. Signed-off-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt14
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/tcp.h15
-rw-r--r--net/ipv4/sysctl_net_ipv4.c10
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_input.c15
-rw-r--r--net/ipv4/tcp_minisocks.c1
7 files changed, 59 insertions, 0 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 9b569a2d9c60..34916e792d9d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -190,6 +190,20 @@ tcp_cookie_size - INTEGER
190tcp_dsack - BOOLEAN 190tcp_dsack - BOOLEAN
191 Allows TCP to send "duplicate" SACKs. 191 Allows TCP to send "duplicate" SACKs.
192 192
193tcp_early_retrans - INTEGER
194 Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
195 for triggering fast retransmit when the amount of outstanding data is
196 small and when no previously unsent data can be transmitted (such
197 that limited transmit could be used).
198 Possible values:
199 0 disables ER
200 1 enables ER
201 2 enables ER but delays fast recovery and fast retransmit
202 by a fourth of RTT. This mitigates connection falsely
203 recovers when network has a small degree of reordering
204 (less than 3 packets).
205 Default: 2
206
193tcp_ecn - INTEGER 207tcp_ecn - INTEGER
194 Enable Explicit Congestion Notification (ECN) in TCP. ECN is only 208 Enable Explicit Congestion Notification (ECN) in TCP. ECN is only
195 used when both ends of the TCP flow support it. It is useful to 209 used when both ends of the TCP flow support it. It is useful to
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 278af9ea42d4..7859b416d46e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -372,6 +372,7 @@ struct tcp_sock {
372 repair : 1, 372 repair : 1,
373 unused : 1; 373 unused : 1;
374 u8 repair_queue; 374 u8 repair_queue;
375 u8 do_early_retrans:1;/* Enable RFC5827 early-retransmit */
375 376
376/* RTT measurement */ 377/* RTT measurement */
377 u32 srtt; /* smoothed round trip time << 3 */ 378 u32 srtt; /* smoothed round trip time << 3 */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0fb84de6da36..685437a16c97 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -252,6 +252,7 @@ extern int sysctl_tcp_max_ssthresh;
252extern int sysctl_tcp_cookie_size; 252extern int sysctl_tcp_cookie_size;
253extern int sysctl_tcp_thin_linear_timeouts; 253extern int sysctl_tcp_thin_linear_timeouts;
254extern int sysctl_tcp_thin_dupack; 254extern int sysctl_tcp_thin_dupack;
255extern int sysctl_tcp_early_retrans;
255 256
256extern atomic_long_t tcp_memory_allocated; 257extern atomic_long_t tcp_memory_allocated;
257extern struct percpu_counter tcp_sockets_allocated; 258extern struct percpu_counter tcp_sockets_allocated;
@@ -797,6 +798,20 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
797 tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; 798 tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
798} 799}
799 800
801/* TCP early-retransmit (ER) is similar to but more conservative than
802 * the thin-dupack feature. Enable ER only if thin-dupack is disabled.
803 */
804static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
805{
806 tp->do_early_retrans = sysctl_tcp_early_retrans &&
807 !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3;
808}
809
810static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
811{
812 tp->do_early_retrans = 0;
813}
814
800static inline unsigned int tcp_left_out(const struct tcp_sock *tp) 815static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
801{ 816{
802 return tp->sacked_out + tp->lost_out; 817 return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 33417f84e07f..ef32956ed655 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,6 +27,7 @@
27#include <net/tcp_memcontrol.h> 27#include <net/tcp_memcontrol.h>
28 28
29static int zero; 29static int zero;
30static int two = 2;
30static int tcp_retr1_max = 255; 31static int tcp_retr1_max = 255;
31static int ip_local_port_range_min[] = { 1, 1 }; 32static int ip_local_port_range_min[] = { 1, 1 };
32static int ip_local_port_range_max[] = { 65535, 65535 }; 33static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -677,6 +678,15 @@ static struct ctl_table ipv4_table[] = {
677 .proc_handler = proc_dointvec 678 .proc_handler = proc_dointvec
678 }, 679 },
679 { 680 {
681 .procname = "tcp_early_retrans",
682 .data = &sysctl_tcp_early_retrans,
683 .maxlen = sizeof(int),
684 .mode = 0644,
685 .proc_handler = proc_dointvec_minmax,
686 .extra1 = &zero,
687 .extra2 = &two,
688 },
689 {
680 .procname = "udp_mem", 690 .procname = "udp_mem",
681 .data = &sysctl_udp_mem, 691 .data = &sysctl_udp_mem,
682 .maxlen = sizeof(sysctl_udp_mem), 692 .maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9670af341931..6802c89bc44d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -395,6 +395,7 @@ void tcp_init_sock(struct sock *sk)
395 tp->mss_cache = TCP_MSS_DEFAULT; 395 tp->mss_cache = TCP_MSS_DEFAULT;
396 396
397 tp->reordering = sysctl_tcp_reordering; 397 tp->reordering = sysctl_tcp_reordering;
398 tcp_enable_early_retrans(tp);
398 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 399 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
399 400
400 sk->sk_state = TCP_CLOSE; 401 sk->sk_state = TCP_CLOSE;
@@ -2495,6 +2496,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2495 err = -EINVAL; 2496 err = -EINVAL;
2496 else 2497 else
2497 tp->thin_dupack = val; 2498 tp->thin_dupack = val;
2499 if (tp->thin_dupack)
2500 tcp_disable_early_retrans(tp);
2498 break; 2501 break;
2499 2502
2500 case TCP_REPAIR: 2503 case TCP_REPAIR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be8e09d2c6b1..e042cabb695e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
99 99
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_abc __read_mostly; 101int sysctl_tcp_abc __read_mostly;
102int sysctl_tcp_early_retrans __read_mostly = 2;
102 103
103#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 104#define FLAG_DATA 0x01 /* Incoming frame contained data. */
104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 105#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk)
906 if (dst_metric(dst, RTAX_REORDERING) && 907 if (dst_metric(dst, RTAX_REORDERING) &&
907 tp->reordering != dst_metric(dst, RTAX_REORDERING)) { 908 tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
908 tcp_disable_fack(tp); 909 tcp_disable_fack(tp);
910 tcp_disable_early_retrans(tp);
909 tp->reordering = dst_metric(dst, RTAX_REORDERING); 911 tp->reordering = dst_metric(dst, RTAX_REORDERING);
910 } 912 }
911 913
@@ -988,6 +990,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
988#endif 990#endif
989 tcp_disable_fack(tp); 991 tcp_disable_fack(tp);
990 } 992 }
993
994 if (metric > 0)
995 tcp_disable_early_retrans(tp);
991} 996}
992 997
993/* This must be called before lost_out is incremented */ 998/* This must be called before lost_out is incremented */
@@ -2492,6 +2497,16 @@ static int tcp_time_to_recover(struct sock *sk)
2492 tcp_is_sack(tp) && !tcp_send_head(sk)) 2497 tcp_is_sack(tp) && !tcp_send_head(sk))
2493 return 1; 2498 return 1;
2494 2499
2500 /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
2501 * retransmissions due to small network reorderings, we implement
2502 * Mitigation A.3 in the RFC and delay the retransmission for a short
2503 * interval if appropriate.
2504 */
2505 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2506 (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
2507 !tcp_may_send_now(sk))
2508 return 1;
2509
2495 return 0; 2510 return 0;
2496} 2511}
2497 2512
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3cabafb5cdd1..6f6a91832826 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
482 newtp->sacked_out = 0; 482 newtp->sacked_out = 0;
483 newtp->fackets_out = 0; 483 newtp->fackets_out = 0;
484 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 484 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
485 tcp_enable_early_retrans(newtp);
485 486
486 /* So many TCP implementations out there (incorrectly) count the 487 /* So many TCP implementations out there (incorrectly) count the
487 * initial SYN frame in their delayed-ACK and congestion control 488 * initial SYN frame in their delayed-ACK and congestion control