aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndreas Petlund <apetlund@simula.no>2010-02-17 21:47:01 -0500
committerDavid S. Miller <davem@davemloft.net>2010-02-18 18:43:08 -0500
commit36e31b0af58728071e8023cf8e20c5166b700717 (patch)
tree8b4d251bf78965ac7501bea9011786b8255a3312
parent5aa4b32fc86408705337e941ed716880c63d1590 (diff)
net: TCP thin linear timeouts
This patch will make TCP use only linear timeouts if the stream is thin. This will help to avoid the very high latencies that thin stream suffer because of exponential backoff. This mechanism is only active if enabled by iocontrol or syscontrol and the stream is identified as thin. A maximum of 6 linear timeouts is tried before exponential backoff is resumed. Signed-off-by: Andreas Petlund <apetlund@simula.no> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt12
-rw-r--r--include/linux/tcp.h5
-rw-r--r--include/net/tcp.h4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c7
-rw-r--r--net/ipv4/tcp_timer.c21
6 files changed, 54 insertions, 2 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2dc7a1d97686..f147310d9af4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -487,6 +487,18 @@ tcp_dma_copybreak - INTEGER
487 and CONFIG_NET_DMA is enabled. 487 and CONFIG_NET_DMA is enabled.
488 Default: 4096 488 Default: 4096
489 489
490tcp_thin_linear_timeouts - BOOLEAN
491 Enable dynamic triggering of linear timeouts for thin streams.
492 If set, a check is performed upon retransmission by timeout to
493 determine if the stream is thin (less than 4 packets in flight).
494 As long as the stream is found to be thin, up to 6 linear
495 timeouts may be performed before exponential backoff mode is
496 initiated. This improves retransmission latency for
497 non-aggressive thin streams, often found to be time-dependent.
498 For more information on thin streams, see
499 Documentation/networking/tcp-thin.txt
500 Default: 0
501
490UDP variables: 502UDP variables:
491 503
492udp_mem - vector of 3 INTEGERs: min, pressure, max 504udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7fee8a4df931..3ba8b074612f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -103,6 +103,7 @@ enum {
103#define TCP_CONGESTION 13 /* Congestion control algorithm */ 103#define TCP_CONGESTION 13 /* Congestion control algorithm */
104#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ 104#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
105#define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */ 105#define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */
106#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/
106 107
107/* for TCP_INFO socket option */ 108/* for TCP_INFO socket option */
108#define TCPI_OPT_TIMESTAMPS 1 109#define TCPI_OPT_TIMESTAMPS 1
@@ -340,7 +341,9 @@ struct tcp_sock {
340 u32 frto_highmark; /* snd_nxt when RTO occurred */ 341 u32 frto_highmark; /* snd_nxt when RTO occurred */
341 u16 advmss; /* Advertised MSS */ 342 u16 advmss; /* Advertised MSS */
342 u8 frto_counter; /* Number of new acks after RTO */ 343 u8 frto_counter; /* Number of new acks after RTO */
343 u8 nonagle; /* Disable Nagle algorithm? */ 344 u8 nonagle : 4,/* Disable Nagle algorithm? */
345 thin_lto : 1,/* Use linear timeouts for thin streams */
346 unused : 3;
344 347
345/* RTT measurement */ 348/* RTT measurement */
346 u32 srtt; /* smoothed round trip time << 3 */ 349 u32 srtt; /* smoothed round trip time << 3 */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0bdc3f640247..6278fc734abd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
196#define TCP_NAGLE_CORK 2 /* Socket is corked */ 196#define TCP_NAGLE_CORK 2 /* Socket is corked */
197#define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ 197#define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */
198 198
199/* TCP thin-stream limits */
200#define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */
201
199extern struct inet_timewait_death_row tcp_death_row; 202extern struct inet_timewait_death_row tcp_death_row;
200 203
201/* sysctl variables for tcp */ 204/* sysctl variables for tcp */
@@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows;
241extern int sysctl_tcp_slow_start_after_idle; 244extern int sysctl_tcp_slow_start_after_idle;
242extern int sysctl_tcp_max_ssthresh; 245extern int sysctl_tcp_max_ssthresh;
243extern int sysctl_tcp_cookie_size; 246extern int sysctl_tcp_cookie_size;
247extern int sysctl_tcp_thin_linear_timeouts;
244 248
245extern atomic_t tcp_memory_allocated; 249extern atomic_t tcp_memory_allocated;
246extern struct percpu_counter tcp_sockets_allocated; 250extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712ce3994..e6a2460587d4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = {
576 .proc_handler = proc_dointvec 576 .proc_handler = proc_dointvec
577 }, 577 },
578 { 578 {
579 .procname = "tcp_thin_linear_timeouts",
580 .data = &sysctl_tcp_thin_linear_timeouts,
581 .maxlen = sizeof(int),
582 .mode = 0644,
583 .proc_handler = proc_dointvec
584 },
585 {
579 .procname = "udp_mem", 586 .procname = "udp_mem",
580 .data = &sysctl_udp_mem, 587 .data = &sysctl_udp_mem,
581 .maxlen = sizeof(sysctl_udp_mem), 588 .maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e471d037fcc9..21bae9afefea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2229,6 +2229,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2229 } 2229 }
2230 break; 2230 break;
2231 2231
2232 case TCP_THIN_LINEAR_TIMEOUTS:
2233 if (val < 0 || val > 1)
2234 err = -EINVAL;
2235 else
2236 tp->thin_lto = val;
2237 break;
2238
2232 case TCP_CORK: 2239 case TCP_CORK:
2233 /* When set indicates to always queue non-full frames. 2240 /* When set indicates to always queue non-full frames.
2234 * Later the user clears this option and we transmit 2241 * Later the user clears this option and we transmit
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index de7d1bf9114f..a17629b8912e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
29int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; 29int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
30int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; 30int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
31int sysctl_tcp_orphan_retries __read_mostly; 31int sysctl_tcp_orphan_retries __read_mostly;
32int sysctl_tcp_thin_linear_timeouts __read_mostly;
32 33
33static void tcp_write_timer(unsigned long); 34static void tcp_write_timer(unsigned long);
34static void tcp_delack_timer(unsigned long); 35static void tcp_delack_timer(unsigned long);
@@ -415,7 +416,25 @@ void tcp_retransmit_timer(struct sock *sk)
415 icsk->icsk_retransmits++; 416 icsk->icsk_retransmits++;
416 417
417out_reset_timer: 418out_reset_timer:
418 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 419 /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
420 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
421 * might be increased if the stream oscillates between thin and thick,
422 * thus the old value might already be too high compared to the value
423 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
424 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
425 * exponential backoff behaviour to avoid continue hammering
426 * linear-timeout retransmissions into a black hole
427 */
428 if (sk->sk_state == TCP_ESTABLISHED &&
429 (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
430 tcp_stream_is_thin(tp) &&
431 icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
432 icsk->icsk_backoff = 0;
433 icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
434 } else {
435 /* Use normal (exponential) backoff */
436 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
437 }
419 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
420 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) 439 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
421 __sk_dst_reset(sk); 440 __sk_dst_reset(sk);