diff options
author | Jerry Chu <hkchu@google.com> | 2010-08-27 15:13:28 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-08-30 16:23:33 -0400 |
commit | dca43c75e7e545694a9dd6288553f55c53e2a3a3 (patch) | |
tree | 4df6b0b295ecd571fa95004b486d9af1636d6a30 | |
parent | 409456b10f87b28303643fec37543103f9ada00c (diff) |
tcp: Add TCP_USER_TIMEOUT socket option.
This patch provides a "user timeout" support as described in RFC793. The
socket option is also needed for the the local half of RFC5482 "TCP User
Timeout Option".
TCP_USER_TIMEOUT is a TCP level socket option that takes an unsigned int,
when > 0, to specify the maximum amount of time in ms that transmitted
data may remain unacknowledged before TCP will forcefully close the
corresponding connection and return ETIMEDOUT to the application. If
0 is given, TCP will continue to use the system default.
Increasing the user timeouts allows a TCP connection to survive extended
periods without end-to-end connectivity. Decreasing the user timeouts
allows applications to "fail fast" if so desired. Otherwise it may take
upto 20 minutes with the current system defaults in a normal WAN
environment.
The socket option can be made during any state of a TCP connection, but
is only effective during the synchronized states of a connection
(ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, or LAST-ACK).
Moreover, when used with the TCP keepalive (SO_KEEPALIVE) option,
TCP_USER_TIMEOUT will overtake keepalive to determine when to close a
connection due to keepalive failure.
The option does not change in anyway when TCP retransmits a packet, nor
when a keepalive probe will be sent.
This option, like many others, will be inherited by an acceptor from its
listener.
Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/tcp.h | 1 | ||||
-rw-r--r-- | include/net/inet_connection_sock.h | 1 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 40 |
4 files changed, 37 insertions, 16 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a778ee02459..e64f4c67d0e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -105,6 +105,7 @@ enum { | |||
105 | #define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */ | 105 | #define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */ |
106 | #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ | 106 | #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ |
107 | #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ | 107 | #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ |
108 | #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ | ||
108 | 109 | ||
109 | /* for TCP_INFO socket option */ | 110 | /* for TCP_INFO socket option */ |
110 | #define TCPI_OPT_TIMESTAMPS 1 | 111 | #define TCPI_OPT_TIMESTAMPS 1 |
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b6d3b55da19..e4f494b42e0 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h | |||
@@ -125,6 +125,7 @@ struct inet_connection_sock { | |||
125 | int probe_size; | 125 | int probe_size; |
126 | } icsk_mtup; | 126 | } icsk_mtup; |
127 | u32 icsk_ca_priv[16]; | 127 | u32 icsk_ca_priv[16]; |
128 | u32 icsk_user_timeout; | ||
128 | #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) | 129 | #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) |
129 | }; | 130 | }; |
130 | 131 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 176e11aaea7..cf325452875 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -2391,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2391 | err = tp->af_specific->md5_parse(sk, optval, optlen); | 2391 | err = tp->af_specific->md5_parse(sk, optval, optlen); |
2392 | break; | 2392 | break; |
2393 | #endif | 2393 | #endif |
2394 | 2394 | case TCP_USER_TIMEOUT: | |
2395 | /* Cap the max timeout in ms TCP will retry/retrans | ||
2396 | * before giving up and aborting (ETIMEDOUT) a connection. | ||
2397 | */ | ||
2398 | icsk->icsk_user_timeout = msecs_to_jiffies(val); | ||
2399 | break; | ||
2395 | default: | 2400 | default: |
2396 | err = -ENOPROTOOPT; | 2401 | err = -ENOPROTOOPT; |
2397 | break; | 2402 | break; |
@@ -2610,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
2610 | case TCP_THIN_DUPACK: | 2615 | case TCP_THIN_DUPACK: |
2611 | val = tp->thin_dupack; | 2616 | val = tp->thin_dupack; |
2612 | break; | 2617 | break; |
2618 | |||
2619 | case TCP_USER_TIMEOUT: | ||
2620 | val = jiffies_to_msecs(icsk->icsk_user_timeout); | ||
2621 | break; | ||
2613 | default: | 2622 | default: |
2614 | return -ENOPROTOOPT; | 2623 | return -ENOPROTOOPT; |
2615 | } | 2624 | } |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 808bb920c9f..11569deccbe 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -138,10 +138,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) | |||
138 | * retransmissions with an initial RTO of TCP_RTO_MIN. | 138 | * retransmissions with an initial RTO of TCP_RTO_MIN. |
139 | */ | 139 | */ |
140 | static bool retransmits_timed_out(struct sock *sk, | 140 | static bool retransmits_timed_out(struct sock *sk, |
141 | unsigned int boundary) | 141 | unsigned int boundary, |
142 | unsigned int timeout) | ||
142 | { | 143 | { |
143 | unsigned int timeout, linear_backoff_thresh; | 144 | unsigned int linear_backoff_thresh, start_ts; |
144 | unsigned int start_ts; | ||
145 | 145 | ||
146 | if (!inet_csk(sk)->icsk_retransmits) | 146 | if (!inet_csk(sk)->icsk_retransmits) |
147 | return false; | 147 | return false; |
@@ -151,14 +151,15 @@ static bool retransmits_timed_out(struct sock *sk, | |||
151 | else | 151 | else |
152 | start_ts = tcp_sk(sk)->retrans_stamp; | 152 | start_ts = tcp_sk(sk)->retrans_stamp; |
153 | 153 | ||
154 | linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); | 154 | if (likely(timeout == 0)) { |
155 | 155 | linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); | |
156 | if (boundary <= linear_backoff_thresh) | ||
157 | timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; | ||
158 | else | ||
159 | timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + | ||
160 | (boundary - linear_backoff_thresh) * TCP_RTO_MAX; | ||
161 | 156 | ||
157 | if (boundary <= linear_backoff_thresh) | ||
158 | timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; | ||
159 | else | ||
160 | timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + | ||
161 | (boundary - linear_backoff_thresh) * TCP_RTO_MAX; | ||
162 | } | ||
162 | return (tcp_time_stamp - start_ts) >= timeout; | 163 | return (tcp_time_stamp - start_ts) >= timeout; |
163 | } | 164 | } |
164 | 165 | ||
@@ -174,7 +175,7 @@ static int tcp_write_timeout(struct sock *sk) | |||
174 | dst_negative_advice(sk); | 175 | dst_negative_advice(sk); |
175 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; | 176 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
176 | } else { | 177 | } else { |
177 | if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { | 178 | if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { |
178 | /* Black hole detection */ | 179 | /* Black hole detection */ |
179 | tcp_mtu_probing(icsk, sk); | 180 | tcp_mtu_probing(icsk, sk); |
180 | 181 | ||
@@ -187,14 +188,16 @@ static int tcp_write_timeout(struct sock *sk) | |||
187 | 188 | ||
188 | retry_until = tcp_orphan_retries(sk, alive); | 189 | retry_until = tcp_orphan_retries(sk, alive); |
189 | do_reset = alive || | 190 | do_reset = alive || |
190 | !retransmits_timed_out(sk, retry_until); | 191 | !retransmits_timed_out(sk, retry_until, 0); |
191 | 192 | ||
192 | if (tcp_out_of_resources(sk, do_reset)) | 193 | if (tcp_out_of_resources(sk, do_reset)) |
193 | return 1; | 194 | return 1; |
194 | } | 195 | } |
195 | } | 196 | } |
196 | 197 | ||
197 | if (retransmits_timed_out(sk, retry_until)) { | 198 | if (retransmits_timed_out(sk, retry_until, |
199 | (1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) ? 0 : | ||
200 | icsk->icsk_user_timeout)) { | ||
198 | /* Has it gone just too far? */ | 201 | /* Has it gone just too far? */ |
199 | tcp_write_err(sk); | 202 | tcp_write_err(sk); |
200 | return 1; | 203 | return 1; |
@@ -436,7 +439,7 @@ out_reset_timer: | |||
436 | icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); | 439 | icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); |
437 | } | 440 | } |
438 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); | 441 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); |
439 | if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) | 442 | if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) |
440 | __sk_dst_reset(sk); | 443 | __sk_dst_reset(sk); |
441 | 444 | ||
442 | out:; | 445 | out:; |
@@ -556,7 +559,14 @@ static void tcp_keepalive_timer (unsigned long data) | |||
556 | elapsed = keepalive_time_elapsed(tp); | 559 | elapsed = keepalive_time_elapsed(tp); |
557 | 560 | ||
558 | if (elapsed >= keepalive_time_when(tp)) { | 561 | if (elapsed >= keepalive_time_when(tp)) { |
559 | if (icsk->icsk_probes_out >= keepalive_probes(tp)) { | 562 | /* If the TCP_USER_TIMEOUT option is enabled, use that |
563 | * to determine when to timeout instead. | ||
564 | */ | ||
565 | if ((icsk->icsk_user_timeout != 0 && | ||
566 | elapsed >= icsk->icsk_user_timeout && | ||
567 | icsk->icsk_probes_out > 0) || | ||
568 | (icsk->icsk_user_timeout == 0 && | ||
569 | icsk->icsk_probes_out >= keepalive_probes(tp))) { | ||
560 | tcp_send_active_reset(sk, GFP_ATOMIC); | 570 | tcp_send_active_reset(sk, GFP_ATOMIC); |
561 | tcp_write_err(sk); | 571 | tcp_write_err(sk); |
562 | goto out; | 572 | goto out; |