diff options
author | Patrick McManus <mcmanus@ducksong.com> | 2008-03-21 19:33:01 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-03-21 19:33:01 -0400 |
commit | ec3c0982a2dd1e671bad8e9d26c28dcba0039d87 (patch) | |
tree | 11a3cd7c530e4225a4c3d4c3f3cc54eb7d2e0e4f | |
parent | e4c78840284f3f51b1896cf3936d60a6033c4d2c (diff) |
[TCP]: TCP_DEFER_ACCEPT updates - process as established
Change TCP_DEFER_ACCEPT implementation so that it transitions a
connection to ESTABLISHED after handshake is complete instead of
leaving it in SYN-RECV until some data arrvies. Place connection in
accept queue when first data packet arrives from slow path.
Benefits:
- established connection is now reset if it never makes it
to the accept queue
- diagnostic state of established matches with the packet traces
showing completed handshake
- TCP_DEFER_ACCEPT timeouts are expressed in seconds and can now be
enforced with reasonable accuracy instead of rounding up to next
exponential back-off of syn-ack retry.
Signed-off-by: Patrick McManus <mcmanus@ducksong.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/tcp.h | 7 | ||||
-rw-r--r-- | include/net/request_sock.h | 4 | ||||
-rw-r--r-- | include/net/tcp.h | 1 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 18 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 46 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 32 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 5 |
9 files changed, 99 insertions, 33 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 08027f1d7f31..d96d9b122304 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -239,6 +239,11 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) | |||
239 | return (struct tcp_request_sock *)req; | 239 | return (struct tcp_request_sock *)req; |
240 | } | 240 | } |
241 | 241 | ||
242 | struct tcp_deferred_accept_info { | ||
243 | struct sock *listen_sk; | ||
244 | struct request_sock *request; | ||
245 | }; | ||
246 | |||
242 | struct tcp_sock { | 247 | struct tcp_sock { |
243 | /* inet_connection_sock has to be the first member of tcp_sock */ | 248 | /* inet_connection_sock has to be the first member of tcp_sock */ |
244 | struct inet_connection_sock inet_conn; | 249 | struct inet_connection_sock inet_conn; |
@@ -374,6 +379,8 @@ struct tcp_sock { | |||
374 | unsigned int keepalive_intvl; /* time interval between keep alive probes */ | 379 | unsigned int keepalive_intvl; /* time interval between keep alive probes */ |
375 | int linger2; | 380 | int linger2; |
376 | 381 | ||
382 | struct tcp_deferred_accept_info defer_tcp_accept; | ||
383 | |||
377 | unsigned long last_synq_overflow; | 384 | unsigned long last_synq_overflow; |
378 | 385 | ||
379 | u32 tso_deferred; | 386 | u32 tso_deferred; |
diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 040780add355..0369f98e9f3a 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h | |||
@@ -115,8 +115,8 @@ struct request_sock_queue { | |||
115 | struct request_sock *rskq_accept_head; | 115 | struct request_sock *rskq_accept_head; |
116 | struct request_sock *rskq_accept_tail; | 116 | struct request_sock *rskq_accept_tail; |
117 | rwlock_t syn_wait_lock; | 117 | rwlock_t syn_wait_lock; |
118 | u8 rskq_defer_accept; | 118 | u16 rskq_defer_accept; |
119 | /* 3 bytes hole, try to pack */ | 119 | /* 2 bytes hole, try to pack */ |
120 | struct listen_sock *listen_opt; | 120 | struct listen_sock *listen_opt; |
121 | }; | 121 | }; |
122 | 122 | ||
diff --git a/include/net/tcp.h b/include/net/tcp.h index 847e1634e1f4..67cc3956d29c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -139,6 +139,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); | |||
139 | #define MAX_TCP_KEEPINTVL 32767 | 139 | #define MAX_TCP_KEEPINTVL 32767 |
140 | #define MAX_TCP_KEEPCNT 127 | 140 | #define MAX_TCP_KEEPCNT 127 |
141 | #define MAX_TCP_SYNCNT 127 | 141 | #define MAX_TCP_SYNCNT 127 |
142 | #define MAX_TCP_ACCEPT_DEFERRED 65535 | ||
142 | 143 | ||
143 | #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ | 144 | #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ |
144 | 145 | ||
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8a45be988709..cc1a1859a61b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -414,8 +414,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, | |||
414 | struct inet_connection_sock *icsk = inet_csk(parent); | 414 | struct inet_connection_sock *icsk = inet_csk(parent); |
415 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | 415 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; |
416 | struct listen_sock *lopt = queue->listen_opt; | 416 | struct listen_sock *lopt = queue->listen_opt; |
417 | int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; | 417 | int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; |
418 | int thresh = max_retries; | ||
419 | unsigned long now = jiffies; | 418 | unsigned long now = jiffies; |
420 | struct request_sock **reqp, *req; | 419 | struct request_sock **reqp, *req; |
421 | int i, budget; | 420 | int i, budget; |
@@ -451,9 +450,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, | |||
451 | } | 450 | } |
452 | } | 451 | } |
453 | 452 | ||
454 | if (queue->rskq_defer_accept) | ||
455 | max_retries = queue->rskq_defer_accept; | ||
456 | |||
457 | budget = 2 * (lopt->nr_table_entries / (timeout / interval)); | 453 | budget = 2 * (lopt->nr_table_entries / (timeout / interval)); |
458 | i = lopt->clock_hand; | 454 | i = lopt->clock_hand; |
459 | 455 | ||
@@ -461,9 +457,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, | |||
461 | reqp=&lopt->syn_table[i]; | 457 | reqp=&lopt->syn_table[i]; |
462 | while ((req = *reqp) != NULL) { | 458 | while ((req = *reqp) != NULL) { |
463 | if (time_after_eq(now, req->expires)) { | 459 | if (time_after_eq(now, req->expires)) { |
464 | if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) && | 460 | if (req->retrans < thresh && |
465 | (inet_rsk(req)->acked || | 461 | !req->rsk_ops->rtx_syn_ack(parent, req)) { |
466 | !req->rsk_ops->rtx_syn_ack(parent, req))) { | ||
467 | unsigned long timeo; | 462 | unsigned long timeo; |
468 | 463 | ||
469 | if (req->retrans++ == 0) | 464 | if (req->retrans++ == 0) |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 071e83a894ad..e0fbc25ca816 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -2105,15 +2105,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2105 | break; | 2105 | break; |
2106 | 2106 | ||
2107 | case TCP_DEFER_ACCEPT: | 2107 | case TCP_DEFER_ACCEPT: |
2108 | icsk->icsk_accept_queue.rskq_defer_accept = 0; | 2108 | if (val < 0) { |
2109 | if (val > 0) { | 2109 | err = -EINVAL; |
2110 | /* Translate value in seconds to number of | 2110 | } else { |
2111 | * retransmits */ | 2111 | if (val > MAX_TCP_ACCEPT_DEFERRED) |
2112 | while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && | 2112 | val = MAX_TCP_ACCEPT_DEFERRED; |
2113 | val > ((TCP_TIMEOUT_INIT / HZ) << | 2113 | icsk->icsk_accept_queue.rskq_defer_accept = val; |
2114 | icsk->icsk_accept_queue.rskq_defer_accept)) | ||
2115 | icsk->icsk_accept_queue.rskq_defer_accept++; | ||
2116 | icsk->icsk_accept_queue.rskq_defer_accept++; | ||
2117 | } | 2114 | } |
2118 | break; | 2115 | break; |
2119 | 2116 | ||
@@ -2295,8 +2292,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
2295 | val = (val ? : sysctl_tcp_fin_timeout) / HZ; | 2292 | val = (val ? : sysctl_tcp_fin_timeout) / HZ; |
2296 | break; | 2293 | break; |
2297 | case TCP_DEFER_ACCEPT: | 2294 | case TCP_DEFER_ACCEPT: |
2298 | val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : | 2295 | val = icsk->icsk_accept_queue.rskq_defer_accept; |
2299 | ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); | ||
2300 | break; | 2296 | break; |
2301 | case TCP_WINDOW_CLAMP: | 2297 | case TCP_WINDOW_CLAMP: |
2302 | val = tp->window_clamp; | 2298 | val = tp->window_clamp; |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9cf446427cc2..6e46b4c0f28c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -4451,6 +4451,49 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) | |||
4451 | } | 4451 | } |
4452 | } | 4452 | } |
4453 | 4453 | ||
4454 | static int tcp_defer_accept_check(struct sock *sk) | ||
4455 | { | ||
4456 | struct tcp_sock *tp = tcp_sk(sk); | ||
4457 | |||
4458 | if (tp->defer_tcp_accept.request) { | ||
4459 | int queued_data = tp->rcv_nxt - tp->copied_seq; | ||
4460 | int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ? | ||
4461 | tcp_hdr((struct sk_buff *) | ||
4462 | sk->sk_receive_queue.prev)->fin : 0; | ||
4463 | |||
4464 | if (queued_data && hasfin) | ||
4465 | queued_data--; | ||
4466 | |||
4467 | if (queued_data && | ||
4468 | tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { | ||
4469 | if (sock_flag(sk, SOCK_KEEPOPEN)) { | ||
4470 | inet_csk_reset_keepalive_timer(sk, | ||
4471 | keepalive_time_when(tp)); | ||
4472 | } else { | ||
4473 | inet_csk_delete_keepalive_timer(sk); | ||
4474 | } | ||
4475 | |||
4476 | inet_csk_reqsk_queue_add( | ||
4477 | tp->defer_tcp_accept.listen_sk, | ||
4478 | tp->defer_tcp_accept.request, | ||
4479 | sk); | ||
4480 | |||
4481 | tp->defer_tcp_accept.listen_sk->sk_data_ready( | ||
4482 | tp->defer_tcp_accept.listen_sk, 0); | ||
4483 | |||
4484 | sock_put(tp->defer_tcp_accept.listen_sk); | ||
4485 | sock_put(sk); | ||
4486 | tp->defer_tcp_accept.listen_sk = NULL; | ||
4487 | tp->defer_tcp_accept.request = NULL; | ||
4488 | } else if (hasfin || | ||
4489 | tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) { | ||
4490 | tcp_reset(sk); | ||
4491 | return -1; | ||
4492 | } | ||
4493 | } | ||
4494 | return 0; | ||
4495 | } | ||
4496 | |||
4454 | static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) | 4497 | static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) |
4455 | { | 4498 | { |
4456 | struct tcp_sock *tp = tcp_sk(sk); | 4499 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -4811,6 +4854,9 @@ step5: | |||
4811 | 4854 | ||
4812 | tcp_data_snd_check(sk); | 4855 | tcp_data_snd_check(sk); |
4813 | tcp_ack_snd_check(sk); | 4856 | tcp_ack_snd_check(sk); |
4857 | |||
4858 | if (tcp_defer_accept_check(sk)) | ||
4859 | return -1; | ||
4814 | return 0; | 4860 | return 0; |
4815 | 4861 | ||
4816 | csum_error: | 4862 | csum_error: |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0ba6e911c979..167a0f557531 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -1920,6 +1920,14 @@ int tcp_v4_destroy_sock(struct sock *sk) | |||
1920 | sk->sk_sndmsg_page = NULL; | 1920 | sk->sk_sndmsg_page = NULL; |
1921 | } | 1921 | } |
1922 | 1922 | ||
1923 | if (tp->defer_tcp_accept.request) { | ||
1924 | reqsk_free(tp->defer_tcp_accept.request); | ||
1925 | sock_put(tp->defer_tcp_accept.listen_sk); | ||
1926 | sock_put(sk); | ||
1927 | tp->defer_tcp_accept.listen_sk = NULL; | ||
1928 | tp->defer_tcp_accept.request = NULL; | ||
1929 | } | ||
1930 | |||
1923 | atomic_dec(&tcp_sockets_allocated); | 1931 | atomic_dec(&tcp_sockets_allocated); |
1924 | 1932 | ||
1925 | return 0; | 1933 | return 0; |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8245247a6ceb..019c8c16e5cc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -571,10 +571,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
571 | does sequence test, SYN is truncated, and thus we consider | 571 | does sequence test, SYN is truncated, and thus we consider |
572 | it a bare ACK. | 572 | it a bare ACK. |
573 | 573 | ||
574 | If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this | 574 | Both ends (listening sockets) accept the new incoming |
575 | bare ACK. Otherwise, we create an established connection. Both | 575 | connection and try to talk to each other. 8-) |
576 | ends (listening sockets) accept the new incoming connection and try | ||
577 | to talk to each other. 8-) | ||
578 | 576 | ||
579 | Note: This case is both harmless, and rare. Possibility is about the | 577 | Note: This case is both harmless, and rare. Possibility is about the |
580 | same as us discovering intelligent life on another plant tomorrow. | 578 | same as us discovering intelligent life on another plant tomorrow. |
@@ -642,13 +640,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
642 | if (!(flg & TCP_FLAG_ACK)) | 640 | if (!(flg & TCP_FLAG_ACK)) |
643 | return NULL; | 641 | return NULL; |
644 | 642 | ||
645 | /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ | ||
646 | if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && | ||
647 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | ||
648 | inet_rsk(req)->acked = 1; | ||
649 | return NULL; | ||
650 | } | ||
651 | |||
652 | /* OK, ACK is valid, create big socket and | 643 | /* OK, ACK is valid, create big socket and |
653 | * feed this segment to it. It will repeat all | 644 | * feed this segment to it. It will repeat all |
654 | * the tests. THIS SEGMENT MUST MOVE SOCKET TO | 645 | * the tests. THIS SEGMENT MUST MOVE SOCKET TO |
@@ -687,7 +678,24 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
687 | inet_csk_reqsk_queue_unlink(sk, req, prev); | 678 | inet_csk_reqsk_queue_unlink(sk, req, prev); |
688 | inet_csk_reqsk_queue_removed(sk, req); | 679 | inet_csk_reqsk_queue_removed(sk, req); |
689 | 680 | ||
690 | inet_csk_reqsk_queue_add(sk, req, child); | 681 | if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && |
682 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | ||
683 | |||
684 | /* the accept queue handling is done is est recv slow | ||
685 | * path so lets make sure to start there | ||
686 | */ | ||
687 | tcp_sk(child)->pred_flags = 0; | ||
688 | sock_hold(sk); | ||
689 | sock_hold(child); | ||
690 | tcp_sk(child)->defer_tcp_accept.listen_sk = sk; | ||
691 | tcp_sk(child)->defer_tcp_accept.request = req; | ||
692 | |||
693 | inet_csk_reset_keepalive_timer(child, | ||
694 | inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ); | ||
695 | } else { | ||
696 | inet_csk_reqsk_queue_add(sk, req, child); | ||
697 | } | ||
698 | |||
691 | return child; | 699 | return child; |
692 | 700 | ||
693 | listen_overflow: | 701 | listen_overflow: |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 803d758a2b12..160d16f9f4fc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data) | |||
481 | goto death; | 481 | goto death; |
482 | } | 482 | } |
483 | 483 | ||
484 | if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { | ||
485 | tcp_send_active_reset(sk, GFP_ATOMIC); | ||
486 | goto death; | ||
487 | } | ||
488 | |||
484 | if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) | 489 | if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) |
485 | goto out; | 490 | goto out; |
486 | 491 | ||