diff options
author | Jerry Chu <hkchu@google.com> | 2012-08-31 08:29:12 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-08-31 20:02:19 -0400 |
commit | 8336886f786fdacbc19b719c1f7ea91eb70706d4 (patch) | |
tree | c1fa912f7583ce0ffcb5ae673802da4a7dfb3b19 /net/ipv4/tcp_minisocks.c | |
parent | 1046716368979dee857a2b8a91c4a8833f21b9cb (diff) |
tcp: TCP Fast Open Server - support TFO listeners
This patch builds on top of the previous patch to add the support
for TFO listeners. This includes -
1. allocating, properly initializing, and managing the per listener
fastopen_queue structure when TFO is enabled
2. changes to the inet_csk_accept code to support TFO. E.g., the
request_sock can no longer be freed upon accept(), not until 3WHS
finishes
3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg()
if it's a TFO socket
4. properly closing a TFO listener, and a TFO socket before 3WHS
finishes
5. supporting TCP_FASTOPEN socket option
6. modifying tcp_check_req() to use to check a TFO socket as well
as request_sock
7. supporting TCP's TFO cookie option
8. adding a new SYN-ACK retransmit handler to use the timer directly
off the TFO socket rather than the listener socket. Note that TFO
server side will not retransmit anything other than SYN-ACK until
the 3WHS is completed.
The patch also contains an important function
"reqsk_fastopen_remove()" to manage the somewhat complex relation
between a listener, its request_sock, and the corresponding child
socket. See the comment above the function for the detail.
Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 61 |
1 files changed, 50 insertions, 11 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6ff7f10dce9d..e965319d610b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
507 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; | 507 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; |
508 | newtp->rx_opt.mss_clamp = req->mss; | 508 | newtp->rx_opt.mss_clamp = req->mss; |
509 | TCP_ECN_openreq_child(newtp, req); | 509 | TCP_ECN_openreq_child(newtp, req); |
510 | newtp->fastopen_rsk = NULL; | ||
510 | 511 | ||
511 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); | 512 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); |
512 | } | 513 | } |
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
515 | EXPORT_SYMBOL(tcp_create_openreq_child); | 516 | EXPORT_SYMBOL(tcp_create_openreq_child); |
516 | 517 | ||
517 | /* | 518 | /* |
518 | * Process an incoming packet for SYN_RECV sockets represented | 519 | * Process an incoming packet for SYN_RECV sockets represented as a |
519 | * as a request_sock. | 520 | * request_sock. Normally sk is the listener socket but for TFO it |
521 | * points to the child socket. | ||
522 | * | ||
523 | * XXX (TFO) - The current impl contains a special check for ack | ||
524 | * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? | ||
520 | */ | 525 | */ |
521 | 526 | ||
522 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | 527 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, |
523 | struct request_sock *req, | 528 | struct request_sock *req, |
524 | struct request_sock **prev) | 529 | struct request_sock **prev, |
530 | bool fastopen) | ||
525 | { | 531 | { |
526 | struct tcp_options_received tmp_opt; | 532 | struct tcp_options_received tmp_opt; |
527 | const u8 *hash_location; | 533 | const u8 *hash_location; |
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
530 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); | 536 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); |
531 | bool paws_reject = false; | 537 | bool paws_reject = false; |
532 | 538 | ||
539 | BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); | ||
540 | |||
533 | tmp_opt.saw_tstamp = 0; | 541 | tmp_opt.saw_tstamp = 0; |
534 | if (th->doff > (sizeof(struct tcphdr)>>2)) { | 542 | if (th->doff > (sizeof(struct tcphdr)>>2)) { |
535 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); | 543 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); |
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
565 | * | 573 | * |
566 | * Enforce "SYN-ACK" according to figure 8, figure 6 | 574 | * Enforce "SYN-ACK" according to figure 8, figure 6 |
567 | * of RFC793, fixed by RFC1122. | 575 | * of RFC793, fixed by RFC1122. |
576 | * | ||
577 | * Note that even if there is new data in the SYN packet | ||
578 | * they will be thrown away too. | ||
568 | */ | 579 | */ |
569 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); | 580 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); |
570 | return NULL; | 581 | return NULL; |
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
622 | * sent (the segment carries an unacceptable ACK) ... | 633 | * sent (the segment carries an unacceptable ACK) ... |
623 | * a reset is sent." | 634 | * a reset is sent." |
624 | * | 635 | * |
625 | * Invalid ACK: reset will be sent by listening socket | 636 | * Invalid ACK: reset will be sent by listening socket. |
637 | * Note that the ACK validity check for a Fast Open socket is done | ||
638 | * elsewhere and is checked directly against the child socket rather | ||
639 | * than req because user data may have been sent out. | ||
626 | */ | 640 | */ |
627 | if ((flg & TCP_FLAG_ACK) && | 641 | if ((flg & TCP_FLAG_ACK) && !fastopen && |
628 | (TCP_SKB_CB(skb)->ack_seq != | 642 | (TCP_SKB_CB(skb)->ack_seq != |
629 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) | 643 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) |
630 | return sk; | 644 | return sk; |
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
637 | /* RFC793: "first check sequence number". */ | 651 | /* RFC793: "first check sequence number". */ |
638 | 652 | ||
639 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 653 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, |
640 | tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { | 654 | tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { |
641 | /* Out of window: send ACK and drop. */ | 655 | /* Out of window: send ACK and drop. */ |
642 | if (!(flg & TCP_FLAG_RST)) | 656 | if (!(flg & TCP_FLAG_RST)) |
643 | req->rsk_ops->send_ack(sk, skb, req); | 657 | req->rsk_ops->send_ack(sk, skb, req); |
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
648 | 662 | ||
649 | /* In sequence, PAWS is OK. */ | 663 | /* In sequence, PAWS is OK. */ |
650 | 664 | ||
651 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) | 665 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) |
652 | req->ts_recent = tmp_opt.rcv_tsval; | 666 | req->ts_recent = tmp_opt.rcv_tsval; |
653 | 667 | ||
654 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { | 668 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { |
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
667 | 681 | ||
668 | /* ACK sequence verified above, just make sure ACK is | 682 | /* ACK sequence verified above, just make sure ACK is |
669 | * set. If ACK not set, just silently drop the packet. | 683 | * set. If ACK not set, just silently drop the packet. |
684 | * | ||
685 | * XXX (TFO) - if we ever allow "data after SYN", the | ||
686 | * following check needs to be removed. | ||
670 | */ | 687 | */ |
671 | if (!(flg & TCP_FLAG_ACK)) | 688 | if (!(flg & TCP_FLAG_ACK)) |
672 | return NULL; | 689 | return NULL; |
673 | 690 | ||
691 | /* For Fast Open no more processing is needed (sk is the | ||
692 | * child socket). | ||
693 | */ | ||
694 | if (fastopen) | ||
695 | return sk; | ||
696 | |||
674 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ | 697 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ |
675 | if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && | 698 | if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && |
676 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | 699 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { |
@@ -706,11 +729,21 @@ listen_overflow: | |||
706 | } | 729 | } |
707 | 730 | ||
708 | embryonic_reset: | 731 | embryonic_reset: |
709 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | 732 | if (!(flg & TCP_FLAG_RST)) { |
710 | if (!(flg & TCP_FLAG_RST)) | 733 | /* Received a bad SYN pkt - for TFO We try not to reset |
734 | * the local connection unless it's really necessary to | ||
735 | * avoid becoming vulnerable to outside attack aiming at | ||
736 | * resetting legit local connections. | ||
737 | */ | ||
711 | req->rsk_ops->send_reset(sk, skb); | 738 | req->rsk_ops->send_reset(sk, skb); |
712 | 739 | } else if (fastopen) { /* received a valid RST pkt */ | |
713 | inet_csk_reqsk_queue_drop(sk, req, prev); | 740 | reqsk_fastopen_remove(sk, req, true); |
741 | tcp_reset(sk); | ||
742 | } | ||
743 | if (!fastopen) { | ||
744 | inet_csk_reqsk_queue_drop(sk, req, prev); | ||
745 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | ||
746 | } | ||
714 | return NULL; | 747 | return NULL; |
715 | } | 748 | } |
716 | EXPORT_SYMBOL(tcp_check_req); | 749 | EXPORT_SYMBOL(tcp_check_req); |
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req); | |||
719 | * Queue segment on the new socket if the new socket is active, | 752 | * Queue segment on the new socket if the new socket is active, |
720 | * otherwise we just shortcircuit this and continue with | 753 | * otherwise we just shortcircuit this and continue with |
721 | * the new socket. | 754 | * the new socket. |
755 | * | ||
756 | * For the vast majority of cases child->sk_state will be TCP_SYN_RECV | ||
757 | * when entering. But other states are possible due to a race condition | ||
758 | * where after __inet_lookup_established() fails but before the listener | ||
759 | * locked is obtained, other packets cause the same connection to | ||
760 | * be created. | ||
722 | */ | 761 | */ |
723 | 762 | ||
724 | int tcp_child_process(struct sock *parent, struct sock *child, | 763 | int tcp_child_process(struct sock *parent, struct sock *child, |