aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_minisocks.c
diff options
context:
space:
mode:
authorJerry Chu <hkchu@google.com>2012-08-31 08:29:12 -0400
committerDavid S. Miller <davem@davemloft.net>2012-08-31 20:02:19 -0400
commit8336886f786fdacbc19b719c1f7ea91eb70706d4 (patch)
treec1fa912f7583ce0ffcb5ae673802da4a7dfb3b19 /net/ipv4/tcp_minisocks.c
parent1046716368979dee857a2b8a91c4a8833f21b9cb (diff)
tcp: TCP Fast Open Server - support TFO listeners
This patch builds on top of the previous patch to add the support for TFO listeners. This includes - 1. allocating, properly initializing, and managing the per listener fastopen_queue structure when TFO is enabled 2. changes to the inet_csk_accept code to support TFO. E.g., the request_sock can no longer be freed upon accept(), not until 3WHS finishes 3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg() if it's a TFO socket 4. properly closing a TFO listener, and a TFO socket before 3WHS finishes 5. supporting TCP_FASTOPEN socket option 6. modifying tcp_check_req() to use to check a TFO socket as well as request_sock 7. supporting TCP's TFO cookie option 8. adding a new SYN-ACK retransmit handler to use the timer directly off the TFO socket rather than the listener socket. Note that TFO server side will not retransmit anything other than SYN-ACK until the 3WHS is completed. The patch also contains an important function "reqsk_fastopen_remove()" to manage the somewhat complex relation between a listener, its request_sock, and the corresponding child socket. See the comment above the function for the detail. Signed-off-by: H.K. Jerry Chu <hkchu@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
-rw-r--r--net/ipv4/tcp_minisocks.c61
1 files changed, 50 insertions, 11 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6ff7f10dce9..e965319d610 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
508 newtp->rx_opt.mss_clamp = req->mss; 508 newtp->rx_opt.mss_clamp = req->mss;
509 TCP_ECN_openreq_child(newtp, req); 509 TCP_ECN_openreq_child(newtp, req);
510 newtp->fastopen_rsk = NULL;
510 511
511 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 512 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
512 } 513 }
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
515EXPORT_SYMBOL(tcp_create_openreq_child); 516EXPORT_SYMBOL(tcp_create_openreq_child);
516 517
517/* 518/*
518 * Process an incoming packet for SYN_RECV sockets represented 519 * Process an incoming packet for SYN_RECV sockets represented as a
519 * as a request_sock. 520 * request_sock. Normally sk is the listener socket but for TFO it
521 * points to the child socket.
522 *
523 * XXX (TFO) - The current impl contains a special check for ack
524 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
520 */ 525 */
521 526
522struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 527struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
523 struct request_sock *req, 528 struct request_sock *req,
524 struct request_sock **prev) 529 struct request_sock **prev,
530 bool fastopen)
525{ 531{
526 struct tcp_options_received tmp_opt; 532 struct tcp_options_received tmp_opt;
527 const u8 *hash_location; 533 const u8 *hash_location;
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
530 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 536 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
531 bool paws_reject = false; 537 bool paws_reject = false;
532 538
539 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
540
533 tmp_opt.saw_tstamp = 0; 541 tmp_opt.saw_tstamp = 0;
534 if (th->doff > (sizeof(struct tcphdr)>>2)) { 542 if (th->doff > (sizeof(struct tcphdr)>>2)) {
535 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 543 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
565 * 573 *
566 * Enforce "SYN-ACK" according to figure 8, figure 6 574 * Enforce "SYN-ACK" according to figure 8, figure 6
567 * of RFC793, fixed by RFC1122. 575 * of RFC793, fixed by RFC1122.
576 *
577 * Note that even if there is new data in the SYN packet
578 * they will be thrown away too.
568 */ 579 */
569 req->rsk_ops->rtx_syn_ack(sk, req, NULL); 580 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
570 return NULL; 581 return NULL;
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
622 * sent (the segment carries an unacceptable ACK) ... 633 * sent (the segment carries an unacceptable ACK) ...
623 * a reset is sent." 634 * a reset is sent."
624 * 635 *
625 * Invalid ACK: reset will be sent by listening socket 636 * Invalid ACK: reset will be sent by listening socket.
637 * Note that the ACK validity check for a Fast Open socket is done
638 * elsewhere and is checked directly against the child socket rather
639 * than req because user data may have been sent out.
626 */ 640 */
627 if ((flg & TCP_FLAG_ACK) && 641 if ((flg & TCP_FLAG_ACK) && !fastopen &&
628 (TCP_SKB_CB(skb)->ack_seq != 642 (TCP_SKB_CB(skb)->ack_seq !=
629 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 643 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
630 return sk; 644 return sk;
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
637 /* RFC793: "first check sequence number". */ 651 /* RFC793: "first check sequence number". */
638 652
639 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 653 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
640 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 654 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
641 /* Out of window: send ACK and drop. */ 655 /* Out of window: send ACK and drop. */
642 if (!(flg & TCP_FLAG_RST)) 656 if (!(flg & TCP_FLAG_RST))
643 req->rsk_ops->send_ack(sk, skb, req); 657 req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
648 662
649 /* In sequence, PAWS is OK. */ 663 /* In sequence, PAWS is OK. */
650 664
651 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 665 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
652 req->ts_recent = tmp_opt.rcv_tsval; 666 req->ts_recent = tmp_opt.rcv_tsval;
653 667
654 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 668 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
667 681
668 /* ACK sequence verified above, just make sure ACK is 682 /* ACK sequence verified above, just make sure ACK is
669 * set. If ACK not set, just silently drop the packet. 683 * set. If ACK not set, just silently drop the packet.
684 *
685 * XXX (TFO) - if we ever allow "data after SYN", the
686 * following check needs to be removed.
670 */ 687 */
671 if (!(flg & TCP_FLAG_ACK)) 688 if (!(flg & TCP_FLAG_ACK))
672 return NULL; 689 return NULL;
673 690
691 /* For Fast Open no more processing is needed (sk is the
692 * child socket).
693 */
694 if (fastopen)
695 return sk;
696
674 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 697 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
675 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 698 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
676 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 699 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -706,11 +729,21 @@ listen_overflow:
706 } 729 }
707 730
708embryonic_reset: 731embryonic_reset:
709 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 732 if (!(flg & TCP_FLAG_RST)) {
710 if (!(flg & TCP_FLAG_RST)) 733 /* Received a bad SYN pkt - for TFO We try not to reset
734 * the local connection unless it's really necessary to
735 * avoid becoming vulnerable to outside attack aiming at
736 * resetting legit local connections.
737 */
711 req->rsk_ops->send_reset(sk, skb); 738 req->rsk_ops->send_reset(sk, skb);
712 739 } else if (fastopen) { /* received a valid RST pkt */
713 inet_csk_reqsk_queue_drop(sk, req, prev); 740 reqsk_fastopen_remove(sk, req, true);
741 tcp_reset(sk);
742 }
743 if (!fastopen) {
744 inet_csk_reqsk_queue_drop(sk, req, prev);
745 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
746 }
714 return NULL; 747 return NULL;
715} 748}
716EXPORT_SYMBOL(tcp_check_req); 749EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req);
719 * Queue segment on the new socket if the new socket is active, 752 * Queue segment on the new socket if the new socket is active,
720 * otherwise we just shortcircuit this and continue with 753 * otherwise we just shortcircuit this and continue with
721 * the new socket. 754 * the new socket.
755 *
756 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
757 * when entering. But other states are possible due to a race condition
758 * where after __inet_lookup_established() fails but before the listener
759 * locked is obtained, other packets cause the same connection to
760 * be created.
722 */ 761 */
723 762
724int tcp_child_process(struct sock *parent, struct sock *child, 763int tcp_child_process(struct sock *parent, struct sock *child,