diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 343 |
1 files changed, 324 insertions, 19 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 803cbfe82fbc..33cd065cfbd8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; | |||
50 | */ | 50 | */ |
51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | 51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
52 | 52 | ||
53 | /* Default TSQ limit of two TSO segments */ | ||
54 | int sysctl_tcp_limit_output_bytes __read_mostly = 131072; | ||
55 | |||
53 | /* This limits the percentage of the congestion window which we | 56 | /* This limits the percentage of the congestion window which we |
54 | * will allow a single TSO frame to consume. Building TSO frames | 57 | * will allow a single TSO frame to consume. Building TSO frames |
55 | * which are too large can cause TCP streams to be bursty. | 58 | * which are too large can cause TCP streams to be bursty. |
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | |||
65 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ | 68 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ |
66 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); | 69 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); |
67 | 70 | ||
71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | ||
72 | int push_one, gfp_t gfp); | ||
68 | 73 | ||
69 | /* Account for new data that has been sent to the network. */ | 74 | /* Account for new data that has been sent to the network. */ |
70 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | 75 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) |
@@ -380,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) | |||
380 | #define OPTION_MD5 (1 << 2) | 385 | #define OPTION_MD5 (1 << 2) |
381 | #define OPTION_WSCALE (1 << 3) | 386 | #define OPTION_WSCALE (1 << 3) |
382 | #define OPTION_COOKIE_EXTENSION (1 << 4) | 387 | #define OPTION_COOKIE_EXTENSION (1 << 4) |
388 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) | ||
383 | 389 | ||
384 | struct tcp_out_options { | 390 | struct tcp_out_options { |
385 | u8 options; /* bit field of OPTION_* */ | 391 | u16 options; /* bit field of OPTION_* */ |
392 | u16 mss; /* 0 to disable */ | ||
386 | u8 ws; /* window scale, 0 to disable */ | 393 | u8 ws; /* window scale, 0 to disable */ |
387 | u8 num_sack_blocks; /* number of SACK blocks to include */ | 394 | u8 num_sack_blocks; /* number of SACK blocks to include */ |
388 | u8 hash_size; /* bytes in hash_location */ | 395 | u8 hash_size; /* bytes in hash_location */ |
389 | u16 mss; /* 0 to disable */ | ||
390 | __u32 tsval, tsecr; /* need to include OPTION_TS */ | ||
391 | __u8 *hash_location; /* temporary pointer, overloaded */ | 396 | __u8 *hash_location; /* temporary pointer, overloaded */ |
397 | __u32 tsval, tsecr; /* need to include OPTION_TS */ | ||
398 | struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ | ||
392 | }; | 399 | }; |
393 | 400 | ||
394 | /* The sysctl int routines are generic, so check consistency here. | 401 | /* The sysctl int routines are generic, so check consistency here. |
@@ -437,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired) | |||
437 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | 444 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
438 | struct tcp_out_options *opts) | 445 | struct tcp_out_options *opts) |
439 | { | 446 | { |
440 | u8 options = opts->options; /* mungable copy */ | 447 | u16 options = opts->options; /* mungable copy */ |
441 | 448 | ||
442 | /* Having both authentication and cookies for security is redundant, | 449 | /* Having both authentication and cookies for security is redundant, |
443 | * and there's certainly not enough room. Instead, the cookie-less | 450 | * and there's certainly not enough room. Instead, the cookie-less |
@@ -559,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |||
559 | 566 | ||
560 | tp->rx_opt.dsack = 0; | 567 | tp->rx_opt.dsack = 0; |
561 | } | 568 | } |
569 | |||
570 | if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { | ||
571 | struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; | ||
572 | |||
573 | *ptr++ = htonl((TCPOPT_EXP << 24) | | ||
574 | ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | | ||
575 | TCPOPT_FASTOPEN_MAGIC); | ||
576 | |||
577 | memcpy(ptr, foc->val, foc->len); | ||
578 | if ((foc->len & 3) == 2) { | ||
579 | u8 *align = ((u8 *)ptr) + foc->len; | ||
580 | align[0] = align[1] = TCPOPT_NOP; | ||
581 | } | ||
582 | ptr += (foc->len + 3) >> 2; | ||
583 | } | ||
562 | } | 584 | } |
563 | 585 | ||
564 | /* Compute TCP options for SYN packets. This is not the final | 586 | /* Compute TCP options for SYN packets. This is not the final |
@@ -574,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
574 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? | 596 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? |
575 | tcp_cookie_size_check(cvp->cookie_desired) : | 597 | tcp_cookie_size_check(cvp->cookie_desired) : |
576 | 0; | 598 | 0; |
599 | struct tcp_fastopen_request *fastopen = tp->fastopen_req; | ||
577 | 600 | ||
578 | #ifdef CONFIG_TCP_MD5SIG | 601 | #ifdef CONFIG_TCP_MD5SIG |
579 | *md5 = tp->af_specific->md5_lookup(sk, sk); | 602 | *md5 = tp->af_specific->md5_lookup(sk, sk); |
@@ -614,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
614 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 637 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
615 | } | 638 | } |
616 | 639 | ||
640 | if (fastopen && fastopen->cookie.len >= 0) { | ||
641 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; | ||
642 | need = (need + 3) & ~3U; /* Align to 32 bits */ | ||
643 | if (remaining >= need) { | ||
644 | opts->options |= OPTION_FAST_OPEN_COOKIE; | ||
645 | opts->fastopen_cookie = &fastopen->cookie; | ||
646 | remaining -= need; | ||
647 | tp->syn_fastopen = 1; | ||
648 | } | ||
649 | } | ||
617 | /* Note that timestamps are required by the specification. | 650 | /* Note that timestamps are required by the specification. |
618 | * | 651 | * |
619 | * Odd numbers of bytes are prohibited by the specification, ensuring | 652 | * Odd numbers of bytes are prohibited by the specification, ensuring |
@@ -783,6 +816,156 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
783 | return size; | 816 | return size; |
784 | } | 817 | } |
785 | 818 | ||
819 | |||
820 | /* TCP SMALL QUEUES (TSQ) | ||
821 | * | ||
822 | * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) | ||
823 | * to reduce RTT and bufferbloat. | ||
824 | * We do this using a special skb destructor (tcp_wfree). | ||
825 | * | ||
826 | * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb | ||
827 | * needs to be reallocated in a driver. | ||
828 | * The invariant being skb->truesize substracted from sk->sk_wmem_alloc | ||
829 | * | ||
830 | * Since transmit from skb destructor is forbidden, we use a tasklet | ||
831 | * to process all sockets that eventually need to send more skbs. | ||
832 | * We use one tasklet per cpu, with its own queue of sockets. | ||
833 | */ | ||
834 | struct tsq_tasklet { | ||
835 | struct tasklet_struct tasklet; | ||
836 | struct list_head head; /* queue of tcp sockets */ | ||
837 | }; | ||
838 | static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); | ||
839 | |||
840 | static void tcp_tsq_handler(struct sock *sk) | ||
841 | { | ||
842 | if ((1 << sk->sk_state) & | ||
843 | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | | ||
844 | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) | ||
845 | tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); | ||
846 | } | ||
847 | /* | ||
848 | * One tasklest per cpu tries to send more skbs. | ||
849 | * We run in tasklet context but need to disable irqs when | ||
850 | * transfering tsq->head because tcp_wfree() might | ||
851 | * interrupt us (non NAPI drivers) | ||
852 | */ | ||
853 | static void tcp_tasklet_func(unsigned long data) | ||
854 | { | ||
855 | struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; | ||
856 | LIST_HEAD(list); | ||
857 | unsigned long flags; | ||
858 | struct list_head *q, *n; | ||
859 | struct tcp_sock *tp; | ||
860 | struct sock *sk; | ||
861 | |||
862 | local_irq_save(flags); | ||
863 | list_splice_init(&tsq->head, &list); | ||
864 | local_irq_restore(flags); | ||
865 | |||
866 | list_for_each_safe(q, n, &list) { | ||
867 | tp = list_entry(q, struct tcp_sock, tsq_node); | ||
868 | list_del(&tp->tsq_node); | ||
869 | |||
870 | sk = (struct sock *)tp; | ||
871 | bh_lock_sock(sk); | ||
872 | |||
873 | if (!sock_owned_by_user(sk)) { | ||
874 | tcp_tsq_handler(sk); | ||
875 | } else { | ||
876 | /* defer the work to tcp_release_cb() */ | ||
877 | set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); | ||
878 | } | ||
879 | bh_unlock_sock(sk); | ||
880 | |||
881 | clear_bit(TSQ_QUEUED, &tp->tsq_flags); | ||
882 | sk_free(sk); | ||
883 | } | ||
884 | } | ||
885 | |||
886 | #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ | ||
887 | (1UL << TCP_WRITE_TIMER_DEFERRED) | \ | ||
888 | (1UL << TCP_DELACK_TIMER_DEFERRED) | \ | ||
889 | (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
890 | /** | ||
891 | * tcp_release_cb - tcp release_sock() callback | ||
892 | * @sk: socket | ||
893 | * | ||
894 | * called from release_sock() to perform protocol dependent | ||
895 | * actions before socket release. | ||
896 | */ | ||
897 | void tcp_release_cb(struct sock *sk) | ||
898 | { | ||
899 | struct tcp_sock *tp = tcp_sk(sk); | ||
900 | unsigned long flags, nflags; | ||
901 | |||
902 | /* perform an atomic operation only if at least one flag is set */ | ||
903 | do { | ||
904 | flags = tp->tsq_flags; | ||
905 | if (!(flags & TCP_DEFERRED_ALL)) | ||
906 | return; | ||
907 | nflags = flags & ~TCP_DEFERRED_ALL; | ||
908 | } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); | ||
909 | |||
910 | if (flags & (1UL << TCP_TSQ_DEFERRED)) | ||
911 | tcp_tsq_handler(sk); | ||
912 | |||
913 | if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) | ||
914 | tcp_write_timer_handler(sk); | ||
915 | |||
916 | if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) | ||
917 | tcp_delack_timer_handler(sk); | ||
918 | |||
919 | if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
920 | sk->sk_prot->mtu_reduced(sk); | ||
921 | } | ||
922 | EXPORT_SYMBOL(tcp_release_cb); | ||
923 | |||
924 | void __init tcp_tasklet_init(void) | ||
925 | { | ||
926 | int i; | ||
927 | |||
928 | for_each_possible_cpu(i) { | ||
929 | struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); | ||
930 | |||
931 | INIT_LIST_HEAD(&tsq->head); | ||
932 | tasklet_init(&tsq->tasklet, | ||
933 | tcp_tasklet_func, | ||
934 | (unsigned long)tsq); | ||
935 | } | ||
936 | } | ||
937 | |||
938 | /* | ||
939 | * Write buffer destructor automatically called from kfree_skb. | ||
940 | * We cant xmit new skbs from this context, as we might already | ||
941 | * hold qdisc lock. | ||
942 | */ | ||
943 | void tcp_wfree(struct sk_buff *skb) | ||
944 | { | ||
945 | struct sock *sk = skb->sk; | ||
946 | struct tcp_sock *tp = tcp_sk(sk); | ||
947 | |||
948 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && | ||
949 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { | ||
950 | unsigned long flags; | ||
951 | struct tsq_tasklet *tsq; | ||
952 | |||
953 | /* Keep a ref on socket. | ||
954 | * This last ref will be released in tcp_tasklet_func() | ||
955 | */ | ||
956 | atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); | ||
957 | |||
958 | /* queue this socket to tasklet queue */ | ||
959 | local_irq_save(flags); | ||
960 | tsq = &__get_cpu_var(tsq_tasklet); | ||
961 | list_add(&tp->tsq_node, &tsq->head); | ||
962 | tasklet_schedule(&tsq->tasklet); | ||
963 | local_irq_restore(flags); | ||
964 | } else { | ||
965 | sock_wfree(skb); | ||
966 | } | ||
967 | } | ||
968 | |||
786 | /* This routine actually transmits TCP packets queued in by | 969 | /* This routine actually transmits TCP packets queued in by |
787 | * tcp_do_sendmsg(). This is used by both the initial | 970 | * tcp_do_sendmsg(). This is used by both the initial |
788 | * transmission and possible later retransmissions. | 971 | * transmission and possible later retransmissions. |
@@ -844,7 +1027,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
844 | 1027 | ||
845 | skb_push(skb, tcp_header_size); | 1028 | skb_push(skb, tcp_header_size); |
846 | skb_reset_transport_header(skb); | 1029 | skb_reset_transport_header(skb); |
847 | skb_set_owner_w(skb, sk); | 1030 | |
1031 | skb_orphan(skb); | ||
1032 | skb->sk = sk; | ||
1033 | skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? | ||
1034 | tcp_wfree : sock_wfree; | ||
1035 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | ||
848 | 1036 | ||
849 | /* Build TCP header and checksum it. */ | 1037 | /* Build TCP header and checksum it. */ |
850 | th = tcp_hdr(skb); | 1038 | th = tcp_hdr(skb); |
@@ -1780,6 +1968,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1780 | while ((skb = tcp_send_head(sk))) { | 1968 | while ((skb = tcp_send_head(sk))) { |
1781 | unsigned int limit; | 1969 | unsigned int limit; |
1782 | 1970 | ||
1971 | |||
1783 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); | 1972 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); |
1784 | BUG_ON(!tso_segs); | 1973 | BUG_ON(!tso_segs); |
1785 | 1974 | ||
@@ -1800,6 +1989,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1800 | break; | 1989 | break; |
1801 | } | 1990 | } |
1802 | 1991 | ||
1992 | /* TSQ : sk_wmem_alloc accounts skb truesize, | ||
1993 | * including skb overhead. But thats OK. | ||
1994 | */ | ||
1995 | if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { | ||
1996 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | ||
1997 | break; | ||
1998 | } | ||
1803 | limit = mss_now; | 1999 | limit = mss_now; |
1804 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | 2000 | if (tso_segs > 1 && !tcp_urg_mode(tp)) |
1805 | limit = tcp_mss_split_point(sk, skb, mss_now, | 2001 | limit = tcp_mss_split_point(sk, skb, mss_now, |
@@ -2442,7 +2638,16 @@ int tcp_send_synack(struct sock *sk) | |||
2442 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2638 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
2443 | } | 2639 | } |
2444 | 2640 | ||
2445 | /* Prepare a SYN-ACK. */ | 2641 | /** |
2642 | * tcp_make_synack - Prepare a SYN-ACK. | ||
2643 | * sk: listener socket | ||
2644 | * dst: dst entry attached to the SYNACK | ||
2645 | * req: request_sock pointer | ||
2646 | * rvp: request_values pointer | ||
2647 | * | ||
2648 | * Allocate one skb and build a SYNACK packet. | ||
2649 | * @dst is consumed : Caller should not use it again. | ||
2650 | */ | ||
2446 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | 2651 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
2447 | struct request_sock *req, | 2652 | struct request_sock *req, |
2448 | struct request_values *rvp) | 2653 | struct request_values *rvp) |
@@ -2461,14 +2666,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2461 | 2666 | ||
2462 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) | 2667 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) |
2463 | s_data_desired = cvp->s_data_desired; | 2668 | s_data_desired = cvp->s_data_desired; |
2464 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); | 2669 | skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); |
2465 | if (skb == NULL) | 2670 | if (unlikely(!skb)) { |
2671 | dst_release(dst); | ||
2466 | return NULL; | 2672 | return NULL; |
2467 | 2673 | } | |
2468 | /* Reserve space for headers. */ | 2674 | /* Reserve space for headers. */ |
2469 | skb_reserve(skb, MAX_TCP_HEADER); | 2675 | skb_reserve(skb, MAX_TCP_HEADER); |
2470 | 2676 | ||
2471 | skb_dst_set(skb, dst_clone(dst)); | 2677 | skb_dst_set(skb, dst); |
2472 | 2678 | ||
2473 | mss = dst_metric_advmss(dst); | 2679 | mss = dst_metric_advmss(dst); |
2474 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2680 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
@@ -2645,6 +2851,109 @@ void tcp_connect_init(struct sock *sk) | |||
2645 | tcp_clear_retrans(tp); | 2851 | tcp_clear_retrans(tp); |
2646 | } | 2852 | } |
2647 | 2853 | ||
2854 | static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) | ||
2855 | { | ||
2856 | struct tcp_sock *tp = tcp_sk(sk); | ||
2857 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | ||
2858 | |||
2859 | tcb->end_seq += skb->len; | ||
2860 | skb_header_release(skb); | ||
2861 | __tcp_add_write_queue_tail(sk, skb); | ||
2862 | sk->sk_wmem_queued += skb->truesize; | ||
2863 | sk_mem_charge(sk, skb->truesize); | ||
2864 | tp->write_seq = tcb->end_seq; | ||
2865 | tp->packets_out += tcp_skb_pcount(skb); | ||
2866 | } | ||
2867 | |||
2868 | /* Build and send a SYN with data and (cached) Fast Open cookie. However, | ||
2869 | * queue a data-only packet after the regular SYN, such that regular SYNs | ||
2870 | * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges | ||
2871 | * only the SYN sequence, the data are retransmitted in the first ACK. | ||
2872 | * If cookie is not cached or other error occurs, falls back to send a | ||
2873 | * regular SYN with Fast Open cookie request option. | ||
2874 | */ | ||
2875 | static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | ||
2876 | { | ||
2877 | struct tcp_sock *tp = tcp_sk(sk); | ||
2878 | struct tcp_fastopen_request *fo = tp->fastopen_req; | ||
2879 | int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; | ||
2880 | struct sk_buff *syn_data = NULL, *data; | ||
2881 | unsigned long last_syn_loss = 0; | ||
2882 | |||
2883 | tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ | ||
2884 | tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, | ||
2885 | &syn_loss, &last_syn_loss); | ||
2886 | /* Recurring FO SYN losses: revert to regular handshake temporarily */ | ||
2887 | if (syn_loss > 1 && | ||
2888 | time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { | ||
2889 | fo->cookie.len = -1; | ||
2890 | goto fallback; | ||
2891 | } | ||
2892 | |||
2893 | if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) | ||
2894 | fo->cookie.len = -1; | ||
2895 | else if (fo->cookie.len <= 0) | ||
2896 | goto fallback; | ||
2897 | |||
2898 | /* MSS for SYN-data is based on cached MSS and bounded by PMTU and | ||
2899 | * user-MSS. Reserve maximum option space for middleboxes that add | ||
2900 | * private TCP options. The cost is reduced data space in SYN :( | ||
2901 | */ | ||
2902 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) | ||
2903 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | ||
2904 | space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - | ||
2905 | MAX_TCP_OPTION_SPACE; | ||
2906 | |||
2907 | syn_data = skb_copy_expand(syn, skb_headroom(syn), space, | ||
2908 | sk->sk_allocation); | ||
2909 | if (syn_data == NULL) | ||
2910 | goto fallback; | ||
2911 | |||
2912 | for (i = 0; i < iovlen && syn_data->len < space; ++i) { | ||
2913 | struct iovec *iov = &fo->data->msg_iov[i]; | ||
2914 | unsigned char __user *from = iov->iov_base; | ||
2915 | int len = iov->iov_len; | ||
2916 | |||
2917 | if (syn_data->len + len > space) | ||
2918 | len = space - syn_data->len; | ||
2919 | else if (i + 1 == iovlen) | ||
2920 | /* No more data pending in inet_wait_for_connect() */ | ||
2921 | fo->data = NULL; | ||
2922 | |||
2923 | if (skb_add_data(syn_data, from, len)) | ||
2924 | goto fallback; | ||
2925 | } | ||
2926 | |||
2927 | /* Queue a data-only packet after the regular SYN for retransmission */ | ||
2928 | data = pskb_copy(syn_data, sk->sk_allocation); | ||
2929 | if (data == NULL) | ||
2930 | goto fallback; | ||
2931 | TCP_SKB_CB(data)->seq++; | ||
2932 | TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; | ||
2933 | TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); | ||
2934 | tcp_connect_queue_skb(sk, data); | ||
2935 | fo->copied = data->len; | ||
2936 | |||
2937 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { | ||
2938 | tp->syn_data = (fo->copied > 0); | ||
2939 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); | ||
2940 | goto done; | ||
2941 | } | ||
2942 | syn_data = NULL; | ||
2943 | |||
2944 | fallback: | ||
2945 | /* Send a regular SYN with Fast Open cookie request option */ | ||
2946 | if (fo->cookie.len > 0) | ||
2947 | fo->cookie.len = 0; | ||
2948 | err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); | ||
2949 | if (err) | ||
2950 | tp->syn_fastopen = 0; | ||
2951 | kfree_skb(syn_data); | ||
2952 | done: | ||
2953 | fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ | ||
2954 | return err; | ||
2955 | } | ||
2956 | |||
2648 | /* Build a SYN and send it off. */ | 2957 | /* Build a SYN and send it off. */ |
2649 | int tcp_connect(struct sock *sk) | 2958 | int tcp_connect(struct sock *sk) |
2650 | { | 2959 | { |
@@ -2662,17 +2971,13 @@ int tcp_connect(struct sock *sk) | |||
2662 | skb_reserve(buff, MAX_TCP_HEADER); | 2971 | skb_reserve(buff, MAX_TCP_HEADER); |
2663 | 2972 | ||
2664 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); | 2973 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); |
2974 | tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; | ||
2975 | tcp_connect_queue_skb(sk, buff); | ||
2665 | TCP_ECN_send_syn(sk, buff); | 2976 | TCP_ECN_send_syn(sk, buff); |
2666 | 2977 | ||
2667 | /* Send it off. */ | 2978 | /* Send off SYN; include data in Fast Open. */ |
2668 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 2979 | err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : |
2669 | tp->retrans_stamp = TCP_SKB_CB(buff)->when; | 2980 | tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); |
2670 | skb_header_release(buff); | ||
2671 | __tcp_add_write_queue_tail(sk, buff); | ||
2672 | sk->sk_wmem_queued += buff->truesize; | ||
2673 | sk_mem_charge(sk, buff->truesize); | ||
2674 | tp->packets_out += tcp_skb_pcount(buff); | ||
2675 | err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); | ||
2676 | if (err == -ECONNREFUSED) | 2981 | if (err == -ECONNREFUSED) |
2677 | return err; | 2982 | return err; |
2678 | 2983 | ||