diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 353 |
1 files changed, 330 insertions, 23 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 803cbfe82fbc..3f1bcff0b10b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; | |||
| 50 | */ | 50 | */ |
| 51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | 51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
| 52 | 52 | ||
| 53 | /* Default TSQ limit of two TSO segments */ | ||
| 54 | int sysctl_tcp_limit_output_bytes __read_mostly = 131072; | ||
| 55 | |||
| 53 | /* This limits the percentage of the congestion window which we | 56 | /* This limits the percentage of the congestion window which we |
| 54 | * will allow a single TSO frame to consume. Building TSO frames | 57 | * will allow a single TSO frame to consume. Building TSO frames |
| 55 | * which are too large can cause TCP streams to be bursty. | 58 | * which are too large can cause TCP streams to be bursty. |
| @@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | |||
| 65 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ | 68 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ |
| 66 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); | 69 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); |
| 67 | 70 | ||
| 71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | ||
| 72 | int push_one, gfp_t gfp); | ||
| 68 | 73 | ||
| 69 | /* Account for new data that has been sent to the network. */ | 74 | /* Account for new data that has been sent to the network. */ |
| 70 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | 75 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) |
| @@ -380,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) | |||
| 380 | #define OPTION_MD5 (1 << 2) | 385 | #define OPTION_MD5 (1 << 2) |
| 381 | #define OPTION_WSCALE (1 << 3) | 386 | #define OPTION_WSCALE (1 << 3) |
| 382 | #define OPTION_COOKIE_EXTENSION (1 << 4) | 387 | #define OPTION_COOKIE_EXTENSION (1 << 4) |
| 388 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) | ||
| 383 | 389 | ||
| 384 | struct tcp_out_options { | 390 | struct tcp_out_options { |
| 385 | u8 options; /* bit field of OPTION_* */ | 391 | u16 options; /* bit field of OPTION_* */ |
| 392 | u16 mss; /* 0 to disable */ | ||
| 386 | u8 ws; /* window scale, 0 to disable */ | 393 | u8 ws; /* window scale, 0 to disable */ |
| 387 | u8 num_sack_blocks; /* number of SACK blocks to include */ | 394 | u8 num_sack_blocks; /* number of SACK blocks to include */ |
| 388 | u8 hash_size; /* bytes in hash_location */ | 395 | u8 hash_size; /* bytes in hash_location */ |
| 389 | u16 mss; /* 0 to disable */ | ||
| 390 | __u32 tsval, tsecr; /* need to include OPTION_TS */ | ||
| 391 | __u8 *hash_location; /* temporary pointer, overloaded */ | 396 | __u8 *hash_location; /* temporary pointer, overloaded */ |
| 397 | __u32 tsval, tsecr; /* need to include OPTION_TS */ | ||
| 398 | struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ | ||
| 392 | }; | 399 | }; |
| 393 | 400 | ||
| 394 | /* The sysctl int routines are generic, so check consistency here. | 401 | /* The sysctl int routines are generic, so check consistency here. |
| @@ -437,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired) | |||
| 437 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | 444 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
| 438 | struct tcp_out_options *opts) | 445 | struct tcp_out_options *opts) |
| 439 | { | 446 | { |
| 440 | u8 options = opts->options; /* mungable copy */ | 447 | u16 options = opts->options; /* mungable copy */ |
| 441 | 448 | ||
| 442 | /* Having both authentication and cookies for security is redundant, | 449 | /* Having both authentication and cookies for security is redundant, |
| 443 | * and there's certainly not enough room. Instead, the cookie-less | 450 | * and there's certainly not enough room. Instead, the cookie-less |
| @@ -559,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |||
| 559 | 566 | ||
| 560 | tp->rx_opt.dsack = 0; | 567 | tp->rx_opt.dsack = 0; |
| 561 | } | 568 | } |
| 569 | |||
| 570 | if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { | ||
| 571 | struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; | ||
| 572 | |||
| 573 | *ptr++ = htonl((TCPOPT_EXP << 24) | | ||
| 574 | ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | | ||
| 575 | TCPOPT_FASTOPEN_MAGIC); | ||
| 576 | |||
| 577 | memcpy(ptr, foc->val, foc->len); | ||
| 578 | if ((foc->len & 3) == 2) { | ||
| 579 | u8 *align = ((u8 *)ptr) + foc->len; | ||
| 580 | align[0] = align[1] = TCPOPT_NOP; | ||
| 581 | } | ||
| 582 | ptr += (foc->len + 3) >> 2; | ||
| 583 | } | ||
| 562 | } | 584 | } |
| 563 | 585 | ||
| 564 | /* Compute TCP options for SYN packets. This is not the final | 586 | /* Compute TCP options for SYN packets. This is not the final |
| @@ -574,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
| 574 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? | 596 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? |
| 575 | tcp_cookie_size_check(cvp->cookie_desired) : | 597 | tcp_cookie_size_check(cvp->cookie_desired) : |
| 576 | 0; | 598 | 0; |
| 599 | struct tcp_fastopen_request *fastopen = tp->fastopen_req; | ||
| 577 | 600 | ||
| 578 | #ifdef CONFIG_TCP_MD5SIG | 601 | #ifdef CONFIG_TCP_MD5SIG |
| 579 | *md5 = tp->af_specific->md5_lookup(sk, sk); | 602 | *md5 = tp->af_specific->md5_lookup(sk, sk); |
| @@ -614,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
| 614 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 637 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
| 615 | } | 638 | } |
| 616 | 639 | ||
| 640 | if (fastopen && fastopen->cookie.len >= 0) { | ||
| 641 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; | ||
| 642 | need = (need + 3) & ~3U; /* Align to 32 bits */ | ||
| 643 | if (remaining >= need) { | ||
| 644 | opts->options |= OPTION_FAST_OPEN_COOKIE; | ||
| 645 | opts->fastopen_cookie = &fastopen->cookie; | ||
| 646 | remaining -= need; | ||
| 647 | tp->syn_fastopen = 1; | ||
| 648 | } | ||
| 649 | } | ||
| 617 | /* Note that timestamps are required by the specification. | 650 | /* Note that timestamps are required by the specification. |
| 618 | * | 651 | * |
| 619 | * Odd numbers of bytes are prohibited by the specification, ensuring | 652 | * Odd numbers of bytes are prohibited by the specification, ensuring |
| @@ -783,6 +816,156 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
| 783 | return size; | 816 | return size; |
| 784 | } | 817 | } |
| 785 | 818 | ||
| 819 | |||
| 820 | /* TCP SMALL QUEUES (TSQ) | ||
| 821 | * | ||
| 822 | * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) | ||
| 823 | * to reduce RTT and bufferbloat. | ||
| 824 | * We do this using a special skb destructor (tcp_wfree). | ||
| 825 | * | ||
| 826 | * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb | ||
| 827 | * needs to be reallocated in a driver. | ||
| 828 | * The invariant being skb->truesize substracted from sk->sk_wmem_alloc | ||
| 829 | * | ||
| 830 | * Since transmit from skb destructor is forbidden, we use a tasklet | ||
| 831 | * to process all sockets that eventually need to send more skbs. | ||
| 832 | * We use one tasklet per cpu, with its own queue of sockets. | ||
| 833 | */ | ||
| 834 | struct tsq_tasklet { | ||
| 835 | struct tasklet_struct tasklet; | ||
| 836 | struct list_head head; /* queue of tcp sockets */ | ||
| 837 | }; | ||
| 838 | static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); | ||
| 839 | |||
| 840 | static void tcp_tsq_handler(struct sock *sk) | ||
| 841 | { | ||
| 842 | if ((1 << sk->sk_state) & | ||
| 843 | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | | ||
| 844 | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) | ||
| 845 | tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); | ||
| 846 | } | ||
| 847 | /* | ||
| 848 | * One tasklest per cpu tries to send more skbs. | ||
| 849 | * We run in tasklet context but need to disable irqs when | ||
| 850 | * transfering tsq->head because tcp_wfree() might | ||
| 851 | * interrupt us (non NAPI drivers) | ||
| 852 | */ | ||
| 853 | static void tcp_tasklet_func(unsigned long data) | ||
| 854 | { | ||
| 855 | struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; | ||
| 856 | LIST_HEAD(list); | ||
| 857 | unsigned long flags; | ||
| 858 | struct list_head *q, *n; | ||
| 859 | struct tcp_sock *tp; | ||
| 860 | struct sock *sk; | ||
| 861 | |||
| 862 | local_irq_save(flags); | ||
| 863 | list_splice_init(&tsq->head, &list); | ||
| 864 | local_irq_restore(flags); | ||
| 865 | |||
| 866 | list_for_each_safe(q, n, &list) { | ||
| 867 | tp = list_entry(q, struct tcp_sock, tsq_node); | ||
| 868 | list_del(&tp->tsq_node); | ||
| 869 | |||
| 870 | sk = (struct sock *)tp; | ||
| 871 | bh_lock_sock(sk); | ||
| 872 | |||
| 873 | if (!sock_owned_by_user(sk)) { | ||
| 874 | tcp_tsq_handler(sk); | ||
| 875 | } else { | ||
| 876 | /* defer the work to tcp_release_cb() */ | ||
| 877 | set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); | ||
| 878 | } | ||
| 879 | bh_unlock_sock(sk); | ||
| 880 | |||
| 881 | clear_bit(TSQ_QUEUED, &tp->tsq_flags); | ||
| 882 | sk_free(sk); | ||
| 883 | } | ||
| 884 | } | ||
| 885 | |||
| 886 | #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ | ||
| 887 | (1UL << TCP_WRITE_TIMER_DEFERRED) | \ | ||
| 888 | (1UL << TCP_DELACK_TIMER_DEFERRED) | \ | ||
| 889 | (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
| 890 | /** | ||
| 891 | * tcp_release_cb - tcp release_sock() callback | ||
| 892 | * @sk: socket | ||
| 893 | * | ||
| 894 | * called from release_sock() to perform protocol dependent | ||
| 895 | * actions before socket release. | ||
| 896 | */ | ||
| 897 | void tcp_release_cb(struct sock *sk) | ||
| 898 | { | ||
| 899 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 900 | unsigned long flags, nflags; | ||
| 901 | |||
| 902 | /* perform an atomic operation only if at least one flag is set */ | ||
| 903 | do { | ||
| 904 | flags = tp->tsq_flags; | ||
| 905 | if (!(flags & TCP_DEFERRED_ALL)) | ||
| 906 | return; | ||
| 907 | nflags = flags & ~TCP_DEFERRED_ALL; | ||
| 908 | } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); | ||
| 909 | |||
| 910 | if (flags & (1UL << TCP_TSQ_DEFERRED)) | ||
| 911 | tcp_tsq_handler(sk); | ||
| 912 | |||
| 913 | if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) | ||
| 914 | tcp_write_timer_handler(sk); | ||
| 915 | |||
| 916 | if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) | ||
| 917 | tcp_delack_timer_handler(sk); | ||
| 918 | |||
| 919 | if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
| 920 | sk->sk_prot->mtu_reduced(sk); | ||
| 921 | } | ||
| 922 | EXPORT_SYMBOL(tcp_release_cb); | ||
| 923 | |||
| 924 | void __init tcp_tasklet_init(void) | ||
| 925 | { | ||
| 926 | int i; | ||
| 927 | |||
| 928 | for_each_possible_cpu(i) { | ||
| 929 | struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); | ||
| 930 | |||
| 931 | INIT_LIST_HEAD(&tsq->head); | ||
| 932 | tasklet_init(&tsq->tasklet, | ||
| 933 | tcp_tasklet_func, | ||
| 934 | (unsigned long)tsq); | ||
| 935 | } | ||
| 936 | } | ||
| 937 | |||
| 938 | /* | ||
| 939 | * Write buffer destructor automatically called from kfree_skb. | ||
| 940 | * We cant xmit new skbs from this context, as we might already | ||
| 941 | * hold qdisc lock. | ||
| 942 | */ | ||
| 943 | void tcp_wfree(struct sk_buff *skb) | ||
| 944 | { | ||
| 945 | struct sock *sk = skb->sk; | ||
| 946 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 947 | |||
| 948 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && | ||
| 949 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { | ||
| 950 | unsigned long flags; | ||
| 951 | struct tsq_tasklet *tsq; | ||
| 952 | |||
| 953 | /* Keep a ref on socket. | ||
| 954 | * This last ref will be released in tcp_tasklet_func() | ||
| 955 | */ | ||
| 956 | atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); | ||
| 957 | |||
| 958 | /* queue this socket to tasklet queue */ | ||
| 959 | local_irq_save(flags); | ||
| 960 | tsq = &__get_cpu_var(tsq_tasklet); | ||
| 961 | list_add(&tp->tsq_node, &tsq->head); | ||
| 962 | tasklet_schedule(&tsq->tasklet); | ||
| 963 | local_irq_restore(flags); | ||
| 964 | } else { | ||
| 965 | sock_wfree(skb); | ||
| 966 | } | ||
| 967 | } | ||
| 968 | |||
| 786 | /* This routine actually transmits TCP packets queued in by | 969 | /* This routine actually transmits TCP packets queued in by |
| 787 | * tcp_do_sendmsg(). This is used by both the initial | 970 | * tcp_do_sendmsg(). This is used by both the initial |
| 788 | * transmission and possible later retransmissions. | 971 | * transmission and possible later retransmissions. |
| @@ -844,7 +1027,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 844 | 1027 | ||
| 845 | skb_push(skb, tcp_header_size); | 1028 | skb_push(skb, tcp_header_size); |
| 846 | skb_reset_transport_header(skb); | 1029 | skb_reset_transport_header(skb); |
| 847 | skb_set_owner_w(skb, sk); | 1030 | |
| 1031 | skb_orphan(skb); | ||
| 1032 | skb->sk = sk; | ||
| 1033 | skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? | ||
| 1034 | tcp_wfree : sock_wfree; | ||
| 1035 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | ||
| 848 | 1036 | ||
| 849 | /* Build TCP header and checksum it. */ | 1037 | /* Build TCP header and checksum it. */ |
| 850 | th = tcp_hdr(skb); | 1038 | th = tcp_hdr(skb); |
| @@ -1780,6 +1968,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1780 | while ((skb = tcp_send_head(sk))) { | 1968 | while ((skb = tcp_send_head(sk))) { |
| 1781 | unsigned int limit; | 1969 | unsigned int limit; |
| 1782 | 1970 | ||
| 1971 | |||
| 1783 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); | 1972 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); |
| 1784 | BUG_ON(!tso_segs); | 1973 | BUG_ON(!tso_segs); |
| 1785 | 1974 | ||
| @@ -1800,6 +1989,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1800 | break; | 1989 | break; |
| 1801 | } | 1990 | } |
| 1802 | 1991 | ||
| 1992 | /* TSQ : sk_wmem_alloc accounts skb truesize, | ||
| 1993 | * including skb overhead. But thats OK. | ||
| 1994 | */ | ||
| 1995 | if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { | ||
| 1996 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | ||
| 1997 | break; | ||
| 1998 | } | ||
| 1803 | limit = mss_now; | 1999 | limit = mss_now; |
| 1804 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | 2000 | if (tso_segs > 1 && !tcp_urg_mode(tp)) |
| 1805 | limit = tcp_mss_split_point(sk, skb, mss_now, | 2001 | limit = tcp_mss_split_point(sk, skb, mss_now, |
| @@ -1849,7 +2045,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, | |||
| 1849 | if (unlikely(sk->sk_state == TCP_CLOSE)) | 2045 | if (unlikely(sk->sk_state == TCP_CLOSE)) |
| 1850 | return; | 2046 | return; |
| 1851 | 2047 | ||
| 1852 | if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) | 2048 | if (tcp_write_xmit(sk, cur_mss, nonagle, 0, |
| 2049 | sk_gfp_atomic(sk, GFP_ATOMIC))) | ||
| 1853 | tcp_check_probe_timer(sk); | 2050 | tcp_check_probe_timer(sk); |
| 1854 | } | 2051 | } |
| 1855 | 2052 | ||
| @@ -2442,7 +2639,16 @@ int tcp_send_synack(struct sock *sk) | |||
| 2442 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2639 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
| 2443 | } | 2640 | } |
| 2444 | 2641 | ||
| 2445 | /* Prepare a SYN-ACK. */ | 2642 | /** |
| 2643 | * tcp_make_synack - Prepare a SYN-ACK. | ||
| 2644 | * sk: listener socket | ||
| 2645 | * dst: dst entry attached to the SYNACK | ||
| 2646 | * req: request_sock pointer | ||
| 2647 | * rvp: request_values pointer | ||
| 2648 | * | ||
| 2649 | * Allocate one skb and build a SYNACK packet. | ||
| 2650 | * @dst is consumed : Caller should not use it again. | ||
| 2651 | */ | ||
| 2446 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | 2652 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
| 2447 | struct request_sock *req, | 2653 | struct request_sock *req, |
| 2448 | struct request_values *rvp) | 2654 | struct request_values *rvp) |
| @@ -2461,14 +2667,16 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2461 | 2667 | ||
| 2462 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) | 2668 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) |
| 2463 | s_data_desired = cvp->s_data_desired; | 2669 | s_data_desired = cvp->s_data_desired; |
| 2464 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); | 2670 | skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, |
| 2465 | if (skb == NULL) | 2671 | sk_gfp_atomic(sk, GFP_ATOMIC)); |
| 2672 | if (unlikely(!skb)) { | ||
| 2673 | dst_release(dst); | ||
| 2466 | return NULL; | 2674 | return NULL; |
| 2467 | 2675 | } | |
| 2468 | /* Reserve space for headers. */ | 2676 | /* Reserve space for headers. */ |
| 2469 | skb_reserve(skb, MAX_TCP_HEADER); | 2677 | skb_reserve(skb, MAX_TCP_HEADER); |
| 2470 | 2678 | ||
| 2471 | skb_dst_set(skb, dst_clone(dst)); | 2679 | skb_dst_set(skb, dst); |
| 2472 | 2680 | ||
| 2473 | mss = dst_metric_advmss(dst); | 2681 | mss = dst_metric_advmss(dst); |
| 2474 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2682 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
| @@ -2645,6 +2853,109 @@ void tcp_connect_init(struct sock *sk) | |||
| 2645 | tcp_clear_retrans(tp); | 2853 | tcp_clear_retrans(tp); |
| 2646 | } | 2854 | } |
| 2647 | 2855 | ||
| 2856 | static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) | ||
| 2857 | { | ||
| 2858 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2859 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | ||
| 2860 | |||
| 2861 | tcb->end_seq += skb->len; | ||
| 2862 | skb_header_release(skb); | ||
| 2863 | __tcp_add_write_queue_tail(sk, skb); | ||
| 2864 | sk->sk_wmem_queued += skb->truesize; | ||
| 2865 | sk_mem_charge(sk, skb->truesize); | ||
| 2866 | tp->write_seq = tcb->end_seq; | ||
| 2867 | tp->packets_out += tcp_skb_pcount(skb); | ||
| 2868 | } | ||
| 2869 | |||
| 2870 | /* Build and send a SYN with data and (cached) Fast Open cookie. However, | ||
| 2871 | * queue a data-only packet after the regular SYN, such that regular SYNs | ||
| 2872 | * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges | ||
| 2873 | * only the SYN sequence, the data are retransmitted in the first ACK. | ||
| 2874 | * If cookie is not cached or other error occurs, falls back to send a | ||
| 2875 | * regular SYN with Fast Open cookie request option. | ||
| 2876 | */ | ||
| 2877 | static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | ||
| 2878 | { | ||
| 2879 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2880 | struct tcp_fastopen_request *fo = tp->fastopen_req; | ||
| 2881 | int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; | ||
| 2882 | struct sk_buff *syn_data = NULL, *data; | ||
| 2883 | unsigned long last_syn_loss = 0; | ||
| 2884 | |||
| 2885 | tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ | ||
| 2886 | tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, | ||
| 2887 | &syn_loss, &last_syn_loss); | ||
| 2888 | /* Recurring FO SYN losses: revert to regular handshake temporarily */ | ||
| 2889 | if (syn_loss > 1 && | ||
| 2890 | time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { | ||
| 2891 | fo->cookie.len = -1; | ||
| 2892 | goto fallback; | ||
| 2893 | } | ||
| 2894 | |||
| 2895 | if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) | ||
| 2896 | fo->cookie.len = -1; | ||
| 2897 | else if (fo->cookie.len <= 0) | ||
| 2898 | goto fallback; | ||
| 2899 | |||
| 2900 | /* MSS for SYN-data is based on cached MSS and bounded by PMTU and | ||
| 2901 | * user-MSS. Reserve maximum option space for middleboxes that add | ||
| 2902 | * private TCP options. The cost is reduced data space in SYN :( | ||
| 2903 | */ | ||
| 2904 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) | ||
| 2905 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | ||
| 2906 | space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - | ||
| 2907 | MAX_TCP_OPTION_SPACE; | ||
| 2908 | |||
| 2909 | syn_data = skb_copy_expand(syn, skb_headroom(syn), space, | ||
| 2910 | sk->sk_allocation); | ||
| 2911 | if (syn_data == NULL) | ||
| 2912 | goto fallback; | ||
| 2913 | |||
| 2914 | for (i = 0; i < iovlen && syn_data->len < space; ++i) { | ||
| 2915 | struct iovec *iov = &fo->data->msg_iov[i]; | ||
| 2916 | unsigned char __user *from = iov->iov_base; | ||
| 2917 | int len = iov->iov_len; | ||
| 2918 | |||
| 2919 | if (syn_data->len + len > space) | ||
| 2920 | len = space - syn_data->len; | ||
| 2921 | else if (i + 1 == iovlen) | ||
| 2922 | /* No more data pending in inet_wait_for_connect() */ | ||
| 2923 | fo->data = NULL; | ||
| 2924 | |||
| 2925 | if (skb_add_data(syn_data, from, len)) | ||
| 2926 | goto fallback; | ||
| 2927 | } | ||
| 2928 | |||
| 2929 | /* Queue a data-only packet after the regular SYN for retransmission */ | ||
| 2930 | data = pskb_copy(syn_data, sk->sk_allocation); | ||
| 2931 | if (data == NULL) | ||
| 2932 | goto fallback; | ||
| 2933 | TCP_SKB_CB(data)->seq++; | ||
| 2934 | TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; | ||
| 2935 | TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); | ||
| 2936 | tcp_connect_queue_skb(sk, data); | ||
| 2937 | fo->copied = data->len; | ||
| 2938 | |||
| 2939 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { | ||
| 2940 | tp->syn_data = (fo->copied > 0); | ||
| 2941 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); | ||
| 2942 | goto done; | ||
| 2943 | } | ||
| 2944 | syn_data = NULL; | ||
| 2945 | |||
| 2946 | fallback: | ||
| 2947 | /* Send a regular SYN with Fast Open cookie request option */ | ||
| 2948 | if (fo->cookie.len > 0) | ||
| 2949 | fo->cookie.len = 0; | ||
| 2950 | err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); | ||
| 2951 | if (err) | ||
| 2952 | tp->syn_fastopen = 0; | ||
| 2953 | kfree_skb(syn_data); | ||
| 2954 | done: | ||
| 2955 | fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ | ||
| 2956 | return err; | ||
| 2957 | } | ||
| 2958 | |||
| 2648 | /* Build a SYN and send it off. */ | 2959 | /* Build a SYN and send it off. */ |
| 2649 | int tcp_connect(struct sock *sk) | 2960 | int tcp_connect(struct sock *sk) |
| 2650 | { | 2961 | { |
| @@ -2662,17 +2973,13 @@ int tcp_connect(struct sock *sk) | |||
| 2662 | skb_reserve(buff, MAX_TCP_HEADER); | 2973 | skb_reserve(buff, MAX_TCP_HEADER); |
| 2663 | 2974 | ||
| 2664 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); | 2975 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); |
| 2976 | tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; | ||
| 2977 | tcp_connect_queue_skb(sk, buff); | ||
| 2665 | TCP_ECN_send_syn(sk, buff); | 2978 | TCP_ECN_send_syn(sk, buff); |
| 2666 | 2979 | ||
| 2667 | /* Send it off. */ | 2980 | /* Send off SYN; include data in Fast Open. */ |
| 2668 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 2981 | err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : |
| 2669 | tp->retrans_stamp = TCP_SKB_CB(buff)->when; | 2982 | tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); |
| 2670 | skb_header_release(buff); | ||
| 2671 | __tcp_add_write_queue_tail(sk, buff); | ||
| 2672 | sk->sk_wmem_queued += buff->truesize; | ||
| 2673 | sk_mem_charge(sk, buff->truesize); | ||
| 2674 | tp->packets_out += tcp_skb_pcount(buff); | ||
| 2675 | err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); | ||
| 2676 | if (err == -ECONNREFUSED) | 2983 | if (err == -ECONNREFUSED) |
| 2677 | return err; | 2984 | return err; |
| 2678 | 2985 | ||
| @@ -2759,7 +3066,7 @@ void tcp_send_ack(struct sock *sk) | |||
| 2759 | * tcp_transmit_skb() will set the ownership to this | 3066 | * tcp_transmit_skb() will set the ownership to this |
| 2760 | * sock. | 3067 | * sock. |
| 2761 | */ | 3068 | */ |
| 2762 | buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); | 3069 | buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); |
| 2763 | if (buff == NULL) { | 3070 | if (buff == NULL) { |
| 2764 | inet_csk_schedule_ack(sk); | 3071 | inet_csk_schedule_ack(sk); |
| 2765 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; | 3072 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; |
| @@ -2774,7 +3081,7 @@ void tcp_send_ack(struct sock *sk) | |||
| 2774 | 3081 | ||
| 2775 | /* Send it off, this clears delayed acks for us. */ | 3082 | /* Send it off, this clears delayed acks for us. */ |
| 2776 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 3083 | TCP_SKB_CB(buff)->when = tcp_time_stamp; |
| 2777 | tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); | 3084 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); |
| 2778 | } | 3085 | } |
| 2779 | 3086 | ||
| 2780 | /* This routine sends a packet with an out of date sequence | 3087 | /* This routine sends a packet with an out of date sequence |
| @@ -2794,7 +3101,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) | |||
| 2794 | struct sk_buff *skb; | 3101 | struct sk_buff *skb; |
| 2795 | 3102 | ||
| 2796 | /* We don't queue it, tcp_transmit_skb() sets ownership. */ | 3103 | /* We don't queue it, tcp_transmit_skb() sets ownership. */ |
| 2797 | skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); | 3104 | skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); |
| 2798 | if (skb == NULL) | 3105 | if (skb == NULL) |
| 2799 | return -1; | 3106 | return -1; |
| 2800 | 3107 | ||
