diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 335 |
1 files changed, 177 insertions, 158 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 896e9dfbdb5c..22548b5f05cb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | |||
| 76 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; | 76 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; |
| 77 | 77 | ||
| 78 | tp->packets_out += tcp_skb_pcount(skb); | 78 | tp->packets_out += tcp_skb_pcount(skb); |
| 79 | if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || | 79 | if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) |
| 80 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | ||
| 81 | tcp_rearm_rto(sk); | 80 | tcp_rearm_rto(sk); |
| 82 | } | ||
| 83 | 81 | ||
| 84 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, | 82 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, |
| 85 | tcp_skb_pcount(skb)); | 83 | tcp_skb_pcount(skb)); |
| 86 | } | 84 | } |
| 87 | 85 | ||
| 88 | /* SND.NXT, if window was not shrunk. | 86 | /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one |
| 87 | * window scaling factor due to loss of precision. | ||
| 89 | * If window has been shrunk, what should we make? It is not clear at all. | 88 | * If window has been shrunk, what should we make? It is not clear at all. |
| 90 | * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( | 89 | * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( |
| 91 | * Anything in between SND.UNA...SND.UNA+SND.WND also can be already | 90 | * Anything in between SND.UNA...SND.UNA+SND.WND also can be already |
| @@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk) | |||
| 95 | { | 94 | { |
| 96 | const struct tcp_sock *tp = tcp_sk(sk); | 95 | const struct tcp_sock *tp = tcp_sk(sk); |
| 97 | 96 | ||
| 98 | if (!before(tcp_wnd_end(tp), tp->snd_nxt)) | 97 | if (!before(tcp_wnd_end(tp), tp->snd_nxt) || |
| 98 | (tp->rx_opt.wscale_ok && | ||
| 99 | ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) | ||
| 99 | return tp->snd_nxt; | 100 | return tp->snd_nxt; |
| 100 | else | 101 | else |
| 101 | return tcp_wnd_end(tp); | 102 | return tcp_wnd_end(tp); |
| @@ -640,7 +641,7 @@ static unsigned int tcp_synack_options(struct request_sock *req, | |||
| 640 | } | 641 | } |
| 641 | if (likely(ireq->tstamp_ok)) { | 642 | if (likely(ireq->tstamp_ok)) { |
| 642 | opts->options |= OPTION_TS; | 643 | opts->options |= OPTION_TS; |
| 643 | opts->tsval = tcp_skb_timestamp(skb); | 644 | opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; |
| 644 | opts->tsecr = req->ts_recent; | 645 | opts->tsecr = req->ts_recent; |
| 645 | remaining -= TCPOLEN_TSTAMP_ALIGNED; | 646 | remaining -= TCPOLEN_TSTAMP_ALIGNED; |
| 646 | } | 647 | } |
| @@ -769,25 +770,27 @@ static void tcp_tasklet_func(unsigned long data) | |||
| 769 | list_del(&tp->tsq_node); | 770 | list_del(&tp->tsq_node); |
| 770 | 771 | ||
| 771 | sk = (struct sock *)tp; | 772 | sk = (struct sock *)tp; |
| 772 | bh_lock_sock(sk); | 773 | smp_mb__before_atomic(); |
| 773 | 774 | clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); | |
| 774 | if (!sock_owned_by_user(sk)) { | 775 | |
| 775 | tcp_tsq_handler(sk); | 776 | if (!sk->sk_lock.owned && |
| 776 | } else { | 777 | test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) { |
| 777 | /* defer the work to tcp_release_cb() */ | 778 | bh_lock_sock(sk); |
| 778 | set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); | 779 | if (!sock_owned_by_user(sk)) { |
| 780 | clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); | ||
| 781 | tcp_tsq_handler(sk); | ||
| 782 | } | ||
| 783 | bh_unlock_sock(sk); | ||
| 779 | } | 784 | } |
| 780 | bh_unlock_sock(sk); | ||
| 781 | 785 | ||
| 782 | clear_bit(TSQ_QUEUED, &tp->tsq_flags); | ||
| 783 | sk_free(sk); | 786 | sk_free(sk); |
| 784 | } | 787 | } |
| 785 | } | 788 | } |
| 786 | 789 | ||
| 787 | #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ | 790 | #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ |
| 788 | (1UL << TCP_WRITE_TIMER_DEFERRED) | \ | 791 | TCPF_WRITE_TIMER_DEFERRED | \ |
| 789 | (1UL << TCP_DELACK_TIMER_DEFERRED) | \ | 792 | TCPF_DELACK_TIMER_DEFERRED | \ |
| 790 | (1UL << TCP_MTU_REDUCED_DEFERRED)) | 793 | TCPF_MTU_REDUCED_DEFERRED) |
| 791 | /** | 794 | /** |
| 792 | * tcp_release_cb - tcp release_sock() callback | 795 | * tcp_release_cb - tcp release_sock() callback |
| 793 | * @sk: socket | 796 | * @sk: socket |
| @@ -797,18 +800,17 @@ static void tcp_tasklet_func(unsigned long data) | |||
| 797 | */ | 800 | */ |
| 798 | void tcp_release_cb(struct sock *sk) | 801 | void tcp_release_cb(struct sock *sk) |
| 799 | { | 802 | { |
| 800 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 801 | unsigned long flags, nflags; | 803 | unsigned long flags, nflags; |
| 802 | 804 | ||
| 803 | /* perform an atomic operation only if at least one flag is set */ | 805 | /* perform an atomic operation only if at least one flag is set */ |
| 804 | do { | 806 | do { |
| 805 | flags = tp->tsq_flags; | 807 | flags = sk->sk_tsq_flags; |
| 806 | if (!(flags & TCP_DEFERRED_ALL)) | 808 | if (!(flags & TCP_DEFERRED_ALL)) |
| 807 | return; | 809 | return; |
| 808 | nflags = flags & ~TCP_DEFERRED_ALL; | 810 | nflags = flags & ~TCP_DEFERRED_ALL; |
| 809 | } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); | 811 | } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); |
| 810 | 812 | ||
| 811 | if (flags & (1UL << TCP_TSQ_DEFERRED)) | 813 | if (flags & TCPF_TSQ_DEFERRED) |
| 812 | tcp_tsq_handler(sk); | 814 | tcp_tsq_handler(sk); |
| 813 | 815 | ||
| 814 | /* Here begins the tricky part : | 816 | /* Here begins the tricky part : |
| @@ -822,15 +824,15 @@ void tcp_release_cb(struct sock *sk) | |||
| 822 | */ | 824 | */ |
| 823 | sock_release_ownership(sk); | 825 | sock_release_ownership(sk); |
| 824 | 826 | ||
| 825 | if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { | 827 | if (flags & TCPF_WRITE_TIMER_DEFERRED) { |
| 826 | tcp_write_timer_handler(sk); | 828 | tcp_write_timer_handler(sk); |
| 827 | __sock_put(sk); | 829 | __sock_put(sk); |
| 828 | } | 830 | } |
| 829 | if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { | 831 | if (flags & TCPF_DELACK_TIMER_DEFERRED) { |
| 830 | tcp_delack_timer_handler(sk); | 832 | tcp_delack_timer_handler(sk); |
| 831 | __sock_put(sk); | 833 | __sock_put(sk); |
| 832 | } | 834 | } |
| 833 | if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { | 835 | if (flags & TCPF_MTU_REDUCED_DEFERRED) { |
| 834 | inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); | 836 | inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); |
| 835 | __sock_put(sk); | 837 | __sock_put(sk); |
| 836 | } | 838 | } |
| @@ -860,6 +862,7 @@ void tcp_wfree(struct sk_buff *skb) | |||
| 860 | { | 862 | { |
| 861 | struct sock *sk = skb->sk; | 863 | struct sock *sk = skb->sk; |
| 862 | struct tcp_sock *tp = tcp_sk(sk); | 864 | struct tcp_sock *tp = tcp_sk(sk); |
| 865 | unsigned long flags, nval, oval; | ||
| 863 | int wmem; | 866 | int wmem; |
| 864 | 867 | ||
| 865 | /* Keep one reference on sk_wmem_alloc. | 868 | /* Keep one reference on sk_wmem_alloc. |
| @@ -877,16 +880,25 @@ void tcp_wfree(struct sk_buff *skb) | |||
| 877 | if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) | 880 | if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) |
| 878 | goto out; | 881 | goto out; |
| 879 | 882 | ||
| 880 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && | 883 | for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { |
| 881 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { | ||
| 882 | unsigned long flags; | ||
| 883 | struct tsq_tasklet *tsq; | 884 | struct tsq_tasklet *tsq; |
| 885 | bool empty; | ||
| 886 | |||
| 887 | if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) | ||
| 888 | goto out; | ||
| 889 | |||
| 890 | nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; | ||
| 891 | nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); | ||
| 892 | if (nval != oval) | ||
| 893 | continue; | ||
| 884 | 894 | ||
| 885 | /* queue this socket to tasklet queue */ | 895 | /* queue this socket to tasklet queue */ |
| 886 | local_irq_save(flags); | 896 | local_irq_save(flags); |
| 887 | tsq = this_cpu_ptr(&tsq_tasklet); | 897 | tsq = this_cpu_ptr(&tsq_tasklet); |
| 898 | empty = list_empty(&tsq->head); | ||
| 888 | list_add(&tp->tsq_node, &tsq->head); | 899 | list_add(&tp->tsq_node, &tsq->head); |
| 889 | tasklet_schedule(&tsq->tasklet); | 900 | if (empty) |
| 901 | tasklet_schedule(&tsq->tasklet); | ||
| 890 | local_irq_restore(flags); | 902 | local_irq_restore(flags); |
| 891 | return; | 903 | return; |
| 892 | } | 904 | } |
| @@ -955,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 955 | */ | 967 | */ |
| 956 | skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); | 968 | skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); |
| 957 | 969 | ||
| 970 | /* If we had to use memory reserve to allocate this skb, | ||
| 971 | * this might cause drops if packet is looped back : | ||
| 972 | * Other socket might not have SOCK_MEMALLOC. | ||
| 973 | * Packets not looped back do not care about pfmemalloc. | ||
| 974 | */ | ||
| 975 | skb->pfmemalloc = 0; | ||
| 976 | |||
| 958 | skb_push(skb, tcp_header_size); | 977 | skb_push(skb, tcp_header_size); |
| 959 | skb_reset_transport_header(skb); | 978 | skb_reset_transport_header(skb); |
| 960 | 979 | ||
| @@ -964,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 964 | skb_set_hash_from_sk(skb, sk); | 983 | skb_set_hash_from_sk(skb, sk); |
| 965 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | 984 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); |
| 966 | 985 | ||
| 986 | skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); | ||
| 987 | |||
| 967 | /* Build TCP header and checksum it. */ | 988 | /* Build TCP header and checksum it. */ |
| 968 | th = (struct tcphdr *)skb->data; | 989 | th = (struct tcphdr *)skb->data; |
| 969 | th->source = inet->inet_sport; | 990 | th->source = inet->inet_sport; |
| @@ -1027,7 +1048,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 1027 | skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); | 1048 | skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); |
| 1028 | 1049 | ||
| 1029 | /* Our usage of tstamp should remain private */ | 1050 | /* Our usage of tstamp should remain private */ |
| 1030 | skb->tstamp.tv64 = 0; | 1051 | skb->tstamp = 0; |
| 1031 | 1052 | ||
| 1032 | /* Cleanup our debris for IP stacks */ | 1053 | /* Cleanup our debris for IP stacks */ |
| 1033 | memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), | 1054 | memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), |
| @@ -1514,6 +1535,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) | |||
| 1514 | if (sysctl_tcp_slow_start_after_idle && | 1535 | if (sysctl_tcp_slow_start_after_idle && |
| 1515 | (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) | 1536 | (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) |
| 1516 | tcp_cwnd_application_limited(sk); | 1537 | tcp_cwnd_application_limited(sk); |
| 1538 | |||
| 1539 | /* The following conditions together indicate the starvation | ||
| 1540 | * is caused by insufficient sender buffer: | ||
| 1541 | * 1) just sent some data (see tcp_write_xmit) | ||
| 1542 | * 2) not cwnd limited (this else condition) | ||
| 1543 | * 3) no more data to send (null tcp_send_head ) | ||
| 1544 | * 4) application is hitting buffer limit (SOCK_NOSPACE) | ||
| 1545 | */ | ||
| 1546 | if (!tcp_send_head(sk) && sk->sk_socket && | ||
| 1547 | test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && | ||
| 1548 | (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | ||
| 1549 | tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); | ||
| 1517 | } | 1550 | } |
| 1518 | } | 1551 | } |
| 1519 | 1552 | ||
| @@ -1910,26 +1943,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) | |||
| 1910 | */ | 1943 | */ |
| 1911 | static int tcp_mtu_probe(struct sock *sk) | 1944 | static int tcp_mtu_probe(struct sock *sk) |
| 1912 | { | 1945 | { |
| 1913 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1914 | struct inet_connection_sock *icsk = inet_csk(sk); | 1946 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 1947 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1915 | struct sk_buff *skb, *nskb, *next; | 1948 | struct sk_buff *skb, *nskb, *next; |
| 1916 | struct net *net = sock_net(sk); | 1949 | struct net *net = sock_net(sk); |
| 1917 | int len; | ||
| 1918 | int probe_size; | 1950 | int probe_size; |
| 1919 | int size_needed; | 1951 | int size_needed; |
| 1920 | int copy; | 1952 | int copy, len; |
| 1921 | int mss_now; | 1953 | int mss_now; |
| 1922 | int interval; | 1954 | int interval; |
| 1923 | 1955 | ||
| 1924 | /* Not currently probing/verifying, | 1956 | /* Not currently probing/verifying, |
| 1925 | * not in recovery, | 1957 | * not in recovery, |
| 1926 | * have enough cwnd, and | 1958 | * have enough cwnd, and |
| 1927 | * not SACKing (the variable headers throw things off) */ | 1959 | * not SACKing (the variable headers throw things off) |
| 1928 | if (!icsk->icsk_mtup.enabled || | 1960 | */ |
| 1929 | icsk->icsk_mtup.probe_size || | 1961 | if (likely(!icsk->icsk_mtup.enabled || |
| 1930 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open || | 1962 | icsk->icsk_mtup.probe_size || |
| 1931 | tp->snd_cwnd < 11 || | 1963 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open || |
| 1932 | tp->rx_opt.num_sacks || tp->rx_opt.dsack) | 1964 | tp->snd_cwnd < 11 || |
| 1965 | tp->rx_opt.num_sacks || tp->rx_opt.dsack)) | ||
| 1933 | return -1; | 1966 | return -1; |
| 1934 | 1967 | ||
| 1935 | /* Use binary search for probe_size between tcp_mss_base, | 1968 | /* Use binary search for probe_size between tcp_mss_base, |
| @@ -2069,7 +2102,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, | |||
| 2069 | limit <<= factor; | 2102 | limit <<= factor; |
| 2070 | 2103 | ||
| 2071 | if (atomic_read(&sk->sk_wmem_alloc) > limit) { | 2104 | if (atomic_read(&sk->sk_wmem_alloc) > limit) { |
| 2072 | set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); | 2105 | /* Always send the 1st or 2nd skb in write queue. |
| 2106 | * No need to wait for TX completion to call us back, | ||
| 2107 | * after softirq/tasklet schedule. | ||
| 2108 | * This helps when TX completions are delayed too much. | ||
| 2109 | */ | ||
| 2110 | if (skb == sk->sk_write_queue.next || | ||
| 2111 | skb->prev == sk->sk_write_queue.next) | ||
| 2112 | return false; | ||
| 2113 | |||
| 2114 | set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); | ||
| 2073 | /* It is possible TX completion already happened | 2115 | /* It is possible TX completion already happened |
| 2074 | * before we set TSQ_THROTTLED, so we must | 2116 | * before we set TSQ_THROTTLED, so we must |
| 2075 | * test again the condition. | 2117 | * test again the condition. |
| @@ -2081,6 +2123,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, | |||
| 2081 | return false; | 2123 | return false; |
| 2082 | } | 2124 | } |
| 2083 | 2125 | ||
| 2126 | static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) | ||
| 2127 | { | ||
| 2128 | const u32 now = tcp_time_stamp; | ||
| 2129 | |||
| 2130 | if (tp->chrono_type > TCP_CHRONO_UNSPEC) | ||
| 2131 | tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; | ||
| 2132 | tp->chrono_start = now; | ||
| 2133 | tp->chrono_type = new; | ||
| 2134 | } | ||
| 2135 | |||
| 2136 | void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) | ||
| 2137 | { | ||
| 2138 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2139 | |||
| 2140 | /* If there are multiple conditions worthy of tracking in a | ||
| 2141 | * chronograph then the highest priority enum takes precedence | ||
| 2142 | * over the other conditions. So that if something "more interesting" | ||
| 2143 | * starts happening, stop the previous chrono and start a new one. | ||
| 2144 | */ | ||
| 2145 | if (type > tp->chrono_type) | ||
| 2146 | tcp_chrono_set(tp, type); | ||
| 2147 | } | ||
| 2148 | |||
| 2149 | void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) | ||
| 2150 | { | ||
| 2151 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2152 | |||
| 2153 | |||
| 2154 | /* There are multiple conditions worthy of tracking in a | ||
| 2155 | * chronograph, so that the highest priority enum takes | ||
| 2156 | * precedence over the other conditions (see tcp_chrono_start). | ||
| 2157 | * If a condition stops, we only stop chrono tracking if | ||
| 2158 | * it's the "most interesting" or current chrono we are | ||
| 2159 | * tracking and starts busy chrono if we have pending data. | ||
| 2160 | */ | ||
| 2161 | if (tcp_write_queue_empty(sk)) | ||
| 2162 | tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); | ||
| 2163 | else if (type == tp->chrono_type) | ||
| 2164 | tcp_chrono_set(tp, TCP_CHRONO_BUSY); | ||
| 2165 | } | ||
| 2166 | |||
| 2084 | /* This routine writes packets to the network. It advances the | 2167 | /* This routine writes packets to the network. It advances the |
| 2085 | * send_head. This happens as incoming acks open up the remote | 2168 | * send_head. This happens as incoming acks open up the remote |
| 2086 | * window for us. | 2169 | * window for us. |
| @@ -2103,7 +2186,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 2103 | unsigned int tso_segs, sent_pkts; | 2186 | unsigned int tso_segs, sent_pkts; |
| 2104 | int cwnd_quota; | 2187 | int cwnd_quota; |
| 2105 | int result; | 2188 | int result; |
| 2106 | bool is_cwnd_limited = false; | 2189 | bool is_cwnd_limited = false, is_rwnd_limited = false; |
| 2107 | u32 max_segs; | 2190 | u32 max_segs; |
| 2108 | 2191 | ||
| 2109 | sent_pkts = 0; | 2192 | sent_pkts = 0; |
| @@ -2140,8 +2223,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 2140 | break; | 2223 | break; |
| 2141 | } | 2224 | } |
| 2142 | 2225 | ||
| 2143 | if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) | 2226 | if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { |
| 2227 | is_rwnd_limited = true; | ||
| 2144 | break; | 2228 | break; |
| 2229 | } | ||
| 2145 | 2230 | ||
| 2146 | if (tso_segs == 1) { | 2231 | if (tso_segs == 1) { |
| 2147 | if (unlikely(!tcp_nagle_test(tp, skb, mss_now, | 2232 | if (unlikely(!tcp_nagle_test(tp, skb, mss_now, |
| @@ -2167,6 +2252,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 2167 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | 2252 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) |
| 2168 | break; | 2253 | break; |
| 2169 | 2254 | ||
| 2255 | if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) | ||
| 2256 | clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); | ||
| 2170 | if (tcp_small_queue_check(sk, skb, 0)) | 2257 | if (tcp_small_queue_check(sk, skb, 0)) |
| 2171 | break; | 2258 | break; |
| 2172 | 2259 | ||
| @@ -2186,6 +2273,11 @@ repair: | |||
| 2186 | break; | 2273 | break; |
| 2187 | } | 2274 | } |
| 2188 | 2275 | ||
| 2276 | if (is_rwnd_limited) | ||
| 2277 | tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); | ||
| 2278 | else | ||
| 2279 | tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); | ||
| 2280 | |||
| 2189 | if (likely(sent_pkts)) { | 2281 | if (likely(sent_pkts)) { |
| 2190 | if (tcp_in_cwnd_reduction(sk)) | 2282 | if (tcp_in_cwnd_reduction(sk)) |
| 2191 | tp->prr_out += sent_pkts; | 2283 | tp->prr_out += sent_pkts; |
| @@ -2207,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
| 2207 | u32 timeout, tlp_time_stamp, rto_time_stamp; | 2299 | u32 timeout, tlp_time_stamp, rto_time_stamp; |
| 2208 | u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); | 2300 | u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); |
| 2209 | 2301 | ||
| 2210 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) | ||
| 2211 | return false; | ||
| 2212 | /* No consecutive loss probes. */ | 2302 | /* No consecutive loss probes. */ |
| 2213 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { | 2303 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { |
| 2214 | tcp_rearm_rto(sk); | 2304 | tcp_rearm_rto(sk); |
| @@ -2227,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
| 2227 | /* Schedule a loss probe in 2*RTT for SACK capable connections | 2317 | /* Schedule a loss probe in 2*RTT for SACK capable connections |
| 2228 | * in Open state, that are either limited by cwnd or application. | 2318 | * in Open state, that are either limited by cwnd or application. |
| 2229 | */ | 2319 | */ |
| 2230 | if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || | 2320 | if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || |
| 2231 | !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) | 2321 | !tp->packets_out || !tcp_is_sack(tp) || |
| 2322 | icsk->icsk_ca_state != TCP_CA_Open) | ||
| 2232 | return false; | 2323 | return false; |
| 2233 | 2324 | ||
| 2234 | if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && | 2325 | if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && |
| @@ -2436,9 +2527,11 @@ u32 __tcp_select_window(struct sock *sk) | |||
| 2436 | int full_space = min_t(int, tp->window_clamp, allowed_space); | 2527 | int full_space = min_t(int, tp->window_clamp, allowed_space); |
| 2437 | int window; | 2528 | int window; |
| 2438 | 2529 | ||
| 2439 | if (mss > full_space) | 2530 | if (unlikely(mss > full_space)) { |
| 2440 | mss = full_space; | 2531 | mss = full_space; |
| 2441 | 2532 | if (mss <= 0) | |
| 2533 | return 0; | ||
| 2534 | } | ||
| 2442 | if (free_space < (full_space >> 1)) { | 2535 | if (free_space < (full_space >> 1)) { |
| 2443 | icsk->icsk_ack.quick = 0; | 2536 | icsk->icsk_ack.quick = 0; |
| 2444 | 2537 | ||
| @@ -2514,7 +2607,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, | |||
| 2514 | } | 2607 | } |
| 2515 | 2608 | ||
| 2516 | /* Collapses two adjacent SKB's during retransmission. */ | 2609 | /* Collapses two adjacent SKB's during retransmission. */ |
| 2517 | static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) | 2610 | static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) |
| 2518 | { | 2611 | { |
| 2519 | struct tcp_sock *tp = tcp_sk(sk); | 2612 | struct tcp_sock *tp = tcp_sk(sk); |
| 2520 | struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); | 2613 | struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); |
| @@ -2525,13 +2618,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) | |||
| 2525 | 2618 | ||
| 2526 | BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); | 2619 | BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); |
| 2527 | 2620 | ||
| 2621 | if (next_skb_size) { | ||
| 2622 | if (next_skb_size <= skb_availroom(skb)) | ||
| 2623 | skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), | ||
| 2624 | next_skb_size); | ||
| 2625 | else if (!skb_shift(skb, next_skb, next_skb_size)) | ||
| 2626 | return false; | ||
| 2627 | } | ||
| 2528 | tcp_highest_sack_combine(sk, next_skb, skb); | 2628 | tcp_highest_sack_combine(sk, next_skb, skb); |
| 2529 | 2629 | ||
| 2530 | tcp_unlink_write_queue(next_skb, sk); | 2630 | tcp_unlink_write_queue(next_skb, sk); |
| 2531 | 2631 | ||
| 2532 | skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), | ||
| 2533 | next_skb_size); | ||
| 2534 | |||
| 2535 | if (next_skb->ip_summed == CHECKSUM_PARTIAL) | 2632 | if (next_skb->ip_summed == CHECKSUM_PARTIAL) |
| 2536 | skb->ip_summed = CHECKSUM_PARTIAL; | 2633 | skb->ip_summed = CHECKSUM_PARTIAL; |
| 2537 | 2634 | ||
| @@ -2560,6 +2657,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) | |||
| 2560 | tcp_skb_collapse_tstamp(skb, next_skb); | 2657 | tcp_skb_collapse_tstamp(skb, next_skb); |
| 2561 | 2658 | ||
| 2562 | sk_wmem_free_skb(sk, next_skb); | 2659 | sk_wmem_free_skb(sk, next_skb); |
| 2660 | return true; | ||
| 2563 | } | 2661 | } |
| 2564 | 2662 | ||
| 2565 | /* Check if coalescing SKBs is legal. */ | 2663 | /* Check if coalescing SKBs is legal. */ |
| @@ -2567,14 +2665,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) | |||
| 2567 | { | 2665 | { |
| 2568 | if (tcp_skb_pcount(skb) > 1) | 2666 | if (tcp_skb_pcount(skb) > 1) |
| 2569 | return false; | 2667 | return false; |
| 2570 | /* TODO: SACK collapsing could be used to remove this condition */ | ||
| 2571 | if (skb_shinfo(skb)->nr_frags != 0) | ||
| 2572 | return false; | ||
| 2573 | if (skb_cloned(skb)) | 2668 | if (skb_cloned(skb)) |
| 2574 | return false; | 2669 | return false; |
| 2575 | if (skb == tcp_send_head(sk)) | 2670 | if (skb == tcp_send_head(sk)) |
| 2576 | return false; | 2671 | return false; |
| 2577 | /* Some heurestics for collapsing over SACK'd could be invented */ | 2672 | /* Some heuristics for collapsing over SACK'd could be invented */ |
| 2578 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) | 2673 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
| 2579 | return false; | 2674 | return false; |
| 2580 | 2675 | ||
| @@ -2612,16 +2707,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | |||
| 2612 | 2707 | ||
| 2613 | if (space < 0) | 2708 | if (space < 0) |
| 2614 | break; | 2709 | break; |
| 2615 | /* Punt if not enough space exists in the first SKB for | ||
| 2616 | * the data in the second | ||
| 2617 | */ | ||
| 2618 | if (skb->len > skb_availroom(to)) | ||
| 2619 | break; | ||
| 2620 | 2710 | ||
| 2621 | if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) | 2711 | if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) |
| 2622 | break; | 2712 | break; |
| 2623 | 2713 | ||
| 2624 | tcp_collapse_retrans(sk, to); | 2714 | if (!tcp_collapse_retrans(sk, to)) |
| 2715 | break; | ||
| 2625 | } | 2716 | } |
| 2626 | } | 2717 | } |
| 2627 | 2718 | ||
| @@ -2694,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | |||
| 2694 | if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) | 2785 | if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) |
| 2695 | tcp_ecn_clear_syn(sk, skb); | 2786 | tcp_ecn_clear_syn(sk, skb); |
| 2696 | 2787 | ||
| 2788 | /* Update global and local TCP statistics. */ | ||
| 2789 | segs = tcp_skb_pcount(skb); | ||
| 2790 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); | ||
| 2791 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) | ||
| 2792 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | ||
| 2793 | tp->total_retrans += segs; | ||
| 2794 | |||
| 2697 | /* make sure skb->data is aligned on arches that require it | 2795 | /* make sure skb->data is aligned on arches that require it |
| 2698 | * and check if ack-trimming & collapsing extended the headroom | 2796 | * and check if ack-trimming & collapsing extended the headroom |
| 2699 | * beyond what csum_start can cover. | 2797 | * beyond what csum_start can cover. |
| @@ -2711,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | |||
| 2711 | } | 2809 | } |
| 2712 | 2810 | ||
| 2713 | if (likely(!err)) { | 2811 | if (likely(!err)) { |
| 2714 | segs = tcp_skb_pcount(skb); | ||
| 2715 | |||
| 2716 | TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; | 2812 | TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; |
| 2717 | /* Update global TCP statistics. */ | 2813 | } else if (err != -EBUSY) { |
| 2718 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); | 2814 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); |
| 2719 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) | ||
| 2720 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | ||
| 2721 | tp->total_retrans += segs; | ||
| 2722 | } | 2815 | } |
| 2723 | return err; | 2816 | return err; |
| 2724 | } | 2817 | } |
| @@ -2741,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | |||
| 2741 | if (!tp->retrans_stamp) | 2834 | if (!tp->retrans_stamp) |
| 2742 | tp->retrans_stamp = tcp_skb_timestamp(skb); | 2835 | tp->retrans_stamp = tcp_skb_timestamp(skb); |
| 2743 | 2836 | ||
| 2744 | } else if (err != -EBUSY) { | ||
| 2745 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); | ||
| 2746 | } | 2837 | } |
| 2747 | 2838 | ||
| 2748 | if (tp->undo_retrans < 0) | 2839 | if (tp->undo_retrans < 0) |
| @@ -2751,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | |||
| 2751 | return err; | 2842 | return err; |
| 2752 | } | 2843 | } |
| 2753 | 2844 | ||
| 2754 | /* Check if we forward retransmits are possible in the current | ||
| 2755 | * window/congestion state. | ||
| 2756 | */ | ||
| 2757 | static bool tcp_can_forward_retransmit(struct sock *sk) | ||
| 2758 | { | ||
| 2759 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2760 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2761 | |||
| 2762 | /* Forward retransmissions are possible only during Recovery. */ | ||
| 2763 | if (icsk->icsk_ca_state != TCP_CA_Recovery) | ||
| 2764 | return false; | ||
| 2765 | |||
| 2766 | /* No forward retransmissions in Reno are possible. */ | ||
| 2767 | if (tcp_is_reno(tp)) | ||
| 2768 | return false; | ||
| 2769 | |||
| 2770 | /* Yeah, we have to make difficult choice between forward transmission | ||
| 2771 | * and retransmission... Both ways have their merits... | ||
| 2772 | * | ||
| 2773 | * For now we do not retransmit anything, while we have some new | ||
| 2774 | * segments to send. In the other cases, follow rule 3 for | ||
| 2775 | * NextSeg() specified in RFC3517. | ||
| 2776 | */ | ||
| 2777 | |||
| 2778 | if (tcp_may_send_now(sk)) | ||
| 2779 | return false; | ||
| 2780 | |||
| 2781 | return true; | ||
| 2782 | } | ||
| 2783 | |||
| 2784 | /* This gets called after a retransmit timeout, and the initially | 2845 | /* This gets called after a retransmit timeout, and the initially |
| 2785 | * retransmitted data is acknowledged. It tries to continue | 2846 | * retransmitted data is acknowledged. It tries to continue |
| 2786 | * resending the rest of the retransmit queue, until either | 2847 | * resending the rest of the retransmit queue, until either |
| @@ -2795,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 2795 | struct tcp_sock *tp = tcp_sk(sk); | 2856 | struct tcp_sock *tp = tcp_sk(sk); |
| 2796 | struct sk_buff *skb; | 2857 | struct sk_buff *skb; |
| 2797 | struct sk_buff *hole = NULL; | 2858 | struct sk_buff *hole = NULL; |
| 2798 | u32 max_segs, last_lost; | 2859 | u32 max_segs; |
| 2799 | int mib_idx; | 2860 | int mib_idx; |
| 2800 | int fwd_rexmitting = 0; | ||
| 2801 | 2861 | ||
| 2802 | if (!tp->packets_out) | 2862 | if (!tp->packets_out) |
| 2803 | return; | 2863 | return; |
| 2804 | 2864 | ||
| 2805 | if (!tp->lost_out) | ||
| 2806 | tp->retransmit_high = tp->snd_una; | ||
| 2807 | |||
| 2808 | if (tp->retransmit_skb_hint) { | 2865 | if (tp->retransmit_skb_hint) { |
| 2809 | skb = tp->retransmit_skb_hint; | 2866 | skb = tp->retransmit_skb_hint; |
| 2810 | last_lost = TCP_SKB_CB(skb)->end_seq; | ||
| 2811 | if (after(last_lost, tp->retransmit_high)) | ||
| 2812 | last_lost = tp->retransmit_high; | ||
| 2813 | } else { | 2867 | } else { |
| 2814 | skb = tcp_write_queue_head(sk); | 2868 | skb = tcp_write_queue_head(sk); |
| 2815 | last_lost = tp->snd_una; | ||
| 2816 | } | 2869 | } |
| 2817 | 2870 | ||
| 2818 | max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); | 2871 | max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); |
| @@ -2835,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 2835 | */ | 2888 | */ |
| 2836 | segs = min_t(int, segs, max_segs); | 2889 | segs = min_t(int, segs, max_segs); |
| 2837 | 2890 | ||
| 2838 | if (fwd_rexmitting) { | 2891 | if (tp->retrans_out >= tp->lost_out) { |
| 2839 | begin_fwd: | 2892 | break; |
| 2840 | if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) | ||
| 2841 | break; | ||
| 2842 | mib_idx = LINUX_MIB_TCPFORWARDRETRANS; | ||
| 2843 | |||
| 2844 | } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { | ||
| 2845 | tp->retransmit_high = last_lost; | ||
| 2846 | if (!tcp_can_forward_retransmit(sk)) | ||
| 2847 | break; | ||
| 2848 | /* Backtrack if necessary to non-L'ed skb */ | ||
| 2849 | if (hole) { | ||
| 2850 | skb = hole; | ||
| 2851 | hole = NULL; | ||
| 2852 | } | ||
| 2853 | fwd_rexmitting = 1; | ||
| 2854 | goto begin_fwd; | ||
| 2855 | |||
| 2856 | } else if (!(sacked & TCPCB_LOST)) { | 2893 | } else if (!(sacked & TCPCB_LOST)) { |
| 2857 | if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) | 2894 | if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) |
| 2858 | hole = skb; | 2895 | hole = skb; |
| 2859 | continue; | 2896 | continue; |
| 2860 | 2897 | ||
| 2861 | } else { | 2898 | } else { |
| 2862 | last_lost = TCP_SKB_CB(skb)->end_seq; | ||
| 2863 | if (icsk->icsk_ca_state != TCP_CA_Loss) | 2899 | if (icsk->icsk_ca_state != TCP_CA_Loss) |
| 2864 | mib_idx = LINUX_MIB_TCPFASTRETRANS; | 2900 | mib_idx = LINUX_MIB_TCPFASTRETRANS; |
| 2865 | else | 2901 | else |
| @@ -2880,7 +2916,8 @@ begin_fwd: | |||
| 2880 | if (tcp_in_cwnd_reduction(sk)) | 2916 | if (tcp_in_cwnd_reduction(sk)) |
| 2881 | tp->prr_out += tcp_skb_pcount(skb); | 2917 | tp->prr_out += tcp_skb_pcount(skb); |
| 2882 | 2918 | ||
| 2883 | if (skb == tcp_write_queue_head(sk)) | 2919 | if (skb == tcp_write_queue_head(sk) && |
| 2920 | icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) | ||
| 2884 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 2921 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 2885 | inet_csk(sk)->icsk_rto, | 2922 | inet_csk(sk)->icsk_rto, |
| 2886 | TCP_RTO_MAX); | 2923 | TCP_RTO_MAX); |
| @@ -3037,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, | |||
| 3037 | struct sk_buff *skb; | 3074 | struct sk_buff *skb; |
| 3038 | int tcp_header_size; | 3075 | int tcp_header_size; |
| 3039 | struct tcphdr *th; | 3076 | struct tcphdr *th; |
| 3040 | u16 user_mss; | ||
| 3041 | int mss; | 3077 | int mss; |
| 3042 | 3078 | ||
| 3043 | skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); | 3079 | skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); |
| @@ -3067,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, | |||
| 3067 | } | 3103 | } |
| 3068 | skb_dst_set(skb, dst); | 3104 | skb_dst_set(skb, dst); |
| 3069 | 3105 | ||
| 3070 | mss = dst_metric_advmss(dst); | 3106 | mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); |
| 3071 | user_mss = READ_ONCE(tp->rx_opt.user_mss); | ||
| 3072 | if (user_mss && user_mss < mss) | ||
| 3073 | mss = user_mss; | ||
| 3074 | 3107 | ||
| 3075 | memset(&opts, 0, sizeof(opts)); | 3108 | memset(&opts, 0, sizeof(opts)); |
| 3076 | #ifdef CONFIG_SYN_COOKIES | 3109 | #ifdef CONFIG_SYN_COOKIES |
| @@ -3123,7 +3156,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, | |||
| 3123 | #endif | 3156 | #endif |
| 3124 | 3157 | ||
| 3125 | /* Do not fool tcpdump (if any), clean our debris */ | 3158 | /* Do not fool tcpdump (if any), clean our debris */ |
| 3126 | skb->tstamp.tv64 = 0; | 3159 | skb->tstamp = 0; |
| 3127 | return skb; | 3160 | return skb; |
| 3128 | } | 3161 | } |
| 3129 | EXPORT_SYMBOL(tcp_make_synack); | 3162 | EXPORT_SYMBOL(tcp_make_synack); |
| @@ -3176,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk) | |||
| 3176 | 3209 | ||
| 3177 | if (!tp->window_clamp) | 3210 | if (!tp->window_clamp) |
| 3178 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 3211 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
| 3179 | tp->advmss = dst_metric_advmss(dst); | 3212 | tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); |
| 3180 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) | ||
| 3181 | tp->advmss = tp->rx_opt.user_mss; | ||
| 3182 | 3213 | ||
| 3183 | tcp_initialize_rcv_mss(sk); | 3214 | tcp_initialize_rcv_mss(sk); |
| 3184 | 3215 | ||
| @@ -3244,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
| 3244 | { | 3275 | { |
| 3245 | struct tcp_sock *tp = tcp_sk(sk); | 3276 | struct tcp_sock *tp = tcp_sk(sk); |
| 3246 | struct tcp_fastopen_request *fo = tp->fastopen_req; | 3277 | struct tcp_fastopen_request *fo = tp->fastopen_req; |
| 3247 | int syn_loss = 0, space, err = 0; | 3278 | int space, err = 0; |
| 3248 | unsigned long last_syn_loss = 0; | ||
| 3249 | struct sk_buff *syn_data; | 3279 | struct sk_buff *syn_data; |
| 3250 | 3280 | ||
| 3251 | tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ | 3281 | tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ |
| 3252 | tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, | 3282 | if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) |
| 3253 | &syn_loss, &last_syn_loss); | ||
| 3254 | /* Recurring FO SYN losses: revert to regular handshake temporarily */ | ||
| 3255 | if (syn_loss > 1 && | ||
| 3256 | time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { | ||
| 3257 | fo->cookie.len = -1; | ||
| 3258 | goto fallback; | ||
| 3259 | } | ||
| 3260 | |||
| 3261 | if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) | ||
| 3262 | fo->cookie.len = -1; | ||
| 3263 | else if (fo->cookie.len <= 0) | ||
| 3264 | goto fallback; | 3283 | goto fallback; |
| 3265 | 3284 | ||
| 3266 | /* MSS for SYN-data is based on cached MSS and bounded by PMTU and | 3285 | /* MSS for SYN-data is based on cached MSS and bounded by PMTU and |
| 3267 | * user-MSS. Reserve maximum option space for middleboxes that add | 3286 | * user-MSS. Reserve maximum option space for middleboxes that add |
| 3268 | * private TCP options. The cost is reduced data space in SYN :( | 3287 | * private TCP options. The cost is reduced data space in SYN :( |
| 3269 | */ | 3288 | */ |
| 3270 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) | 3289 | tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); |
| 3271 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | 3290 | |
| 3272 | space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - | 3291 | space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - |
| 3273 | MAX_TCP_OPTION_SPACE; | 3292 | MAX_TCP_OPTION_SPACE; |
| 3274 | 3293 | ||
| @@ -3300,6 +3319,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
| 3300 | fo->copied = space; | 3319 | fo->copied = space; |
| 3301 | 3320 | ||
| 3302 | tcp_connect_queue_skb(sk, syn_data); | 3321 | tcp_connect_queue_skb(sk, syn_data); |
| 3322 | if (syn_data->len) | ||
| 3323 | tcp_chrono_start(sk, TCP_CHRONO_BUSY); | ||
| 3303 | 3324 | ||
| 3304 | err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); | 3325 | err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); |
| 3305 | 3326 | ||
| @@ -3464,8 +3485,6 @@ void tcp_send_ack(struct sock *sk) | |||
| 3464 | /* We do not want pure acks influencing TCP Small Queues or fq/pacing | 3485 | /* We do not want pure acks influencing TCP Small Queues or fq/pacing |
| 3465 | * too much. | 3486 | * too much. |
| 3466 | * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 | 3487 | * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 |
| 3467 | * We also avoid tcp_wfree() overhead (cache line miss accessing | ||
| 3468 | * tp->tsq_flags) by using regular sock_wfree() | ||
| 3469 | */ | 3488 | */ |
| 3470 | skb_set_tcp_pure_ack(buff); | 3489 | skb_set_tcp_pure_ack(buff); |
| 3471 | 3490 | ||
