diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 404 |
1 files changed, 198 insertions, 206 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a906e0200ff2..a12b455928e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -68,12 +68,12 @@ | |||
68 | #include <linux/module.h> | 68 | #include <linux/module.h> |
69 | #include <linux/sysctl.h> | 69 | #include <linux/sysctl.h> |
70 | #include <linux/kernel.h> | 70 | #include <linux/kernel.h> |
71 | #include <linux/prefetch.h> | ||
71 | #include <net/dst.h> | 72 | #include <net/dst.h> |
72 | #include <net/tcp.h> | 73 | #include <net/tcp.h> |
73 | #include <net/inet_common.h> | 74 | #include <net/inet_common.h> |
74 | #include <linux/ipsec.h> | 75 | #include <linux/ipsec.h> |
75 | #include <asm/unaligned.h> | 76 | #include <asm/unaligned.h> |
76 | #include <net/netdma.h> | ||
77 | #include <linux/errqueue.h> | 77 | #include <linux/errqueue.h> |
78 | 78 | ||
79 | int sysctl_tcp_timestamps __read_mostly = 1; | 79 | int sysctl_tcp_timestamps __read_mostly = 1; |
@@ -201,28 +201,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk) | |||
201 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; | 201 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; |
202 | } | 202 | } |
203 | 203 | ||
204 | static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) | 204 | static void tcp_ecn_queue_cwr(struct tcp_sock *tp) |
205 | { | 205 | { |
206 | if (tp->ecn_flags & TCP_ECN_OK) | 206 | if (tp->ecn_flags & TCP_ECN_OK) |
207 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; | 207 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; |
208 | } | 208 | } |
209 | 209 | ||
210 | static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) | 210 | static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) |
211 | { | 211 | { |
212 | if (tcp_hdr(skb)->cwr) | 212 | if (tcp_hdr(skb)->cwr) |
213 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 213 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
214 | } | 214 | } |
215 | 215 | ||
216 | static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) | 216 | static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) |
217 | { | 217 | { |
218 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 218 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
219 | } | 219 | } |
220 | 220 | ||
221 | static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) | 221 | static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) |
222 | { | 222 | { |
223 | if (!(tp->ecn_flags & TCP_ECN_OK)) | ||
224 | return; | ||
225 | |||
226 | switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { | 223 | switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { |
227 | case INET_ECN_NOT_ECT: | 224 | case INET_ECN_NOT_ECT: |
228 | /* Funny extension: if ECT is not set on a segment, | 225 | /* Funny extension: if ECT is not set on a segment, |
@@ -233,30 +230,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s | |||
233 | tcp_enter_quickack_mode((struct sock *)tp); | 230 | tcp_enter_quickack_mode((struct sock *)tp); |
234 | break; | 231 | break; |
235 | case INET_ECN_CE: | 232 | case INET_ECN_CE: |
233 | if (tcp_ca_needs_ecn((struct sock *)tp)) | ||
234 | tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); | ||
235 | |||
236 | if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { | 236 | if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { |
237 | /* Better not delay acks, sender can have a very low cwnd */ | 237 | /* Better not delay acks, sender can have a very low cwnd */ |
238 | tcp_enter_quickack_mode((struct sock *)tp); | 238 | tcp_enter_quickack_mode((struct sock *)tp); |
239 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | 239 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; |
240 | } | 240 | } |
241 | /* fallinto */ | 241 | tp->ecn_flags |= TCP_ECN_SEEN; |
242 | break; | ||
242 | default: | 243 | default: |
244 | if (tcp_ca_needs_ecn((struct sock *)tp)) | ||
245 | tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); | ||
243 | tp->ecn_flags |= TCP_ECN_SEEN; | 246 | tp->ecn_flags |= TCP_ECN_SEEN; |
247 | break; | ||
244 | } | 248 | } |
245 | } | 249 | } |
246 | 250 | ||
247 | static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) | 251 | static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) |
252 | { | ||
253 | if (tp->ecn_flags & TCP_ECN_OK) | ||
254 | __tcp_ecn_check_ce(tp, skb); | ||
255 | } | ||
256 | |||
257 | static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) | ||
248 | { | 258 | { |
249 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) | 259 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) |
250 | tp->ecn_flags &= ~TCP_ECN_OK; | 260 | tp->ecn_flags &= ~TCP_ECN_OK; |
251 | } | 261 | } |
252 | 262 | ||
253 | static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) | 263 | static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) |
254 | { | 264 | { |
255 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) | 265 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) |
256 | tp->ecn_flags &= ~TCP_ECN_OK; | 266 | tp->ecn_flags &= ~TCP_ECN_OK; |
257 | } | 267 | } |
258 | 268 | ||
259 | static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) | 269 | static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) |
260 | { | 270 | { |
261 | if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) | 271 | if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) |
262 | return true; | 272 | return true; |
@@ -653,7 +663,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) | |||
653 | } | 663 | } |
654 | icsk->icsk_ack.lrcvtime = now; | 664 | icsk->icsk_ack.lrcvtime = now; |
655 | 665 | ||
656 | TCP_ECN_check_ce(tp, skb); | 666 | tcp_ecn_check_ce(tp, skb); |
657 | 667 | ||
658 | if (skb->len >= 128) | 668 | if (skb->len >= 128) |
659 | tcp_grow_window(sk, skb); | 669 | tcp_grow_window(sk, skb); |
@@ -1295,9 +1305,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1295 | TCP_SKB_CB(prev)->end_seq += shifted; | 1305 | TCP_SKB_CB(prev)->end_seq += shifted; |
1296 | TCP_SKB_CB(skb)->seq += shifted; | 1306 | TCP_SKB_CB(skb)->seq += shifted; |
1297 | 1307 | ||
1298 | skb_shinfo(prev)->gso_segs += pcount; | 1308 | tcp_skb_pcount_add(prev, pcount); |
1299 | BUG_ON(skb_shinfo(skb)->gso_segs < pcount); | 1309 | BUG_ON(tcp_skb_pcount(skb) < pcount); |
1300 | skb_shinfo(skb)->gso_segs -= pcount; | 1310 | tcp_skb_pcount_add(skb, -pcount); |
1301 | 1311 | ||
1302 | /* When we're adding to gso_segs == 1, gso_size will be zero, | 1312 | /* When we're adding to gso_segs == 1, gso_size will be zero, |
1303 | * in theory this shouldn't be necessary but as long as DSACK | 1313 | * in theory this shouldn't be necessary but as long as DSACK |
@@ -1310,7 +1320,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1310 | } | 1320 | } |
1311 | 1321 | ||
1312 | /* CHECKME: To clear or not to clear? Mimics normal skb currently */ | 1322 | /* CHECKME: To clear or not to clear? Mimics normal skb currently */ |
1313 | if (skb_shinfo(skb)->gso_segs <= 1) { | 1323 | if (tcp_skb_pcount(skb) <= 1) { |
1314 | skb_shinfo(skb)->gso_size = 0; | 1324 | skb_shinfo(skb)->gso_size = 0; |
1315 | skb_shinfo(skb)->gso_type = 0; | 1325 | skb_shinfo(skb)->gso_type = 0; |
1316 | } | 1326 | } |
@@ -1888,21 +1898,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) | |||
1888 | tp->sacked_out = 0; | 1898 | tp->sacked_out = 0; |
1889 | } | 1899 | } |
1890 | 1900 | ||
1891 | static void tcp_clear_retrans_partial(struct tcp_sock *tp) | 1901 | void tcp_clear_retrans(struct tcp_sock *tp) |
1892 | { | 1902 | { |
1893 | tp->retrans_out = 0; | 1903 | tp->retrans_out = 0; |
1894 | tp->lost_out = 0; | 1904 | tp->lost_out = 0; |
1895 | |||
1896 | tp->undo_marker = 0; | 1905 | tp->undo_marker = 0; |
1897 | tp->undo_retrans = -1; | 1906 | tp->undo_retrans = -1; |
1907 | tp->fackets_out = 0; | ||
1908 | tp->sacked_out = 0; | ||
1898 | } | 1909 | } |
1899 | 1910 | ||
1900 | void tcp_clear_retrans(struct tcp_sock *tp) | 1911 | static inline void tcp_init_undo(struct tcp_sock *tp) |
1901 | { | 1912 | { |
1902 | tcp_clear_retrans_partial(tp); | 1913 | tp->undo_marker = tp->snd_una; |
1903 | 1914 | /* Retransmission still in flight may cause DSACKs later. */ | |
1904 | tp->fackets_out = 0; | 1915 | tp->undo_retrans = tp->retrans_out ? : -1; |
1905 | tp->sacked_out = 0; | ||
1906 | } | 1916 | } |
1907 | 1917 | ||
1908 | /* Enter Loss state. If we detect SACK reneging, forget all SACK information | 1918 | /* Enter Loss state. If we detect SACK reneging, forget all SACK information |
@@ -1925,18 +1935,18 @@ void tcp_enter_loss(struct sock *sk) | |||
1925 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | 1935 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
1926 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | 1936 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); |
1927 | tcp_ca_event(sk, CA_EVENT_LOSS); | 1937 | tcp_ca_event(sk, CA_EVENT_LOSS); |
1938 | tcp_init_undo(tp); | ||
1928 | } | 1939 | } |
1929 | tp->snd_cwnd = 1; | 1940 | tp->snd_cwnd = 1; |
1930 | tp->snd_cwnd_cnt = 0; | 1941 | tp->snd_cwnd_cnt = 0; |
1931 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1942 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1932 | 1943 | ||
1933 | tcp_clear_retrans_partial(tp); | 1944 | tp->retrans_out = 0; |
1945 | tp->lost_out = 0; | ||
1934 | 1946 | ||
1935 | if (tcp_is_reno(tp)) | 1947 | if (tcp_is_reno(tp)) |
1936 | tcp_reset_reno_sack(tp); | 1948 | tcp_reset_reno_sack(tp); |
1937 | 1949 | ||
1938 | tp->undo_marker = tp->snd_una; | ||
1939 | |||
1940 | skb = tcp_write_queue_head(sk); | 1950 | skb = tcp_write_queue_head(sk); |
1941 | is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); | 1951 | is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); |
1942 | if (is_reneg) { | 1952 | if (is_reneg) { |
@@ -1950,9 +1960,6 @@ void tcp_enter_loss(struct sock *sk) | |||
1950 | if (skb == tcp_send_head(sk)) | 1960 | if (skb == tcp_send_head(sk)) |
1951 | break; | 1961 | break; |
1952 | 1962 | ||
1953 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) | ||
1954 | tp->undo_marker = 0; | ||
1955 | |||
1956 | TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; | 1963 | TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; |
1957 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { | 1964 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { |
1958 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; | 1965 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; |
@@ -1972,7 +1979,7 @@ void tcp_enter_loss(struct sock *sk) | |||
1972 | sysctl_tcp_reordering); | 1979 | sysctl_tcp_reordering); |
1973 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1980 | tcp_set_ca_state(sk, TCP_CA_Loss); |
1974 | tp->high_seq = tp->snd_nxt; | 1981 | tp->high_seq = tp->snd_nxt; |
1975 | TCP_ECN_queue_cwr(tp); | 1982 | tcp_ecn_queue_cwr(tp); |
1976 | 1983 | ||
1977 | /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous | 1984 | /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous |
1978 | * loss recovery is underway except recurring timeout(s) on | 1985 | * loss recovery is underway except recurring timeout(s) on |
@@ -2364,7 +2371,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) | |||
2364 | 2371 | ||
2365 | if (tp->prior_ssthresh > tp->snd_ssthresh) { | 2372 | if (tp->prior_ssthresh > tp->snd_ssthresh) { |
2366 | tp->snd_ssthresh = tp->prior_ssthresh; | 2373 | tp->snd_ssthresh = tp->prior_ssthresh; |
2367 | TCP_ECN_withdraw_cwr(tp); | 2374 | tcp_ecn_withdraw_cwr(tp); |
2368 | } | 2375 | } |
2369 | } else { | 2376 | } else { |
2370 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); | 2377 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); |
@@ -2494,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) | |||
2494 | tp->prr_delivered = 0; | 2501 | tp->prr_delivered = 0; |
2495 | tp->prr_out = 0; | 2502 | tp->prr_out = 0; |
2496 | tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); | 2503 | tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); |
2497 | TCP_ECN_queue_cwr(tp); | 2504 | tcp_ecn_queue_cwr(tp); |
2498 | } | 2505 | } |
2499 | 2506 | ||
2500 | static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, | 2507 | static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, |
@@ -2671,8 +2678,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | |||
2671 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | 2678 | NET_INC_STATS_BH(sock_net(sk), mib_idx); |
2672 | 2679 | ||
2673 | tp->prior_ssthresh = 0; | 2680 | tp->prior_ssthresh = 0; |
2674 | tp->undo_marker = tp->snd_una; | 2681 | tcp_init_undo(tp); |
2675 | tp->undo_retrans = tp->retrans_out ? : -1; | ||
2676 | 2682 | ||
2677 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | 2683 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { |
2678 | if (!ece_ack) | 2684 | if (!ece_ack) |
@@ -2971,7 +2977,8 @@ void tcp_rearm_rto(struct sock *sk) | |||
2971 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || | 2977 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || |
2972 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | 2978 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { |
2973 | struct sk_buff *skb = tcp_write_queue_head(sk); | 2979 | struct sk_buff *skb = tcp_write_queue_head(sk); |
2974 | const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; | 2980 | const u32 rto_time_stamp = |
2981 | tcp_skb_timestamp(skb) + rto; | ||
2975 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); | 2982 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); |
2976 | /* delta may not be positive if the socket is locked | 2983 | /* delta may not be positive if the socket is locked |
2977 | * when the retrans timer fires and is rescheduled. | 2984 | * when the retrans timer fires and is rescheduled. |
@@ -3023,6 +3030,21 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) | |||
3023 | return packets_acked; | 3030 | return packets_acked; |
3024 | } | 3031 | } |
3025 | 3032 | ||
3033 | static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, | ||
3034 | u32 prior_snd_una) | ||
3035 | { | ||
3036 | const struct skb_shared_info *shinfo; | ||
3037 | |||
3038 | /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ | ||
3039 | if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))) | ||
3040 | return; | ||
3041 | |||
3042 | shinfo = skb_shinfo(skb); | ||
3043 | if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && | ||
3044 | between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)) | ||
3045 | __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); | ||
3046 | } | ||
3047 | |||
3026 | /* Remove acknowledged frames from the retransmission queue. If our packet | 3048 | /* Remove acknowledged frames from the retransmission queue. If our packet |
3027 | * is before the ack sequence we can discard it as it's confirmed to have | 3049 | * is before the ack sequence we can discard it as it's confirmed to have |
3028 | * arrived at the other end. | 3050 | * arrived at the other end. |
@@ -3046,14 +3068,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3046 | first_ackt.v64 = 0; | 3068 | first_ackt.v64 = 0; |
3047 | 3069 | ||
3048 | while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { | 3070 | while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { |
3049 | struct skb_shared_info *shinfo = skb_shinfo(skb); | ||
3050 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); | 3071 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); |
3051 | u8 sacked = scb->sacked; | 3072 | u8 sacked = scb->sacked; |
3052 | u32 acked_pcount; | 3073 | u32 acked_pcount; |
3053 | 3074 | ||
3054 | if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && | 3075 | tcp_ack_tstamp(sk, skb, prior_snd_una); |
3055 | between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) | ||
3056 | __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); | ||
3057 | 3076 | ||
3058 | /* Determine how many packets and what bytes were acked, tso and else */ | 3077 | /* Determine how many packets and what bytes were acked, tso and else */ |
3059 | if (after(scb->end_seq, tp->snd_una)) { | 3078 | if (after(scb->end_seq, tp->snd_una)) { |
@@ -3067,10 +3086,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3067 | 3086 | ||
3068 | fully_acked = false; | 3087 | fully_acked = false; |
3069 | } else { | 3088 | } else { |
3089 | /* Speedup tcp_unlink_write_queue() and next loop */ | ||
3090 | prefetchw(skb->next); | ||
3070 | acked_pcount = tcp_skb_pcount(skb); | 3091 | acked_pcount = tcp_skb_pcount(skb); |
3071 | } | 3092 | } |
3072 | 3093 | ||
3073 | if (sacked & TCPCB_RETRANS) { | 3094 | if (unlikely(sacked & TCPCB_RETRANS)) { |
3074 | if (sacked & TCPCB_SACKED_RETRANS) | 3095 | if (sacked & TCPCB_SACKED_RETRANS) |
3075 | tp->retrans_out -= acked_pcount; | 3096 | tp->retrans_out -= acked_pcount; |
3076 | flag |= FLAG_RETRANS_DATA_ACKED; | 3097 | flag |= FLAG_RETRANS_DATA_ACKED; |
@@ -3101,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3101 | * connection startup slow start one packet too | 3122 | * connection startup slow start one packet too |
3102 | * quickly. This is severely frowned upon behavior. | 3123 | * quickly. This is severely frowned upon behavior. |
3103 | */ | 3124 | */ |
3104 | if (!(scb->tcp_flags & TCPHDR_SYN)) { | 3125 | if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { |
3105 | flag |= FLAG_DATA_ACKED; | 3126 | flag |= FLAG_DATA_ACKED; |
3106 | } else { | 3127 | } else { |
3107 | flag |= FLAG_SYN_ACKED; | 3128 | flag |= FLAG_SYN_ACKED; |
@@ -3113,9 +3134,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3113 | 3134 | ||
3114 | tcp_unlink_write_queue(skb, sk); | 3135 | tcp_unlink_write_queue(skb, sk); |
3115 | sk_wmem_free_skb(sk, skb); | 3136 | sk_wmem_free_skb(sk, skb); |
3116 | if (skb == tp->retransmit_skb_hint) | 3137 | if (unlikely(skb == tp->retransmit_skb_hint)) |
3117 | tp->retransmit_skb_hint = NULL; | 3138 | tp->retransmit_skb_hint = NULL; |
3118 | if (skb == tp->lost_skb_hint) | 3139 | if (unlikely(skb == tp->lost_skb_hint)) |
3119 | tp->lost_skb_hint = NULL; | 3140 | tp->lost_skb_hint = NULL; |
3120 | } | 3141 | } |
3121 | 3142 | ||
@@ -3126,7 +3147,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3126 | flag |= FLAG_SACK_RENEGING; | 3147 | flag |= FLAG_SACK_RENEGING; |
3127 | 3148 | ||
3128 | skb_mstamp_get(&now); | 3149 | skb_mstamp_get(&now); |
3129 | if (first_ackt.v64) { | 3150 | if (likely(first_ackt.v64)) { |
3130 | seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); | 3151 | seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); |
3131 | ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); | 3152 | ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); |
3132 | } | 3153 | } |
@@ -3211,9 +3232,10 @@ static void tcp_ack_probe(struct sock *sk) | |||
3211 | * This function is not for random using! | 3232 | * This function is not for random using! |
3212 | */ | 3233 | */ |
3213 | } else { | 3234 | } else { |
3235 | unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); | ||
3236 | |||
3214 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, | 3237 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
3215 | min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), | 3238 | when, TCP_RTO_MAX); |
3216 | TCP_RTO_MAX); | ||
3217 | } | 3239 | } |
3218 | } | 3240 | } |
3219 | 3241 | ||
@@ -3364,6 +3386,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) | |||
3364 | } | 3386 | } |
3365 | } | 3387 | } |
3366 | 3388 | ||
3389 | static inline void tcp_in_ack_event(struct sock *sk, u32 flags) | ||
3390 | { | ||
3391 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
3392 | |||
3393 | if (icsk->icsk_ca_ops->in_ack_event) | ||
3394 | icsk->icsk_ca_ops->in_ack_event(sk, flags); | ||
3395 | } | ||
3396 | |||
3367 | /* This routine deals with incoming acks, but not outgoing ones. */ | 3397 | /* This routine deals with incoming acks, but not outgoing ones. */ |
3368 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | 3398 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) |
3369 | { | 3399 | { |
@@ -3379,6 +3409,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3379 | int acked = 0; /* Number of packets newly acked */ | 3409 | int acked = 0; /* Number of packets newly acked */ |
3380 | long sack_rtt_us = -1L; | 3410 | long sack_rtt_us = -1L; |
3381 | 3411 | ||
3412 | /* We very likely will need to access write queue head. */ | ||
3413 | prefetchw(sk->sk_write_queue.next); | ||
3414 | |||
3382 | /* If the ack is older than previous acks | 3415 | /* If the ack is older than previous acks |
3383 | * then we can probably ignore it. | 3416 | * then we can probably ignore it. |
3384 | */ | 3417 | */ |
@@ -3423,10 +3456,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3423 | tp->snd_una = ack; | 3456 | tp->snd_una = ack; |
3424 | flag |= FLAG_WIN_UPDATE; | 3457 | flag |= FLAG_WIN_UPDATE; |
3425 | 3458 | ||
3426 | tcp_ca_event(sk, CA_EVENT_FAST_ACK); | 3459 | tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); |
3427 | 3460 | ||
3428 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); | 3461 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); |
3429 | } else { | 3462 | } else { |
3463 | u32 ack_ev_flags = CA_ACK_SLOWPATH; | ||
3464 | |||
3430 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 3465 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
3431 | flag |= FLAG_DATA; | 3466 | flag |= FLAG_DATA; |
3432 | else | 3467 | else |
@@ -3438,10 +3473,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3438 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, | 3473 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3439 | &sack_rtt_us); | 3474 | &sack_rtt_us); |
3440 | 3475 | ||
3441 | if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) | 3476 | if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { |
3442 | flag |= FLAG_ECE; | 3477 | flag |= FLAG_ECE; |
3478 | ack_ev_flags |= CA_ACK_ECE; | ||
3479 | } | ||
3480 | |||
3481 | if (flag & FLAG_WIN_UPDATE) | ||
3482 | ack_ev_flags |= CA_ACK_WIN_UPDATE; | ||
3443 | 3483 | ||
3444 | tcp_ca_event(sk, CA_EVENT_SLOW_ACK); | 3484 | tcp_in_ack_event(sk, ack_ev_flags); |
3445 | } | 3485 | } |
3446 | 3486 | ||
3447 | /* We passed data and got it acked, remove any soft error | 3487 | /* We passed data and got it acked, remove any soft error |
@@ -4063,6 +4103,44 @@ static void tcp_sack_remove(struct tcp_sock *tp) | |||
4063 | tp->rx_opt.num_sacks = num_sacks; | 4103 | tp->rx_opt.num_sacks = num_sacks; |
4064 | } | 4104 | } |
4065 | 4105 | ||
4106 | /** | ||
4107 | * tcp_try_coalesce - try to merge skb to prior one | ||
4108 | * @sk: socket | ||
4109 | * @to: prior buffer | ||
4110 | * @from: buffer to add in queue | ||
4111 | * @fragstolen: pointer to boolean | ||
4112 | * | ||
4113 | * Before queueing skb @from after @to, try to merge them | ||
4114 | * to reduce overall memory use and queue lengths, if cost is small. | ||
4115 | * Packets in ofo or receive queues can stay a long time. | ||
4116 | * Better try to coalesce them right now to avoid future collapses. | ||
4117 | * Returns true if caller should free @from instead of queueing it | ||
4118 | */ | ||
4119 | static bool tcp_try_coalesce(struct sock *sk, | ||
4120 | struct sk_buff *to, | ||
4121 | struct sk_buff *from, | ||
4122 | bool *fragstolen) | ||
4123 | { | ||
4124 | int delta; | ||
4125 | |||
4126 | *fragstolen = false; | ||
4127 | |||
4128 | /* Its possible this segment overlaps with prior segment in queue */ | ||
4129 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) | ||
4130 | return false; | ||
4131 | |||
4132 | if (!skb_try_coalesce(to, from, fragstolen, &delta)) | ||
4133 | return false; | ||
4134 | |||
4135 | atomic_add(delta, &sk->sk_rmem_alloc); | ||
4136 | sk_mem_charge(sk, delta); | ||
4137 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); | ||
4138 | TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; | ||
4139 | TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; | ||
4140 | TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; | ||
4141 | return true; | ||
4142 | } | ||
4143 | |||
4066 | /* This one checks to see if we can put data from the | 4144 | /* This one checks to see if we can put data from the |
4067 | * out_of_order queue into the receive_queue. | 4145 | * out_of_order queue into the receive_queue. |
4068 | */ | 4146 | */ |
@@ -4070,7 +4148,8 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4070 | { | 4148 | { |
4071 | struct tcp_sock *tp = tcp_sk(sk); | 4149 | struct tcp_sock *tp = tcp_sk(sk); |
4072 | __u32 dsack_high = tp->rcv_nxt; | 4150 | __u32 dsack_high = tp->rcv_nxt; |
4073 | struct sk_buff *skb; | 4151 | struct sk_buff *skb, *tail; |
4152 | bool fragstolen, eaten; | ||
4074 | 4153 | ||
4075 | while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { | 4154 | while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { |
4076 | if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) | 4155 | if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
@@ -4083,9 +4162,9 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4083 | tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); | 4162 | tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); |
4084 | } | 4163 | } |
4085 | 4164 | ||
4165 | __skb_unlink(skb, &tp->out_of_order_queue); | ||
4086 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { | 4166 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { |
4087 | SOCK_DEBUG(sk, "ofo packet was already received\n"); | 4167 | SOCK_DEBUG(sk, "ofo packet was already received\n"); |
4088 | __skb_unlink(skb, &tp->out_of_order_queue); | ||
4089 | __kfree_skb(skb); | 4168 | __kfree_skb(skb); |
4090 | continue; | 4169 | continue; |
4091 | } | 4170 | } |
@@ -4093,11 +4172,15 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4093 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, | 4172 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
4094 | TCP_SKB_CB(skb)->end_seq); | 4173 | TCP_SKB_CB(skb)->end_seq); |
4095 | 4174 | ||
4096 | __skb_unlink(skb, &tp->out_of_order_queue); | 4175 | tail = skb_peek_tail(&sk->sk_receive_queue); |
4097 | __skb_queue_tail(&sk->sk_receive_queue, skb); | 4176 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
4098 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4177 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
4099 | if (tcp_hdr(skb)->fin) | 4178 | if (!eaten) |
4179 | __skb_queue_tail(&sk->sk_receive_queue, skb); | ||
4180 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | ||
4100 | tcp_fin(sk); | 4181 | tcp_fin(sk); |
4182 | if (eaten) | ||
4183 | kfree_skb_partial(skb, fragstolen); | ||
4101 | } | 4184 | } |
4102 | } | 4185 | } |
4103 | 4186 | ||
@@ -4124,53 +4207,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, | |||
4124 | return 0; | 4207 | return 0; |
4125 | } | 4208 | } |
4126 | 4209 | ||
4127 | /** | ||
4128 | * tcp_try_coalesce - try to merge skb to prior one | ||
4129 | * @sk: socket | ||
4130 | * @to: prior buffer | ||
4131 | * @from: buffer to add in queue | ||
4132 | * @fragstolen: pointer to boolean | ||
4133 | * | ||
4134 | * Before queueing skb @from after @to, try to merge them | ||
4135 | * to reduce overall memory use and queue lengths, if cost is small. | ||
4136 | * Packets in ofo or receive queues can stay a long time. | ||
4137 | * Better try to coalesce them right now to avoid future collapses. | ||
4138 | * Returns true if caller should free @from instead of queueing it | ||
4139 | */ | ||
4140 | static bool tcp_try_coalesce(struct sock *sk, | ||
4141 | struct sk_buff *to, | ||
4142 | struct sk_buff *from, | ||
4143 | bool *fragstolen) | ||
4144 | { | ||
4145 | int delta; | ||
4146 | |||
4147 | *fragstolen = false; | ||
4148 | |||
4149 | if (tcp_hdr(from)->fin) | ||
4150 | return false; | ||
4151 | |||
4152 | /* Its possible this segment overlaps with prior segment in queue */ | ||
4153 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) | ||
4154 | return false; | ||
4155 | |||
4156 | if (!skb_try_coalesce(to, from, fragstolen, &delta)) | ||
4157 | return false; | ||
4158 | |||
4159 | atomic_add(delta, &sk->sk_rmem_alloc); | ||
4160 | sk_mem_charge(sk, delta); | ||
4161 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); | ||
4162 | TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; | ||
4163 | TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; | ||
4164 | return true; | ||
4165 | } | ||
4166 | |||
4167 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | 4210 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) |
4168 | { | 4211 | { |
4169 | struct tcp_sock *tp = tcp_sk(sk); | 4212 | struct tcp_sock *tp = tcp_sk(sk); |
4170 | struct sk_buff *skb1; | 4213 | struct sk_buff *skb1; |
4171 | u32 seq, end_seq; | 4214 | u32 seq, end_seq; |
4172 | 4215 | ||
4173 | TCP_ECN_check_ce(tp, skb); | 4216 | tcp_ecn_check_ce(tp, skb); |
4174 | 4217 | ||
4175 | if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { | 4218 | if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { |
4176 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); | 4219 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); |
@@ -4309,24 +4352,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int | |||
4309 | 4352 | ||
4310 | int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) | 4353 | int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) |
4311 | { | 4354 | { |
4312 | struct sk_buff *skb = NULL; | 4355 | struct sk_buff *skb; |
4313 | struct tcphdr *th; | ||
4314 | bool fragstolen; | 4356 | bool fragstolen; |
4315 | 4357 | ||
4316 | if (size == 0) | 4358 | if (size == 0) |
4317 | return 0; | 4359 | return 0; |
4318 | 4360 | ||
4319 | skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); | 4361 | skb = alloc_skb(size, sk->sk_allocation); |
4320 | if (!skb) | 4362 | if (!skb) |
4321 | goto err; | 4363 | goto err; |
4322 | 4364 | ||
4323 | if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) | 4365 | if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) |
4324 | goto err_free; | 4366 | goto err_free; |
4325 | 4367 | ||
4326 | th = (struct tcphdr *)skb_put(skb, sizeof(*th)); | ||
4327 | skb_reset_transport_header(skb); | ||
4328 | memset(th, 0, sizeof(*th)); | ||
4329 | |||
4330 | if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) | 4368 | if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) |
4331 | goto err_free; | 4369 | goto err_free; |
4332 | 4370 | ||
@@ -4334,7 +4372,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) | |||
4334 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; | 4372 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; |
4335 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; | 4373 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; |
4336 | 4374 | ||
4337 | if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { | 4375 | if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { |
4338 | WARN_ON_ONCE(fragstolen); /* should not happen */ | 4376 | WARN_ON_ONCE(fragstolen); /* should not happen */ |
4339 | __kfree_skb(skb); | 4377 | __kfree_skb(skb); |
4340 | } | 4378 | } |
@@ -4348,7 +4386,6 @@ err: | |||
4348 | 4386 | ||
4349 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | 4387 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) |
4350 | { | 4388 | { |
4351 | const struct tcphdr *th = tcp_hdr(skb); | ||
4352 | struct tcp_sock *tp = tcp_sk(sk); | 4389 | struct tcp_sock *tp = tcp_sk(sk); |
4353 | int eaten = -1; | 4390 | int eaten = -1; |
4354 | bool fragstolen = false; | 4391 | bool fragstolen = false; |
@@ -4357,9 +4394,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | |||
4357 | goto drop; | 4394 | goto drop; |
4358 | 4395 | ||
4359 | skb_dst_drop(skb); | 4396 | skb_dst_drop(skb); |
4360 | __skb_pull(skb, th->doff * 4); | 4397 | __skb_pull(skb, tcp_hdr(skb)->doff * 4); |
4361 | 4398 | ||
4362 | TCP_ECN_accept_cwr(tp, skb); | 4399 | tcp_ecn_accept_cwr(tp, skb); |
4363 | 4400 | ||
4364 | tp->rx_opt.dsack = 0; | 4401 | tp->rx_opt.dsack = 0; |
4365 | 4402 | ||
@@ -4401,7 +4438,7 @@ queue_and_out: | |||
4401 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4438 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
4402 | if (skb->len) | 4439 | if (skb->len) |
4403 | tcp_event_data_recv(sk, skb); | 4440 | tcp_event_data_recv(sk, skb); |
4404 | if (th->fin) | 4441 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
4405 | tcp_fin(sk); | 4442 | tcp_fin(sk); |
4406 | 4443 | ||
4407 | if (!skb_queue_empty(&tp->out_of_order_queue)) { | 4444 | if (!skb_queue_empty(&tp->out_of_order_queue)) { |
@@ -4516,7 +4553,7 @@ restart: | |||
4516 | * - bloated or contains data before "start" or | 4553 | * - bloated or contains data before "start" or |
4517 | * overlaps to the next one. | 4554 | * overlaps to the next one. |
4518 | */ | 4555 | */ |
4519 | if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && | 4556 | if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && |
4520 | (tcp_win_from_space(skb->truesize) > skb->len || | 4557 | (tcp_win_from_space(skb->truesize) > skb->len || |
4521 | before(TCP_SKB_CB(skb)->seq, start))) { | 4558 | before(TCP_SKB_CB(skb)->seq, start))) { |
4522 | end_of_skbs = false; | 4559 | end_of_skbs = false; |
@@ -4535,30 +4572,18 @@ restart: | |||
4535 | /* Decided to skip this, advance start seq. */ | 4572 | /* Decided to skip this, advance start seq. */ |
4536 | start = TCP_SKB_CB(skb)->end_seq; | 4573 | start = TCP_SKB_CB(skb)->end_seq; |
4537 | } | 4574 | } |
4538 | if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) | 4575 | if (end_of_skbs || |
4576 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) | ||
4539 | return; | 4577 | return; |
4540 | 4578 | ||
4541 | while (before(start, end)) { | 4579 | while (before(start, end)) { |
4580 | int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); | ||
4542 | struct sk_buff *nskb; | 4581 | struct sk_buff *nskb; |
4543 | unsigned int header = skb_headroom(skb); | ||
4544 | int copy = SKB_MAX_ORDER(header, 0); | ||
4545 | 4582 | ||
4546 | /* Too big header? This can happen with IPv6. */ | 4583 | nskb = alloc_skb(copy, GFP_ATOMIC); |
4547 | if (copy < 0) | ||
4548 | return; | ||
4549 | if (end - start < copy) | ||
4550 | copy = end - start; | ||
4551 | nskb = alloc_skb(copy + header, GFP_ATOMIC); | ||
4552 | if (!nskb) | 4584 | if (!nskb) |
4553 | return; | 4585 | return; |
4554 | 4586 | ||
4555 | skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); | ||
4556 | skb_set_network_header(nskb, (skb_network_header(skb) - | ||
4557 | skb->head)); | ||
4558 | skb_set_transport_header(nskb, (skb_transport_header(skb) - | ||
4559 | skb->head)); | ||
4560 | skb_reserve(nskb, header); | ||
4561 | memcpy(nskb->head, skb->head, header); | ||
4562 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); | 4587 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); |
4563 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; | 4588 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; |
4564 | __skb_queue_before(list, skb, nskb); | 4589 | __skb_queue_before(list, skb, nskb); |
@@ -4582,8 +4607,7 @@ restart: | |||
4582 | skb = tcp_collapse_one(sk, skb, list); | 4607 | skb = tcp_collapse_one(sk, skb, list); |
4583 | if (!skb || | 4608 | if (!skb || |
4584 | skb == tail || | 4609 | skb == tail || |
4585 | tcp_hdr(skb)->syn || | 4610 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
4586 | tcp_hdr(skb)->fin) | ||
4587 | return; | 4611 | return; |
4588 | } | 4612 | } |
4589 | } | 4613 | } |
@@ -4951,53 +4975,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, | |||
4951 | __tcp_checksum_complete_user(sk, skb); | 4975 | __tcp_checksum_complete_user(sk, skb); |
4952 | } | 4976 | } |
4953 | 4977 | ||
4954 | #ifdef CONFIG_NET_DMA | ||
4955 | static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, | ||
4956 | int hlen) | ||
4957 | { | ||
4958 | struct tcp_sock *tp = tcp_sk(sk); | ||
4959 | int chunk = skb->len - hlen; | ||
4960 | int dma_cookie; | ||
4961 | bool copied_early = false; | ||
4962 | |||
4963 | if (tp->ucopy.wakeup) | ||
4964 | return false; | ||
4965 | |||
4966 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | ||
4967 | tp->ucopy.dma_chan = net_dma_find_channel(); | ||
4968 | |||
4969 | if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { | ||
4970 | |||
4971 | dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, | ||
4972 | skb, hlen, | ||
4973 | tp->ucopy.iov, chunk, | ||
4974 | tp->ucopy.pinned_list); | ||
4975 | |||
4976 | if (dma_cookie < 0) | ||
4977 | goto out; | ||
4978 | |||
4979 | tp->ucopy.dma_cookie = dma_cookie; | ||
4980 | copied_early = true; | ||
4981 | |||
4982 | tp->ucopy.len -= chunk; | ||
4983 | tp->copied_seq += chunk; | ||
4984 | tcp_rcv_space_adjust(sk); | ||
4985 | |||
4986 | if ((tp->ucopy.len == 0) || | ||
4987 | (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || | ||
4988 | (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { | ||
4989 | tp->ucopy.wakeup = 1; | ||
4990 | sk->sk_data_ready(sk); | ||
4991 | } | ||
4992 | } else if (chunk > 0) { | ||
4993 | tp->ucopy.wakeup = 1; | ||
4994 | sk->sk_data_ready(sk); | ||
4995 | } | ||
4996 | out: | ||
4997 | return copied_early; | ||
4998 | } | ||
4999 | #endif /* CONFIG_NET_DMA */ | ||
5000 | |||
5001 | /* Does PAWS and seqno based validation of an incoming segment, flags will | 4978 | /* Does PAWS and seqno based validation of an incoming segment, flags will |
5002 | * play significant role here. | 4979 | * play significant role here. |
5003 | */ | 4980 | */ |
@@ -5177,27 +5154,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5177 | } | 5154 | } |
5178 | } else { | 5155 | } else { |
5179 | int eaten = 0; | 5156 | int eaten = 0; |
5180 | int copied_early = 0; | ||
5181 | bool fragstolen = false; | 5157 | bool fragstolen = false; |
5182 | 5158 | ||
5183 | if (tp->copied_seq == tp->rcv_nxt && | 5159 | if (tp->ucopy.task == current && |
5184 | len - tcp_header_len <= tp->ucopy.len) { | 5160 | tp->copied_seq == tp->rcv_nxt && |
5185 | #ifdef CONFIG_NET_DMA | 5161 | len - tcp_header_len <= tp->ucopy.len && |
5186 | if (tp->ucopy.task == current && | 5162 | sock_owned_by_user(sk)) { |
5187 | sock_owned_by_user(sk) && | 5163 | __set_current_state(TASK_RUNNING); |
5188 | tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { | ||
5189 | copied_early = 1; | ||
5190 | eaten = 1; | ||
5191 | } | ||
5192 | #endif | ||
5193 | if (tp->ucopy.task == current && | ||
5194 | sock_owned_by_user(sk) && !copied_early) { | ||
5195 | __set_current_state(TASK_RUNNING); | ||
5196 | 5164 | ||
5197 | if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) | 5165 | if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { |
5198 | eaten = 1; | ||
5199 | } | ||
5200 | if (eaten) { | ||
5201 | /* Predicted packet is in window by definition. | 5166 | /* Predicted packet is in window by definition. |
5202 | * seq == rcv_nxt and rcv_wup <= rcv_nxt. | 5167 | * seq == rcv_nxt and rcv_wup <= rcv_nxt. |
5203 | * Hence, check seq<=rcv_wup reduces to: | 5168 | * Hence, check seq<=rcv_wup reduces to: |
@@ -5213,9 +5178,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5213 | __skb_pull(skb, tcp_header_len); | 5178 | __skb_pull(skb, tcp_header_len); |
5214 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 5179 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
5215 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); | 5180 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); |
5181 | eaten = 1; | ||
5216 | } | 5182 | } |
5217 | if (copied_early) | ||
5218 | tcp_cleanup_rbuf(sk, skb->len); | ||
5219 | } | 5183 | } |
5220 | if (!eaten) { | 5184 | if (!eaten) { |
5221 | if (tcp_checksum_complete_user(sk, skb)) | 5185 | if (tcp_checksum_complete_user(sk, skb)) |
@@ -5252,14 +5216,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5252 | goto no_ack; | 5216 | goto no_ack; |
5253 | } | 5217 | } |
5254 | 5218 | ||
5255 | if (!copied_early || tp->rcv_nxt != tp->rcv_wup) | 5219 | __tcp_ack_snd_check(sk, 0); |
5256 | __tcp_ack_snd_check(sk, 0); | ||
5257 | no_ack: | 5220 | no_ack: |
5258 | #ifdef CONFIG_NET_DMA | ||
5259 | if (copied_early) | ||
5260 | __skb_queue_tail(&sk->sk_async_wait_queue, skb); | ||
5261 | else | ||
5262 | #endif | ||
5263 | if (eaten) | 5221 | if (eaten) |
5264 | kfree_skb_partial(skb, fragstolen); | 5222 | kfree_skb_partial(skb, fragstolen); |
5265 | sk->sk_data_ready(sk); | 5223 | sk->sk_data_ready(sk); |
@@ -5453,7 +5411,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5453 | * state to ESTABLISHED..." | 5411 | * state to ESTABLISHED..." |
5454 | */ | 5412 | */ |
5455 | 5413 | ||
5456 | TCP_ECN_rcv_synack(tp, th); | 5414 | tcp_ecn_rcv_synack(tp, th); |
5457 | 5415 | ||
5458 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | 5416 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
5459 | tcp_ack(sk, skb, FLAG_SLOWPATH); | 5417 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
@@ -5572,7 +5530,7 @@ discard: | |||
5572 | tp->snd_wl1 = TCP_SKB_CB(skb)->seq; | 5530 | tp->snd_wl1 = TCP_SKB_CB(skb)->seq; |
5573 | tp->max_window = tp->snd_wnd; | 5531 | tp->max_window = tp->snd_wnd; |
5574 | 5532 | ||
5575 | TCP_ECN_rcv_syn(tp, th); | 5533 | tcp_ecn_rcv_syn(tp, th); |
5576 | 5534 | ||
5577 | tcp_mtup_init(sk); | 5535 | tcp_mtup_init(sk); |
5578 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 5536 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
@@ -5902,6 +5860,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) | |||
5902 | #endif | 5860 | #endif |
5903 | } | 5861 | } |
5904 | 5862 | ||
5863 | /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set | ||
5864 | * | ||
5865 | * If we receive a SYN packet with these bits set, it means a | ||
5866 | * network is playing bad games with TOS bits. In order to | ||
5867 | * avoid possible false congestion notifications, we disable | ||
5868 | * TCP ECN negociation. | ||
5869 | * | ||
5870 | * Exception: tcp_ca wants ECN. This is required for DCTCP | ||
5871 | * congestion control; it requires setting ECT on all packets, | ||
5872 | * including SYN. We inverse the test in this case: If our | ||
5873 | * local socket wants ECN, but peer only set ece/cwr (but not | ||
5874 | * ECT in IP header) its probably a non-DCTCP aware sender. | ||
5875 | */ | ||
5876 | static void tcp_ecn_create_request(struct request_sock *req, | ||
5877 | const struct sk_buff *skb, | ||
5878 | const struct sock *listen_sk) | ||
5879 | { | ||
5880 | const struct tcphdr *th = tcp_hdr(skb); | ||
5881 | const struct net *net = sock_net(listen_sk); | ||
5882 | bool th_ecn = th->ece && th->cwr; | ||
5883 | bool ect, need_ecn; | ||
5884 | |||
5885 | if (!th_ecn) | ||
5886 | return; | ||
5887 | |||
5888 | ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); | ||
5889 | need_ecn = tcp_ca_needs_ecn(listen_sk); | ||
5890 | |||
5891 | if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) | ||
5892 | inet_rsk(req)->ecn_ok = 1; | ||
5893 | else if (ect && need_ecn) | ||
5894 | inet_rsk(req)->ecn_ok = 1; | ||
5895 | } | ||
5896 | |||
5905 | int tcp_conn_request(struct request_sock_ops *rsk_ops, | 5897 | int tcp_conn_request(struct request_sock_ops *rsk_ops, |
5906 | const struct tcp_request_sock_ops *af_ops, | 5898 | const struct tcp_request_sock_ops *af_ops, |
5907 | struct sock *sk, struct sk_buff *skb) | 5899 | struct sock *sk, struct sk_buff *skb) |
@@ -5910,7 +5902,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, | |||
5910 | struct request_sock *req; | 5902 | struct request_sock *req; |
5911 | struct tcp_sock *tp = tcp_sk(sk); | 5903 | struct tcp_sock *tp = tcp_sk(sk); |
5912 | struct dst_entry *dst = NULL; | 5904 | struct dst_entry *dst = NULL; |
5913 | __u32 isn = TCP_SKB_CB(skb)->when; | 5905 | __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; |
5914 | bool want_cookie = false, fastopen; | 5906 | bool want_cookie = false, fastopen; |
5915 | struct flowi fl; | 5907 | struct flowi fl; |
5916 | struct tcp_fastopen_cookie foc = { .len = -1 }; | 5908 | struct tcp_fastopen_cookie foc = { .len = -1 }; |
@@ -5962,7 +5954,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, | |||
5962 | goto drop_and_free; | 5954 | goto drop_and_free; |
5963 | 5955 | ||
5964 | if (!want_cookie || tmp_opt.tstamp_ok) | 5956 | if (!want_cookie || tmp_opt.tstamp_ok) |
5965 | TCP_ECN_create_request(req, skb, sock_net(sk)); | 5957 | tcp_ecn_create_request(req, skb, sk); |
5966 | 5958 | ||
5967 | if (want_cookie) { | 5959 | if (want_cookie) { |
5968 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); | 5960 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |