aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c404
1 files changed, 198 insertions, 206 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a906e0200ff2..a12b455928e5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -68,12 +68,12 @@
68#include <linux/module.h> 68#include <linux/module.h>
69#include <linux/sysctl.h> 69#include <linux/sysctl.h>
70#include <linux/kernel.h> 70#include <linux/kernel.h>
71#include <linux/prefetch.h>
71#include <net/dst.h> 72#include <net/dst.h>
72#include <net/tcp.h> 73#include <net/tcp.h>
73#include <net/inet_common.h> 74#include <net/inet_common.h>
74#include <linux/ipsec.h> 75#include <linux/ipsec.h>
75#include <asm/unaligned.h> 76#include <asm/unaligned.h>
76#include <net/netdma.h>
77#include <linux/errqueue.h> 77#include <linux/errqueue.h>
78 78
79int sysctl_tcp_timestamps __read_mostly = 1; 79int sysctl_tcp_timestamps __read_mostly = 1;
@@ -201,28 +201,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk)
201 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; 201 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
202} 202}
203 203
204static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) 204static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
205{ 205{
206 if (tp->ecn_flags & TCP_ECN_OK) 206 if (tp->ecn_flags & TCP_ECN_OK)
207 tp->ecn_flags |= TCP_ECN_QUEUE_CWR; 207 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
208} 208}
209 209
210static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) 210static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
211{ 211{
212 if (tcp_hdr(skb)->cwr) 212 if (tcp_hdr(skb)->cwr)
213 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 213 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
214} 214}
215 215
216static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) 216static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
217{ 217{
218 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 218 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
219} 219}
220 220
221static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) 221static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
222{ 222{
223 if (!(tp->ecn_flags & TCP_ECN_OK))
224 return;
225
226 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { 223 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
227 case INET_ECN_NOT_ECT: 224 case INET_ECN_NOT_ECT:
228 /* Funny extension: if ECT is not set on a segment, 225 /* Funny extension: if ECT is not set on a segment,
@@ -233,30 +230,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
233 tcp_enter_quickack_mode((struct sock *)tp); 230 tcp_enter_quickack_mode((struct sock *)tp);
234 break; 231 break;
235 case INET_ECN_CE: 232 case INET_ECN_CE:
233 if (tcp_ca_needs_ecn((struct sock *)tp))
234 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
235
236 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 236 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
237 /* Better not delay acks, sender can have a very low cwnd */ 237 /* Better not delay acks, sender can have a very low cwnd */
238 tcp_enter_quickack_mode((struct sock *)tp); 238 tcp_enter_quickack_mode((struct sock *)tp);
239 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 239 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
240 } 240 }
241 /* fallinto */ 241 tp->ecn_flags |= TCP_ECN_SEEN;
242 break;
242 default: 243 default:
244 if (tcp_ca_needs_ecn((struct sock *)tp))
245 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
243 tp->ecn_flags |= TCP_ECN_SEEN; 246 tp->ecn_flags |= TCP_ECN_SEEN;
247 break;
244 } 248 }
245} 249}
246 250
247static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) 251static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
252{
253 if (tp->ecn_flags & TCP_ECN_OK)
254 __tcp_ecn_check_ce(tp, skb);
255}
256
257static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
248{ 258{
249 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) 259 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
250 tp->ecn_flags &= ~TCP_ECN_OK; 260 tp->ecn_flags &= ~TCP_ECN_OK;
251} 261}
252 262
253static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) 263static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
254{ 264{
255 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) 265 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
256 tp->ecn_flags &= ~TCP_ECN_OK; 266 tp->ecn_flags &= ~TCP_ECN_OK;
257} 267}
258 268
259static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) 269static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
260{ 270{
261 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) 271 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
262 return true; 272 return true;
@@ -653,7 +663,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
653 } 663 }
654 icsk->icsk_ack.lrcvtime = now; 664 icsk->icsk_ack.lrcvtime = now;
655 665
656 TCP_ECN_check_ce(tp, skb); 666 tcp_ecn_check_ce(tp, skb);
657 667
658 if (skb->len >= 128) 668 if (skb->len >= 128)
659 tcp_grow_window(sk, skb); 669 tcp_grow_window(sk, skb);
@@ -1295,9 +1305,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1295 TCP_SKB_CB(prev)->end_seq += shifted; 1305 TCP_SKB_CB(prev)->end_seq += shifted;
1296 TCP_SKB_CB(skb)->seq += shifted; 1306 TCP_SKB_CB(skb)->seq += shifted;
1297 1307
1298 skb_shinfo(prev)->gso_segs += pcount; 1308 tcp_skb_pcount_add(prev, pcount);
1299 BUG_ON(skb_shinfo(skb)->gso_segs < pcount); 1309 BUG_ON(tcp_skb_pcount(skb) < pcount);
1300 skb_shinfo(skb)->gso_segs -= pcount; 1310 tcp_skb_pcount_add(skb, -pcount);
1301 1311
1302 /* When we're adding to gso_segs == 1, gso_size will be zero, 1312 /* When we're adding to gso_segs == 1, gso_size will be zero,
1303 * in theory this shouldn't be necessary but as long as DSACK 1313 * in theory this shouldn't be necessary but as long as DSACK
@@ -1310,7 +1320,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1310 } 1320 }
1311 1321
1312 /* CHECKME: To clear or not to clear? Mimics normal skb currently */ 1322 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1313 if (skb_shinfo(skb)->gso_segs <= 1) { 1323 if (tcp_skb_pcount(skb) <= 1) {
1314 skb_shinfo(skb)->gso_size = 0; 1324 skb_shinfo(skb)->gso_size = 0;
1315 skb_shinfo(skb)->gso_type = 0; 1325 skb_shinfo(skb)->gso_type = 0;
1316 } 1326 }
@@ -1888,21 +1898,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1888 tp->sacked_out = 0; 1898 tp->sacked_out = 0;
1889} 1899}
1890 1900
1891static void tcp_clear_retrans_partial(struct tcp_sock *tp) 1901void tcp_clear_retrans(struct tcp_sock *tp)
1892{ 1902{
1893 tp->retrans_out = 0; 1903 tp->retrans_out = 0;
1894 tp->lost_out = 0; 1904 tp->lost_out = 0;
1895
1896 tp->undo_marker = 0; 1905 tp->undo_marker = 0;
1897 tp->undo_retrans = -1; 1906 tp->undo_retrans = -1;
1907 tp->fackets_out = 0;
1908 tp->sacked_out = 0;
1898} 1909}
1899 1910
1900void tcp_clear_retrans(struct tcp_sock *tp) 1911static inline void tcp_init_undo(struct tcp_sock *tp)
1901{ 1912{
1902 tcp_clear_retrans_partial(tp); 1913 tp->undo_marker = tp->snd_una;
1903 1914 /* Retransmission still in flight may cause DSACKs later. */
1904 tp->fackets_out = 0; 1915 tp->undo_retrans = tp->retrans_out ? : -1;
1905 tp->sacked_out = 0;
1906} 1916}
1907 1917
1908/* Enter Loss state. If we detect SACK reneging, forget all SACK information 1918/* Enter Loss state. If we detect SACK reneging, forget all SACK information
@@ -1925,18 +1935,18 @@ void tcp_enter_loss(struct sock *sk)
1925 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1935 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1926 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1936 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1927 tcp_ca_event(sk, CA_EVENT_LOSS); 1937 tcp_ca_event(sk, CA_EVENT_LOSS);
1938 tcp_init_undo(tp);
1928 } 1939 }
1929 tp->snd_cwnd = 1; 1940 tp->snd_cwnd = 1;
1930 tp->snd_cwnd_cnt = 0; 1941 tp->snd_cwnd_cnt = 0;
1931 tp->snd_cwnd_stamp = tcp_time_stamp; 1942 tp->snd_cwnd_stamp = tcp_time_stamp;
1932 1943
1933 tcp_clear_retrans_partial(tp); 1944 tp->retrans_out = 0;
1945 tp->lost_out = 0;
1934 1946
1935 if (tcp_is_reno(tp)) 1947 if (tcp_is_reno(tp))
1936 tcp_reset_reno_sack(tp); 1948 tcp_reset_reno_sack(tp);
1937 1949
1938 tp->undo_marker = tp->snd_una;
1939
1940 skb = tcp_write_queue_head(sk); 1950 skb = tcp_write_queue_head(sk);
1941 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); 1951 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1942 if (is_reneg) { 1952 if (is_reneg) {
@@ -1950,9 +1960,6 @@ void tcp_enter_loss(struct sock *sk)
1950 if (skb == tcp_send_head(sk)) 1960 if (skb == tcp_send_head(sk))
1951 break; 1961 break;
1952 1962
1953 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1954 tp->undo_marker = 0;
1955
1956 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; 1963 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1957 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { 1964 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
1958 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1965 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
@@ -1972,7 +1979,7 @@ void tcp_enter_loss(struct sock *sk)
1972 sysctl_tcp_reordering); 1979 sysctl_tcp_reordering);
1973 tcp_set_ca_state(sk, TCP_CA_Loss); 1980 tcp_set_ca_state(sk, TCP_CA_Loss);
1974 tp->high_seq = tp->snd_nxt; 1981 tp->high_seq = tp->snd_nxt;
1975 TCP_ECN_queue_cwr(tp); 1982 tcp_ecn_queue_cwr(tp);
1976 1983
1977 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous 1984 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
1978 * loss recovery is underway except recurring timeout(s) on 1985 * loss recovery is underway except recurring timeout(s) on
@@ -2364,7 +2371,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2364 2371
2365 if (tp->prior_ssthresh > tp->snd_ssthresh) { 2372 if (tp->prior_ssthresh > tp->snd_ssthresh) {
2366 tp->snd_ssthresh = tp->prior_ssthresh; 2373 tp->snd_ssthresh = tp->prior_ssthresh;
2367 TCP_ECN_withdraw_cwr(tp); 2374 tcp_ecn_withdraw_cwr(tp);
2368 } 2375 }
2369 } else { 2376 } else {
2370 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); 2377 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
@@ -2494,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
2494 tp->prr_delivered = 0; 2501 tp->prr_delivered = 0;
2495 tp->prr_out = 0; 2502 tp->prr_out = 0;
2496 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); 2503 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2497 TCP_ECN_queue_cwr(tp); 2504 tcp_ecn_queue_cwr(tp);
2498} 2505}
2499 2506
2500static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, 2507static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
@@ -2671,8 +2678,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2671 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2678 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2672 2679
2673 tp->prior_ssthresh = 0; 2680 tp->prior_ssthresh = 0;
2674 tp->undo_marker = tp->snd_una; 2681 tcp_init_undo(tp);
2675 tp->undo_retrans = tp->retrans_out ? : -1;
2676 2682
2677 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2683 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2678 if (!ece_ack) 2684 if (!ece_ack)
@@ -2971,7 +2977,8 @@ void tcp_rearm_rto(struct sock *sk)
2971 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2977 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2972 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2978 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2973 struct sk_buff *skb = tcp_write_queue_head(sk); 2979 struct sk_buff *skb = tcp_write_queue_head(sk);
2974 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 2980 const u32 rto_time_stamp =
2981 tcp_skb_timestamp(skb) + rto;
2975 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2982 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
2976 /* delta may not be positive if the socket is locked 2983 /* delta may not be positive if the socket is locked
2977 * when the retrans timer fires and is rescheduled. 2984 * when the retrans timer fires and is rescheduled.
@@ -3023,6 +3030,21 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3023 return packets_acked; 3030 return packets_acked;
3024} 3031}
3025 3032
3033static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3034 u32 prior_snd_una)
3035{
3036 const struct skb_shared_info *shinfo;
3037
3038 /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
3039 if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
3040 return;
3041
3042 shinfo = skb_shinfo(skb);
3043 if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
3044 between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
3045 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3046}
3047
3026/* Remove acknowledged frames from the retransmission queue. If our packet 3048/* Remove acknowledged frames from the retransmission queue. If our packet
3027 * is before the ack sequence we can discard it as it's confirmed to have 3049 * is before the ack sequence we can discard it as it's confirmed to have
3028 * arrived at the other end. 3050 * arrived at the other end.
@@ -3046,14 +3068,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3046 first_ackt.v64 = 0; 3068 first_ackt.v64 = 0;
3047 3069
3048 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3070 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3049 struct skb_shared_info *shinfo = skb_shinfo(skb);
3050 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3071 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3051 u8 sacked = scb->sacked; 3072 u8 sacked = scb->sacked;
3052 u32 acked_pcount; 3073 u32 acked_pcount;
3053 3074
3054 if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && 3075 tcp_ack_tstamp(sk, skb, prior_snd_una);
3055 between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
3056 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3057 3076
3058 /* Determine how many packets and what bytes were acked, tso and else */ 3077 /* Determine how many packets and what bytes were acked, tso and else */
3059 if (after(scb->end_seq, tp->snd_una)) { 3078 if (after(scb->end_seq, tp->snd_una)) {
@@ -3067,10 +3086,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3067 3086
3068 fully_acked = false; 3087 fully_acked = false;
3069 } else { 3088 } else {
3089 /* Speedup tcp_unlink_write_queue() and next loop */
3090 prefetchw(skb->next);
3070 acked_pcount = tcp_skb_pcount(skb); 3091 acked_pcount = tcp_skb_pcount(skb);
3071 } 3092 }
3072 3093
3073 if (sacked & TCPCB_RETRANS) { 3094 if (unlikely(sacked & TCPCB_RETRANS)) {
3074 if (sacked & TCPCB_SACKED_RETRANS) 3095 if (sacked & TCPCB_SACKED_RETRANS)
3075 tp->retrans_out -= acked_pcount; 3096 tp->retrans_out -= acked_pcount;
3076 flag |= FLAG_RETRANS_DATA_ACKED; 3097 flag |= FLAG_RETRANS_DATA_ACKED;
@@ -3101,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3101 * connection startup slow start one packet too 3122 * connection startup slow start one packet too
3102 * quickly. This is severely frowned upon behavior. 3123 * quickly. This is severely frowned upon behavior.
3103 */ 3124 */
3104 if (!(scb->tcp_flags & TCPHDR_SYN)) { 3125 if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
3105 flag |= FLAG_DATA_ACKED; 3126 flag |= FLAG_DATA_ACKED;
3106 } else { 3127 } else {
3107 flag |= FLAG_SYN_ACKED; 3128 flag |= FLAG_SYN_ACKED;
@@ -3113,9 +3134,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3113 3134
3114 tcp_unlink_write_queue(skb, sk); 3135 tcp_unlink_write_queue(skb, sk);
3115 sk_wmem_free_skb(sk, skb); 3136 sk_wmem_free_skb(sk, skb);
3116 if (skb == tp->retransmit_skb_hint) 3137 if (unlikely(skb == tp->retransmit_skb_hint))
3117 tp->retransmit_skb_hint = NULL; 3138 tp->retransmit_skb_hint = NULL;
3118 if (skb == tp->lost_skb_hint) 3139 if (unlikely(skb == tp->lost_skb_hint))
3119 tp->lost_skb_hint = NULL; 3140 tp->lost_skb_hint = NULL;
3120 } 3141 }
3121 3142
@@ -3126,7 +3147,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3126 flag |= FLAG_SACK_RENEGING; 3147 flag |= FLAG_SACK_RENEGING;
3127 3148
3128 skb_mstamp_get(&now); 3149 skb_mstamp_get(&now);
3129 if (first_ackt.v64) { 3150 if (likely(first_ackt.v64)) {
3130 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); 3151 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
3131 ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); 3152 ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
3132 } 3153 }
@@ -3211,9 +3232,10 @@ static void tcp_ack_probe(struct sock *sk)
3211 * This function is not for random using! 3232 * This function is not for random using!
3212 */ 3233 */
3213 } else { 3234 } else {
3235 unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
3236
3214 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3237 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3215 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), 3238 when, TCP_RTO_MAX);
3216 TCP_RTO_MAX);
3217 } 3239 }
3218} 3240}
3219 3241
@@ -3364,6 +3386,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3364 } 3386 }
3365} 3387}
3366 3388
3389static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3390{
3391 const struct inet_connection_sock *icsk = inet_csk(sk);
3392
3393 if (icsk->icsk_ca_ops->in_ack_event)
3394 icsk->icsk_ca_ops->in_ack_event(sk, flags);
3395}
3396
3367/* This routine deals with incoming acks, but not outgoing ones. */ 3397/* This routine deals with incoming acks, but not outgoing ones. */
3368static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3398static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3369{ 3399{
@@ -3379,6 +3409,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3379 int acked = 0; /* Number of packets newly acked */ 3409 int acked = 0; /* Number of packets newly acked */
3380 long sack_rtt_us = -1L; 3410 long sack_rtt_us = -1L;
3381 3411
3412 /* We very likely will need to access write queue head. */
3413 prefetchw(sk->sk_write_queue.next);
3414
3382 /* If the ack is older than previous acks 3415 /* If the ack is older than previous acks
3383 * then we can probably ignore it. 3416 * then we can probably ignore it.
3384 */ 3417 */
@@ -3423,10 +3456,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3423 tp->snd_una = ack; 3456 tp->snd_una = ack;
3424 flag |= FLAG_WIN_UPDATE; 3457 flag |= FLAG_WIN_UPDATE;
3425 3458
3426 tcp_ca_event(sk, CA_EVENT_FAST_ACK); 3459 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3427 3460
3428 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); 3461 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
3429 } else { 3462 } else {
3463 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3464
3430 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 3465 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3431 flag |= FLAG_DATA; 3466 flag |= FLAG_DATA;
3432 else 3467 else
@@ -3438,10 +3473,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3438 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3473 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3439 &sack_rtt_us); 3474 &sack_rtt_us);
3440 3475
3441 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3476 if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3442 flag |= FLAG_ECE; 3477 flag |= FLAG_ECE;
3478 ack_ev_flags |= CA_ACK_ECE;
3479 }
3480
3481 if (flag & FLAG_WIN_UPDATE)
3482 ack_ev_flags |= CA_ACK_WIN_UPDATE;
3443 3483
3444 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); 3484 tcp_in_ack_event(sk, ack_ev_flags);
3445 } 3485 }
3446 3486
3447 /* We passed data and got it acked, remove any soft error 3487 /* We passed data and got it acked, remove any soft error
@@ -4063,6 +4103,44 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4063 tp->rx_opt.num_sacks = num_sacks; 4103 tp->rx_opt.num_sacks = num_sacks;
4064} 4104}
4065 4105
4106/**
4107 * tcp_try_coalesce - try to merge skb to prior one
4108 * @sk: socket
4109 * @to: prior buffer
4110 * @from: buffer to add in queue
4111 * @fragstolen: pointer to boolean
4112 *
4113 * Before queueing skb @from after @to, try to merge them
4114 * to reduce overall memory use and queue lengths, if cost is small.
4115 * Packets in ofo or receive queues can stay a long time.
4116 * Better try to coalesce them right now to avoid future collapses.
4117 * Returns true if caller should free @from instead of queueing it
4118 */
4119static bool tcp_try_coalesce(struct sock *sk,
4120 struct sk_buff *to,
4121 struct sk_buff *from,
4122 bool *fragstolen)
4123{
4124 int delta;
4125
4126 *fragstolen = false;
4127
4128 /* Its possible this segment overlaps with prior segment in queue */
4129 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4130 return false;
4131
4132 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4133 return false;
4134
4135 atomic_add(delta, &sk->sk_rmem_alloc);
4136 sk_mem_charge(sk, delta);
4137 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4138 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4139 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4140 TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4141 return true;
4142}
4143
4066/* This one checks to see if we can put data from the 4144/* This one checks to see if we can put data from the
4067 * out_of_order queue into the receive_queue. 4145 * out_of_order queue into the receive_queue.
4068 */ 4146 */
@@ -4070,7 +4148,8 @@ static void tcp_ofo_queue(struct sock *sk)
4070{ 4148{
4071 struct tcp_sock *tp = tcp_sk(sk); 4149 struct tcp_sock *tp = tcp_sk(sk);
4072 __u32 dsack_high = tp->rcv_nxt; 4150 __u32 dsack_high = tp->rcv_nxt;
4073 struct sk_buff *skb; 4151 struct sk_buff *skb, *tail;
4152 bool fragstolen, eaten;
4074 4153
4075 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { 4154 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
4076 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4155 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
@@ -4083,9 +4162,9 @@ static void tcp_ofo_queue(struct sock *sk)
4083 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 4162 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4084 } 4163 }
4085 4164
4165 __skb_unlink(skb, &tp->out_of_order_queue);
4086 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4166 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4087 SOCK_DEBUG(sk, "ofo packet was already received\n"); 4167 SOCK_DEBUG(sk, "ofo packet was already received\n");
4088 __skb_unlink(skb, &tp->out_of_order_queue);
4089 __kfree_skb(skb); 4168 __kfree_skb(skb);
4090 continue; 4169 continue;
4091 } 4170 }
@@ -4093,11 +4172,15 @@ static void tcp_ofo_queue(struct sock *sk)
4093 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 4172 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4094 TCP_SKB_CB(skb)->end_seq); 4173 TCP_SKB_CB(skb)->end_seq);
4095 4174
4096 __skb_unlink(skb, &tp->out_of_order_queue); 4175 tail = skb_peek_tail(&sk->sk_receive_queue);
4097 __skb_queue_tail(&sk->sk_receive_queue, skb); 4176 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4098 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4177 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4099 if (tcp_hdr(skb)->fin) 4178 if (!eaten)
4179 __skb_queue_tail(&sk->sk_receive_queue, skb);
4180 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4100 tcp_fin(sk); 4181 tcp_fin(sk);
4182 if (eaten)
4183 kfree_skb_partial(skb, fragstolen);
4101 } 4184 }
4102} 4185}
4103 4186
@@ -4124,53 +4207,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4124 return 0; 4207 return 0;
4125} 4208}
4126 4209
4127/**
4128 * tcp_try_coalesce - try to merge skb to prior one
4129 * @sk: socket
4130 * @to: prior buffer
4131 * @from: buffer to add in queue
4132 * @fragstolen: pointer to boolean
4133 *
4134 * Before queueing skb @from after @to, try to merge them
4135 * to reduce overall memory use and queue lengths, if cost is small.
4136 * Packets in ofo or receive queues can stay a long time.
4137 * Better try to coalesce them right now to avoid future collapses.
4138 * Returns true if caller should free @from instead of queueing it
4139 */
4140static bool tcp_try_coalesce(struct sock *sk,
4141 struct sk_buff *to,
4142 struct sk_buff *from,
4143 bool *fragstolen)
4144{
4145 int delta;
4146
4147 *fragstolen = false;
4148
4149 if (tcp_hdr(from)->fin)
4150 return false;
4151
4152 /* Its possible this segment overlaps with prior segment in queue */
4153 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4154 return false;
4155
4156 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4157 return false;
4158
4159 atomic_add(delta, &sk->sk_rmem_alloc);
4160 sk_mem_charge(sk, delta);
4161 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4162 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4163 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4164 return true;
4165}
4166
4167static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4210static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4168{ 4211{
4169 struct tcp_sock *tp = tcp_sk(sk); 4212 struct tcp_sock *tp = tcp_sk(sk);
4170 struct sk_buff *skb1; 4213 struct sk_buff *skb1;
4171 u32 seq, end_seq; 4214 u32 seq, end_seq;
4172 4215
4173 TCP_ECN_check_ce(tp, skb); 4216 tcp_ecn_check_ce(tp, skb);
4174 4217
4175 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { 4218 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4176 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); 4219 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4309,24 +4352,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4309 4352
4310int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4353int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4311{ 4354{
4312 struct sk_buff *skb = NULL; 4355 struct sk_buff *skb;
4313 struct tcphdr *th;
4314 bool fragstolen; 4356 bool fragstolen;
4315 4357
4316 if (size == 0) 4358 if (size == 0)
4317 return 0; 4359 return 0;
4318 4360
4319 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); 4361 skb = alloc_skb(size, sk->sk_allocation);
4320 if (!skb) 4362 if (!skb)
4321 goto err; 4363 goto err;
4322 4364
4323 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) 4365 if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4324 goto err_free; 4366 goto err_free;
4325 4367
4326 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4327 skb_reset_transport_header(skb);
4328 memset(th, 0, sizeof(*th));
4329
4330 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) 4368 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
4331 goto err_free; 4369 goto err_free;
4332 4370
@@ -4334,7 +4372,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4334 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; 4372 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4335 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; 4373 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4336 4374
4337 if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { 4375 if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4338 WARN_ON_ONCE(fragstolen); /* should not happen */ 4376 WARN_ON_ONCE(fragstolen); /* should not happen */
4339 __kfree_skb(skb); 4377 __kfree_skb(skb);
4340 } 4378 }
@@ -4348,7 +4386,6 @@ err:
4348 4386
4349static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 4387static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4350{ 4388{
4351 const struct tcphdr *th = tcp_hdr(skb);
4352 struct tcp_sock *tp = tcp_sk(sk); 4389 struct tcp_sock *tp = tcp_sk(sk);
4353 int eaten = -1; 4390 int eaten = -1;
4354 bool fragstolen = false; 4391 bool fragstolen = false;
@@ -4357,9 +4394,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4357 goto drop; 4394 goto drop;
4358 4395
4359 skb_dst_drop(skb); 4396 skb_dst_drop(skb);
4360 __skb_pull(skb, th->doff * 4); 4397 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
4361 4398
4362 TCP_ECN_accept_cwr(tp, skb); 4399 tcp_ecn_accept_cwr(tp, skb);
4363 4400
4364 tp->rx_opt.dsack = 0; 4401 tp->rx_opt.dsack = 0;
4365 4402
@@ -4401,7 +4438,7 @@ queue_and_out:
4401 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4438 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4402 if (skb->len) 4439 if (skb->len)
4403 tcp_event_data_recv(sk, skb); 4440 tcp_event_data_recv(sk, skb);
4404 if (th->fin) 4441 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4405 tcp_fin(sk); 4442 tcp_fin(sk);
4406 4443
4407 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4444 if (!skb_queue_empty(&tp->out_of_order_queue)) {
@@ -4516,7 +4553,7 @@ restart:
4516 * - bloated or contains data before "start" or 4553 * - bloated or contains data before "start" or
4517 * overlaps to the next one. 4554 * overlaps to the next one.
4518 */ 4555 */
4519 if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && 4556 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4520 (tcp_win_from_space(skb->truesize) > skb->len || 4557 (tcp_win_from_space(skb->truesize) > skb->len ||
4521 before(TCP_SKB_CB(skb)->seq, start))) { 4558 before(TCP_SKB_CB(skb)->seq, start))) {
4522 end_of_skbs = false; 4559 end_of_skbs = false;
@@ -4535,30 +4572,18 @@ restart:
4535 /* Decided to skip this, advance start seq. */ 4572 /* Decided to skip this, advance start seq. */
4536 start = TCP_SKB_CB(skb)->end_seq; 4573 start = TCP_SKB_CB(skb)->end_seq;
4537 } 4574 }
4538 if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) 4575 if (end_of_skbs ||
4576 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4539 return; 4577 return;
4540 4578
4541 while (before(start, end)) { 4579 while (before(start, end)) {
4580 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4542 struct sk_buff *nskb; 4581 struct sk_buff *nskb;
4543 unsigned int header = skb_headroom(skb);
4544 int copy = SKB_MAX_ORDER(header, 0);
4545 4582
4546 /* Too big header? This can happen with IPv6. */ 4583 nskb = alloc_skb(copy, GFP_ATOMIC);
4547 if (copy < 0)
4548 return;
4549 if (end - start < copy)
4550 copy = end - start;
4551 nskb = alloc_skb(copy + header, GFP_ATOMIC);
4552 if (!nskb) 4584 if (!nskb)
4553 return; 4585 return;
4554 4586
4555 skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
4556 skb_set_network_header(nskb, (skb_network_header(skb) -
4557 skb->head));
4558 skb_set_transport_header(nskb, (skb_transport_header(skb) -
4559 skb->head));
4560 skb_reserve(nskb, header);
4561 memcpy(nskb->head, skb->head, header);
4562 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4587 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4563 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4588 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4564 __skb_queue_before(list, skb, nskb); 4589 __skb_queue_before(list, skb, nskb);
@@ -4582,8 +4607,7 @@ restart:
4582 skb = tcp_collapse_one(sk, skb, list); 4607 skb = tcp_collapse_one(sk, skb, list);
4583 if (!skb || 4608 if (!skb ||
4584 skb == tail || 4609 skb == tail ||
4585 tcp_hdr(skb)->syn || 4610 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4586 tcp_hdr(skb)->fin)
4587 return; 4611 return;
4588 } 4612 }
4589 } 4613 }
@@ -4951,53 +4975,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
4951 __tcp_checksum_complete_user(sk, skb); 4975 __tcp_checksum_complete_user(sk, skb);
4952} 4976}
4953 4977
4954#ifdef CONFIG_NET_DMA
4955static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
4956 int hlen)
4957{
4958 struct tcp_sock *tp = tcp_sk(sk);
4959 int chunk = skb->len - hlen;
4960 int dma_cookie;
4961 bool copied_early = false;
4962
4963 if (tp->ucopy.wakeup)
4964 return false;
4965
4966 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
4967 tp->ucopy.dma_chan = net_dma_find_channel();
4968
4969 if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
4970
4971 dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
4972 skb, hlen,
4973 tp->ucopy.iov, chunk,
4974 tp->ucopy.pinned_list);
4975
4976 if (dma_cookie < 0)
4977 goto out;
4978
4979 tp->ucopy.dma_cookie = dma_cookie;
4980 copied_early = true;
4981
4982 tp->ucopy.len -= chunk;
4983 tp->copied_seq += chunk;
4984 tcp_rcv_space_adjust(sk);
4985
4986 if ((tp->ucopy.len == 0) ||
4987 (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
4988 (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
4989 tp->ucopy.wakeup = 1;
4990 sk->sk_data_ready(sk);
4991 }
4992 } else if (chunk > 0) {
4993 tp->ucopy.wakeup = 1;
4994 sk->sk_data_ready(sk);
4995 }
4996out:
4997 return copied_early;
4998}
4999#endif /* CONFIG_NET_DMA */
5000
5001/* Does PAWS and seqno based validation of an incoming segment, flags will 4978/* Does PAWS and seqno based validation of an incoming segment, flags will
5002 * play significant role here. 4979 * play significant role here.
5003 */ 4980 */
@@ -5177,27 +5154,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5177 } 5154 }
5178 } else { 5155 } else {
5179 int eaten = 0; 5156 int eaten = 0;
5180 int copied_early = 0;
5181 bool fragstolen = false; 5157 bool fragstolen = false;
5182 5158
5183 if (tp->copied_seq == tp->rcv_nxt && 5159 if (tp->ucopy.task == current &&
5184 len - tcp_header_len <= tp->ucopy.len) { 5160 tp->copied_seq == tp->rcv_nxt &&
5185#ifdef CONFIG_NET_DMA 5161 len - tcp_header_len <= tp->ucopy.len &&
5186 if (tp->ucopy.task == current && 5162 sock_owned_by_user(sk)) {
5187 sock_owned_by_user(sk) && 5163 __set_current_state(TASK_RUNNING);
5188 tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
5189 copied_early = 1;
5190 eaten = 1;
5191 }
5192#endif
5193 if (tp->ucopy.task == current &&
5194 sock_owned_by_user(sk) && !copied_early) {
5195 __set_current_state(TASK_RUNNING);
5196 5164
5197 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) 5165 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
5198 eaten = 1;
5199 }
5200 if (eaten) {
5201 /* Predicted packet is in window by definition. 5166 /* Predicted packet is in window by definition.
5202 * seq == rcv_nxt and rcv_wup <= rcv_nxt. 5167 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5203 * Hence, check seq<=rcv_wup reduces to: 5168 * Hence, check seq<=rcv_wup reduces to:
@@ -5213,9 +5178,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5213 __skb_pull(skb, tcp_header_len); 5178 __skb_pull(skb, tcp_header_len);
5214 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 5179 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
5215 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); 5180 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
5181 eaten = 1;
5216 } 5182 }
5217 if (copied_early)
5218 tcp_cleanup_rbuf(sk, skb->len);
5219 } 5183 }
5220 if (!eaten) { 5184 if (!eaten) {
5221 if (tcp_checksum_complete_user(sk, skb)) 5185 if (tcp_checksum_complete_user(sk, skb))
@@ -5252,14 +5216,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5252 goto no_ack; 5216 goto no_ack;
5253 } 5217 }
5254 5218
5255 if (!copied_early || tp->rcv_nxt != tp->rcv_wup) 5219 __tcp_ack_snd_check(sk, 0);
5256 __tcp_ack_snd_check(sk, 0);
5257no_ack: 5220no_ack:
5258#ifdef CONFIG_NET_DMA
5259 if (copied_early)
5260 __skb_queue_tail(&sk->sk_async_wait_queue, skb);
5261 else
5262#endif
5263 if (eaten) 5221 if (eaten)
5264 kfree_skb_partial(skb, fragstolen); 5222 kfree_skb_partial(skb, fragstolen);
5265 sk->sk_data_ready(sk); 5223 sk->sk_data_ready(sk);
@@ -5453,7 +5411,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5453 * state to ESTABLISHED..." 5411 * state to ESTABLISHED..."
5454 */ 5412 */
5455 5413
5456 TCP_ECN_rcv_synack(tp, th); 5414 tcp_ecn_rcv_synack(tp, th);
5457 5415
5458 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5416 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5459 tcp_ack(sk, skb, FLAG_SLOWPATH); 5417 tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -5572,7 +5530,7 @@ discard:
5572 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5530 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
5573 tp->max_window = tp->snd_wnd; 5531 tp->max_window = tp->snd_wnd;
5574 5532
5575 TCP_ECN_rcv_syn(tp, th); 5533 tcp_ecn_rcv_syn(tp, th);
5576 5534
5577 tcp_mtup_init(sk); 5535 tcp_mtup_init(sk);
5578 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5536 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -5902,6 +5860,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5902#endif 5860#endif
5903} 5861}
5904 5862
5863/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
5864 *
5865 * If we receive a SYN packet with these bits set, it means a
5866 * network is playing bad games with TOS bits. In order to
5867 * avoid possible false congestion notifications, we disable
5868 * TCP ECN negociation.
5869 *
5870 * Exception: tcp_ca wants ECN. This is required for DCTCP
5871 * congestion control; it requires setting ECT on all packets,
5872 * including SYN. We inverse the test in this case: If our
5873 * local socket wants ECN, but peer only set ece/cwr (but not
5874 * ECT in IP header) its probably a non-DCTCP aware sender.
5875 */
5876static void tcp_ecn_create_request(struct request_sock *req,
5877 const struct sk_buff *skb,
5878 const struct sock *listen_sk)
5879{
5880 const struct tcphdr *th = tcp_hdr(skb);
5881 const struct net *net = sock_net(listen_sk);
5882 bool th_ecn = th->ece && th->cwr;
5883 bool ect, need_ecn;
5884
5885 if (!th_ecn)
5886 return;
5887
5888 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
5889 need_ecn = tcp_ca_needs_ecn(listen_sk);
5890
5891 if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
5892 inet_rsk(req)->ecn_ok = 1;
5893 else if (ect && need_ecn)
5894 inet_rsk(req)->ecn_ok = 1;
5895}
5896
5905int tcp_conn_request(struct request_sock_ops *rsk_ops, 5897int tcp_conn_request(struct request_sock_ops *rsk_ops,
5906 const struct tcp_request_sock_ops *af_ops, 5898 const struct tcp_request_sock_ops *af_ops,
5907 struct sock *sk, struct sk_buff *skb) 5899 struct sock *sk, struct sk_buff *skb)
@@ -5910,7 +5902,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5910 struct request_sock *req; 5902 struct request_sock *req;
5911 struct tcp_sock *tp = tcp_sk(sk); 5903 struct tcp_sock *tp = tcp_sk(sk);
5912 struct dst_entry *dst = NULL; 5904 struct dst_entry *dst = NULL;
5913 __u32 isn = TCP_SKB_CB(skb)->when; 5905 __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
5914 bool want_cookie = false, fastopen; 5906 bool want_cookie = false, fastopen;
5915 struct flowi fl; 5907 struct flowi fl;
5916 struct tcp_fastopen_cookie foc = { .len = -1 }; 5908 struct tcp_fastopen_cookie foc = { .len = -1 };
@@ -5962,7 +5954,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5962 goto drop_and_free; 5954 goto drop_and_free;
5963 5955
5964 if (!want_cookie || tmp_opt.tstamp_ok) 5956 if (!want_cookie || tmp_opt.tstamp_ok)
5965 TCP_ECN_create_request(req, skb, sock_net(sk)); 5957 tcp_ecn_create_request(req, skb, sk);
5966 5958
5967 if (want_cookie) { 5959 if (want_cookie) {
5968 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); 5960 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);