aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 21:40:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 21:40:54 -0400
commit35a9ad8af0bb0fa3525e6d0d20e32551d226f38e (patch)
tree15b4b33206818886d9cff371fd2163e073b70568 /net/ipv4/tcp_input.c
parentd5935b07da53f74726e2a65dd4281d0f2c70e5d4 (diff)
parent64b1f00a0830e1c53874067273a096b228d83d36 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Most notable changes in here: 1) By far the biggest accomplishment, thanks to a large range of contributors, is the addition of multi-send for transmit. This is the result of discussions back in Chicago, and the hard work of several individuals. Now, when the ->ndo_start_xmit() method of a driver sees skb->xmit_more as true, it can choose to defer the doorbell telling the driver to start processing the new TX queue entires. skb->xmit_more means that the generic networking is guaranteed to call the driver immediately with another SKB to send. There is logic added to the qdisc layer to dequeue multiple packets at a time, and the handling mis-predicted offloads in software is now done with no locks held. Finally, pktgen is extended to have a "burst" parameter that can be used to test a multi-send implementation. Several drivers have xmit_more support: i40e, igb, ixgbe, mlx4, virtio_net Adding support is almost trivial, so export more drivers to support this optimization soon. I want to thank, in no particular or implied order, Jesper Dangaard Brouer, Eric Dumazet, Alexander Duyck, Tom Herbert, Jamal Hadi Salim, John Fastabend, Florian Westphal, Daniel Borkmann, David Tat, Hannes Frederic Sowa, and Rusty Russell. 2) PTP and timestamping support in bnx2x, from Michal Kalderon. 3) Allow adjusting the rx_copybreak threshold for a driver via ethtool, and add rx_copybreak support to enic driver. From Govindarajulu Varadarajan. 4) Significant enhancements to the generic PHY layer and the bcm7xxx driver in particular (EEE support, auto power down, etc.) from Florian Fainelli. 5) Allow raw buffers to be used for flow dissection, allowing drivers to determine the optimal "linear pull" size for devices that DMA into pools of pages. The objective is to get exactly the necessary amount of headers into the linear SKB area pre-pulled, but no more. The new interface drivers use is eth_get_headlen(). From WANG Cong, with driver conversions (several had their own by-hand duplicated implementations) by Alexander Duyck and Eric Dumazet. 6) Support checksumming more smoothly and efficiently for encapsulations, and add "foo over UDP" facility. From Tom Herbert. 7) Add Broadcom SF2 switch driver to DSA layer, from Florian Fainelli. 8) eBPF now can load programs via a system call and has an extensive testsuite. Alexei Starovoitov and Daniel Borkmann. 9) Major overhaul of the packet scheduler to use RCU in several major areas such as the classifiers and rate estimators. From John Fastabend. 10) Add driver for Intel FM10000 Ethernet Switch, from Alexander Duyck. 11) Rearrange TCP_SKB_CB() to reduce cache line misses, from Eric Dumazet. 12) Add Datacenter TCP congestion control algorithm support, From Florian Westphal. 13) Reorganize sk_buff so that __copy_skb_header() is significantly faster. From Eric Dumazet" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1558 commits) netlabel: directly return netlbl_unlabel_genl_init() net: add netdev_txq_bql_{enqueue, complete}_prefetchw() helpers net: description of dma_cookie cause make xmldocs warning cxgb4: clean up a type issue cxgb4: potential shift wrapping bug i40e: skb->xmit_more support net: fs_enet: Add NAPI TX net: fs_enet: Remove non NAPI RX r8169:add support for RTL8168EP net_sched: copy exts->type in tcf_exts_change() wimax: convert printk to pr_foo() af_unix: remove 0 assignment on static ipv6: Do not warn for informational ICMP messages, regardless of type. Update Intel Ethernet Driver maintainers list bridge: Save frag_max_size between PRE_ROUTING and POST_ROUTING tipc: fix bug in multicast congestion handling net: better IFF_XMIT_DST_RELEASE support net/mlx4_en: remove NETDEV_TX_BUSY 3c59x: fix bad split of cpu_to_le32(pci_map_single()) net: bcmgenet: fix Tx ring priority programming ...
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c285
1 files changed, 163 insertions, 122 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0185eea59342..00a41499d52c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -200,28 +200,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk)
200 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; 200 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
201} 201}
202 202
203static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) 203static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
204{ 204{
205 if (tp->ecn_flags & TCP_ECN_OK) 205 if (tp->ecn_flags & TCP_ECN_OK)
206 tp->ecn_flags |= TCP_ECN_QUEUE_CWR; 206 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
207} 207}
208 208
209static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) 209static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
210{ 210{
211 if (tcp_hdr(skb)->cwr) 211 if (tcp_hdr(skb)->cwr)
212 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 212 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
213} 213}
214 214
215static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) 215static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
216{ 216{
217 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 217 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
218} 218}
219 219
220static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) 220static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
221{ 221{
222 if (!(tp->ecn_flags & TCP_ECN_OK))
223 return;
224
225 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { 222 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
226 case INET_ECN_NOT_ECT: 223 case INET_ECN_NOT_ECT:
227 /* Funny extension: if ECT is not set on a segment, 224 /* Funny extension: if ECT is not set on a segment,
@@ -232,30 +229,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
232 tcp_enter_quickack_mode((struct sock *)tp); 229 tcp_enter_quickack_mode((struct sock *)tp);
233 break; 230 break;
234 case INET_ECN_CE: 231 case INET_ECN_CE:
232 if (tcp_ca_needs_ecn((struct sock *)tp))
233 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
234
235 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 235 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
236 /* Better not delay acks, sender can have a very low cwnd */ 236 /* Better not delay acks, sender can have a very low cwnd */
237 tcp_enter_quickack_mode((struct sock *)tp); 237 tcp_enter_quickack_mode((struct sock *)tp);
238 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 238 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
239 } 239 }
240 /* fallinto */ 240 tp->ecn_flags |= TCP_ECN_SEEN;
241 break;
241 default: 242 default:
243 if (tcp_ca_needs_ecn((struct sock *)tp))
244 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
242 tp->ecn_flags |= TCP_ECN_SEEN; 245 tp->ecn_flags |= TCP_ECN_SEEN;
246 break;
243 } 247 }
244} 248}
245 249
246static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) 250static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
251{
252 if (tp->ecn_flags & TCP_ECN_OK)
253 __tcp_ecn_check_ce(tp, skb);
254}
255
256static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
247{ 257{
248 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) 258 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
249 tp->ecn_flags &= ~TCP_ECN_OK; 259 tp->ecn_flags &= ~TCP_ECN_OK;
250} 260}
251 261
252static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) 262static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
253{ 263{
254 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) 264 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
255 tp->ecn_flags &= ~TCP_ECN_OK; 265 tp->ecn_flags &= ~TCP_ECN_OK;
256} 266}
257 267
258static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) 268static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
259{ 269{
260 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) 270 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
261 return true; 271 return true;
@@ -652,7 +662,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
652 } 662 }
653 icsk->icsk_ack.lrcvtime = now; 663 icsk->icsk_ack.lrcvtime = now;
654 664
655 TCP_ECN_check_ce(tp, skb); 665 tcp_ecn_check_ce(tp, skb);
656 666
657 if (skb->len >= 128) 667 if (skb->len >= 128)
658 tcp_grow_window(sk, skb); 668 tcp_grow_window(sk, skb);
@@ -1294,9 +1304,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1294 TCP_SKB_CB(prev)->end_seq += shifted; 1304 TCP_SKB_CB(prev)->end_seq += shifted;
1295 TCP_SKB_CB(skb)->seq += shifted; 1305 TCP_SKB_CB(skb)->seq += shifted;
1296 1306
1297 skb_shinfo(prev)->gso_segs += pcount; 1307 tcp_skb_pcount_add(prev, pcount);
1298 BUG_ON(skb_shinfo(skb)->gso_segs < pcount); 1308 BUG_ON(tcp_skb_pcount(skb) < pcount);
1299 skb_shinfo(skb)->gso_segs -= pcount; 1309 tcp_skb_pcount_add(skb, -pcount);
1300 1310
1301 /* When we're adding to gso_segs == 1, gso_size will be zero, 1311 /* When we're adding to gso_segs == 1, gso_size will be zero,
1302 * in theory this shouldn't be necessary but as long as DSACK 1312 * in theory this shouldn't be necessary but as long as DSACK
@@ -1309,7 +1319,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1309 } 1319 }
1310 1320
1311 /* CHECKME: To clear or not to clear? Mimics normal skb currently */ 1321 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1312 if (skb_shinfo(skb)->gso_segs <= 1) { 1322 if (tcp_skb_pcount(skb) <= 1) {
1313 skb_shinfo(skb)->gso_size = 0; 1323 skb_shinfo(skb)->gso_size = 0;
1314 skb_shinfo(skb)->gso_type = 0; 1324 skb_shinfo(skb)->gso_type = 0;
1315 } 1325 }
@@ -1887,21 +1897,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1887 tp->sacked_out = 0; 1897 tp->sacked_out = 0;
1888} 1898}
1889 1899
1890static void tcp_clear_retrans_partial(struct tcp_sock *tp) 1900void tcp_clear_retrans(struct tcp_sock *tp)
1891{ 1901{
1892 tp->retrans_out = 0; 1902 tp->retrans_out = 0;
1893 tp->lost_out = 0; 1903 tp->lost_out = 0;
1894
1895 tp->undo_marker = 0; 1904 tp->undo_marker = 0;
1896 tp->undo_retrans = -1; 1905 tp->undo_retrans = -1;
1906 tp->fackets_out = 0;
1907 tp->sacked_out = 0;
1897} 1908}
1898 1909
1899void tcp_clear_retrans(struct tcp_sock *tp) 1910static inline void tcp_init_undo(struct tcp_sock *tp)
1900{ 1911{
1901 tcp_clear_retrans_partial(tp); 1912 tp->undo_marker = tp->snd_una;
1902 1913 /* Retransmission still in flight may cause DSACKs later. */
1903 tp->fackets_out = 0; 1914 tp->undo_retrans = tp->retrans_out ? : -1;
1904 tp->sacked_out = 0;
1905} 1915}
1906 1916
1907/* Enter Loss state. If we detect SACK reneging, forget all SACK information 1917/* Enter Loss state. If we detect SACK reneging, forget all SACK information
@@ -1924,18 +1934,18 @@ void tcp_enter_loss(struct sock *sk)
1924 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1934 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1925 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1935 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1926 tcp_ca_event(sk, CA_EVENT_LOSS); 1936 tcp_ca_event(sk, CA_EVENT_LOSS);
1937 tcp_init_undo(tp);
1927 } 1938 }
1928 tp->snd_cwnd = 1; 1939 tp->snd_cwnd = 1;
1929 tp->snd_cwnd_cnt = 0; 1940 tp->snd_cwnd_cnt = 0;
1930 tp->snd_cwnd_stamp = tcp_time_stamp; 1941 tp->snd_cwnd_stamp = tcp_time_stamp;
1931 1942
1932 tcp_clear_retrans_partial(tp); 1943 tp->retrans_out = 0;
1944 tp->lost_out = 0;
1933 1945
1934 if (tcp_is_reno(tp)) 1946 if (tcp_is_reno(tp))
1935 tcp_reset_reno_sack(tp); 1947 tcp_reset_reno_sack(tp);
1936 1948
1937 tp->undo_marker = tp->snd_una;
1938
1939 skb = tcp_write_queue_head(sk); 1949 skb = tcp_write_queue_head(sk);
1940 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); 1950 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1941 if (is_reneg) { 1951 if (is_reneg) {
@@ -1949,9 +1959,6 @@ void tcp_enter_loss(struct sock *sk)
1949 if (skb == tcp_send_head(sk)) 1959 if (skb == tcp_send_head(sk))
1950 break; 1960 break;
1951 1961
1952 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1953 tp->undo_marker = 0;
1954
1955 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; 1962 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1956 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { 1963 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
1957 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1964 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
@@ -1971,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk)
1971 sysctl_tcp_reordering); 1978 sysctl_tcp_reordering);
1972 tcp_set_ca_state(sk, TCP_CA_Loss); 1979 tcp_set_ca_state(sk, TCP_CA_Loss);
1973 tp->high_seq = tp->snd_nxt; 1980 tp->high_seq = tp->snd_nxt;
1974 TCP_ECN_queue_cwr(tp); 1981 tcp_ecn_queue_cwr(tp);
1975 1982
1976 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous 1983 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
1977 * loss recovery is underway except recurring timeout(s) on 1984 * loss recovery is underway except recurring timeout(s) on
@@ -2363,7 +2370,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2363 2370
2364 if (tp->prior_ssthresh > tp->snd_ssthresh) { 2371 if (tp->prior_ssthresh > tp->snd_ssthresh) {
2365 tp->snd_ssthresh = tp->prior_ssthresh; 2372 tp->snd_ssthresh = tp->prior_ssthresh;
2366 TCP_ECN_withdraw_cwr(tp); 2373 tcp_ecn_withdraw_cwr(tp);
2367 } 2374 }
2368 } else { 2375 } else {
2369 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); 2376 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
@@ -2493,7 +2500,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
2493 tp->prr_delivered = 0; 2500 tp->prr_delivered = 0;
2494 tp->prr_out = 0; 2501 tp->prr_out = 0;
2495 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); 2502 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2496 TCP_ECN_queue_cwr(tp); 2503 tcp_ecn_queue_cwr(tp);
2497} 2504}
2498 2505
2499static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, 2506static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
@@ -2670,8 +2677,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2670 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2677 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2671 2678
2672 tp->prior_ssthresh = 0; 2679 tp->prior_ssthresh = 0;
2673 tp->undo_marker = tp->snd_una; 2680 tcp_init_undo(tp);
2674 tp->undo_retrans = tp->retrans_out ? : -1;
2675 2681
2676 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2682 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2677 if (!ece_ack) 2683 if (!ece_ack)
@@ -2970,7 +2976,8 @@ void tcp_rearm_rto(struct sock *sk)
2970 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2976 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2971 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2977 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2972 struct sk_buff *skb = tcp_write_queue_head(sk); 2978 struct sk_buff *skb = tcp_write_queue_head(sk);
2973 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 2979 const u32 rto_time_stamp =
2980 tcp_skb_timestamp(skb) + rto;
2974 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2981 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
2975 /* delta may not be positive if the socket is locked 2982 /* delta may not be positive if the socket is locked
2976 * when the retrans timer fires and is rescheduled. 2983 * when the retrans timer fires and is rescheduled.
@@ -3210,9 +3217,10 @@ static void tcp_ack_probe(struct sock *sk)
3210 * This function is not for random using! 3217 * This function is not for random using!
3211 */ 3218 */
3212 } else { 3219 } else {
3220 unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
3221
3213 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 3222 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3214 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), 3223 when, TCP_RTO_MAX);
3215 TCP_RTO_MAX);
3216 } 3224 }
3217} 3225}
3218 3226
@@ -3363,6 +3371,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3363 } 3371 }
3364} 3372}
3365 3373
3374static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3375{
3376 const struct inet_connection_sock *icsk = inet_csk(sk);
3377
3378 if (icsk->icsk_ca_ops->in_ack_event)
3379 icsk->icsk_ca_ops->in_ack_event(sk, flags);
3380}
3381
3366/* This routine deals with incoming acks, but not outgoing ones. */ 3382/* This routine deals with incoming acks, but not outgoing ones. */
3367static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3383static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3368{ 3384{
@@ -3422,10 +3438,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3422 tp->snd_una = ack; 3438 tp->snd_una = ack;
3423 flag |= FLAG_WIN_UPDATE; 3439 flag |= FLAG_WIN_UPDATE;
3424 3440
3425 tcp_ca_event(sk, CA_EVENT_FAST_ACK); 3441 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3426 3442
3427 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); 3443 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
3428 } else { 3444 } else {
3445 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3446
3429 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 3447 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3430 flag |= FLAG_DATA; 3448 flag |= FLAG_DATA;
3431 else 3449 else
@@ -3437,10 +3455,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3437 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3455 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3438 &sack_rtt_us); 3456 &sack_rtt_us);
3439 3457
3440 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3458 if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3441 flag |= FLAG_ECE; 3459 flag |= FLAG_ECE;
3460 ack_ev_flags |= CA_ACK_ECE;
3461 }
3462
3463 if (flag & FLAG_WIN_UPDATE)
3464 ack_ev_flags |= CA_ACK_WIN_UPDATE;
3442 3465
3443 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); 3466 tcp_in_ack_event(sk, ack_ev_flags);
3444 } 3467 }
3445 3468
3446 /* We passed data and got it acked, remove any soft error 3469 /* We passed data and got it acked, remove any soft error
@@ -4062,6 +4085,44 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4062 tp->rx_opt.num_sacks = num_sacks; 4085 tp->rx_opt.num_sacks = num_sacks;
4063} 4086}
4064 4087
4088/**
4089 * tcp_try_coalesce - try to merge skb to prior one
4090 * @sk: socket
4091 * @to: prior buffer
4092 * @from: buffer to add in queue
4093 * @fragstolen: pointer to boolean
4094 *
4095 * Before queueing skb @from after @to, try to merge them
4096 * to reduce overall memory use and queue lengths, if cost is small.
4097 * Packets in ofo or receive queues can stay a long time.
4098 * Better try to coalesce them right now to avoid future collapses.
4099 * Returns true if caller should free @from instead of queueing it
4100 */
4101static bool tcp_try_coalesce(struct sock *sk,
4102 struct sk_buff *to,
4103 struct sk_buff *from,
4104 bool *fragstolen)
4105{
4106 int delta;
4107
4108 *fragstolen = false;
4109
4110 /* Its possible this segment overlaps with prior segment in queue */
4111 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4112 return false;
4113
4114 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4115 return false;
4116
4117 atomic_add(delta, &sk->sk_rmem_alloc);
4118 sk_mem_charge(sk, delta);
4119 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4120 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4121 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4122 TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4123 return true;
4124}
4125
4065/* This one checks to see if we can put data from the 4126/* This one checks to see if we can put data from the
4066 * out_of_order queue into the receive_queue. 4127 * out_of_order queue into the receive_queue.
4067 */ 4128 */
@@ -4069,7 +4130,8 @@ static void tcp_ofo_queue(struct sock *sk)
4069{ 4130{
4070 struct tcp_sock *tp = tcp_sk(sk); 4131 struct tcp_sock *tp = tcp_sk(sk);
4071 __u32 dsack_high = tp->rcv_nxt; 4132 __u32 dsack_high = tp->rcv_nxt;
4072 struct sk_buff *skb; 4133 struct sk_buff *skb, *tail;
4134 bool fragstolen, eaten;
4073 4135
4074 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { 4136 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
4075 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4137 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
@@ -4082,9 +4144,9 @@ static void tcp_ofo_queue(struct sock *sk)
4082 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 4144 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4083 } 4145 }
4084 4146
4147 __skb_unlink(skb, &tp->out_of_order_queue);
4085 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4148 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4086 SOCK_DEBUG(sk, "ofo packet was already received\n"); 4149 SOCK_DEBUG(sk, "ofo packet was already received\n");
4087 __skb_unlink(skb, &tp->out_of_order_queue);
4088 __kfree_skb(skb); 4150 __kfree_skb(skb);
4089 continue; 4151 continue;
4090 } 4152 }
@@ -4092,11 +4154,15 @@ static void tcp_ofo_queue(struct sock *sk)
4092 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 4154 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4093 TCP_SKB_CB(skb)->end_seq); 4155 TCP_SKB_CB(skb)->end_seq);
4094 4156
4095 __skb_unlink(skb, &tp->out_of_order_queue); 4157 tail = skb_peek_tail(&sk->sk_receive_queue);
4096 __skb_queue_tail(&sk->sk_receive_queue, skb); 4158 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4097 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4159 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4098 if (tcp_hdr(skb)->fin) 4160 if (!eaten)
4161 __skb_queue_tail(&sk->sk_receive_queue, skb);
4162 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4099 tcp_fin(sk); 4163 tcp_fin(sk);
4164 if (eaten)
4165 kfree_skb_partial(skb, fragstolen);
4100 } 4166 }
4101} 4167}
4102 4168
@@ -4123,53 +4189,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4123 return 0; 4189 return 0;
4124} 4190}
4125 4191
4126/**
4127 * tcp_try_coalesce - try to merge skb to prior one
4128 * @sk: socket
4129 * @to: prior buffer
4130 * @from: buffer to add in queue
4131 * @fragstolen: pointer to boolean
4132 *
4133 * Before queueing skb @from after @to, try to merge them
4134 * to reduce overall memory use and queue lengths, if cost is small.
4135 * Packets in ofo or receive queues can stay a long time.
4136 * Better try to coalesce them right now to avoid future collapses.
4137 * Returns true if caller should free @from instead of queueing it
4138 */
4139static bool tcp_try_coalesce(struct sock *sk,
4140 struct sk_buff *to,
4141 struct sk_buff *from,
4142 bool *fragstolen)
4143{
4144 int delta;
4145
4146 *fragstolen = false;
4147
4148 if (tcp_hdr(from)->fin)
4149 return false;
4150
4151 /* Its possible this segment overlaps with prior segment in queue */
4152 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4153 return false;
4154
4155 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4156 return false;
4157
4158 atomic_add(delta, &sk->sk_rmem_alloc);
4159 sk_mem_charge(sk, delta);
4160 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4161 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4162 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4163 return true;
4164}
4165
4166static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4192static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4167{ 4193{
4168 struct tcp_sock *tp = tcp_sk(sk); 4194 struct tcp_sock *tp = tcp_sk(sk);
4169 struct sk_buff *skb1; 4195 struct sk_buff *skb1;
4170 u32 seq, end_seq; 4196 u32 seq, end_seq;
4171 4197
4172 TCP_ECN_check_ce(tp, skb); 4198 tcp_ecn_check_ce(tp, skb);
4173 4199
4174 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { 4200 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4175 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); 4201 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4308,24 +4334,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4308 4334
4309int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4335int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4310{ 4336{
4311 struct sk_buff *skb = NULL; 4337 struct sk_buff *skb;
4312 struct tcphdr *th;
4313 bool fragstolen; 4338 bool fragstolen;
4314 4339
4315 if (size == 0) 4340 if (size == 0)
4316 return 0; 4341 return 0;
4317 4342
4318 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); 4343 skb = alloc_skb(size, sk->sk_allocation);
4319 if (!skb) 4344 if (!skb)
4320 goto err; 4345 goto err;
4321 4346
4322 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) 4347 if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4323 goto err_free; 4348 goto err_free;
4324 4349
4325 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4326 skb_reset_transport_header(skb);
4327 memset(th, 0, sizeof(*th));
4328
4329 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) 4350 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
4330 goto err_free; 4351 goto err_free;
4331 4352
@@ -4333,7 +4354,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4333 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; 4354 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4334 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; 4355 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4335 4356
4336 if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { 4357 if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4337 WARN_ON_ONCE(fragstolen); /* should not happen */ 4358 WARN_ON_ONCE(fragstolen); /* should not happen */
4338 __kfree_skb(skb); 4359 __kfree_skb(skb);
4339 } 4360 }
@@ -4347,7 +4368,6 @@ err:
4347 4368
4348static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 4369static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4349{ 4370{
4350 const struct tcphdr *th = tcp_hdr(skb);
4351 struct tcp_sock *tp = tcp_sk(sk); 4371 struct tcp_sock *tp = tcp_sk(sk);
4352 int eaten = -1; 4372 int eaten = -1;
4353 bool fragstolen = false; 4373 bool fragstolen = false;
@@ -4356,9 +4376,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4356 goto drop; 4376 goto drop;
4357 4377
4358 skb_dst_drop(skb); 4378 skb_dst_drop(skb);
4359 __skb_pull(skb, th->doff * 4); 4379 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
4360 4380
4361 TCP_ECN_accept_cwr(tp, skb); 4381 tcp_ecn_accept_cwr(tp, skb);
4362 4382
4363 tp->rx_opt.dsack = 0; 4383 tp->rx_opt.dsack = 0;
4364 4384
@@ -4400,7 +4420,7 @@ queue_and_out:
4400 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4420 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4401 if (skb->len) 4421 if (skb->len)
4402 tcp_event_data_recv(sk, skb); 4422 tcp_event_data_recv(sk, skb);
4403 if (th->fin) 4423 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4404 tcp_fin(sk); 4424 tcp_fin(sk);
4405 4425
4406 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4426 if (!skb_queue_empty(&tp->out_of_order_queue)) {
@@ -4515,7 +4535,7 @@ restart:
4515 * - bloated or contains data before "start" or 4535 * - bloated or contains data before "start" or
4516 * overlaps to the next one. 4536 * overlaps to the next one.
4517 */ 4537 */
4518 if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && 4538 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4519 (tcp_win_from_space(skb->truesize) > skb->len || 4539 (tcp_win_from_space(skb->truesize) > skb->len ||
4520 before(TCP_SKB_CB(skb)->seq, start))) { 4540 before(TCP_SKB_CB(skb)->seq, start))) {
4521 end_of_skbs = false; 4541 end_of_skbs = false;
@@ -4534,30 +4554,18 @@ restart:
4534 /* Decided to skip this, advance start seq. */ 4554 /* Decided to skip this, advance start seq. */
4535 start = TCP_SKB_CB(skb)->end_seq; 4555 start = TCP_SKB_CB(skb)->end_seq;
4536 } 4556 }
4537 if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) 4557 if (end_of_skbs ||
4558 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4538 return; 4559 return;
4539 4560
4540 while (before(start, end)) { 4561 while (before(start, end)) {
4562 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4541 struct sk_buff *nskb; 4563 struct sk_buff *nskb;
4542 unsigned int header = skb_headroom(skb);
4543 int copy = SKB_MAX_ORDER(header, 0);
4544 4564
4545 /* Too big header? This can happen with IPv6. */ 4565 nskb = alloc_skb(copy, GFP_ATOMIC);
4546 if (copy < 0)
4547 return;
4548 if (end - start < copy)
4549 copy = end - start;
4550 nskb = alloc_skb(copy + header, GFP_ATOMIC);
4551 if (!nskb) 4566 if (!nskb)
4552 return; 4567 return;
4553 4568
4554 skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
4555 skb_set_network_header(nskb, (skb_network_header(skb) -
4556 skb->head));
4557 skb_set_transport_header(nskb, (skb_transport_header(skb) -
4558 skb->head));
4559 skb_reserve(nskb, header);
4560 memcpy(nskb->head, skb->head, header);
4561 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4569 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4562 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4570 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4563 __skb_queue_before(list, skb, nskb); 4571 __skb_queue_before(list, skb, nskb);
@@ -4581,8 +4589,7 @@ restart:
4581 skb = tcp_collapse_one(sk, skb, list); 4589 skb = tcp_collapse_one(sk, skb, list);
4582 if (!skb || 4590 if (!skb ||
4583 skb == tail || 4591 skb == tail ||
4584 tcp_hdr(skb)->syn || 4592 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4585 tcp_hdr(skb)->fin)
4586 return; 4593 return;
4587 } 4594 }
4588 } 4595 }
@@ -5386,7 +5393,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5386 * state to ESTABLISHED..." 5393 * state to ESTABLISHED..."
5387 */ 5394 */
5388 5395
5389 TCP_ECN_rcv_synack(tp, th); 5396 tcp_ecn_rcv_synack(tp, th);
5390 5397
5391 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5398 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5392 tcp_ack(sk, skb, FLAG_SLOWPATH); 5399 tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -5505,7 +5512,7 @@ discard:
5505 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5512 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
5506 tp->max_window = tp->snd_wnd; 5513 tp->max_window = tp->snd_wnd;
5507 5514
5508 TCP_ECN_rcv_syn(tp, th); 5515 tcp_ecn_rcv_syn(tp, th);
5509 5516
5510 tcp_mtup_init(sk); 5517 tcp_mtup_init(sk);
5511 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5518 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -5835,6 +5842,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5835#endif 5842#endif
5836} 5843}
5837 5844
5845/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
5846 *
5847 * If we receive a SYN packet with these bits set, it means a
5848 * network is playing bad games with TOS bits. In order to
5849 * avoid possible false congestion notifications, we disable
5850 * TCP ECN negociation.
5851 *
5852 * Exception: tcp_ca wants ECN. This is required for DCTCP
5853 * congestion control; it requires setting ECT on all packets,
5854 * including SYN. We inverse the test in this case: If our
5855 * local socket wants ECN, but peer only set ece/cwr (but not
5856 * ECT in IP header) its probably a non-DCTCP aware sender.
5857 */
5858static void tcp_ecn_create_request(struct request_sock *req,
5859 const struct sk_buff *skb,
5860 const struct sock *listen_sk)
5861{
5862 const struct tcphdr *th = tcp_hdr(skb);
5863 const struct net *net = sock_net(listen_sk);
5864 bool th_ecn = th->ece && th->cwr;
5865 bool ect, need_ecn;
5866
5867 if (!th_ecn)
5868 return;
5869
5870 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
5871 need_ecn = tcp_ca_needs_ecn(listen_sk);
5872
5873 if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
5874 inet_rsk(req)->ecn_ok = 1;
5875 else if (ect && need_ecn)
5876 inet_rsk(req)->ecn_ok = 1;
5877}
5878
5838int tcp_conn_request(struct request_sock_ops *rsk_ops, 5879int tcp_conn_request(struct request_sock_ops *rsk_ops,
5839 const struct tcp_request_sock_ops *af_ops, 5880 const struct tcp_request_sock_ops *af_ops,
5840 struct sock *sk, struct sk_buff *skb) 5881 struct sock *sk, struct sk_buff *skb)
@@ -5843,7 +5884,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5843 struct request_sock *req; 5884 struct request_sock *req;
5844 struct tcp_sock *tp = tcp_sk(sk); 5885 struct tcp_sock *tp = tcp_sk(sk);
5845 struct dst_entry *dst = NULL; 5886 struct dst_entry *dst = NULL;
5846 __u32 isn = TCP_SKB_CB(skb)->when; 5887 __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
5847 bool want_cookie = false, fastopen; 5888 bool want_cookie = false, fastopen;
5848 struct flowi fl; 5889 struct flowi fl;
5849 struct tcp_fastopen_cookie foc = { .len = -1 }; 5890 struct tcp_fastopen_cookie foc = { .len = -1 };
@@ -5895,7 +5936,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5895 goto drop_and_free; 5936 goto drop_and_free;
5896 5937
5897 if (!want_cookie || tmp_opt.tstamp_ok) 5938 if (!want_cookie || tmp_opt.tstamp_ok)
5898 TCP_ECN_create_request(req, skb, sock_net(sk)); 5939 tcp_ecn_create_request(req, skb, sk);
5899 5940
5900 if (want_cookie) { 5941 if (want_cookie) {
5901 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); 5942 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);