aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2017-08-30 13:24:58 -0400
committerDavid S. Miller <davem@davemloft.net>2017-08-30 14:20:09 -0400
commit31770e34e43d6c8dee129bfee77e56c34e61f0e5 (patch)
treea4635a632732b39ef560a8f72eb3a81d04f01605 /net/ipv4/tcp_input.c
parentc1d2b4c3e204e602c97680335d082b8d012d08cd (diff)
tcp: Revert "tcp: remove header prediction"
This reverts commit 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78. Eric Dumazet says: We found at Google a significant regression caused by 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78 tcp: remove header prediction In typical RPC (TCP_RR), when a TCP socket receives data, we now call tcp_ack() while we used to not call it. This touches enough cache lines to cause a slowdown. so problem does not seem to be HP removal itself but the tcp_ack() call. Therefore, it might be possible to remove HP after all, provided one finds a way to elide tcp_ack for most cases. Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c188
1 files changed, 183 insertions, 5 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a0e436366d31..c5d7656beeee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
103#define FLAG_DATA_SACKED 0x20 /* New SACK. */ 103#define FLAG_DATA_SACKED 0x20 /* New SACK. */
104#define FLAG_ECE 0x40 /* ECE in this ACK */ 104#define FLAG_ECE 0x40 /* ECE in this ACK */
105#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ 105#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
106#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
106#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ 107#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
107#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ 108#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
108#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ 109#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
@@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3371 if (tp->snd_wnd != nwin) { 3372 if (tp->snd_wnd != nwin) {
3372 tp->snd_wnd = nwin; 3373 tp->snd_wnd = nwin;
3373 3374
3375 /* Note, it is the only place, where
3376 * fast path is recovered for sending TCP.
3377 */
3378 tp->pred_flags = 0;
3379 tcp_fast_path_check(sk);
3380
3374 if (tcp_send_head(sk)) 3381 if (tcp_send_head(sk))
3375 tcp_slow_start_after_idle_check(sk); 3382 tcp_slow_start_after_idle_check(sk);
3376 3383
@@ -3592,7 +3599,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3592 if (flag & FLAG_UPDATE_TS_RECENT) 3599 if (flag & FLAG_UPDATE_TS_RECENT)
3593 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); 3600 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3594 3601
3595 { 3602 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3603 /* Window is constant, pure forward advance.
3604 * No more checks are required.
3605 * Note, we use the fact that SND.UNA>=SND.WL2.
3606 */
3607 tcp_update_wl(tp, ack_seq);
3608 tcp_snd_una_update(tp, ack);
3609 flag |= FLAG_WIN_UPDATE;
3610
3611 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3612
3613 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
3614 } else {
3596 u32 ack_ev_flags = CA_ACK_SLOWPATH; 3615 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3597 3616
3598 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 3617 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -4407,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4407 if (TCP_SKB_CB(skb)->has_rxtstamp) 4426 if (TCP_SKB_CB(skb)->has_rxtstamp)
4408 TCP_SKB_CB(skb)->swtstamp = skb->tstamp; 4427 TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
4409 4428
4429 /* Disable header prediction. */
4430 tp->pred_flags = 0;
4410 inet_csk_schedule_ack(sk); 4431 inet_csk_schedule_ack(sk);
4411 4432
4412 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); 4433 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@ -4647,6 +4668,8 @@ queue_and_out:
4647 if (tp->rx_opt.num_sacks) 4668 if (tp->rx_opt.num_sacks)
4648 tcp_sack_remove(tp); 4669 tcp_sack_remove(tp);
4649 4670
4671 tcp_fast_path_check(sk);
4672
4650 if (eaten > 0) 4673 if (eaten > 0)
4651 kfree_skb_partial(skb, fragstolen); 4674 kfree_skb_partial(skb, fragstolen);
4652 if (!sock_flag(sk, SOCK_DEAD)) 4675 if (!sock_flag(sk, SOCK_DEAD))
@@ -4972,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk)
4972 NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); 4995 NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
4973 4996
4974 /* Massive buffer overcommit. */ 4997 /* Massive buffer overcommit. */
4998 tp->pred_flags = 0;
4975 return -1; 4999 return -1;
4976} 5000}
4977 5001
@@ -5143,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5143 5167
5144 tp->urg_data = TCP_URG_NOTYET; 5168 tp->urg_data = TCP_URG_NOTYET;
5145 tp->urg_seq = ptr; 5169 tp->urg_seq = ptr;
5170
5171 /* Disable header prediction. */
5172 tp->pred_flags = 0;
5146} 5173}
5147 5174
5148/* This is the 'fast' part of urgent handling. */ 5175/* This is the 'fast' part of urgent handling. */
@@ -5301,6 +5328,26 @@ discard:
5301 5328
5302/* 5329/*
5303 * TCP receive function for the ESTABLISHED state. 5330 * TCP receive function for the ESTABLISHED state.
5331 *
5332 * It is split into a fast path and a slow path. The fast path is
5333 * disabled when:
5334 * - A zero window was announced from us - zero window probing
5335 * is only handled properly in the slow path.
5336 * - Out of order segments arrived.
5337 * - Urgent data is expected.
5338 * - There is no buffer space left
5339 * - Unexpected TCP flags/window values/header lengths are received
5340 * (detected by checking the TCP header against pred_flags)
5341 * - Data is sent in both directions. Fast path only supports pure senders
5342 * or pure receivers (this means either the sequence number or the ack
5343 * value must stay constant)
5344 * - Unexpected TCP option.
5345 *
5346 * When these conditions are not satisfied it drops into a standard
5347 * receive procedure patterned after RFC793 to handle all cases.
5348 * The first three cases are guaranteed by proper pred_flags setting,
5349 * the rest is checked inline. Fast processing is turned on in
5350 * tcp_data_queue when everything is OK.
5304 */ 5351 */
5305void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, 5352void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5306 const struct tcphdr *th) 5353 const struct tcphdr *th)
@@ -5311,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5311 tcp_mstamp_refresh(tp); 5358 tcp_mstamp_refresh(tp);
5312 if (unlikely(!sk->sk_rx_dst)) 5359 if (unlikely(!sk->sk_rx_dst))
5313 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); 5360 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5361 /*
5362 * Header prediction.
5363 * The code loosely follows the one in the famous
5364 * "30 instruction TCP receive" Van Jacobson mail.
5365 *
5366 * Van's trick is to deposit buffers into socket queue
5367 * on a device interrupt, to call tcp_recv function
5368 * on the receive process context and checksum and copy
5369 * the buffer to user space. smart...
5370 *
5371 * Our current scheme is not silly either but we take the
5372 * extra cost of the net_bh soft interrupt processing...
5373 * We do checksum and copy also but from device to kernel.
5374 */
5314 5375
5315 tp->rx_opt.saw_tstamp = 0; 5376 tp->rx_opt.saw_tstamp = 0;
5316 5377
5378 /* pred_flags is 0xS?10 << 16 + snd_wnd
5379 * if header_prediction is to be made
5380 * 'S' will always be tp->tcp_header_len >> 2
5381 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
5382 * turn it off (when there are holes in the receive
5383 * space for instance)
5384 * PSH flag is ignored.
5385 */
5386
5387 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5388 TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5389 !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5390 int tcp_header_len = tp->tcp_header_len;
5391
5392 /* Timestamp header prediction: tcp_header_len
5393 * is automatically equal to th->doff*4 due to pred_flags
5394 * match.
5395 */
5396
5397 /* Check timestamp */
5398 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
5399 /* No? Slow path! */
5400 if (!tcp_parse_aligned_timestamp(tp, th))
5401 goto slow_path;
5402
5403 /* If PAWS failed, check it more carefully in slow path */
5404 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
5405 goto slow_path;
5406
5407 /* DO NOT update ts_recent here, if checksum fails
5408 * and timestamp was corrupted part, it will result
5409 * in a hung connection since we will drop all
5410 * future packets due to the PAWS test.
5411 */
5412 }
5413
5414 if (len <= tcp_header_len) {
5415 /* Bulk data transfer: sender */
5416 if (len == tcp_header_len) {
5417 /* Predicted packet is in window by definition.
5418 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5419 * Hence, check seq<=rcv_wup reduces to:
5420 */
5421 if (tcp_header_len ==
5422 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5423 tp->rcv_nxt == tp->rcv_wup)
5424 tcp_store_ts_recent(tp);
5425
5426 /* We know that such packets are checksummed
5427 * on entry.
5428 */
5429 tcp_ack(sk, skb, 0);
5430 __kfree_skb(skb);
5431 tcp_data_snd_check(sk);
5432 return;
5433 } else { /* Header too small */
5434 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5435 goto discard;
5436 }
5437 } else {
5438 int eaten = 0;
5439 bool fragstolen = false;
5440
5441 if (tcp_checksum_complete(skb))
5442 goto csum_error;
5443
5444 if ((int)skb->truesize > sk->sk_forward_alloc)
5445 goto step5;
5446
5447 /* Predicted packet is in window by definition.
5448 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5449 * Hence, check seq<=rcv_wup reduces to:
5450 */
5451 if (tcp_header_len ==
5452 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5453 tp->rcv_nxt == tp->rcv_wup)
5454 tcp_store_ts_recent(tp);
5455
5456 tcp_rcv_rtt_measure_ts(sk, skb);
5457
5458 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
5459
5460 /* Bulk data transfer: receiver */
5461 eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5462 &fragstolen);
5463
5464 tcp_event_data_recv(sk, skb);
5465
5466 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5467 /* Well, only one small jumplet in fast path... */
5468 tcp_ack(sk, skb, FLAG_DATA);
5469 tcp_data_snd_check(sk);
5470 if (!inet_csk_ack_scheduled(sk))
5471 goto no_ack;
5472 }
5473
5474 __tcp_ack_snd_check(sk, 0);
5475no_ack:
5476 if (eaten)
5477 kfree_skb_partial(skb, fragstolen);
5478 sk->sk_data_ready(sk);
5479 return;
5480 }
5481 }
5482
5483slow_path:
5317 if (len < (th->doff << 2) || tcp_checksum_complete(skb)) 5484 if (len < (th->doff << 2) || tcp_checksum_complete(skb))
5318 goto csum_error; 5485 goto csum_error;
5319 5486
5320 if (!th->ack && !th->rst && !th->syn) 5487 if (!th->ack && !th->rst && !th->syn)
5321 goto discard; 5488 goto discard;
5322 5489
5490 /*
5491 * Standard slow path.
5492 */
5493
5323 if (!tcp_validate_incoming(sk, skb, th, 1)) 5494 if (!tcp_validate_incoming(sk, skb, th, 1))
5324 return; 5495 return;
5325 5496
5326 if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0) 5497step5:
5498 if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
5327 goto discard; 5499 goto discard;
5328 5500
5329 tcp_rcv_rtt_measure_ts(sk, skb); 5501 tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5376,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5376 5548
5377 if (sock_flag(sk, SOCK_KEEPOPEN)) 5549 if (sock_flag(sk, SOCK_KEEPOPEN))
5378 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); 5550 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5551
5552 if (!tp->rx_opt.snd_wscale)
5553 __tcp_fast_path_on(tp, tp->snd_wnd);
5554 else
5555 tp->pred_flags = 0;
5379} 5556}
5380 5557
5381static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, 5558static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5504,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5504 tcp_ecn_rcv_synack(tp, th); 5681 tcp_ecn_rcv_synack(tp, th);
5505 5682
5506 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5683 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5507 tcp_ack(sk, skb, 0); 5684 tcp_ack(sk, skb, FLAG_SLOWPATH);
5508 5685
5509 /* Ok.. it's good. Set up sequence numbers and 5686 /* Ok.. it's good. Set up sequence numbers and
5510 * move to established. 5687 * move to established.
@@ -5740,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5740 return 0; 5917 return 0;
5741 5918
5742 /* step 5: check the ACK field */ 5919 /* step 5: check the ACK field */
5743 5920 acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5744 acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT | 5921 FLAG_UPDATE_TS_RECENT |
5745 FLAG_NO_CHALLENGE_ACK) > 0; 5922 FLAG_NO_CHALLENGE_ACK) > 0;
5746 5923
5747 if (!acceptable) { 5924 if (!acceptable) {
@@ -5809,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5809 tp->lsndtime = tcp_jiffies32; 5986 tp->lsndtime = tcp_jiffies32;
5810 5987
5811 tcp_initialize_rcv_mss(sk); 5988 tcp_initialize_rcv_mss(sk);
5989 tcp_fast_path_on(tp);
5812 break; 5990 break;
5813 5991
5814 case TCP_FIN_WAIT1: { 5992 case TCP_FIN_WAIT1: {