diff options
author | Florian Westphal <fw@strlen.de> | 2017-08-30 13:24:58 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-08-30 14:20:09 -0400 |
commit | 31770e34e43d6c8dee129bfee77e56c34e61f0e5 (patch) | |
tree | a4635a632732b39ef560a8f72eb3a81d04f01605 /net/ipv4/tcp_input.c | |
parent | c1d2b4c3e204e602c97680335d082b8d012d08cd (diff) |
tcp: Revert "tcp: remove header prediction"
This reverts commit 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78.
Eric Dumazet says:
We found at Google a significant regression caused by
45f119bf936b1f9f546a0b139c5b56f9bb2bdc78 tcp: remove header prediction
In typical RPC (TCP_RR), when a TCP socket receives data, we now call
tcp_ack() while we used to not call it.
This touches enough cache lines to cause a slowdown.
so problem does not seem to be HP removal itself but the tcp_ack()
call. Therefore, it might be possible to remove HP after all, provided
one finds a way to elide tcp_ack for most cases.
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 188 |
1 files changed, 183 insertions, 5 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a0e436366d31..c5d7656beeee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; | |||
103 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ | 103 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ |
104 | #define FLAG_ECE 0x40 /* ECE in this ACK */ | 104 | #define FLAG_ECE 0x40 /* ECE in this ACK */ |
105 | #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ | 105 | #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ |
106 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ | ||
106 | #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ | 107 | #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ |
107 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ | 108 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ |
108 | #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ | 109 | #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ |
@@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 | |||
3371 | if (tp->snd_wnd != nwin) { | 3372 | if (tp->snd_wnd != nwin) { |
3372 | tp->snd_wnd = nwin; | 3373 | tp->snd_wnd = nwin; |
3373 | 3374 | ||
3375 | /* Note, it is the only place, where | ||
3376 | * fast path is recovered for sending TCP. | ||
3377 | */ | ||
3378 | tp->pred_flags = 0; | ||
3379 | tcp_fast_path_check(sk); | ||
3380 | |||
3374 | if (tcp_send_head(sk)) | 3381 | if (tcp_send_head(sk)) |
3375 | tcp_slow_start_after_idle_check(sk); | 3382 | tcp_slow_start_after_idle_check(sk); |
3376 | 3383 | ||
@@ -3592,7 +3599,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3592 | if (flag & FLAG_UPDATE_TS_RECENT) | 3599 | if (flag & FLAG_UPDATE_TS_RECENT) |
3593 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | 3600 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); |
3594 | 3601 | ||
3595 | { | 3602 | if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { |
3603 | /* Window is constant, pure forward advance. | ||
3604 | * No more checks are required. | ||
3605 | * Note, we use the fact that SND.UNA>=SND.WL2. | ||
3606 | */ | ||
3607 | tcp_update_wl(tp, ack_seq); | ||
3608 | tcp_snd_una_update(tp, ack); | ||
3609 | flag |= FLAG_WIN_UPDATE; | ||
3610 | |||
3611 | tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); | ||
3612 | |||
3613 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); | ||
3614 | } else { | ||
3596 | u32 ack_ev_flags = CA_ACK_SLOWPATH; | 3615 | u32 ack_ev_flags = CA_ACK_SLOWPATH; |
3597 | 3616 | ||
3598 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 3617 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
@@ -4407,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4407 | if (TCP_SKB_CB(skb)->has_rxtstamp) | 4426 | if (TCP_SKB_CB(skb)->has_rxtstamp) |
4408 | TCP_SKB_CB(skb)->swtstamp = skb->tstamp; | 4427 | TCP_SKB_CB(skb)->swtstamp = skb->tstamp; |
4409 | 4428 | ||
4429 | /* Disable header prediction. */ | ||
4430 | tp->pred_flags = 0; | ||
4410 | inet_csk_schedule_ack(sk); | 4431 | inet_csk_schedule_ack(sk); |
4411 | 4432 | ||
4412 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); | 4433 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); |
@@ -4647,6 +4668,8 @@ queue_and_out: | |||
4647 | if (tp->rx_opt.num_sacks) | 4668 | if (tp->rx_opt.num_sacks) |
4648 | tcp_sack_remove(tp); | 4669 | tcp_sack_remove(tp); |
4649 | 4670 | ||
4671 | tcp_fast_path_check(sk); | ||
4672 | |||
4650 | if (eaten > 0) | 4673 | if (eaten > 0) |
4651 | kfree_skb_partial(skb, fragstolen); | 4674 | kfree_skb_partial(skb, fragstolen); |
4652 | if (!sock_flag(sk, SOCK_DEAD)) | 4675 | if (!sock_flag(sk, SOCK_DEAD)) |
@@ -4972,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk) | |||
4972 | NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); | 4995 | NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); |
4973 | 4996 | ||
4974 | /* Massive buffer overcommit. */ | 4997 | /* Massive buffer overcommit. */ |
4998 | tp->pred_flags = 0; | ||
4975 | return -1; | 4999 | return -1; |
4976 | } | 5000 | } |
4977 | 5001 | ||
@@ -5143,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) | |||
5143 | 5167 | ||
5144 | tp->urg_data = TCP_URG_NOTYET; | 5168 | tp->urg_data = TCP_URG_NOTYET; |
5145 | tp->urg_seq = ptr; | 5169 | tp->urg_seq = ptr; |
5170 | |||
5171 | /* Disable header prediction. */ | ||
5172 | tp->pred_flags = 0; | ||
5146 | } | 5173 | } |
5147 | 5174 | ||
5148 | /* This is the 'fast' part of urgent handling. */ | 5175 | /* This is the 'fast' part of urgent handling. */ |
@@ -5301,6 +5328,26 @@ discard: | |||
5301 | 5328 | ||
5302 | /* | 5329 | /* |
5303 | * TCP receive function for the ESTABLISHED state. | 5330 | * TCP receive function for the ESTABLISHED state. |
5331 | * | ||
5332 | * It is split into a fast path and a slow path. The fast path is | ||
5333 | * disabled when: | ||
5334 | * - A zero window was announced from us - zero window probing | ||
5335 | * is only handled properly in the slow path. | ||
5336 | * - Out of order segments arrived. | ||
5337 | * - Urgent data is expected. | ||
5338 | * - There is no buffer space left | ||
5339 | * - Unexpected TCP flags/window values/header lengths are received | ||
5340 | * (detected by checking the TCP header against pred_flags) | ||
5341 | * - Data is sent in both directions. Fast path only supports pure senders | ||
5342 | * or pure receivers (this means either the sequence number or the ack | ||
5343 | * value must stay constant) | ||
5344 | * - Unexpected TCP option. | ||
5345 | * | ||
5346 | * When these conditions are not satisfied it drops into a standard | ||
5347 | * receive procedure patterned after RFC793 to handle all cases. | ||
5348 | * The first three cases are guaranteed by proper pred_flags setting, | ||
5349 | * the rest is checked inline. Fast processing is turned on in | ||
5350 | * tcp_data_queue when everything is OK. | ||
5304 | */ | 5351 | */ |
5305 | void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | 5352 | void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
5306 | const struct tcphdr *th) | 5353 | const struct tcphdr *th) |
@@ -5311,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5311 | tcp_mstamp_refresh(tp); | 5358 | tcp_mstamp_refresh(tp); |
5312 | if (unlikely(!sk->sk_rx_dst)) | 5359 | if (unlikely(!sk->sk_rx_dst)) |
5313 | inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); | 5360 | inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); |
5361 | /* | ||
5362 | * Header prediction. | ||
5363 | * The code loosely follows the one in the famous | ||
5364 | * "30 instruction TCP receive" Van Jacobson mail. | ||
5365 | * | ||
5366 | * Van's trick is to deposit buffers into socket queue | ||
5367 | * on a device interrupt, to call tcp_recv function | ||
5368 | * on the receive process context and checksum and copy | ||
5369 | * the buffer to user space. smart... | ||
5370 | * | ||
5371 | * Our current scheme is not silly either but we take the | ||
5372 | * extra cost of the net_bh soft interrupt processing... | ||
5373 | * We do checksum and copy also but from device to kernel. | ||
5374 | */ | ||
5314 | 5375 | ||
5315 | tp->rx_opt.saw_tstamp = 0; | 5376 | tp->rx_opt.saw_tstamp = 0; |
5316 | 5377 | ||
5378 | /* pred_flags is 0xS?10 << 16 + snd_wnd | ||
5379 | * if header_prediction is to be made | ||
5380 | * 'S' will always be tp->tcp_header_len >> 2 | ||
5381 | * '?' will be 0 for the fast path, otherwise pred_flags is 0 to | ||
5382 | * turn it off (when there are holes in the receive | ||
5383 | * space for instance) | ||
5384 | * PSH flag is ignored. | ||
5385 | */ | ||
5386 | |||
5387 | if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && | ||
5388 | TCP_SKB_CB(skb)->seq == tp->rcv_nxt && | ||
5389 | !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { | ||
5390 | int tcp_header_len = tp->tcp_header_len; | ||
5391 | |||
5392 | /* Timestamp header prediction: tcp_header_len | ||
5393 | * is automatically equal to th->doff*4 due to pred_flags | ||
5394 | * match. | ||
5395 | */ | ||
5396 | |||
5397 | /* Check timestamp */ | ||
5398 | if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { | ||
5399 | /* No? Slow path! */ | ||
5400 | if (!tcp_parse_aligned_timestamp(tp, th)) | ||
5401 | goto slow_path; | ||
5402 | |||
5403 | /* If PAWS failed, check it more carefully in slow path */ | ||
5404 | if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) | ||
5405 | goto slow_path; | ||
5406 | |||
5407 | /* DO NOT update ts_recent here, if checksum fails | ||
5408 | * and timestamp was corrupted part, it will result | ||
5409 | * in a hung connection since we will drop all | ||
5410 | * future packets due to the PAWS test. | ||
5411 | */ | ||
5412 | } | ||
5413 | |||
5414 | if (len <= tcp_header_len) { | ||
5415 | /* Bulk data transfer: sender */ | ||
5416 | if (len == tcp_header_len) { | ||
5417 | /* Predicted packet is in window by definition. | ||
5418 | * seq == rcv_nxt and rcv_wup <= rcv_nxt. | ||
5419 | * Hence, check seq<=rcv_wup reduces to: | ||
5420 | */ | ||
5421 | if (tcp_header_len == | ||
5422 | (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && | ||
5423 | tp->rcv_nxt == tp->rcv_wup) | ||
5424 | tcp_store_ts_recent(tp); | ||
5425 | |||
5426 | /* We know that such packets are checksummed | ||
5427 | * on entry. | ||
5428 | */ | ||
5429 | tcp_ack(sk, skb, 0); | ||
5430 | __kfree_skb(skb); | ||
5431 | tcp_data_snd_check(sk); | ||
5432 | return; | ||
5433 | } else { /* Header too small */ | ||
5434 | TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); | ||
5435 | goto discard; | ||
5436 | } | ||
5437 | } else { | ||
5438 | int eaten = 0; | ||
5439 | bool fragstolen = false; | ||
5440 | |||
5441 | if (tcp_checksum_complete(skb)) | ||
5442 | goto csum_error; | ||
5443 | |||
5444 | if ((int)skb->truesize > sk->sk_forward_alloc) | ||
5445 | goto step5; | ||
5446 | |||
5447 | /* Predicted packet is in window by definition. | ||
5448 | * seq == rcv_nxt and rcv_wup <= rcv_nxt. | ||
5449 | * Hence, check seq<=rcv_wup reduces to: | ||
5450 | */ | ||
5451 | if (tcp_header_len == | ||
5452 | (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && | ||
5453 | tp->rcv_nxt == tp->rcv_wup) | ||
5454 | tcp_store_ts_recent(tp); | ||
5455 | |||
5456 | tcp_rcv_rtt_measure_ts(sk, skb); | ||
5457 | |||
5458 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); | ||
5459 | |||
5460 | /* Bulk data transfer: receiver */ | ||
5461 | eaten = tcp_queue_rcv(sk, skb, tcp_header_len, | ||
5462 | &fragstolen); | ||
5463 | |||
5464 | tcp_event_data_recv(sk, skb); | ||
5465 | |||
5466 | if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { | ||
5467 | /* Well, only one small jumplet in fast path... */ | ||
5468 | tcp_ack(sk, skb, FLAG_DATA); | ||
5469 | tcp_data_snd_check(sk); | ||
5470 | if (!inet_csk_ack_scheduled(sk)) | ||
5471 | goto no_ack; | ||
5472 | } | ||
5473 | |||
5474 | __tcp_ack_snd_check(sk, 0); | ||
5475 | no_ack: | ||
5476 | if (eaten) | ||
5477 | kfree_skb_partial(skb, fragstolen); | ||
5478 | sk->sk_data_ready(sk); | ||
5479 | return; | ||
5480 | } | ||
5481 | } | ||
5482 | |||
5483 | slow_path: | ||
5317 | if (len < (th->doff << 2) || tcp_checksum_complete(skb)) | 5484 | if (len < (th->doff << 2) || tcp_checksum_complete(skb)) |
5318 | goto csum_error; | 5485 | goto csum_error; |
5319 | 5486 | ||
5320 | if (!th->ack && !th->rst && !th->syn) | 5487 | if (!th->ack && !th->rst && !th->syn) |
5321 | goto discard; | 5488 | goto discard; |
5322 | 5489 | ||
5490 | /* | ||
5491 | * Standard slow path. | ||
5492 | */ | ||
5493 | |||
5323 | if (!tcp_validate_incoming(sk, skb, th, 1)) | 5494 | if (!tcp_validate_incoming(sk, skb, th, 1)) |
5324 | return; | 5495 | return; |
5325 | 5496 | ||
5326 | if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0) | 5497 | step5: |
5498 | if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) | ||
5327 | goto discard; | 5499 | goto discard; |
5328 | 5500 | ||
5329 | tcp_rcv_rtt_measure_ts(sk, skb); | 5501 | tcp_rcv_rtt_measure_ts(sk, skb); |
@@ -5376,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) | |||
5376 | 5548 | ||
5377 | if (sock_flag(sk, SOCK_KEEPOPEN)) | 5549 | if (sock_flag(sk, SOCK_KEEPOPEN)) |
5378 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); | 5550 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); |
5551 | |||
5552 | if (!tp->rx_opt.snd_wscale) | ||
5553 | __tcp_fast_path_on(tp, tp->snd_wnd); | ||
5554 | else | ||
5555 | tp->pred_flags = 0; | ||
5379 | } | 5556 | } |
5380 | 5557 | ||
5381 | static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | 5558 | static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, |
@@ -5504,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5504 | tcp_ecn_rcv_synack(tp, th); | 5681 | tcp_ecn_rcv_synack(tp, th); |
5505 | 5682 | ||
5506 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | 5683 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
5507 | tcp_ack(sk, skb, 0); | 5684 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
5508 | 5685 | ||
5509 | /* Ok.. it's good. Set up sequence numbers and | 5686 | /* Ok.. it's good. Set up sequence numbers and |
5510 | * move to established. | 5687 | * move to established. |
@@ -5740,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) | |||
5740 | return 0; | 5917 | return 0; |
5741 | 5918 | ||
5742 | /* step 5: check the ACK field */ | 5919 | /* step 5: check the ACK field */ |
5743 | 5920 | acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | | |
5744 | acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT | | 5921 | FLAG_UPDATE_TS_RECENT | |
5745 | FLAG_NO_CHALLENGE_ACK) > 0; | 5922 | FLAG_NO_CHALLENGE_ACK) > 0; |
5746 | 5923 | ||
5747 | if (!acceptable) { | 5924 | if (!acceptable) { |
@@ -5809,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) | |||
5809 | tp->lsndtime = tcp_jiffies32; | 5986 | tp->lsndtime = tcp_jiffies32; |
5810 | 5987 | ||
5811 | tcp_initialize_rcv_mss(sk); | 5988 | tcp_initialize_rcv_mss(sk); |
5989 | tcp_fast_path_on(tp); | ||
5812 | break; | 5990 | break; |
5813 | 5991 | ||
5814 | case TCP_FIN_WAIT1: { | 5992 | case TCP_FIN_WAIT1: { |