diff options
author | Eric Dumazet <edumazet@google.com> | 2013-12-17 12:58:30 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-12-20 17:56:25 -0500 |
commit | a181ceb501b31b4bf8812a5c84c716cc31d82c2d (patch) | |
tree | 8233fdcc125262b4985b26ff46d1c58bf2592d1a /net/ipv4/tcp.c | |
parent | a792866ad2dafb8f272e4fdfb98a93fdbfff2277 (diff) |
tcp: autocork should not hold first packet in write queue
Willem noticed a TCP_RR regression caused by TCP autocorking
on a Mellanox test bed. MLX4_EN_TX_COAL_TIME is 16 us, which can be
right above RTT between hosts.
We can receive a ACK for a packet still in NIC TX ring buffer or in a
softnet completion queue.
Fix this by always pushing the skb if it is at the head of write queue.
Also, as TX completion is lockless, it's safer to perform sk_wmem_alloc
test after setting TSQ_THROTTLED.
erd:~# MIB="MIN_LATENCY,MEAN_LATENCY,MAX_LATENCY,P99_LATENCY,STDDEV_LATENCY"
erd:~# ./netperf -H remote -t TCP_RR -- -o $MIB | tail -n 1
(repeat 3 times)
Before patch :
18,1049.87,41004,39631,6295.47
17,239.52,40804,48,2912.79
18,348.40,40877,54,3573.39
After patch :
18,22.84,4606,38,16.39
17,21.56,2871,36,13.51
17,22.46,2705,37,11.83
Reported-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: f54b311142a9 ("tcp: auto corking")
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 14 |
1 files changed, 10 insertions, 4 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0ca87547becb..d099f9a055c6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -622,19 +622,21 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) | |||
622 | } | 622 | } |
623 | 623 | ||
624 | /* If a not yet filled skb is pushed, do not send it if | 624 | /* If a not yet filled skb is pushed, do not send it if |
625 | * we have packets in Qdisc or NIC queues : | 625 | * we have data packets in Qdisc or NIC queues : |
626 | * Because TX completion will happen shortly, it gives a chance | 626 | * Because TX completion will happen shortly, it gives a chance |
627 | * to coalesce future sendmsg() payload into this skb, without | 627 | * to coalesce future sendmsg() payload into this skb, without |
628 | * need for a timer, and with no latency trade off. | 628 | * need for a timer, and with no latency trade off. |
629 | * As packets containing data payload have a bigger truesize | 629 | * As packets containing data payload have a bigger truesize |
630 | * than pure acks (dataless) packets, the last check prevents | 630 | * than pure acks (dataless) packets, the last checks prevent |
631 | * autocorking if we only have an ACK in Qdisc/NIC queues. | 631 | * autocorking if we only have an ACK in Qdisc/NIC queues, |
632 | * or if TX completion was delayed after we processed ACK packet. | ||
632 | */ | 633 | */ |
633 | static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, | 634 | static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, |
634 | int size_goal) | 635 | int size_goal) |
635 | { | 636 | { |
636 | return skb->len < size_goal && | 637 | return skb->len < size_goal && |
637 | sysctl_tcp_autocorking && | 638 | sysctl_tcp_autocorking && |
639 | skb != tcp_write_queue_head(sk) && | ||
638 | atomic_read(&sk->sk_wmem_alloc) > skb->truesize; | 640 | atomic_read(&sk->sk_wmem_alloc) > skb->truesize; |
639 | } | 641 | } |
640 | 642 | ||
@@ -660,7 +662,11 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, | |||
660 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); | 662 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); |
661 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | 663 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); |
662 | } | 664 | } |
663 | return; | 665 | /* It is possible TX completion already happened |
666 | * before we set TSQ_THROTTLED. | ||
667 | */ | ||
668 | if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) | ||
669 | return; | ||
664 | } | 670 | } |
665 | 671 | ||
666 | if (flags & MSG_MORE) | 672 | if (flags & MSG_MORE) |