summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-12-17 12:58:30 -0500
committerDavid S. Miller <davem@davemloft.net>2013-12-20 17:56:25 -0500
commita181ceb501b31b4bf8812a5c84c716cc31d82c2d (patch)
tree8233fdcc125262b4985b26ff46d1c58bf2592d1a /net/ipv4/tcp.c
parenta792866ad2dafb8f272e4fdfb98a93fdbfff2277 (diff)
tcp: autocork should not hold first packet in write queue
Willem noticed a TCP_RR regression caused by TCP autocorking on a Mellanox test bed. MLX4_EN_TX_COAL_TIME is 16 us, which can be right above RTT between hosts. We can receive a ACK for a packet still in NIC TX ring buffer or in a softnet completion queue. Fix this by always pushing the skb if it is at the head of write queue. Also, as TX completion is lockless, it's safer to perform sk_wmem_alloc test after setting TSQ_THROTTLED. erd:~# MIB="MIN_LATENCY,MEAN_LATENCY,MAX_LATENCY,P99_LATENCY,STDDEV_LATENCY" erd:~# ./netperf -H remote -t TCP_RR -- -o $MIB | tail -n 1 (repeat 3 times) Before patch : 18,1049.87,41004,39631,6295.47 17,239.52,40804,48,2912.79 18,348.40,40877,54,3573.39 After patch : 18,22.84,4606,38,16.39 17,21.56,2871,36,13.51 17,22.46,2705,37,11.83 Reported-by: Willem de Bruijn <willemb@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Fixes: f54b311142a9 ("tcp: auto corking") Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c14
1 files changed, 10 insertions, 4 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0ca87547becb..d099f9a055c6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -622,19 +622,21 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
622} 622}
623 623
624/* If a not yet filled skb is pushed, do not send it if 624/* If a not yet filled skb is pushed, do not send it if
625 * we have packets in Qdisc or NIC queues : 625 * we have data packets in Qdisc or NIC queues :
626 * Because TX completion will happen shortly, it gives a chance 626 * Because TX completion will happen shortly, it gives a chance
627 * to coalesce future sendmsg() payload into this skb, without 627 * to coalesce future sendmsg() payload into this skb, without
628 * need for a timer, and with no latency trade off. 628 * need for a timer, and with no latency trade off.
629 * As packets containing data payload have a bigger truesize 629 * As packets containing data payload have a bigger truesize
630 * than pure acks (dataless) packets, the last check prevents 630 * than pure acks (dataless) packets, the last checks prevent
631 * autocorking if we only have an ACK in Qdisc/NIC queues. 631 * autocorking if we only have an ACK in Qdisc/NIC queues,
632 * or if TX completion was delayed after we processed ACK packet.
632 */ 633 */
633static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, 634static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
634 int size_goal) 635 int size_goal)
635{ 636{
636 return skb->len < size_goal && 637 return skb->len < size_goal &&
637 sysctl_tcp_autocorking && 638 sysctl_tcp_autocorking &&
639 skb != tcp_write_queue_head(sk) &&
638 atomic_read(&sk->sk_wmem_alloc) > skb->truesize; 640 atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
639} 641}
640 642
@@ -660,7 +662,11 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
660 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); 662 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
661 set_bit(TSQ_THROTTLED, &tp->tsq_flags); 663 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
662 } 664 }
663 return; 665 /* It is possible TX completion already happened
666 * before we set TSQ_THROTTLED.
667 */
668 if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
669 return;
664 } 670 }
665 671
666 if (flags & MSG_MORE) 672 if (flags & MSG_MORE)