aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-02-03 21:31:53 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-04 23:36:31 -0500
commit9878196578286c5ed494778ada01da094377a686 (patch)
tree6705fd6c9ed1cafa5b5b458a13aa9978e632a41e /net/ipv4
parent2a356207ae8acf600de7f0a08640659ac71cdf6d (diff)
tcp: do not pace pure ack packets
When we added pacing to TCP, we decided to let sch_fq take care of actual pacing. All TCP had to do was to compute sk->pacing_rate using simple formula: sk->pacing_rate = 2 * cwnd * mss / rtt It works well for senders (bulk flows), but not very well for receivers or even RPC : cwnd on the receiver can be less than 10, rtt can be around 100ms, so we can end up pacing ACK packets, slowing down the sender. Really, only the sender should pace, according to its own logic. Instead of adding a new bit in skb, or call yet another flow dissection, we tweak skb->truesize to a small value (2), and we instruct sch_fq to use new helper and not pace pure ack. Note this also helps TCP small queue, as ack packets present in qdisc/NIC do not prevent sending a data packet (RPC workload) This helps to reduce tx completion overhead, ack packets can use regular sock_wfree() instead of tcp_wfree() which is a bit more expensive. This has no impact in the case packets are sent to loopback interface, as we do not coalesce ack packets (were we would detect skb->truesize lie) In case netem (with a delay) is used, skb_orphan_partial() also sets skb->truesize to 1. This patch is a combination of two patches we used for about one year at Google. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_output.c10
1 files changed, 9 insertions, 1 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20ab06b228ac..1b326ed46f7b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -948,7 +948,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
948 948
949 skb_orphan(skb); 949 skb_orphan(skb);
950 skb->sk = sk; 950 skb->sk = sk;
951 skb->destructor = tcp_wfree; 951 skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
952 skb_set_hash_from_sk(skb, sk); 952 skb_set_hash_from_sk(skb, sk);
953 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 953 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
954 954
@@ -3265,6 +3265,14 @@ void tcp_send_ack(struct sock *sk)
3265 skb_reserve(buff, MAX_TCP_HEADER); 3265 skb_reserve(buff, MAX_TCP_HEADER);
3266 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); 3266 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3267 3267
3268 /* We do not want pure acks influencing TCP Small Queues or fq/pacing
3269 * too much.
3270 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
3271 * We also avoid tcp_wfree() overhead (cache line miss accessing
3272 * tp->tsq_flags) by using regular sock_wfree()
3273 */
3274 skb_set_tcp_pure_ack(buff);
3275
3268 /* Send it off, this clears delayed acks for us. */ 3276 /* Send it off, this clears delayed acks for us. */
3269 skb_mstamp_get(&buff->skb_mstamp); 3277 skb_mstamp_get(&buff->skb_mstamp);
3270 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); 3278 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));