summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2019-03-22 11:56:40 -0400
committerDavid S. Miller <davem@davemloft.net>2019-03-23 21:57:38 -0400
commit8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c (patch)
tree0e6f2cfd66715d2234acda3ae48d1543facc5303
parent472c2e07eef045145bc1493cc94a01c87140780a (diff)
tcp: add one skb cache for rx
Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/sock.h10
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv6/tcp_ipv6.c12
5 files changed, 36 insertions, 5 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 314c47a8f5d1..577d91fb5626 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -368,6 +368,7 @@ struct sock {
368 atomic_t sk_drops; 368 atomic_t sk_drops;
369 int sk_rcvlowat; 369 int sk_rcvlowat;
370 struct sk_buff_head sk_error_queue; 370 struct sk_buff_head sk_error_queue;
371 struct sk_buff *sk_rx_skb_cache;
371 struct sk_buff_head sk_receive_queue; 372 struct sk_buff_head sk_receive_queue;
372 /* 373 /*
373 * The backlog queue is special, it is always used with 374 * The backlog queue is special, it is always used with
@@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
2438static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) 2439static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
2439{ 2440{
2440 __skb_unlink(skb, &sk->sk_receive_queue); 2441 __skb_unlink(skb, &sk->sk_receive_queue);
2442 if (
2443#ifdef CONFIG_RPS
2444 !static_branch_unlikely(&rps_needed) &&
2445#endif
2446 !sk->sk_rx_skb_cache) {
2447 sk->sk_rx_skb_cache = skb;
2448 skb_orphan(skb);
2449 return;
2450 }
2441 __kfree_skb(skb); 2451 __kfree_skb(skb);
2442} 2452}
2443 2453
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eab3ebde981e..7f3a984ad618 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk)
136 struct inet_sock *inet = inet_sk(sk); 136 struct inet_sock *inet = inet_sk(sk);
137 137
138 __skb_queue_purge(&sk->sk_receive_queue); 138 __skb_queue_purge(&sk->sk_receive_queue);
139 if (sk->sk_rx_skb_cache) {
140 __kfree_skb(sk->sk_rx_skb_cache);
141 sk->sk_rx_skb_cache = NULL;
142 }
139 __skb_queue_purge(&sk->sk_error_queue); 143 __skb_queue_purge(&sk->sk_error_queue);
140 144
141 sk_mem_reclaim(sk); 145 sk_mem_reclaim(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f0b5a5999145..29b94edf05f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2583,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags)
2583 2583
2584 tcp_clear_xmit_timers(sk); 2584 tcp_clear_xmit_timers(sk);
2585 __skb_queue_purge(&sk->sk_receive_queue); 2585 __skb_queue_purge(&sk->sk_receive_queue);
2586 if (sk->sk_rx_skb_cache) {
2587 __kfree_skb(sk->sk_rx_skb_cache);
2588 sk->sk_rx_skb_cache = NULL;
2589 }
2586 tp->copied_seq = tp->rcv_nxt; 2590 tp->copied_seq = tp->rcv_nxt;
2587 tp->urg_data = 0; 2591 tp->urg_data = 0;
2588 tcp_write_queue_purge(sk); 2592 tcp_write_queue_purge(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 277d71239d75..3979939804b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1774int tcp_v4_rcv(struct sk_buff *skb) 1774int tcp_v4_rcv(struct sk_buff *skb)
1775{ 1775{
1776 struct net *net = dev_net(skb->dev); 1776 struct net *net = dev_net(skb->dev);
1777 struct sk_buff *skb_to_free;
1777 int sdif = inet_sdif(skb); 1778 int sdif = inet_sdif(skb);
1778 const struct iphdr *iph; 1779 const struct iphdr *iph;
1779 const struct tcphdr *th; 1780 const struct tcphdr *th;
@@ -1905,11 +1906,17 @@ process:
1905 tcp_segs_in(tcp_sk(sk), skb); 1906 tcp_segs_in(tcp_sk(sk), skb);
1906 ret = 0; 1907 ret = 0;
1907 if (!sock_owned_by_user(sk)) { 1908 if (!sock_owned_by_user(sk)) {
1909 skb_to_free = sk->sk_rx_skb_cache;
1910 sk->sk_rx_skb_cache = NULL;
1908 ret = tcp_v4_do_rcv(sk, skb); 1911 ret = tcp_v4_do_rcv(sk, skb);
1909 } else if (tcp_add_backlog(sk, skb)) { 1912 } else {
1910 goto discard_and_relse; 1913 if (tcp_add_backlog(sk, skb))
1914 goto discard_and_relse;
1915 skb_to_free = NULL;
1911 } 1916 }
1912 bh_unlock_sock(sk); 1917 bh_unlock_sock(sk);
1918 if (skb_to_free)
1919 __kfree_skb(skb_to_free);
1913 1920
1914put_and_return: 1921put_and_return:
1915 if (refcounted) 1922 if (refcounted)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 983ad7a75102..77d723bbe050 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1436,6 +1436,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
1436 1436
1437static int tcp_v6_rcv(struct sk_buff *skb) 1437static int tcp_v6_rcv(struct sk_buff *skb)
1438{ 1438{
1439 struct sk_buff *skb_to_free;
1439 int sdif = inet6_sdif(skb); 1440 int sdif = inet6_sdif(skb);
1440 const struct tcphdr *th; 1441 const struct tcphdr *th;
1441 const struct ipv6hdr *hdr; 1442 const struct ipv6hdr *hdr;
@@ -1562,12 +1563,17 @@ process:
1562 tcp_segs_in(tcp_sk(sk), skb); 1563 tcp_segs_in(tcp_sk(sk), skb);
1563 ret = 0; 1564 ret = 0;
1564 if (!sock_owned_by_user(sk)) { 1565 if (!sock_owned_by_user(sk)) {
1566 skb_to_free = sk->sk_rx_skb_cache;
1567 sk->sk_rx_skb_cache = NULL;
1565 ret = tcp_v6_do_rcv(sk, skb); 1568 ret = tcp_v6_do_rcv(sk, skb);
1566 } else if (tcp_add_backlog(sk, skb)) { 1569 } else {
1567 goto discard_and_relse; 1570 if (tcp_add_backlog(sk, skb))
1571 goto discard_and_relse;
1572 skb_to_free = NULL;
1568 } 1573 }
1569 bh_unlock_sock(sk); 1574 bh_unlock_sock(sk);
1570 1575 if (skb_to_free)
1576 __kfree_skb(skb_to_free);
1571put_and_return: 1577put_and_return:
1572 if (refcounted) 1578 if (refcounted)
1573 sock_put(sk); 1579 sock_put(sk);