aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2009-06-11 05:55:43 -0400
committerDavid S. Miller <davem@davemloft.net>2009-06-11 05:55:43 -0400
commit2b85a34e911bf483c27cfdd124aeb1605145dc80 (patch)
tree3cea3e8a27b62de2f92e759641c27200d8bde421
parentf2333a014c1e13ac8e1b73a6fd77731c524eff78 (diff)
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses a pair of sock_hold()/sock_put() for each transmitted packet. This slows down bidirectional flows because the receive path also needs to take a refcount on socket and might use a different cpu than transmit path or transmit completion path. So these two atomic operations also trigger cache line bounces. We can see this in tx or tx/rx workloads (media gateways for example), where sock_wfree() can be in top five functions in profiles. We use this sock_hold()/sock_put() so that sock freeing is delayed until all tx packets are completed. As we also update sk_wmem_alloc, we could offset sk_wmem_alloc by one unit at init time, until sk_free() is called. Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc) to decrement initial offset and atomicaly check if any packets are in flight. skb_set_owner_w() doesnt call sock_hold() anymore sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc reached 0 to perform the final freeing. Drawback is that a skb->truesize error could lead to unfreeable sockets, or even worse, prematurely calling __sk_free() on a live socket. Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt contention point. 5 % speedup on a UDP transmit workload (depends on number of flows), lowering TX completion cpu usage. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/sock.h6
-rw-r--r--net/core/sock.c29
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv6/ip6_output.c1
4 files changed, 30 insertions, 7 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9fd15b..010e14a93c92 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1217,9 +1217,13 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from,
1217 1217
1218static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1218static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1219{ 1219{
1220 sock_hold(sk);
1221 skb->sk = sk; 1220 skb->sk = sk;
1222 skb->destructor = sock_wfree; 1221 skb->destructor = sock_wfree;
1222 /*
1223 * We used to take a refcount on sk, but following operation
1224 * is enough to guarantee sk_free() wont free this sock until
1225 * all in-flight packets are completed
1226 */
1223 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 1227 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1224} 1228}
1225 1229
diff --git a/net/core/sock.c b/net/core/sock.c
index 04e35eb2e736..06e26b77ad9e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1008,7 +1008,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1008} 1008}
1009EXPORT_SYMBOL(sk_alloc); 1009EXPORT_SYMBOL(sk_alloc);
1010 1010
1011void sk_free(struct sock *sk) 1011static void __sk_free(struct sock *sk)
1012{ 1012{
1013 struct sk_filter *filter; 1013 struct sk_filter *filter;
1014 1014
@@ -1031,6 +1031,17 @@ void sk_free(struct sock *sk)
1031 put_net(sock_net(sk)); 1031 put_net(sock_net(sk));
1032 sk_prot_free(sk->sk_prot_creator, sk); 1032 sk_prot_free(sk->sk_prot_creator, sk);
1033} 1033}
1034
1035void sk_free(struct sock *sk)
1036{
1037 /*
1038 * We substract one from sk_wmem_alloc and can know if
1039 * some packets are still in some tx queue.
1040 * If not null, sock_wfree() will call __sk_free(sk) later
1041 */
1042 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1043 __sk_free(sk);
1044}
1034EXPORT_SYMBOL(sk_free); 1045EXPORT_SYMBOL(sk_free);
1035 1046
1036/* 1047/*
@@ -1071,7 +1082,10 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1071 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1082 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1072 1083
1073 atomic_set(&newsk->sk_rmem_alloc, 0); 1084 atomic_set(&newsk->sk_rmem_alloc, 0);
1074 atomic_set(&newsk->sk_wmem_alloc, 0); 1085 /*
1086 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1087 */
1088 atomic_set(&newsk->sk_wmem_alloc, 1);
1075 atomic_set(&newsk->sk_omem_alloc, 0); 1089 atomic_set(&newsk->sk_omem_alloc, 0);
1076 skb_queue_head_init(&newsk->sk_receive_queue); 1090 skb_queue_head_init(&newsk->sk_receive_queue);
1077 skb_queue_head_init(&newsk->sk_write_queue); 1091 skb_queue_head_init(&newsk->sk_write_queue);
@@ -1175,12 +1189,18 @@ void __init sk_init(void)
1175void sock_wfree(struct sk_buff *skb) 1189void sock_wfree(struct sk_buff *skb)
1176{ 1190{
1177 struct sock *sk = skb->sk; 1191 struct sock *sk = skb->sk;
1192 int res;
1178 1193
1179 /* In case it might be waiting for more memory. */ 1194 /* In case it might be waiting for more memory. */
1180 atomic_sub(skb->truesize, &sk->sk_wmem_alloc); 1195 res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
1181 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) 1196 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1182 sk->sk_write_space(sk); 1197 sk->sk_write_space(sk);
1183 sock_put(sk); 1198 /*
1199 * if sk_wmem_alloc reached 0, we are last user and should
1200 * free this sock, as sk_free() call could not do it.
1201 */
1202 if (res == 0)
1203 __sk_free(sk);
1184} 1204}
1185EXPORT_SYMBOL(sock_wfree); 1205EXPORT_SYMBOL(sock_wfree);
1186 1206
@@ -1819,6 +1839,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
1819 sk->sk_stamp = ktime_set(-1L, 0); 1839 sk->sk_stamp = ktime_set(-1L, 0);
1820 1840
1821 atomic_set(&sk->sk_refcnt, 1); 1841 atomic_set(&sk->sk_refcnt, 1);
1842 atomic_set(&sk->sk_wmem_alloc, 1);
1822 atomic_set(&sk->sk_drops, 0); 1843 atomic_set(&sk->sk_drops, 0);
1823} 1844}
1824EXPORT_SYMBOL(sock_init_data); 1845EXPORT_SYMBOL(sock_init_data);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9248d2807ba6..247026282669 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -498,7 +498,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
498 498
499 BUG_ON(frag->sk); 499 BUG_ON(frag->sk);
500 if (skb->sk) { 500 if (skb->sk) {
501 sock_hold(skb->sk);
502 frag->sk = skb->sk; 501 frag->sk = skb->sk;
503 frag->destructor = sock_wfree; 502 frag->destructor = sock_wfree;
504 truesizes += frag->truesize; 503 truesizes += frag->truesize;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index db6c7224a862..7c76e3d18215 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -680,7 +680,6 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
680 680
681 BUG_ON(frag->sk); 681 BUG_ON(frag->sk);
682 if (skb->sk) { 682 if (skb->sk) {
683 sock_hold(skb->sk);
684 frag->sk = skb->sk; 683 frag->sk = skb->sk;
685 frag->destructor = sock_wfree; 684 frag->destructor = sock_wfree;
686 truesizes += frag->truesize; 685 truesizes += frag->truesize;