diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2009-06-11 05:55:43 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-06-11 05:55:43 -0400 |
commit | 2b85a34e911bf483c27cfdd124aeb1605145dc80 (patch) | |
tree | 3cea3e8a27b62de2f92e759641c27200d8bde421 | |
parent | f2333a014c1e13ac8e1b73a6fd77731c524eff78 (diff) |
net: No more expensive sock_hold()/sock_put() on each tx
One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.
This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.
We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.
We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.
As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.
skb_set_owner_w() doesnt call sock_hold() anymore
sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.
Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.
Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/sock.h | 6 | ||||
-rw-r--r-- | net/core/sock.c | 29 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 1 | ||||
-rw-r--r-- | net/ipv6/ip6_output.c | 1 |
4 files changed, 30 insertions, 7 deletions
diff --git a/include/net/sock.h b/include/net/sock.h index 4bb1ff9fd15b..010e14a93c92 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -1217,9 +1217,13 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from, | |||
1217 | 1217 | ||
1218 | static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) | 1218 | static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) |
1219 | { | 1219 | { |
1220 | sock_hold(sk); | ||
1221 | skb->sk = sk; | 1220 | skb->sk = sk; |
1222 | skb->destructor = sock_wfree; | 1221 | skb->destructor = sock_wfree; |
1222 | /* | ||
1223 | * We used to take a refcount on sk, but following operation | ||
1224 | * is enough to guarantee sk_free() wont free this sock until | ||
1225 | * all in-flight packets are completed | ||
1226 | */ | ||
1223 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | 1227 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); |
1224 | } | 1228 | } |
1225 | 1229 | ||
diff --git a/net/core/sock.c b/net/core/sock.c index 04e35eb2e736..06e26b77ad9e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -1008,7 +1008,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, | |||
1008 | } | 1008 | } |
1009 | EXPORT_SYMBOL(sk_alloc); | 1009 | EXPORT_SYMBOL(sk_alloc); |
1010 | 1010 | ||
1011 | void sk_free(struct sock *sk) | 1011 | static void __sk_free(struct sock *sk) |
1012 | { | 1012 | { |
1013 | struct sk_filter *filter; | 1013 | struct sk_filter *filter; |
1014 | 1014 | ||
@@ -1031,6 +1031,17 @@ void sk_free(struct sock *sk) | |||
1031 | put_net(sock_net(sk)); | 1031 | put_net(sock_net(sk)); |
1032 | sk_prot_free(sk->sk_prot_creator, sk); | 1032 | sk_prot_free(sk->sk_prot_creator, sk); |
1033 | } | 1033 | } |
1034 | |||
1035 | void sk_free(struct sock *sk) | ||
1036 | { | ||
1037 | /* | ||
1038 | * We substract one from sk_wmem_alloc and can know if | ||
1039 | * some packets are still in some tx queue. | ||
1040 | * If not null, sock_wfree() will call __sk_free(sk) later | ||
1041 | */ | ||
1042 | if (atomic_dec_and_test(&sk->sk_wmem_alloc)) | ||
1043 | __sk_free(sk); | ||
1044 | } | ||
1034 | EXPORT_SYMBOL(sk_free); | 1045 | EXPORT_SYMBOL(sk_free); |
1035 | 1046 | ||
1036 | /* | 1047 | /* |
@@ -1071,7 +1082,10 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) | |||
1071 | newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; | 1082 | newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; |
1072 | 1083 | ||
1073 | atomic_set(&newsk->sk_rmem_alloc, 0); | 1084 | atomic_set(&newsk->sk_rmem_alloc, 0); |
1074 | atomic_set(&newsk->sk_wmem_alloc, 0); | 1085 | /* |
1086 | * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) | ||
1087 | */ | ||
1088 | atomic_set(&newsk->sk_wmem_alloc, 1); | ||
1075 | atomic_set(&newsk->sk_omem_alloc, 0); | 1089 | atomic_set(&newsk->sk_omem_alloc, 0); |
1076 | skb_queue_head_init(&newsk->sk_receive_queue); | 1090 | skb_queue_head_init(&newsk->sk_receive_queue); |
1077 | skb_queue_head_init(&newsk->sk_write_queue); | 1091 | skb_queue_head_init(&newsk->sk_write_queue); |
@@ -1175,12 +1189,18 @@ void __init sk_init(void) | |||
1175 | void sock_wfree(struct sk_buff *skb) | 1189 | void sock_wfree(struct sk_buff *skb) |
1176 | { | 1190 | { |
1177 | struct sock *sk = skb->sk; | 1191 | struct sock *sk = skb->sk; |
1192 | int res; | ||
1178 | 1193 | ||
1179 | /* In case it might be waiting for more memory. */ | 1194 | /* In case it might be waiting for more memory. */ |
1180 | atomic_sub(skb->truesize, &sk->sk_wmem_alloc); | 1195 | res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc); |
1181 | if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) | 1196 | if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) |
1182 | sk->sk_write_space(sk); | 1197 | sk->sk_write_space(sk); |
1183 | sock_put(sk); | 1198 | /* |
1199 | * if sk_wmem_alloc reached 0, we are last user and should | ||
1200 | * free this sock, as sk_free() call could not do it. | ||
1201 | */ | ||
1202 | if (res == 0) | ||
1203 | __sk_free(sk); | ||
1184 | } | 1204 | } |
1185 | EXPORT_SYMBOL(sock_wfree); | 1205 | EXPORT_SYMBOL(sock_wfree); |
1186 | 1206 | ||
@@ -1819,6 +1839,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) | |||
1819 | sk->sk_stamp = ktime_set(-1L, 0); | 1839 | sk->sk_stamp = ktime_set(-1L, 0); |
1820 | 1840 | ||
1821 | atomic_set(&sk->sk_refcnt, 1); | 1841 | atomic_set(&sk->sk_refcnt, 1); |
1842 | atomic_set(&sk->sk_wmem_alloc, 1); | ||
1822 | atomic_set(&sk->sk_drops, 0); | 1843 | atomic_set(&sk->sk_drops, 0); |
1823 | } | 1844 | } |
1824 | EXPORT_SYMBOL(sock_init_data); | 1845 | EXPORT_SYMBOL(sock_init_data); |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9248d2807ba6..247026282669 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -498,7 +498,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) | |||
498 | 498 | ||
499 | BUG_ON(frag->sk); | 499 | BUG_ON(frag->sk); |
500 | if (skb->sk) { | 500 | if (skb->sk) { |
501 | sock_hold(skb->sk); | ||
502 | frag->sk = skb->sk; | 501 | frag->sk = skb->sk; |
503 | frag->destructor = sock_wfree; | 502 | frag->destructor = sock_wfree; |
504 | truesizes += frag->truesize; | 503 | truesizes += frag->truesize; |
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index db6c7224a862..7c76e3d18215 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c | |||
@@ -680,7 +680,6 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) | |||
680 | 680 | ||
681 | BUG_ON(frag->sk); | 681 | BUG_ON(frag->sk); |
682 | if (skb->sk) { | 682 | if (skb->sk) { |
683 | sock_hold(skb->sk); | ||
684 | frag->sk = skb->sk; | 683 | frag->sk = skb->sk; |
685 | frag->destructor = sock_wfree; | 684 | frag->destructor = sock_wfree; |
686 | truesizes += frag->truesize; | 685 | truesizes += frag->truesize; |