aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-10-29 05:11:14 -0400
committerDavid S. Miller <davem@davemloft.net>2008-10-29 05:11:14 -0400
commit271b72c7fa82c2c7a795bc16896149933110672d (patch)
tree5634b95c04b4a7ac9babf2d8ac34cfb6c38a8f83
parent645ca708f936b2fbeb79e52d7823e3eb2c0905f8 (diff)
udp: RCU handling for Unicast packets.
Goals are : 1) Optimizing handling of incoming Unicast UDP frames, so that no memory writes should happen in the fast path. Note: Multicasts and broadcasts still will need to take a lock, because doing a full lockless lookup in this case is difficult. 2) No expensive operations in the socket bind/unhash phases : - No expensive synchronize_rcu() calls. - No added rcu_head in socket structure, increasing memory needs, but more important, forcing us to use call_rcu() calls, that have the bad property of making sockets structure cold. (rcu grace period between socket freeing and its potential reuse make this socket being cold in CPU cache). David did a previous patch using call_rcu() and noticed a 20% impact on TCP connection rates. Quoting Cristopher Lameter : "Right. That results in cacheline cooldown. You'd want to recycle the object as they are cache hot on a per cpu basis. That is screwed up by the delayed regular rcu processing. We have seen multiple regressions due to cacheline cooldown. The only choice in cacheline hot sensitive areas is to deal with the complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU." - Because udp sockets are allocated from dedicated kmem_cache, use of SLAB_DESTROY_BY_RCU can help here. Theory of operation : --------------------- As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()), special attention must be taken by readers and writers. Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed, reused, inserted in a different chain or in worst case in the same chain while readers could do lookups in the same time. In order to avoid loops, a reader must check each socket found in a chain really belongs to the chain the reader was traversing. If it finds a mismatch, lookup must start again at the begining. This *restart* loop is the reason we had to use rdlock for the multicast case, because we dont want to send same message several times to the same socket. We use RCU only for fast path. Thus, /proc/net/udp still takes spinlocks. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/sock.h37
-rw-r--r--net/core/sock.c3
-rw-r--r--net/ipv4/udp.c35
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv6/udp.c31
-rw-r--r--net/ipv6/udplite.c1
6 files changed, 90 insertions, 18 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index d200dfbe1ef6..0bea25db5471 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -363,6 +363,27 @@ static __inline__ int sk_del_node_init(struct sock *sk)
363 return rc; 363 return rc;
364} 364}
365 365
366static __inline__ int __sk_del_node_init_rcu(struct sock *sk)
367{
368 if (sk_hashed(sk)) {
369 hlist_del_init_rcu(&sk->sk_node);
370 return 1;
371 }
372 return 0;
373}
374
375static __inline__ int sk_del_node_init_rcu(struct sock *sk)
376{
377 int rc = __sk_del_node_init_rcu(sk);
378
379 if (rc) {
380 /* paranoid for a while -acme */
381 WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
382 __sock_put(sk);
383 }
384 return rc;
385}
386
366static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) 387static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
367{ 388{
368 hlist_add_head(&sk->sk_node, list); 389 hlist_add_head(&sk->sk_node, list);
@@ -374,6 +395,17 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
374 __sk_add_node(sk, list); 395 __sk_add_node(sk, list);
375} 396}
376 397
398static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
399{
400 hlist_add_head_rcu(&sk->sk_node, list);
401}
402
403static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
404{
405 sock_hold(sk);
406 __sk_add_node_rcu(sk, list);
407}
408
377static __inline__ void __sk_del_bind_node(struct sock *sk) 409static __inline__ void __sk_del_bind_node(struct sock *sk)
378{ 410{
379 __hlist_del(&sk->sk_bind_node); 411 __hlist_del(&sk->sk_bind_node);
@@ -387,6 +419,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
387 419
388#define sk_for_each(__sk, node, list) \ 420#define sk_for_each(__sk, node, list) \
389 hlist_for_each_entry(__sk, node, list, sk_node) 421 hlist_for_each_entry(__sk, node, list, sk_node)
422#define sk_for_each_rcu(__sk, node, list) \
423 hlist_for_each_entry_rcu(__sk, node, list, sk_node)
390#define sk_for_each_from(__sk, node) \ 424#define sk_for_each_from(__sk, node) \
391 if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ 425 if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
392 hlist_for_each_entry_from(__sk, node, sk_node) 426 hlist_for_each_entry_from(__sk, node, sk_node)
@@ -589,8 +623,9 @@ struct proto {
589 int *sysctl_rmem; 623 int *sysctl_rmem;
590 int max_header; 624 int max_header;
591 625
592 struct kmem_cache *slab; 626 struct kmem_cache *slab;
593 unsigned int obj_size; 627 unsigned int obj_size;
628 int slab_flags;
594 629
595 atomic_t *orphan_count; 630 atomic_t *orphan_count;
596 631
diff --git a/net/core/sock.c b/net/core/sock.c
index 5e2a3132a8c9..ded1eb5d2fd4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2042,7 +2042,8 @@ int proto_register(struct proto *prot, int alloc_slab)
2042 2042
2043 if (alloc_slab) { 2043 if (alloc_slab) {
2044 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, 2044 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2045 SLAB_HWCACHE_ALIGN, NULL); 2045 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2046 NULL);
2046 2047
2047 if (prot->slab == NULL) { 2048 if (prot->slab == NULL) {
2048 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", 2049 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2a6c491f97d7..0ea974bf7962 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -187,7 +187,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
187 inet_sk(sk)->num = snum; 187 inet_sk(sk)->num = snum;
188 sk->sk_hash = snum; 188 sk->sk_hash = snum;
189 if (sk_unhashed(sk)) { 189 if (sk_unhashed(sk)) {
190 sk_add_node(sk, &hslot->head); 190 sk_add_node_rcu(sk, &hslot->head);
191 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 191 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
192 } 192 }
193 error = 0; 193 error = 0;
@@ -253,15 +253,24 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
253 __be16 sport, __be32 daddr, __be16 dport, 253 __be16 sport, __be32 daddr, __be16 dport,
254 int dif, struct udp_table *udptable) 254 int dif, struct udp_table *udptable)
255{ 255{
256 struct sock *sk, *result = NULL; 256 struct sock *sk, *result;
257 struct hlist_node *node; 257 struct hlist_node *node;
258 unsigned short hnum = ntohs(dport); 258 unsigned short hnum = ntohs(dport);
259 unsigned int hash = udp_hashfn(net, hnum); 259 unsigned int hash = udp_hashfn(net, hnum);
260 struct udp_hslot *hslot = &udptable->hash[hash]; 260 struct udp_hslot *hslot = &udptable->hash[hash];
261 int score, badness = -1; 261 int score, badness;
262 262
263 spin_lock(&hslot->lock); 263 rcu_read_lock();
264 sk_for_each(sk, node, &hslot->head) { 264begin:
265 result = NULL;
266 badness = -1;
267 sk_for_each_rcu(sk, node, &hslot->head) {
268 /*
269 * lockless reader, and SLAB_DESTROY_BY_RCU items:
270 * We must check this item was not moved to another chain
271 */
272 if (udp_hashfn(net, sk->sk_hash) != hash)
273 goto begin;
265 score = compute_score(sk, net, saddr, hnum, sport, 274 score = compute_score(sk, net, saddr, hnum, sport,
266 daddr, dport, dif); 275 daddr, dport, dif);
267 if (score > badness) { 276 if (score > badness) {
@@ -269,9 +278,16 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
269 badness = score; 278 badness = score;
270 } 279 }
271 } 280 }
272 if (result) 281 if (result) {
273 sock_hold(result); 282 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
274 spin_unlock(&hslot->lock); 283 result = NULL;
284 else if (unlikely(compute_score(result, net, saddr, hnum, sport,
285 daddr, dport, dif) < badness)) {
286 sock_put(result);
287 goto begin;
288 }
289 }
290 rcu_read_unlock();
275 return result; 291 return result;
276} 292}
277 293
@@ -953,7 +969,7 @@ void udp_lib_unhash(struct sock *sk)
953 struct udp_hslot *hslot = &udptable->hash[hash]; 969 struct udp_hslot *hslot = &udptable->hash[hash];
954 970
955 spin_lock(&hslot->lock); 971 spin_lock(&hslot->lock);
956 if (sk_del_node_init(sk)) { 972 if (sk_del_node_init_rcu(sk)) {
957 inet_sk(sk)->num = 0; 973 inet_sk(sk)->num = 0;
958 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 974 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
959 } 975 }
@@ -1517,6 +1533,7 @@ struct proto udp_prot = {
1517 .sysctl_wmem = &sysctl_udp_wmem_min, 1533 .sysctl_wmem = &sysctl_udp_wmem_min,
1518 .sysctl_rmem = &sysctl_udp_rmem_min, 1534 .sysctl_rmem = &sysctl_udp_rmem_min,
1519 .obj_size = sizeof(struct udp_sock), 1535 .obj_size = sizeof(struct udp_sock),
1536 .slab_flags = SLAB_DESTROY_BY_RCU,
1520 .h.udp_table = &udp_table, 1537 .h.udp_table = &udp_table,
1521#ifdef CONFIG_COMPAT 1538#ifdef CONFIG_COMPAT
1522 .compat_setsockopt = compat_udp_setsockopt, 1539 .compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index d8ea8e5f5ea3..c784891cb7e5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -51,6 +51,7 @@ struct proto udplite_prot = {
51 .unhash = udp_lib_unhash, 51 .unhash = udp_lib_unhash,
52 .get_port = udp_v4_get_port, 52 .get_port = udp_v4_get_port,
53 .obj_size = sizeof(struct udp_sock), 53 .obj_size = sizeof(struct udp_sock),
54 .slab_flags = SLAB_DESTROY_BY_RCU,
54 .h.udp_table = &udplite_table, 55 .h.udp_table = &udplite_table,
55#ifdef CONFIG_COMPAT 56#ifdef CONFIG_COMPAT
56 .compat_setsockopt = compat_udp_setsockopt, 57 .compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ccee7244ca0f..1d9790e43dfc 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -97,24 +97,40 @@ static struct sock *__udp6_lib_lookup(struct net *net,
97 struct in6_addr *daddr, __be16 dport, 97 struct in6_addr *daddr, __be16 dport,
98 int dif, struct udp_table *udptable) 98 int dif, struct udp_table *udptable)
99{ 99{
100 struct sock *sk, *result = NULL; 100 struct sock *sk, *result;
101 struct hlist_node *node; 101 struct hlist_node *node;
102 unsigned short hnum = ntohs(dport); 102 unsigned short hnum = ntohs(dport);
103 unsigned int hash = udp_hashfn(net, hnum); 103 unsigned int hash = udp_hashfn(net, hnum);
104 struct udp_hslot *hslot = &udptable->hash[hash]; 104 struct udp_hslot *hslot = &udptable->hash[hash];
105 int score, badness = -1; 105 int score, badness;
106 106
107 spin_lock(&hslot->lock); 107 rcu_read_lock();
108 sk_for_each(sk, node, &hslot->head) { 108begin:
109 result = NULL;
110 badness = -1;
111 sk_for_each_rcu(sk, node, &hslot->head) {
112 /*
113 * lockless reader, and SLAB_DESTROY_BY_RCU items:
114 * We must check this item was not moved to another chain
115 */
116 if (udp_hashfn(net, sk->sk_hash) != hash)
117 goto begin;
109 score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); 118 score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
110 if (score > badness) { 119 if (score > badness) {
111 result = sk; 120 result = sk;
112 badness = score; 121 badness = score;
113 } 122 }
114 } 123 }
115 if (result) 124 if (result) {
116 sock_hold(result); 125 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
117 spin_unlock(&hslot->lock); 126 result = NULL;
127 else if (unlikely(compute_score(result, net, hnum, saddr, sport,
128 daddr, dport, dif) < badness)) {
129 sock_put(result);
130 goto begin;
131 }
132 }
133 rcu_read_unlock();
118 return result; 134 return result;
119} 135}
120 136
@@ -1062,6 +1078,7 @@ struct proto udpv6_prot = {
1062 .sysctl_wmem = &sysctl_udp_wmem_min, 1078 .sysctl_wmem = &sysctl_udp_wmem_min,
1063 .sysctl_rmem = &sysctl_udp_rmem_min, 1079 .sysctl_rmem = &sysctl_udp_rmem_min,
1064 .obj_size = sizeof(struct udp6_sock), 1080 .obj_size = sizeof(struct udp6_sock),
1081 .slab_flags = SLAB_DESTROY_BY_RCU,
1065 .h.udp_table = &udp_table, 1082 .h.udp_table = &udp_table,
1066#ifdef CONFIG_COMPAT 1083#ifdef CONFIG_COMPAT
1067 .compat_setsockopt = compat_udpv6_setsockopt, 1084 .compat_setsockopt = compat_udpv6_setsockopt,
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index f1e892a99e05..ba162a824585 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -49,6 +49,7 @@ struct proto udplitev6_prot = {
49 .unhash = udp_lib_unhash, 49 .unhash = udp_lib_unhash,
50 .get_port = udp_v6_get_port, 50 .get_port = udp_v6_get_port,
51 .obj_size = sizeof(struct udp6_sock), 51 .obj_size = sizeof(struct udp6_sock),
52 .slab_flags = SLAB_DESTROY_BY_RCU,
52 .h.udp_table = &udplite_table, 53 .h.udp_table = &udplite_table,
53#ifdef CONFIG_COMPAT 54#ifdef CONFIG_COMPAT
54 .compat_setsockopt = compat_udpv6_setsockopt, 55 .compat_setsockopt = compat_udpv6_setsockopt,