diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2008-11-16 22:40:17 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-11-16 22:40:17 -0500 |
commit | 3ab5aee7fe840b5b1b35a8d1ac11c3de5281e611 (patch) | |
tree | 468296b7be813643248d4ca67497d6ddb6934fc6 | |
parent | 88ab1932eac721c6e7336708558fa5ed02c85c80 (diff) |
net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
RCU was added to UDP lookups, using a fast infrastructure :
- sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the
price of call_rcu() at freeing time.
- hlist_nulls permits to use few memory barriers.
This patch uses same infrastructure for TCP/DCCP established
and timewait sockets.
Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications
using short lived TCP connections. A followup patch, converting
rwlocks to spinlocks will even speedup this case.
__inet_lookup_established() is pretty fast now we dont have to
dirty a contended cache line (read_lock/read_unlock)
Only established and timewait hashtable are converted to RCU
(bind table and listen table are still using traditional locking)
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/inet_hashtables.h | 4 | ||||
-rw-r--r-- | include/net/inet_timewait_sock.h | 10 | ||||
-rw-r--r-- | net/core/sock.c | 4 | ||||
-rw-r--r-- | net/dccp/ipv4.c | 1 | ||||
-rw-r--r-- | net/dccp/ipv6.c | 1 | ||||
-rw-r--r-- | net/dccp/proto.c | 4 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 7 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 78 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 26 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 25 | ||||
-rw-r--r-- | net/ipv6/inet6_hashtables.c | 70 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 |
13 files changed, 151 insertions, 84 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index cb31fbf8ae2a..481896045111 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h | |||
@@ -41,8 +41,8 @@ | |||
41 | * I'll experiment with dynamic table growth later. | 41 | * I'll experiment with dynamic table growth later. |
42 | */ | 42 | */ |
43 | struct inet_ehash_bucket { | 43 | struct inet_ehash_bucket { |
44 | struct hlist_head chain; | 44 | struct hlist_nulls_head chain; |
45 | struct hlist_head twchain; | 45 | struct hlist_nulls_head twchain; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | /* There are a few simple rules, which allow for local port reuse by | 48 | /* There are a few simple rules, which allow for local port reuse by |
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 80e4977631b8..4b8ece22b8e9 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h | |||
@@ -110,7 +110,7 @@ struct inet_timewait_sock { | |||
110 | #define tw_state __tw_common.skc_state | 110 | #define tw_state __tw_common.skc_state |
111 | #define tw_reuse __tw_common.skc_reuse | 111 | #define tw_reuse __tw_common.skc_reuse |
112 | #define tw_bound_dev_if __tw_common.skc_bound_dev_if | 112 | #define tw_bound_dev_if __tw_common.skc_bound_dev_if |
113 | #define tw_node __tw_common.skc_node | 113 | #define tw_node __tw_common.skc_nulls_node |
114 | #define tw_bind_node __tw_common.skc_bind_node | 114 | #define tw_bind_node __tw_common.skc_bind_node |
115 | #define tw_refcnt __tw_common.skc_refcnt | 115 | #define tw_refcnt __tw_common.skc_refcnt |
116 | #define tw_hash __tw_common.skc_hash | 116 | #define tw_hash __tw_common.skc_hash |
@@ -137,10 +137,10 @@ struct inet_timewait_sock { | |||
137 | struct hlist_node tw_death_node; | 137 | struct hlist_node tw_death_node; |
138 | }; | 138 | }; |
139 | 139 | ||
140 | static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, | 140 | static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, |
141 | struct hlist_head *list) | 141 | struct hlist_nulls_head *list) |
142 | { | 142 | { |
143 | hlist_add_head(&tw->tw_node, list); | 143 | hlist_nulls_add_head_rcu(&tw->tw_node, list); |
144 | } | 144 | } |
145 | 145 | ||
146 | static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, | 146 | static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, |
@@ -175,7 +175,7 @@ static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw) | |||
175 | } | 175 | } |
176 | 176 | ||
177 | #define inet_twsk_for_each(tw, node, head) \ | 177 | #define inet_twsk_for_each(tw, node, head) \ |
178 | hlist_for_each_entry(tw, node, head, tw_node) | 178 | hlist_nulls_for_each_entry(tw, node, head, tw_node) |
179 | 179 | ||
180 | #define inet_twsk_for_each_inmate(tw, node, jail) \ | 180 | #define inet_twsk_for_each_inmate(tw, node, jail) \ |
181 | hlist_for_each_entry(tw, node, jail, tw_death_node) | 181 | hlist_for_each_entry(tw, node, jail, tw_death_node) |
diff --git a/net/core/sock.c b/net/core/sock.c index ded1eb5d2fd4..38de9c3f563b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -2082,7 +2082,9 @@ int proto_register(struct proto *prot, int alloc_slab) | |||
2082 | prot->twsk_prot->twsk_slab = | 2082 | prot->twsk_prot->twsk_slab = |
2083 | kmem_cache_create(timewait_sock_slab_name, | 2083 | kmem_cache_create(timewait_sock_slab_name, |
2084 | prot->twsk_prot->twsk_obj_size, | 2084 | prot->twsk_prot->twsk_obj_size, |
2085 | 0, SLAB_HWCACHE_ALIGN, | 2085 | 0, |
2086 | SLAB_HWCACHE_ALIGN | | ||
2087 | prot->slab_flags, | ||
2086 | NULL); | 2088 | NULL); |
2087 | if (prot->twsk_prot->twsk_slab == NULL) | 2089 | if (prot->twsk_prot->twsk_slab == NULL) |
2088 | goto out_free_timewait_sock_slab_name; | 2090 | goto out_free_timewait_sock_slab_name; |
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 528baa2e5be4..d1dd95289b89 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c | |||
@@ -938,6 +938,7 @@ static struct proto dccp_v4_prot = { | |||
938 | .orphan_count = &dccp_orphan_count, | 938 | .orphan_count = &dccp_orphan_count, |
939 | .max_header = MAX_DCCP_HEADER, | 939 | .max_header = MAX_DCCP_HEADER, |
940 | .obj_size = sizeof(struct dccp_sock), | 940 | .obj_size = sizeof(struct dccp_sock), |
941 | .slab_flags = SLAB_DESTROY_BY_RCU, | ||
941 | .rsk_prot = &dccp_request_sock_ops, | 942 | .rsk_prot = &dccp_request_sock_ops, |
942 | .twsk_prot = &dccp_timewait_sock_ops, | 943 | .twsk_prot = &dccp_timewait_sock_ops, |
943 | .h.hashinfo = &dccp_hashinfo, | 944 | .h.hashinfo = &dccp_hashinfo, |
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4aa1148cdb20..f033e845bb07 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c | |||
@@ -1140,6 +1140,7 @@ static struct proto dccp_v6_prot = { | |||
1140 | .orphan_count = &dccp_orphan_count, | 1140 | .orphan_count = &dccp_orphan_count, |
1141 | .max_header = MAX_DCCP_HEADER, | 1141 | .max_header = MAX_DCCP_HEADER, |
1142 | .obj_size = sizeof(struct dccp6_sock), | 1142 | .obj_size = sizeof(struct dccp6_sock), |
1143 | .slab_flags = SLAB_DESTROY_BY_RCU, | ||
1143 | .rsk_prot = &dccp6_request_sock_ops, | 1144 | .rsk_prot = &dccp6_request_sock_ops, |
1144 | .twsk_prot = &dccp6_timewait_sock_ops, | 1145 | .twsk_prot = &dccp6_timewait_sock_ops, |
1145 | .h.hashinfo = &dccp_hashinfo, | 1146 | .h.hashinfo = &dccp_hashinfo, |
diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 46cb3490d48e..1117d4d8c8f1 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c | |||
@@ -1090,8 +1090,8 @@ static int __init dccp_init(void) | |||
1090 | } | 1090 | } |
1091 | 1091 | ||
1092 | for (i = 0; i < dccp_hashinfo.ehash_size; i++) { | 1092 | for (i = 0; i < dccp_hashinfo.ehash_size; i++) { |
1093 | INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); | 1093 | INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); |
1094 | INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); | 1094 | INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i); |
1095 | } | 1095 | } |
1096 | 1096 | ||
1097 | if (inet_ehash_locks_alloc(&dccp_hashinfo)) | 1097 | if (inet_ehash_locks_alloc(&dccp_hashinfo)) |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 564230dabcb8..41b36720e977 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -778,18 +778,19 @@ skip_listen_ht: | |||
778 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; | 778 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; |
779 | rwlock_t *lock = inet_ehash_lockp(hashinfo, i); | 779 | rwlock_t *lock = inet_ehash_lockp(hashinfo, i); |
780 | struct sock *sk; | 780 | struct sock *sk; |
781 | struct hlist_node *node; | 781 | struct hlist_nulls_node *node; |
782 | 782 | ||
783 | num = 0; | 783 | num = 0; |
784 | 784 | ||
785 | if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) | 785 | if (hlist_nulls_empty(&head->chain) && |
786 | hlist_nulls_empty(&head->twchain)) | ||
786 | continue; | 787 | continue; |
787 | 788 | ||
788 | if (i > s_i) | 789 | if (i > s_i) |
789 | s_num = 0; | 790 | s_num = 0; |
790 | 791 | ||
791 | read_lock_bh(lock); | 792 | read_lock_bh(lock); |
792 | sk_for_each(sk, node, &head->chain) { | 793 | sk_nulls_for_each(sk, node, &head->chain) { |
793 | struct inet_sock *inet = inet_sk(sk); | 794 | struct inet_sock *inet = inet_sk(sk); |
794 | 795 | ||
795 | if (num < s_num) | 796 | if (num < s_num) |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index be41ebbec4eb..fd269cfef0ec 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -223,35 +223,65 @@ struct sock * __inet_lookup_established(struct net *net, | |||
223 | INET_ADDR_COOKIE(acookie, saddr, daddr) | 223 | INET_ADDR_COOKIE(acookie, saddr, daddr) |
224 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); | 224 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); |
225 | struct sock *sk; | 225 | struct sock *sk; |
226 | const struct hlist_node *node; | 226 | const struct hlist_nulls_node *node; |
227 | /* Optimize here for direct hit, only listening connections can | 227 | /* Optimize here for direct hit, only listening connections can |
228 | * have wildcards anyways. | 228 | * have wildcards anyways. |
229 | */ | 229 | */ |
230 | unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); | 230 | unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); |
231 | struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); | 231 | unsigned int slot = hash & (hashinfo->ehash_size - 1); |
232 | rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); | 232 | struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; |
233 | 233 | ||
234 | prefetch(head->chain.first); | 234 | rcu_read_lock(); |
235 | read_lock(lock); | 235 | begin: |
236 | sk_for_each(sk, node, &head->chain) { | 236 | sk_nulls_for_each_rcu(sk, node, &head->chain) { |
237 | if (INET_MATCH(sk, net, hash, acookie, | 237 | if (INET_MATCH(sk, net, hash, acookie, |
238 | saddr, daddr, ports, dif)) | 238 | saddr, daddr, ports, dif)) { |
239 | goto hit; /* You sunk my battleship! */ | 239 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) |
240 | goto begintw; | ||
241 | if (unlikely(!INET_MATCH(sk, net, hash, acookie, | ||
242 | saddr, daddr, ports, dif))) { | ||
243 | sock_put(sk); | ||
244 | goto begin; | ||
245 | } | ||
246 | goto out; | ||
247 | } | ||
240 | } | 248 | } |
249 | /* | ||
250 | * if the nulls value we got at the end of this lookup is | ||
251 | * not the expected one, we must restart lookup. | ||
252 | * We probably met an item that was moved to another chain. | ||
253 | */ | ||
254 | if (get_nulls_value(node) != slot) | ||
255 | goto begin; | ||
241 | 256 | ||
257 | begintw: | ||
242 | /* Must check for a TIME_WAIT'er before going to listener hash. */ | 258 | /* Must check for a TIME_WAIT'er before going to listener hash. */ |
243 | sk_for_each(sk, node, &head->twchain) { | 259 | sk_nulls_for_each_rcu(sk, node, &head->twchain) { |
244 | if (INET_TW_MATCH(sk, net, hash, acookie, | 260 | if (INET_TW_MATCH(sk, net, hash, acookie, |
245 | saddr, daddr, ports, dif)) | 261 | saddr, daddr, ports, dif)) { |
246 | goto hit; | 262 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { |
263 | sk = NULL; | ||
264 | goto out; | ||
265 | } | ||
266 | if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, | ||
267 | saddr, daddr, ports, dif))) { | ||
268 | sock_put(sk); | ||
269 | goto begintw; | ||
270 | } | ||
271 | goto out; | ||
272 | } | ||
247 | } | 273 | } |
274 | /* | ||
275 | * if the nulls value we got at the end of this lookup is | ||
276 | * not the expected one, we must restart lookup. | ||
277 | * We probably met an item that was moved to another chain. | ||
278 | */ | ||
279 | if (get_nulls_value(node) != slot) | ||
280 | goto begintw; | ||
248 | sk = NULL; | 281 | sk = NULL; |
249 | out: | 282 | out: |
250 | read_unlock(lock); | 283 | rcu_read_unlock(); |
251 | return sk; | 284 | return sk; |
252 | hit: | ||
253 | sock_hold(sk); | ||
254 | goto out; | ||
255 | } | 285 | } |
256 | EXPORT_SYMBOL_GPL(__inet_lookup_established); | 286 | EXPORT_SYMBOL_GPL(__inet_lookup_established); |
257 | 287 | ||
@@ -272,14 +302,14 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, | |||
272 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); | 302 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); |
273 | rwlock_t *lock = inet_ehash_lockp(hinfo, hash); | 303 | rwlock_t *lock = inet_ehash_lockp(hinfo, hash); |
274 | struct sock *sk2; | 304 | struct sock *sk2; |
275 | const struct hlist_node *node; | 305 | const struct hlist_nulls_node *node; |
276 | struct inet_timewait_sock *tw; | 306 | struct inet_timewait_sock *tw; |
277 | 307 | ||
278 | prefetch(head->chain.first); | 308 | prefetch(head->chain.first); |
279 | write_lock(lock); | 309 | write_lock(lock); |
280 | 310 | ||
281 | /* Check TIME-WAIT sockets first. */ | 311 | /* Check TIME-WAIT sockets first. */ |
282 | sk_for_each(sk2, node, &head->twchain) { | 312 | sk_nulls_for_each(sk2, node, &head->twchain) { |
283 | tw = inet_twsk(sk2); | 313 | tw = inet_twsk(sk2); |
284 | 314 | ||
285 | if (INET_TW_MATCH(sk2, net, hash, acookie, | 315 | if (INET_TW_MATCH(sk2, net, hash, acookie, |
@@ -293,7 +323,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, | |||
293 | tw = NULL; | 323 | tw = NULL; |
294 | 324 | ||
295 | /* And established part... */ | 325 | /* And established part... */ |
296 | sk_for_each(sk2, node, &head->chain) { | 326 | sk_nulls_for_each(sk2, node, &head->chain) { |
297 | if (INET_MATCH(sk2, net, hash, acookie, | 327 | if (INET_MATCH(sk2, net, hash, acookie, |
298 | saddr, daddr, ports, dif)) | 328 | saddr, daddr, ports, dif)) |
299 | goto not_unique; | 329 | goto not_unique; |
@@ -306,7 +336,7 @@ unique: | |||
306 | inet->sport = htons(lport); | 336 | inet->sport = htons(lport); |
307 | sk->sk_hash = hash; | 337 | sk->sk_hash = hash; |
308 | WARN_ON(!sk_unhashed(sk)); | 338 | WARN_ON(!sk_unhashed(sk)); |
309 | __sk_add_node(sk, &head->chain); | 339 | __sk_nulls_add_node_rcu(sk, &head->chain); |
310 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 340 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
311 | write_unlock(lock); | 341 | write_unlock(lock); |
312 | 342 | ||
@@ -338,7 +368,7 @@ static inline u32 inet_sk_port_offset(const struct sock *sk) | |||
338 | void __inet_hash_nolisten(struct sock *sk) | 368 | void __inet_hash_nolisten(struct sock *sk) |
339 | { | 369 | { |
340 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 370 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
341 | struct hlist_head *list; | 371 | struct hlist_nulls_head *list; |
342 | rwlock_t *lock; | 372 | rwlock_t *lock; |
343 | struct inet_ehash_bucket *head; | 373 | struct inet_ehash_bucket *head; |
344 | 374 | ||
@@ -350,7 +380,7 @@ void __inet_hash_nolisten(struct sock *sk) | |||
350 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | 380 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
351 | 381 | ||
352 | write_lock(lock); | 382 | write_lock(lock); |
353 | __sk_add_node(sk, list); | 383 | __sk_nulls_add_node_rcu(sk, list); |
354 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 384 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
355 | write_unlock(lock); | 385 | write_unlock(lock); |
356 | } | 386 | } |
@@ -400,13 +430,15 @@ void inet_unhash(struct sock *sk) | |||
400 | local_bh_disable(); | 430 | local_bh_disable(); |
401 | inet_listen_wlock(hashinfo); | 431 | inet_listen_wlock(hashinfo); |
402 | lock = &hashinfo->lhash_lock; | 432 | lock = &hashinfo->lhash_lock; |
433 | if (__sk_del_node_init(sk)) | ||
434 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | ||
403 | } else { | 435 | } else { |
404 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | 436 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
405 | write_lock_bh(lock); | 437 | write_lock_bh(lock); |
438 | if (__sk_nulls_del_node_init_rcu(sk)) | ||
439 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | ||
406 | } | 440 | } |
407 | 441 | ||
408 | if (__sk_del_node_init(sk)) | ||
409 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | ||
410 | write_unlock_bh(lock); | 442 | write_unlock_bh(lock); |
411 | out: | 443 | out: |
412 | if (sk->sk_state == TCP_LISTEN) | 444 | if (sk->sk_state == TCP_LISTEN) |
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1c5fd38f8824..60689951ecdb 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c | |||
@@ -23,12 +23,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, | |||
23 | rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); | 23 | rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); |
24 | 24 | ||
25 | write_lock(lock); | 25 | write_lock(lock); |
26 | if (hlist_unhashed(&tw->tw_node)) { | 26 | if (hlist_nulls_unhashed(&tw->tw_node)) { |
27 | write_unlock(lock); | 27 | write_unlock(lock); |
28 | return; | 28 | return; |
29 | } | 29 | } |
30 | __hlist_del(&tw->tw_node); | 30 | hlist_nulls_del_rcu(&tw->tw_node); |
31 | sk_node_init(&tw->tw_node); | 31 | sk_nulls_node_init(&tw->tw_node); |
32 | write_unlock(lock); | 32 | write_unlock(lock); |
33 | 33 | ||
34 | /* Disassociate with bind bucket. */ | 34 | /* Disassociate with bind bucket. */ |
@@ -92,13 +92,17 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | |||
92 | 92 | ||
93 | write_lock(lock); | 93 | write_lock(lock); |
94 | 94 | ||
95 | /* Step 2: Remove SK from established hash. */ | 95 | /* |
96 | if (__sk_del_node_init(sk)) | 96 | * Step 2: Hash TW into TIMEWAIT chain. |
97 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 97 | * Should be done before removing sk from established chain |
98 | 98 | * because readers are lockless and search established first. | |
99 | /* Step 3: Hash TW into TIMEWAIT chain. */ | 99 | */ |
100 | inet_twsk_add_node(tw, &ehead->twchain); | ||
101 | atomic_inc(&tw->tw_refcnt); | 100 | atomic_inc(&tw->tw_refcnt); |
101 | inet_twsk_add_node_rcu(tw, &ehead->twchain); | ||
102 | |||
103 | /* Step 3: Remove SK from established hash. */ | ||
104 | if (__sk_nulls_del_node_init_rcu(sk)) | ||
105 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | ||
102 | 106 | ||
103 | write_unlock(lock); | 107 | write_unlock(lock); |
104 | } | 108 | } |
@@ -416,7 +420,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, | |||
416 | { | 420 | { |
417 | struct inet_timewait_sock *tw; | 421 | struct inet_timewait_sock *tw; |
418 | struct sock *sk; | 422 | struct sock *sk; |
419 | struct hlist_node *node; | 423 | struct hlist_nulls_node *node; |
420 | int h; | 424 | int h; |
421 | 425 | ||
422 | local_bh_disable(); | 426 | local_bh_disable(); |
@@ -426,7 +430,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, | |||
426 | rwlock_t *lock = inet_ehash_lockp(hashinfo, h); | 430 | rwlock_t *lock = inet_ehash_lockp(hashinfo, h); |
427 | restart: | 431 | restart: |
428 | write_lock(lock); | 432 | write_lock(lock); |
429 | sk_for_each(sk, node, &head->twchain) { | 433 | sk_nulls_for_each(sk, node, &head->twchain) { |
430 | 434 | ||
431 | tw = inet_twsk(sk); | 435 | tw = inet_twsk(sk); |
432 | if (!net_eq(twsk_net(tw), net) || | 436 | if (!net_eq(twsk_net(tw), net) || |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f60a5917e54d..044224a341eb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -2707,8 +2707,8 @@ void __init tcp_init(void) | |||
2707 | thash_entries ? 0 : 512 * 1024); | 2707 | thash_entries ? 0 : 512 * 1024); |
2708 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; | 2708 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; |
2709 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { | 2709 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { |
2710 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); | 2710 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); |
2711 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); | 2711 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); |
2712 | } | 2712 | } |
2713 | if (inet_ehash_locks_alloc(&tcp_hashinfo)) | 2713 | if (inet_ehash_locks_alloc(&tcp_hashinfo)) |
2714 | panic("TCP: failed to alloc ehash_locks"); | 2714 | panic("TCP: failed to alloc ehash_locks"); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d49233f409b5..b2e3ab2287ba 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -1857,16 +1857,16 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); | |||
1857 | #ifdef CONFIG_PROC_FS | 1857 | #ifdef CONFIG_PROC_FS |
1858 | /* Proc filesystem TCP sock list dumping. */ | 1858 | /* Proc filesystem TCP sock list dumping. */ |
1859 | 1859 | ||
1860 | static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) | 1860 | static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) |
1861 | { | 1861 | { |
1862 | return hlist_empty(head) ? NULL : | 1862 | return hlist_nulls_empty(head) ? NULL : |
1863 | list_entry(head->first, struct inet_timewait_sock, tw_node); | 1863 | list_entry(head->first, struct inet_timewait_sock, tw_node); |
1864 | } | 1864 | } |
1865 | 1865 | ||
1866 | static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) | 1866 | static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) |
1867 | { | 1867 | { |
1868 | return tw->tw_node.next ? | 1868 | return !is_a_nulls(tw->tw_node.next) ? |
1869 | hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; | 1869 | hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; |
1870 | } | 1870 | } |
1871 | 1871 | ||
1872 | static void *listening_get_next(struct seq_file *seq, void *cur) | 1872 | static void *listening_get_next(struct seq_file *seq, void *cur) |
@@ -1954,8 +1954,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) | |||
1954 | 1954 | ||
1955 | static inline int empty_bucket(struct tcp_iter_state *st) | 1955 | static inline int empty_bucket(struct tcp_iter_state *st) |
1956 | { | 1956 | { |
1957 | return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && | 1957 | return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && |
1958 | hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); | 1958 | hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); |
1959 | } | 1959 | } |
1960 | 1960 | ||
1961 | static void *established_get_first(struct seq_file *seq) | 1961 | static void *established_get_first(struct seq_file *seq) |
@@ -1966,7 +1966,7 @@ static void *established_get_first(struct seq_file *seq) | |||
1966 | 1966 | ||
1967 | for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { | 1967 | for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { |
1968 | struct sock *sk; | 1968 | struct sock *sk; |
1969 | struct hlist_node *node; | 1969 | struct hlist_nulls_node *node; |
1970 | struct inet_timewait_sock *tw; | 1970 | struct inet_timewait_sock *tw; |
1971 | rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); | 1971 | rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); |
1972 | 1972 | ||
@@ -1975,7 +1975,7 @@ static void *established_get_first(struct seq_file *seq) | |||
1975 | continue; | 1975 | continue; |
1976 | 1976 | ||
1977 | read_lock_bh(lock); | 1977 | read_lock_bh(lock); |
1978 | sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { | 1978 | sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { |
1979 | if (sk->sk_family != st->family || | 1979 | if (sk->sk_family != st->family || |
1980 | !net_eq(sock_net(sk), net)) { | 1980 | !net_eq(sock_net(sk), net)) { |
1981 | continue; | 1981 | continue; |
@@ -2004,7 +2004,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) | |||
2004 | { | 2004 | { |
2005 | struct sock *sk = cur; | 2005 | struct sock *sk = cur; |
2006 | struct inet_timewait_sock *tw; | 2006 | struct inet_timewait_sock *tw; |
2007 | struct hlist_node *node; | 2007 | struct hlist_nulls_node *node; |
2008 | struct tcp_iter_state *st = seq->private; | 2008 | struct tcp_iter_state *st = seq->private; |
2009 | struct net *net = seq_file_net(seq); | 2009 | struct net *net = seq_file_net(seq); |
2010 | 2010 | ||
@@ -2032,11 +2032,11 @@ get_tw: | |||
2032 | return NULL; | 2032 | return NULL; |
2033 | 2033 | ||
2034 | read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); | 2034 | read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); |
2035 | sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); | 2035 | sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); |
2036 | } else | 2036 | } else |
2037 | sk = sk_next(sk); | 2037 | sk = sk_nulls_next(sk); |
2038 | 2038 | ||
2039 | sk_for_each_from(sk, node) { | 2039 | sk_nulls_for_each_from(sk, node) { |
2040 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) | 2040 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) |
2041 | goto found; | 2041 | goto found; |
2042 | } | 2042 | } |
@@ -2375,6 +2375,7 @@ struct proto tcp_prot = { | |||
2375 | .sysctl_rmem = sysctl_tcp_rmem, | 2375 | .sysctl_rmem = sysctl_tcp_rmem, |
2376 | .max_header = MAX_TCP_HEADER, | 2376 | .max_header = MAX_TCP_HEADER, |
2377 | .obj_size = sizeof(struct tcp_sock), | 2377 | .obj_size = sizeof(struct tcp_sock), |
2378 | .slab_flags = SLAB_DESTROY_BY_RCU, | ||
2378 | .twsk_prot = &tcp_timewait_sock_ops, | 2379 | .twsk_prot = &tcp_timewait_sock_ops, |
2379 | .rsk_prot = &tcp_request_sock_ops, | 2380 | .rsk_prot = &tcp_request_sock_ops, |
2380 | .h.hashinfo = &tcp_hashinfo, | 2381 | .h.hashinfo = &tcp_hashinfo, |
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 1646a5658255..c1b4d401fd95 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c | |||
@@ -25,24 +25,28 @@ | |||
25 | void __inet6_hash(struct sock *sk) | 25 | void __inet6_hash(struct sock *sk) |
26 | { | 26 | { |
27 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 27 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
28 | struct hlist_head *list; | ||
29 | rwlock_t *lock; | 28 | rwlock_t *lock; |
30 | 29 | ||
31 | WARN_ON(!sk_unhashed(sk)); | 30 | WARN_ON(!sk_unhashed(sk)); |
32 | 31 | ||
33 | if (sk->sk_state == TCP_LISTEN) { | 32 | if (sk->sk_state == TCP_LISTEN) { |
33 | struct hlist_head *list; | ||
34 | |||
34 | list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 35 | list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
35 | lock = &hashinfo->lhash_lock; | 36 | lock = &hashinfo->lhash_lock; |
36 | inet_listen_wlock(hashinfo); | 37 | inet_listen_wlock(hashinfo); |
38 | __sk_add_node(sk, list); | ||
37 | } else { | 39 | } else { |
38 | unsigned int hash; | 40 | unsigned int hash; |
41 | struct hlist_nulls_head *list; | ||
42 | |||
39 | sk->sk_hash = hash = inet6_sk_ehashfn(sk); | 43 | sk->sk_hash = hash = inet6_sk_ehashfn(sk); |
40 | list = &inet_ehash_bucket(hashinfo, hash)->chain; | 44 | list = &inet_ehash_bucket(hashinfo, hash)->chain; |
41 | lock = inet_ehash_lockp(hashinfo, hash); | 45 | lock = inet_ehash_lockp(hashinfo, hash); |
42 | write_lock(lock); | 46 | write_lock(lock); |
47 | __sk_nulls_add_node_rcu(sk, list); | ||
43 | } | 48 | } |
44 | 49 | ||
45 | __sk_add_node(sk, list); | ||
46 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 50 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
47 | write_unlock(lock); | 51 | write_unlock(lock); |
48 | } | 52 | } |
@@ -63,33 +67,53 @@ struct sock *__inet6_lookup_established(struct net *net, | |||
63 | const int dif) | 67 | const int dif) |
64 | { | 68 | { |
65 | struct sock *sk; | 69 | struct sock *sk; |
66 | const struct hlist_node *node; | 70 | const struct hlist_nulls_node *node; |
67 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); | 71 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); |
68 | /* Optimize here for direct hit, only listening connections can | 72 | /* Optimize here for direct hit, only listening connections can |
69 | * have wildcards anyways. | 73 | * have wildcards anyways. |
70 | */ | 74 | */ |
71 | unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); | 75 | unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); |
72 | struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); | 76 | unsigned int slot = hash & (hashinfo->ehash_size - 1); |
73 | rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); | 77 | struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; |
74 | 78 | ||
75 | prefetch(head->chain.first); | 79 | |
76 | read_lock(lock); | 80 | rcu_read_lock(); |
77 | sk_for_each(sk, node, &head->chain) { | 81 | begin: |
82 | sk_nulls_for_each_rcu(sk, node, &head->chain) { | ||
78 | /* For IPV6 do the cheaper port and family tests first. */ | 83 | /* For IPV6 do the cheaper port and family tests first. */ |
79 | if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) | 84 | if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { |
80 | goto hit; /* You sunk my battleship! */ | 85 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) |
86 | goto begintw; | ||
87 | if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { | ||
88 | sock_put(sk); | ||
89 | goto begin; | ||
90 | } | ||
91 | goto out; | ||
92 | } | ||
81 | } | 93 | } |
94 | if (get_nulls_value(node) != slot) | ||
95 | goto begin; | ||
96 | |||
97 | begintw: | ||
82 | /* Must check for a TIME_WAIT'er before going to listener hash. */ | 98 | /* Must check for a TIME_WAIT'er before going to listener hash. */ |
83 | sk_for_each(sk, node, &head->twchain) { | 99 | sk_nulls_for_each_rcu(sk, node, &head->twchain) { |
84 | if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) | 100 | if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { |
85 | goto hit; | 101 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { |
102 | sk = NULL; | ||
103 | goto out; | ||
104 | } | ||
105 | if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { | ||
106 | sock_put(sk); | ||
107 | goto begintw; | ||
108 | } | ||
109 | goto out; | ||
110 | } | ||
86 | } | 111 | } |
87 | read_unlock(lock); | 112 | if (get_nulls_value(node) != slot) |
88 | return NULL; | 113 | goto begintw; |
89 | 114 | sk = NULL; | |
90 | hit: | 115 | out: |
91 | sock_hold(sk); | 116 | rcu_read_unlock(); |
92 | read_unlock(lock); | ||
93 | return sk; | 117 | return sk; |
94 | } | 118 | } |
95 | EXPORT_SYMBOL(__inet6_lookup_established); | 119 | EXPORT_SYMBOL(__inet6_lookup_established); |
@@ -172,14 +196,14 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, | |||
172 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); | 196 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); |
173 | rwlock_t *lock = inet_ehash_lockp(hinfo, hash); | 197 | rwlock_t *lock = inet_ehash_lockp(hinfo, hash); |
174 | struct sock *sk2; | 198 | struct sock *sk2; |
175 | const struct hlist_node *node; | 199 | const struct hlist_nulls_node *node; |
176 | struct inet_timewait_sock *tw; | 200 | struct inet_timewait_sock *tw; |
177 | 201 | ||
178 | prefetch(head->chain.first); | 202 | prefetch(head->chain.first); |
179 | write_lock(lock); | 203 | write_lock(lock); |
180 | 204 | ||
181 | /* Check TIME-WAIT sockets first. */ | 205 | /* Check TIME-WAIT sockets first. */ |
182 | sk_for_each(sk2, node, &head->twchain) { | 206 | sk_nulls_for_each(sk2, node, &head->twchain) { |
183 | tw = inet_twsk(sk2); | 207 | tw = inet_twsk(sk2); |
184 | 208 | ||
185 | if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { | 209 | if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { |
@@ -192,7 +216,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, | |||
192 | tw = NULL; | 216 | tw = NULL; |
193 | 217 | ||
194 | /* And established part... */ | 218 | /* And established part... */ |
195 | sk_for_each(sk2, node, &head->chain) { | 219 | sk_nulls_for_each(sk2, node, &head->chain) { |
196 | if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) | 220 | if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) |
197 | goto not_unique; | 221 | goto not_unique; |
198 | } | 222 | } |
@@ -203,7 +227,7 @@ unique: | |||
203 | inet->num = lport; | 227 | inet->num = lport; |
204 | inet->sport = htons(lport); | 228 | inet->sport = htons(lport); |
205 | WARN_ON(!sk_unhashed(sk)); | 229 | WARN_ON(!sk_unhashed(sk)); |
206 | __sk_add_node(sk, &head->chain); | 230 | __sk_nulls_add_node_rcu(sk, &head->chain); |
207 | sk->sk_hash = hash; | 231 | sk->sk_hash = hash; |
208 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 232 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
209 | write_unlock(lock); | 233 | write_unlock(lock); |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 984276463a8d..b35787056313 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -2043,6 +2043,7 @@ struct proto tcpv6_prot = { | |||
2043 | .sysctl_rmem = sysctl_tcp_rmem, | 2043 | .sysctl_rmem = sysctl_tcp_rmem, |
2044 | .max_header = MAX_TCP_HEADER, | 2044 | .max_header = MAX_TCP_HEADER, |
2045 | .obj_size = sizeof(struct tcp6_sock), | 2045 | .obj_size = sizeof(struct tcp6_sock), |
2046 | .slab_flags = SLAB_DESTROY_BY_RCU, | ||
2046 | .twsk_prot = &tcp6_timewait_sock_ops, | 2047 | .twsk_prot = &tcp6_timewait_sock_ops, |
2047 | .rsk_prot = &tcp6_request_sock_ops, | 2048 | .rsk_prot = &tcp6_request_sock_ops, |
2048 | .h.hashinfo = &tcp_hashinfo, | 2049 | .h.hashinfo = &tcp_hashinfo, |