aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-11-16 22:40:17 -0500
committerDavid S. Miller <davem@davemloft.net>2008-11-16 22:40:17 -0500
commit3ab5aee7fe840b5b1b35a8d1ac11c3de5281e611 (patch)
tree468296b7be813643248d4ca67497d6ddb6934fc6
parent88ab1932eac721c6e7336708558fa5ed02c85c80 (diff)
net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
RCU was added to UDP lookups, using a fast infrastructure : - sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the price of call_rcu() at freeing time. - hlist_nulls permits to use few memory barriers. This patch uses same infrastructure for TCP/DCCP established and timewait sockets. Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications using short lived TCP connections. A followup patch, converting rwlocks to spinlocks will even speedup this case. __inet_lookup_established() is pretty fast now we dont have to dirty a contended cache line (read_lock/read_unlock) Only established and timewait hashtable are converted to RCU (bind table and listen table are still using traditional locking) Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/inet_hashtables.h4
-rw-r--r--include/net/inet_timewait_sock.h10
-rw-r--r--net/core/sock.c4
-rw-r--r--net/dccp/ipv4.c1
-rw-r--r--net/dccp/ipv6.c1
-rw-r--r--net/dccp/proto.c4
-rw-r--r--net/ipv4/inet_diag.c7
-rw-r--r--net/ipv4/inet_hashtables.c78
-rw-r--r--net/ipv4/inet_timewait_sock.c26
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_ipv4.c25
-rw-r--r--net/ipv6/inet6_hashtables.c70
-rw-r--r--net/ipv6/tcp_ipv6.c1
13 files changed, 151 insertions, 84 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index cb31fbf8ae2a..481896045111 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -41,8 +41,8 @@
41 * I'll experiment with dynamic table growth later. 41 * I'll experiment with dynamic table growth later.
42 */ 42 */
43struct inet_ehash_bucket { 43struct inet_ehash_bucket {
44 struct hlist_head chain; 44 struct hlist_nulls_head chain;
45 struct hlist_head twchain; 45 struct hlist_nulls_head twchain;
46}; 46};
47 47
48/* There are a few simple rules, which allow for local port reuse by 48/* There are a few simple rules, which allow for local port reuse by
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 80e4977631b8..4b8ece22b8e9 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -110,7 +110,7 @@ struct inet_timewait_sock {
110#define tw_state __tw_common.skc_state 110#define tw_state __tw_common.skc_state
111#define tw_reuse __tw_common.skc_reuse 111#define tw_reuse __tw_common.skc_reuse
112#define tw_bound_dev_if __tw_common.skc_bound_dev_if 112#define tw_bound_dev_if __tw_common.skc_bound_dev_if
113#define tw_node __tw_common.skc_node 113#define tw_node __tw_common.skc_nulls_node
114#define tw_bind_node __tw_common.skc_bind_node 114#define tw_bind_node __tw_common.skc_bind_node
115#define tw_refcnt __tw_common.skc_refcnt 115#define tw_refcnt __tw_common.skc_refcnt
116#define tw_hash __tw_common.skc_hash 116#define tw_hash __tw_common.skc_hash
@@ -137,10 +137,10 @@ struct inet_timewait_sock {
137 struct hlist_node tw_death_node; 137 struct hlist_node tw_death_node;
138}; 138};
139 139
140static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, 140static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
141 struct hlist_head *list) 141 struct hlist_nulls_head *list)
142{ 142{
143 hlist_add_head(&tw->tw_node, list); 143 hlist_nulls_add_head_rcu(&tw->tw_node, list);
144} 144}
145 145
146static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, 146static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
@@ -175,7 +175,7 @@ static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
175} 175}
176 176
177#define inet_twsk_for_each(tw, node, head) \ 177#define inet_twsk_for_each(tw, node, head) \
178 hlist_for_each_entry(tw, node, head, tw_node) 178 hlist_nulls_for_each_entry(tw, node, head, tw_node)
179 179
180#define inet_twsk_for_each_inmate(tw, node, jail) \ 180#define inet_twsk_for_each_inmate(tw, node, jail) \
181 hlist_for_each_entry(tw, node, jail, tw_death_node) 181 hlist_for_each_entry(tw, node, jail, tw_death_node)
diff --git a/net/core/sock.c b/net/core/sock.c
index ded1eb5d2fd4..38de9c3f563b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2082,7 +2082,9 @@ int proto_register(struct proto *prot, int alloc_slab)
2082 prot->twsk_prot->twsk_slab = 2082 prot->twsk_prot->twsk_slab =
2083 kmem_cache_create(timewait_sock_slab_name, 2083 kmem_cache_create(timewait_sock_slab_name,
2084 prot->twsk_prot->twsk_obj_size, 2084 prot->twsk_prot->twsk_obj_size,
2085 0, SLAB_HWCACHE_ALIGN, 2085 0,
2086 SLAB_HWCACHE_ALIGN |
2087 prot->slab_flags,
2086 NULL); 2088 NULL);
2087 if (prot->twsk_prot->twsk_slab == NULL) 2089 if (prot->twsk_prot->twsk_slab == NULL)
2088 goto out_free_timewait_sock_slab_name; 2090 goto out_free_timewait_sock_slab_name;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 528baa2e5be4..d1dd95289b89 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -938,6 +938,7 @@ static struct proto dccp_v4_prot = {
938 .orphan_count = &dccp_orphan_count, 938 .orphan_count = &dccp_orphan_count,
939 .max_header = MAX_DCCP_HEADER, 939 .max_header = MAX_DCCP_HEADER,
940 .obj_size = sizeof(struct dccp_sock), 940 .obj_size = sizeof(struct dccp_sock),
941 .slab_flags = SLAB_DESTROY_BY_RCU,
941 .rsk_prot = &dccp_request_sock_ops, 942 .rsk_prot = &dccp_request_sock_ops,
942 .twsk_prot = &dccp_timewait_sock_ops, 943 .twsk_prot = &dccp_timewait_sock_ops,
943 .h.hashinfo = &dccp_hashinfo, 944 .h.hashinfo = &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 4aa1148cdb20..f033e845bb07 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1140,6 +1140,7 @@ static struct proto dccp_v6_prot = {
1140 .orphan_count = &dccp_orphan_count, 1140 .orphan_count = &dccp_orphan_count,
1141 .max_header = MAX_DCCP_HEADER, 1141 .max_header = MAX_DCCP_HEADER,
1142 .obj_size = sizeof(struct dccp6_sock), 1142 .obj_size = sizeof(struct dccp6_sock),
1143 .slab_flags = SLAB_DESTROY_BY_RCU,
1143 .rsk_prot = &dccp6_request_sock_ops, 1144 .rsk_prot = &dccp6_request_sock_ops,
1144 .twsk_prot = &dccp6_timewait_sock_ops, 1145 .twsk_prot = &dccp6_timewait_sock_ops,
1145 .h.hashinfo = &dccp_hashinfo, 1146 .h.hashinfo = &dccp_hashinfo,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 46cb3490d48e..1117d4d8c8f1 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1090,8 +1090,8 @@ static int __init dccp_init(void)
1090 } 1090 }
1091 1091
1092 for (i = 0; i < dccp_hashinfo.ehash_size; i++) { 1092 for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
1093 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); 1093 INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
1094 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); 1094 INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
1095 } 1095 }
1096 1096
1097 if (inet_ehash_locks_alloc(&dccp_hashinfo)) 1097 if (inet_ehash_locks_alloc(&dccp_hashinfo))
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 564230dabcb8..41b36720e977 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -778,18 +778,19 @@ skip_listen_ht:
778 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 778 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
779 rwlock_t *lock = inet_ehash_lockp(hashinfo, i); 779 rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
780 struct sock *sk; 780 struct sock *sk;
781 struct hlist_node *node; 781 struct hlist_nulls_node *node;
782 782
783 num = 0; 783 num = 0;
784 784
785 if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) 785 if (hlist_nulls_empty(&head->chain) &&
786 hlist_nulls_empty(&head->twchain))
786 continue; 787 continue;
787 788
788 if (i > s_i) 789 if (i > s_i)
789 s_num = 0; 790 s_num = 0;
790 791
791 read_lock_bh(lock); 792 read_lock_bh(lock);
792 sk_for_each(sk, node, &head->chain) { 793 sk_nulls_for_each(sk, node, &head->chain) {
793 struct inet_sock *inet = inet_sk(sk); 794 struct inet_sock *inet = inet_sk(sk);
794 795
795 if (num < s_num) 796 if (num < s_num)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index be41ebbec4eb..fd269cfef0ec 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -223,35 +223,65 @@ struct sock * __inet_lookup_established(struct net *net,
223 INET_ADDR_COOKIE(acookie, saddr, daddr) 223 INET_ADDR_COOKIE(acookie, saddr, daddr)
224 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 224 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
225 struct sock *sk; 225 struct sock *sk;
226 const struct hlist_node *node; 226 const struct hlist_nulls_node *node;
227 /* Optimize here for direct hit, only listening connections can 227 /* Optimize here for direct hit, only listening connections can
228 * have wildcards anyways. 228 * have wildcards anyways.
229 */ 229 */
230 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 230 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
231 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 231 unsigned int slot = hash & (hashinfo->ehash_size - 1);
232 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 232 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
233 233
234 prefetch(head->chain.first); 234 rcu_read_lock();
235 read_lock(lock); 235begin:
236 sk_for_each(sk, node, &head->chain) { 236 sk_nulls_for_each_rcu(sk, node, &head->chain) {
237 if (INET_MATCH(sk, net, hash, acookie, 237 if (INET_MATCH(sk, net, hash, acookie,
238 saddr, daddr, ports, dif)) 238 saddr, daddr, ports, dif)) {
239 goto hit; /* You sunk my battleship! */ 239 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
240 goto begintw;
241 if (unlikely(!INET_MATCH(sk, net, hash, acookie,
242 saddr, daddr, ports, dif))) {
243 sock_put(sk);
244 goto begin;
245 }
246 goto out;
247 }
240 } 248 }
249 /*
250 * if the nulls value we got at the end of this lookup is
251 * not the expected one, we must restart lookup.
252 * We probably met an item that was moved to another chain.
253 */
254 if (get_nulls_value(node) != slot)
255 goto begin;
241 256
257begintw:
242 /* Must check for a TIME_WAIT'er before going to listener hash. */ 258 /* Must check for a TIME_WAIT'er before going to listener hash. */
243 sk_for_each(sk, node, &head->twchain) { 259 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
244 if (INET_TW_MATCH(sk, net, hash, acookie, 260 if (INET_TW_MATCH(sk, net, hash, acookie,
245 saddr, daddr, ports, dif)) 261 saddr, daddr, ports, dif)) {
246 goto hit; 262 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
263 sk = NULL;
264 goto out;
265 }
266 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
267 saddr, daddr, ports, dif))) {
268 sock_put(sk);
269 goto begintw;
270 }
271 goto out;
272 }
247 } 273 }
274 /*
275 * if the nulls value we got at the end of this lookup is
276 * not the expected one, we must restart lookup.
277 * We probably met an item that was moved to another chain.
278 */
279 if (get_nulls_value(node) != slot)
280 goto begintw;
248 sk = NULL; 281 sk = NULL;
249out: 282out:
250 read_unlock(lock); 283 rcu_read_unlock();
251 return sk; 284 return sk;
252hit:
253 sock_hold(sk);
254 goto out;
255} 285}
256EXPORT_SYMBOL_GPL(__inet_lookup_established); 286EXPORT_SYMBOL_GPL(__inet_lookup_established);
257 287
@@ -272,14 +302,14 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
272 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 302 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
273 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 303 rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
274 struct sock *sk2; 304 struct sock *sk2;
275 const struct hlist_node *node; 305 const struct hlist_nulls_node *node;
276 struct inet_timewait_sock *tw; 306 struct inet_timewait_sock *tw;
277 307
278 prefetch(head->chain.first); 308 prefetch(head->chain.first);
279 write_lock(lock); 309 write_lock(lock);
280 310
281 /* Check TIME-WAIT sockets first. */ 311 /* Check TIME-WAIT sockets first. */
282 sk_for_each(sk2, node, &head->twchain) { 312 sk_nulls_for_each(sk2, node, &head->twchain) {
283 tw = inet_twsk(sk2); 313 tw = inet_twsk(sk2);
284 314
285 if (INET_TW_MATCH(sk2, net, hash, acookie, 315 if (INET_TW_MATCH(sk2, net, hash, acookie,
@@ -293,7 +323,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
293 tw = NULL; 323 tw = NULL;
294 324
295 /* And established part... */ 325 /* And established part... */
296 sk_for_each(sk2, node, &head->chain) { 326 sk_nulls_for_each(sk2, node, &head->chain) {
297 if (INET_MATCH(sk2, net, hash, acookie, 327 if (INET_MATCH(sk2, net, hash, acookie,
298 saddr, daddr, ports, dif)) 328 saddr, daddr, ports, dif))
299 goto not_unique; 329 goto not_unique;
@@ -306,7 +336,7 @@ unique:
306 inet->sport = htons(lport); 336 inet->sport = htons(lport);
307 sk->sk_hash = hash; 337 sk->sk_hash = hash;
308 WARN_ON(!sk_unhashed(sk)); 338 WARN_ON(!sk_unhashed(sk));
309 __sk_add_node(sk, &head->chain); 339 __sk_nulls_add_node_rcu(sk, &head->chain);
310 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 340 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
311 write_unlock(lock); 341 write_unlock(lock);
312 342
@@ -338,7 +368,7 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
338void __inet_hash_nolisten(struct sock *sk) 368void __inet_hash_nolisten(struct sock *sk)
339{ 369{
340 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 370 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
341 struct hlist_head *list; 371 struct hlist_nulls_head *list;
342 rwlock_t *lock; 372 rwlock_t *lock;
343 struct inet_ehash_bucket *head; 373 struct inet_ehash_bucket *head;
344 374
@@ -350,7 +380,7 @@ void __inet_hash_nolisten(struct sock *sk)
350 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 380 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
351 381
352 write_lock(lock); 382 write_lock(lock);
353 __sk_add_node(sk, list); 383 __sk_nulls_add_node_rcu(sk, list);
354 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 384 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
355 write_unlock(lock); 385 write_unlock(lock);
356} 386}
@@ -400,13 +430,15 @@ void inet_unhash(struct sock *sk)
400 local_bh_disable(); 430 local_bh_disable();
401 inet_listen_wlock(hashinfo); 431 inet_listen_wlock(hashinfo);
402 lock = &hashinfo->lhash_lock; 432 lock = &hashinfo->lhash_lock;
433 if (__sk_del_node_init(sk))
434 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
403 } else { 435 } else {
404 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 436 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
405 write_lock_bh(lock); 437 write_lock_bh(lock);
438 if (__sk_nulls_del_node_init_rcu(sk))
439 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
406 } 440 }
407 441
408 if (__sk_del_node_init(sk))
409 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
410 write_unlock_bh(lock); 442 write_unlock_bh(lock);
411out: 443out:
412 if (sk->sk_state == TCP_LISTEN) 444 if (sk->sk_state == TCP_LISTEN)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1c5fd38f8824..60689951ecdb 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -23,12 +23,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
23 rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 23 rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
24 24
25 write_lock(lock); 25 write_lock(lock);
26 if (hlist_unhashed(&tw->tw_node)) { 26 if (hlist_nulls_unhashed(&tw->tw_node)) {
27 write_unlock(lock); 27 write_unlock(lock);
28 return; 28 return;
29 } 29 }
30 __hlist_del(&tw->tw_node); 30 hlist_nulls_del_rcu(&tw->tw_node);
31 sk_node_init(&tw->tw_node); 31 sk_nulls_node_init(&tw->tw_node);
32 write_unlock(lock); 32 write_unlock(lock);
33 33
34 /* Disassociate with bind bucket. */ 34 /* Disassociate with bind bucket. */
@@ -92,13 +92,17 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
92 92
93 write_lock(lock); 93 write_lock(lock);
94 94
95 /* Step 2: Remove SK from established hash. */ 95 /*
96 if (__sk_del_node_init(sk)) 96 * Step 2: Hash TW into TIMEWAIT chain.
97 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 97 * Should be done before removing sk from established chain
98 98 * because readers are lockless and search established first.
99 /* Step 3: Hash TW into TIMEWAIT chain. */ 99 */
100 inet_twsk_add_node(tw, &ehead->twchain);
101 atomic_inc(&tw->tw_refcnt); 100 atomic_inc(&tw->tw_refcnt);
101 inet_twsk_add_node_rcu(tw, &ehead->twchain);
102
103 /* Step 3: Remove SK from established hash. */
104 if (__sk_nulls_del_node_init_rcu(sk))
105 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
102 106
103 write_unlock(lock); 107 write_unlock(lock);
104} 108}
@@ -416,7 +420,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
416{ 420{
417 struct inet_timewait_sock *tw; 421 struct inet_timewait_sock *tw;
418 struct sock *sk; 422 struct sock *sk;
419 struct hlist_node *node; 423 struct hlist_nulls_node *node;
420 int h; 424 int h;
421 425
422 local_bh_disable(); 426 local_bh_disable();
@@ -426,7 +430,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
426 rwlock_t *lock = inet_ehash_lockp(hashinfo, h); 430 rwlock_t *lock = inet_ehash_lockp(hashinfo, h);
427restart: 431restart:
428 write_lock(lock); 432 write_lock(lock);
429 sk_for_each(sk, node, &head->twchain) { 433 sk_nulls_for_each(sk, node, &head->twchain) {
430 434
431 tw = inet_twsk(sk); 435 tw = inet_twsk(sk);
432 if (!net_eq(twsk_net(tw), net) || 436 if (!net_eq(twsk_net(tw), net) ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f60a5917e54d..044224a341eb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2707,8 +2707,8 @@ void __init tcp_init(void)
2707 thash_entries ? 0 : 512 * 1024); 2707 thash_entries ? 0 : 512 * 1024);
2708 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 2708 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2709 for (i = 0; i < tcp_hashinfo.ehash_size; i++) { 2709 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2710 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); 2710 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2711 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); 2711 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2712 } 2712 }
2713 if (inet_ehash_locks_alloc(&tcp_hashinfo)) 2713 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2714 panic("TCP: failed to alloc ehash_locks"); 2714 panic("TCP: failed to alloc ehash_locks");
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d49233f409b5..b2e3ab2287ba 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1857,16 +1857,16 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
1857#ifdef CONFIG_PROC_FS 1857#ifdef CONFIG_PROC_FS
1858/* Proc filesystem TCP sock list dumping. */ 1858/* Proc filesystem TCP sock list dumping. */
1859 1859
1860static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) 1860static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1861{ 1861{
1862 return hlist_empty(head) ? NULL : 1862 return hlist_nulls_empty(head) ? NULL :
1863 list_entry(head->first, struct inet_timewait_sock, tw_node); 1863 list_entry(head->first, struct inet_timewait_sock, tw_node);
1864} 1864}
1865 1865
1866static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) 1866static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1867{ 1867{
1868 return tw->tw_node.next ? 1868 return !is_a_nulls(tw->tw_node.next) ?
1869 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1869 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1870} 1870}
1871 1871
1872static void *listening_get_next(struct seq_file *seq, void *cur) 1872static void *listening_get_next(struct seq_file *seq, void *cur)
@@ -1954,8 +1954,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1954 1954
1955static inline int empty_bucket(struct tcp_iter_state *st) 1955static inline int empty_bucket(struct tcp_iter_state *st)
1956{ 1956{
1957 return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && 1957 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1958 hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 1958 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1959} 1959}
1960 1960
1961static void *established_get_first(struct seq_file *seq) 1961static void *established_get_first(struct seq_file *seq)
@@ -1966,7 +1966,7 @@ static void *established_get_first(struct seq_file *seq)
1966 1966
1967 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { 1967 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1968 struct sock *sk; 1968 struct sock *sk;
1969 struct hlist_node *node; 1969 struct hlist_nulls_node *node;
1970 struct inet_timewait_sock *tw; 1970 struct inet_timewait_sock *tw;
1971 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1971 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1972 1972
@@ -1975,7 +1975,7 @@ static void *established_get_first(struct seq_file *seq)
1975 continue; 1975 continue;
1976 1976
1977 read_lock_bh(lock); 1977 read_lock_bh(lock);
1978 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1978 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1979 if (sk->sk_family != st->family || 1979 if (sk->sk_family != st->family ||
1980 !net_eq(sock_net(sk), net)) { 1980 !net_eq(sock_net(sk), net)) {
1981 continue; 1981 continue;
@@ -2004,7 +2004,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
2004{ 2004{
2005 struct sock *sk = cur; 2005 struct sock *sk = cur;
2006 struct inet_timewait_sock *tw; 2006 struct inet_timewait_sock *tw;
2007 struct hlist_node *node; 2007 struct hlist_nulls_node *node;
2008 struct tcp_iter_state *st = seq->private; 2008 struct tcp_iter_state *st = seq->private;
2009 struct net *net = seq_file_net(seq); 2009 struct net *net = seq_file_net(seq);
2010 2010
@@ -2032,11 +2032,11 @@ get_tw:
2032 return NULL; 2032 return NULL;
2033 2033
2034 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2034 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2035 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2035 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2036 } else 2036 } else
2037 sk = sk_next(sk); 2037 sk = sk_nulls_next(sk);
2038 2038
2039 sk_for_each_from(sk, node) { 2039 sk_nulls_for_each_from(sk, node) {
2040 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2040 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2041 goto found; 2041 goto found;
2042 } 2042 }
@@ -2375,6 +2375,7 @@ struct proto tcp_prot = {
2375 .sysctl_rmem = sysctl_tcp_rmem, 2375 .sysctl_rmem = sysctl_tcp_rmem,
2376 .max_header = MAX_TCP_HEADER, 2376 .max_header = MAX_TCP_HEADER,
2377 .obj_size = sizeof(struct tcp_sock), 2377 .obj_size = sizeof(struct tcp_sock),
2378 .slab_flags = SLAB_DESTROY_BY_RCU,
2378 .twsk_prot = &tcp_timewait_sock_ops, 2379 .twsk_prot = &tcp_timewait_sock_ops,
2379 .rsk_prot = &tcp_request_sock_ops, 2380 .rsk_prot = &tcp_request_sock_ops,
2380 .h.hashinfo = &tcp_hashinfo, 2381 .h.hashinfo = &tcp_hashinfo,
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 1646a5658255..c1b4d401fd95 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -25,24 +25,28 @@
25void __inet6_hash(struct sock *sk) 25void __inet6_hash(struct sock *sk)
26{ 26{
27 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 27 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
28 struct hlist_head *list;
29 rwlock_t *lock; 28 rwlock_t *lock;
30 29
31 WARN_ON(!sk_unhashed(sk)); 30 WARN_ON(!sk_unhashed(sk));
32 31
33 if (sk->sk_state == TCP_LISTEN) { 32 if (sk->sk_state == TCP_LISTEN) {
33 struct hlist_head *list;
34
34 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 35 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
35 lock = &hashinfo->lhash_lock; 36 lock = &hashinfo->lhash_lock;
36 inet_listen_wlock(hashinfo); 37 inet_listen_wlock(hashinfo);
38 __sk_add_node(sk, list);
37 } else { 39 } else {
38 unsigned int hash; 40 unsigned int hash;
41 struct hlist_nulls_head *list;
42
39 sk->sk_hash = hash = inet6_sk_ehashfn(sk); 43 sk->sk_hash = hash = inet6_sk_ehashfn(sk);
40 list = &inet_ehash_bucket(hashinfo, hash)->chain; 44 list = &inet_ehash_bucket(hashinfo, hash)->chain;
41 lock = inet_ehash_lockp(hashinfo, hash); 45 lock = inet_ehash_lockp(hashinfo, hash);
42 write_lock(lock); 46 write_lock(lock);
47 __sk_nulls_add_node_rcu(sk, list);
43 } 48 }
44 49
45 __sk_add_node(sk, list);
46 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 50 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
47 write_unlock(lock); 51 write_unlock(lock);
48} 52}
@@ -63,33 +67,53 @@ struct sock *__inet6_lookup_established(struct net *net,
63 const int dif) 67 const int dif)
64{ 68{
65 struct sock *sk; 69 struct sock *sk;
66 const struct hlist_node *node; 70 const struct hlist_nulls_node *node;
67 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 71 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
68 /* Optimize here for direct hit, only listening connections can 72 /* Optimize here for direct hit, only listening connections can
69 * have wildcards anyways. 73 * have wildcards anyways.
70 */ 74 */
71 unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); 75 unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
72 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 76 unsigned int slot = hash & (hashinfo->ehash_size - 1);
73 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 77 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
74 78
75 prefetch(head->chain.first); 79
76 read_lock(lock); 80 rcu_read_lock();
77 sk_for_each(sk, node, &head->chain) { 81begin:
82 sk_nulls_for_each_rcu(sk, node, &head->chain) {
78 /* For IPV6 do the cheaper port and family tests first. */ 83 /* For IPV6 do the cheaper port and family tests first. */
79 if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) 84 if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
80 goto hit; /* You sunk my battleship! */ 85 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
86 goto begintw;
87 if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
88 sock_put(sk);
89 goto begin;
90 }
91 goto out;
92 }
81 } 93 }
94 if (get_nulls_value(node) != slot)
95 goto begin;
96
97begintw:
82 /* Must check for a TIME_WAIT'er before going to listener hash. */ 98 /* Must check for a TIME_WAIT'er before going to listener hash. */
83 sk_for_each(sk, node, &head->twchain) { 99 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
84 if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) 100 if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
85 goto hit; 101 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
102 sk = NULL;
103 goto out;
104 }
105 if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
106 sock_put(sk);
107 goto begintw;
108 }
109 goto out;
110 }
86 } 111 }
87 read_unlock(lock); 112 if (get_nulls_value(node) != slot)
88 return NULL; 113 goto begintw;
89 114 sk = NULL;
90hit: 115out:
91 sock_hold(sk); 116 rcu_read_unlock();
92 read_unlock(lock);
93 return sk; 117 return sk;
94} 118}
95EXPORT_SYMBOL(__inet6_lookup_established); 119EXPORT_SYMBOL(__inet6_lookup_established);
@@ -172,14 +196,14 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
172 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 196 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
173 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 197 rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
174 struct sock *sk2; 198 struct sock *sk2;
175 const struct hlist_node *node; 199 const struct hlist_nulls_node *node;
176 struct inet_timewait_sock *tw; 200 struct inet_timewait_sock *tw;
177 201
178 prefetch(head->chain.first); 202 prefetch(head->chain.first);
179 write_lock(lock); 203 write_lock(lock);
180 204
181 /* Check TIME-WAIT sockets first. */ 205 /* Check TIME-WAIT sockets first. */
182 sk_for_each(sk2, node, &head->twchain) { 206 sk_nulls_for_each(sk2, node, &head->twchain) {
183 tw = inet_twsk(sk2); 207 tw = inet_twsk(sk2);
184 208
185 if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { 209 if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) {
@@ -192,7 +216,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
192 tw = NULL; 216 tw = NULL;
193 217
194 /* And established part... */ 218 /* And established part... */
195 sk_for_each(sk2, node, &head->chain) { 219 sk_nulls_for_each(sk2, node, &head->chain) {
196 if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) 220 if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif))
197 goto not_unique; 221 goto not_unique;
198 } 222 }
@@ -203,7 +227,7 @@ unique:
203 inet->num = lport; 227 inet->num = lport;
204 inet->sport = htons(lport); 228 inet->sport = htons(lport);
205 WARN_ON(!sk_unhashed(sk)); 229 WARN_ON(!sk_unhashed(sk));
206 __sk_add_node(sk, &head->chain); 230 __sk_nulls_add_node_rcu(sk, &head->chain);
207 sk->sk_hash = hash; 231 sk->sk_hash = hash;
208 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 232 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
209 write_unlock(lock); 233 write_unlock(lock);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 984276463a8d..b35787056313 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2043,6 +2043,7 @@ struct proto tcpv6_prot = {
2043 .sysctl_rmem = sysctl_tcp_rmem, 2043 .sysctl_rmem = sysctl_tcp_rmem,
2044 .max_header = MAX_TCP_HEADER, 2044 .max_header = MAX_TCP_HEADER,
2045 .obj_size = sizeof(struct tcp6_sock), 2045 .obj_size = sizeof(struct tcp6_sock),
2046 .slab_flags = SLAB_DESTROY_BY_RCU,
2046 .twsk_prot = &tcp6_timewait_sock_ops, 2047 .twsk_prot = &tcp6_timewait_sock_ops,
2047 .rsk_prot = &tcp6_request_sock_ops, 2048 .rsk_prot = &tcp6_request_sock_ops,
2048 .h.hashinfo = &tcp_hashinfo, 2049 .h.hashinfo = &tcp_hashinfo,