aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-10-03 03:22:02 -0400
committerDavid S. Miller <davem@davemloft.net>2013-10-08 23:19:24 -0400
commit05dbc7b59481ca891bbcfe6799a562d48159fbf7 (patch)
treef398ddbc5d2a72b3c3b7b16aed8a34b153491341
parent53af53ae83fe960ceb9ef74cac7915e9088f4266 (diff)
tcp/dccp: remove twchain
TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/inet_hashtables.h9
-rw-r--r--include/net/inet_timewait_sock.h13
-rw-r--r--include/net/sock.h8
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/dccp/proto.c4
-rw-r--r--net/ipv4/inet_diag.c48
-rw-r--r--net/ipv4/inet_hashtables.c83
-rw-r--r--net/ipv4/inet_timewait_sock.c55
-rw-r--r--net/ipv4/tcp.c5
-rw-r--r--net/ipv4/tcp_ipv4.c83
-rw-r--r--net/ipv6/inet6_hashtables.c75
-rw-r--r--net/ipv6/tcp_ipv6.c9
12 files changed, 132 insertions, 261 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 10d6838378c3..1bdb47715def 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -37,12 +37,11 @@
37#include <asm/byteorder.h> 37#include <asm/byteorder.h>
38 38
39/* This is for all connections with a full identity, no wildcards. 39/* This is for all connections with a full identity, no wildcards.
40 * One chain is dedicated to TIME_WAIT sockets. 40 * The 'e' prefix stands for Establish, but we really put all sockets
41 * I'll experiment with dynamic table growth later. 41 * but LISTEN ones.
42 */ 42 */
43struct inet_ehash_bucket { 43struct inet_ehash_bucket {
44 struct hlist_nulls_head chain; 44 struct hlist_nulls_head chain;
45 struct hlist_nulls_head twchain;
46}; 45};
47 46
48/* There are a few simple rules, which allow for local port reuse by 47/* There are a few simple rules, which allow for local port reuse by
@@ -123,7 +122,6 @@ struct inet_hashinfo {
123 * 122 *
124 * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE 123 * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
125 * 124 *
126 * TIME_WAIT sockets use a separate chain (twchain).
127 */ 125 */
128 struct inet_ehash_bucket *ehash; 126 struct inet_ehash_bucket *ehash;
129 spinlock_t *ehash_locks; 127 spinlock_t *ehash_locks;
@@ -318,9 +316,6 @@ static inline struct sock *inet_lookup_listener(struct net *net,
318 net_eq(sock_net(__sk), (__net))) 316 net_eq(sock_net(__sk), (__net)))
319#endif /* 64-bit arch */ 317#endif /* 64-bit arch */
320 318
321#define INET_TW_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)\
322 INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)
323
324/* 319/*
325 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need 320 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
326 * not check it for lookups anymore, thanks Alexey. -DaveM 321 * not check it for lookups anymore, thanks Alexey. -DaveM
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index f528d1b0ac95..de9e3ab7d43d 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -141,18 +141,6 @@ struct inet_timewait_sock {
141}; 141};
142#define tw_tclass tw_tos 142#define tw_tclass tw_tos
143 143
144static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
145 struct hlist_nulls_head *list)
146{
147 hlist_nulls_add_head_rcu(&tw->tw_node, list);
148}
149
150static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
151 struct hlist_head *list)
152{
153 hlist_add_head(&tw->tw_bind_node, list);
154}
155
156static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw) 144static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
157{ 145{
158 return !hlist_unhashed(&tw->tw_death_node); 146 return !hlist_unhashed(&tw->tw_death_node);
@@ -192,6 +180,7 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
192 return (struct inet_timewait_sock *)sk; 180 return (struct inet_timewait_sock *)sk;
193} 181}
194 182
183void inet_twsk_free(struct inet_timewait_sock *tw);
195void inet_twsk_put(struct inet_timewait_sock *tw); 184void inet_twsk_put(struct inet_timewait_sock *tw);
196 185
197int inet_twsk_unhash(struct inet_timewait_sock *tw); 186int inet_twsk_unhash(struct inet_timewait_sock *tw);
diff --git a/include/net/sock.h b/include/net/sock.h
index 7cf8d2331afb..3f3e48c4704d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -156,7 +156,7 @@ typedef __u64 __bitwise __addrpair;
156 */ 156 */
157struct sock_common { 157struct sock_common {
158 /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned 158 /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
159 * address on 64bit arches : cf INET_MATCH() and INET_TW_MATCH() 159 * address on 64bit arches : cf INET_MATCH()
160 */ 160 */
161 union { 161 union {
162 __addrpair skc_addrpair; 162 __addrpair skc_addrpair;
@@ -301,6 +301,8 @@ struct sock {
301#define sk_dontcopy_end __sk_common.skc_dontcopy_end 301#define sk_dontcopy_end __sk_common.skc_dontcopy_end
302#define sk_hash __sk_common.skc_hash 302#define sk_hash __sk_common.skc_hash
303#define sk_portpair __sk_common.skc_portpair 303#define sk_portpair __sk_common.skc_portpair
304#define sk_num __sk_common.skc_num
305#define sk_dport __sk_common.skc_dport
304#define sk_addrpair __sk_common.skc_addrpair 306#define sk_addrpair __sk_common.skc_addrpair
305#define sk_daddr __sk_common.skc_daddr 307#define sk_daddr __sk_common.skc_daddr
306#define sk_rcv_saddr __sk_common.skc_rcv_saddr 308#define sk_rcv_saddr __sk_common.skc_rcv_saddr
@@ -1653,6 +1655,10 @@ static inline void sock_put(struct sock *sk)
1653 if (atomic_dec_and_test(&sk->sk_refcnt)) 1655 if (atomic_dec_and_test(&sk->sk_refcnt))
1654 sk_free(sk); 1656 sk_free(sk);
1655} 1657}
1658/* Generic version of sock_put(), dealing with all sockets
1659 * (TCP_TIMEWAIT, ESTABLISHED...)
1660 */
1661void sock_gen_put(struct sock *sk);
1656 1662
1657int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested); 1663int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested);
1658 1664
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de870ee5582d..39bbfa1602b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1519,7 +1519,6 @@ enum tcp_seq_states {
1519 TCP_SEQ_STATE_LISTENING, 1519 TCP_SEQ_STATE_LISTENING,
1520 TCP_SEQ_STATE_OPENREQ, 1520 TCP_SEQ_STATE_OPENREQ,
1521 TCP_SEQ_STATE_ESTABLISHED, 1521 TCP_SEQ_STATE_ESTABLISHED,
1522 TCP_SEQ_STATE_TIME_WAIT,
1523}; 1522};
1524 1523
1525int tcp_seq_open(struct inode *inode, struct file *file); 1524int tcp_seq_open(struct inode *inode, struct file *file);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ba64750f0387..eb892b4f4814 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1158,10 +1158,8 @@ static int __init dccp_init(void)
1158 goto out_free_bind_bucket_cachep; 1158 goto out_free_bind_bucket_cachep;
1159 } 1159 }
1160 1160
1161 for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) { 1161 for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
1162 INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); 1162 INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
1163 INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
1164 }
1165 1163
1166 if (inet_ehash_locks_alloc(&dccp_hashinfo)) 1164 if (inet_ehash_locks_alloc(&dccp_hashinfo))
1167 goto out_free_dccp_ehash; 1165 goto out_free_dccp_ehash;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 22000279efc8..8e1e40653357 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -635,12 +635,14 @@ static int inet_csk_diag_dump(struct sock *sk,
635 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 635 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
636} 636}
637 637
638static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, 638static int inet_twsk_diag_dump(struct sock *sk,
639 struct sk_buff *skb, 639 struct sk_buff *skb,
640 struct netlink_callback *cb, 640 struct netlink_callback *cb,
641 struct inet_diag_req_v2 *r, 641 struct inet_diag_req_v2 *r,
642 const struct nlattr *bc) 642 const struct nlattr *bc)
643{ 643{
644 struct inet_timewait_sock *tw = inet_twsk(sk);
645
644 if (bc != NULL) { 646 if (bc != NULL) {
645 struct inet_diag_entry entry; 647 struct inet_diag_entry entry;
646 648
@@ -911,8 +913,7 @@ skip_listen_ht:
911 913
912 num = 0; 914 num = 0;
913 915
914 if (hlist_nulls_empty(&head->chain) && 916 if (hlist_nulls_empty(&head->chain))
915 hlist_nulls_empty(&head->twchain))
916 continue; 917 continue;
917 918
918 if (i > s_i) 919 if (i > s_i)
@@ -920,7 +921,7 @@ skip_listen_ht:
920 921
921 spin_lock_bh(lock); 922 spin_lock_bh(lock);
922 sk_nulls_for_each(sk, node, &head->chain) { 923 sk_nulls_for_each(sk, node, &head->chain) {
923 struct inet_sock *inet = inet_sk(sk); 924 int res;
924 925
925 if (!net_eq(sock_net(sk), net)) 926 if (!net_eq(sock_net(sk), net))
926 continue; 927 continue;
@@ -929,15 +930,19 @@ skip_listen_ht:
929 if (!(r->idiag_states & (1 << sk->sk_state))) 930 if (!(r->idiag_states & (1 << sk->sk_state)))
930 goto next_normal; 931 goto next_normal;
931 if (r->sdiag_family != AF_UNSPEC && 932 if (r->sdiag_family != AF_UNSPEC &&
932 sk->sk_family != r->sdiag_family) 933 sk->sk_family != r->sdiag_family)
933 goto next_normal; 934 goto next_normal;
934 if (r->id.idiag_sport != inet->inet_sport && 935 if (r->id.idiag_sport != htons(sk->sk_num) &&
935 r->id.idiag_sport) 936 r->id.idiag_sport)
936 goto next_normal; 937 goto next_normal;
937 if (r->id.idiag_dport != inet->inet_dport && 938 if (r->id.idiag_dport != sk->sk_dport &&
938 r->id.idiag_dport) 939 r->id.idiag_dport)
939 goto next_normal; 940 goto next_normal;
940 if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { 941 if (sk->sk_state == TCP_TIME_WAIT)
942 res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
943 else
944 res = inet_csk_diag_dump(sk, skb, cb, r, bc);
945 if (res < 0) {
941 spin_unlock_bh(lock); 946 spin_unlock_bh(lock);
942 goto done; 947 goto done;
943 } 948 }
@@ -945,33 +950,6 @@ next_normal:
945 ++num; 950 ++num;
946 } 951 }
947 952
948 if (r->idiag_states & TCPF_TIME_WAIT) {
949 struct inet_timewait_sock *tw;
950
951 inet_twsk_for_each(tw, node,
952 &head->twchain) {
953 if (!net_eq(twsk_net(tw), net))
954 continue;
955
956 if (num < s_num)
957 goto next_dying;
958 if (r->sdiag_family != AF_UNSPEC &&
959 tw->tw_family != r->sdiag_family)
960 goto next_dying;
961 if (r->id.idiag_sport != tw->tw_sport &&
962 r->id.idiag_sport)
963 goto next_dying;
964 if (r->id.idiag_dport != tw->tw_dport &&
965 r->id.idiag_dport)
966 goto next_dying;
967 if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) {
968 spin_unlock_bh(lock);
969 goto done;
970 }
971next_dying:
972 ++num;
973 }
974 }
975 spin_unlock_bh(lock); 953 spin_unlock_bh(lock);
976 } 954 }
977 955
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ae199596b9b0..a4b66bbe4f21 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -230,6 +230,19 @@ begin:
230} 230}
231EXPORT_SYMBOL_GPL(__inet_lookup_listener); 231EXPORT_SYMBOL_GPL(__inet_lookup_listener);
232 232
233/* All sockets share common refcount, but have different destructors */
234void sock_gen_put(struct sock *sk)
235{
236 if (!atomic_dec_and_test(&sk->sk_refcnt))
237 return;
238
239 if (sk->sk_state == TCP_TIME_WAIT)
240 inet_twsk_free(inet_twsk(sk));
241 else
242 sk_free(sk);
243}
244EXPORT_SYMBOL_GPL(sock_gen_put);
245
233struct sock *__inet_lookup_established(struct net *net, 246struct sock *__inet_lookup_established(struct net *net,
234 struct inet_hashinfo *hashinfo, 247 struct inet_hashinfo *hashinfo,
235 const __be32 saddr, const __be16 sport, 248 const __be32 saddr, const __be16 sport,
@@ -255,13 +268,13 @@ begin:
255 if (likely(INET_MATCH(sk, net, acookie, 268 if (likely(INET_MATCH(sk, net, acookie,
256 saddr, daddr, ports, dif))) { 269 saddr, daddr, ports, dif))) {
257 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 270 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
258 goto begintw; 271 goto out;
259 if (unlikely(!INET_MATCH(sk, net, acookie, 272 if (unlikely(!INET_MATCH(sk, net, acookie,
260 saddr, daddr, ports, dif))) { 273 saddr, daddr, ports, dif))) {
261 sock_put(sk); 274 sock_gen_put(sk);
262 goto begin; 275 goto begin;
263 } 276 }
264 goto out; 277 goto found;
265 } 278 }
266 } 279 }
267 /* 280 /*
@@ -271,37 +284,9 @@ begin:
271 */ 284 */
272 if (get_nulls_value(node) != slot) 285 if (get_nulls_value(node) != slot)
273 goto begin; 286 goto begin;
274
275begintw:
276 /* Must check for a TIME_WAIT'er before going to listener hash. */
277 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
278 if (sk->sk_hash != hash)
279 continue;
280 if (likely(INET_TW_MATCH(sk, net, acookie,
281 saddr, daddr, ports,
282 dif))) {
283 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
284 sk = NULL;
285 goto out;
286 }
287 if (unlikely(!INET_TW_MATCH(sk, net, acookie,
288 saddr, daddr, ports,
289 dif))) {
290 inet_twsk_put(inet_twsk(sk));
291 goto begintw;
292 }
293 goto out;
294 }
295 }
296 /*
297 * if the nulls value we got at the end of this lookup is
298 * not the expected one, we must restart lookup.
299 * We probably met an item that was moved to another chain.
300 */
301 if (get_nulls_value(node) != slot)
302 goto begintw;
303 sk = NULL;
304out: 287out:
288 sk = NULL;
289found:
305 rcu_read_unlock(); 290 rcu_read_unlock();
306 return sk; 291 return sk;
307} 292}
@@ -326,39 +311,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
326 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 311 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
327 struct sock *sk2; 312 struct sock *sk2;
328 const struct hlist_nulls_node *node; 313 const struct hlist_nulls_node *node;
329 struct inet_timewait_sock *tw; 314 struct inet_timewait_sock *tw = NULL;
330 int twrefcnt = 0; 315 int twrefcnt = 0;
331 316
332 spin_lock(lock); 317 spin_lock(lock);
333 318
334 /* Check TIME-WAIT sockets first. */
335 sk_nulls_for_each(sk2, node, &head->twchain) {
336 if (sk2->sk_hash != hash)
337 continue;
338
339 if (likely(INET_TW_MATCH(sk2, net, acookie,
340 saddr, daddr, ports, dif))) {
341 tw = inet_twsk(sk2);
342 if (twsk_unique(sk, sk2, twp))
343 goto unique;
344 else
345 goto not_unique;
346 }
347 }
348 tw = NULL;
349
350 /* And established part... */
351 sk_nulls_for_each(sk2, node, &head->chain) { 319 sk_nulls_for_each(sk2, node, &head->chain) {
352 if (sk2->sk_hash != hash) 320 if (sk2->sk_hash != hash)
353 continue; 321 continue;
322
354 if (likely(INET_MATCH(sk2, net, acookie, 323 if (likely(INET_MATCH(sk2, net, acookie,
355 saddr, daddr, ports, dif))) 324 saddr, daddr, ports, dif))) {
325 if (sk2->sk_state == TCP_TIME_WAIT) {
326 tw = inet_twsk(sk2);
327 if (twsk_unique(sk, sk2, twp))
328 break;
329 }
356 goto not_unique; 330 goto not_unique;
331 }
357 } 332 }
358 333
359unique:
360 /* Must record num and sport now. Otherwise we will see 334 /* Must record num and sport now. Otherwise we will see
361 * in hash table socket with a funny identity. */ 335 * in hash table socket with a funny identity.
336 */
362 inet->inet_num = lport; 337 inet->inet_num = lport;
363 inet->inet_sport = htons(lport); 338 inet->inet_sport = htons(lport);
364 sk->sk_hash = hash; 339 sk->sk_hash = hash;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 9bcd8f7234ec..6d592f8555fb 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -87,19 +87,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
87 refcnt += inet_twsk_bind_unhash(tw, hashinfo); 87 refcnt += inet_twsk_bind_unhash(tw, hashinfo);
88 spin_unlock(&bhead->lock); 88 spin_unlock(&bhead->lock);
89 89
90#ifdef SOCK_REFCNT_DEBUG 90 BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
91 if (atomic_read(&tw->tw_refcnt) != 1) { 91 atomic_sub(refcnt, &tw->tw_refcnt);
92 pr_debug("%s timewait_sock %p refcnt=%d\n",
93 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
94 }
95#endif
96 while (refcnt) {
97 inet_twsk_put(tw);
98 refcnt--;
99 }
100} 92}
101 93
102static noinline void inet_twsk_free(struct inet_timewait_sock *tw) 94void inet_twsk_free(struct inet_timewait_sock *tw)
103{ 95{
104 struct module *owner = tw->tw_prot->owner; 96 struct module *owner = tw->tw_prot->owner;
105 twsk_destructor((struct sock *)tw); 97 twsk_destructor((struct sock *)tw);
@@ -118,6 +110,18 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
118} 110}
119EXPORT_SYMBOL_GPL(inet_twsk_put); 111EXPORT_SYMBOL_GPL(inet_twsk_put);
120 112
113static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
114 struct hlist_nulls_head *list)
115{
116 hlist_nulls_add_head_rcu(&tw->tw_node, list);
117}
118
119static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
120 struct hlist_head *list)
121{
122 hlist_add_head(&tw->tw_bind_node, list);
123}
124
121/* 125/*
122 * Enter the time wait state. This is called with locally disabled BH. 126 * Enter the time wait state. This is called with locally disabled BH.
123 * Essentially we whip up a timewait bucket, copy the relevant info into it 127 * Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -146,26 +150,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
146 spin_lock(lock); 150 spin_lock(lock);
147 151
148 /* 152 /*
149 * Step 2: Hash TW into TIMEWAIT chain. 153 * Step 2: Hash TW into tcp ehash chain.
150 * Should be done before removing sk from established chain 154 * Notes :
151 * because readers are lockless and search established first. 155 * - tw_refcnt is set to 3 because :
156 * - We have one reference from bhash chain.
157 * - We have one reference from ehash chain.
158 * We can use atomic_set() because prior spin_lock()/spin_unlock()
159 * committed into memory all tw fields.
152 */ 160 */
153 inet_twsk_add_node_rcu(tw, &ehead->twchain); 161 atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
162 inet_twsk_add_node_rcu(tw, &ehead->chain);
154 163
155 /* Step 3: Remove SK from established hash. */ 164 /* Step 3: Remove SK from hash chain */
156 if (__sk_nulls_del_node_init_rcu(sk)) 165 if (__sk_nulls_del_node_init_rcu(sk))
157 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 166 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
158 167
159 /*
160 * Notes :
161 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
162 * - We add one reference for the bhash link
163 * - We add one reference for the ehash link
164 * - We want this refcnt update done before allowing other
165 * threads to find this tw in ehash chain.
166 */
167 atomic_add(1 + 1 + 1, &tw->tw_refcnt);
168
169 spin_unlock(lock); 168 spin_unlock(lock);
170} 169}
171EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 170EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -490,7 +489,9 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
490restart_rcu: 489restart_rcu:
491 rcu_read_lock(); 490 rcu_read_lock();
492restart: 491restart:
493 sk_nulls_for_each_rcu(sk, node, &head->twchain) { 492 sk_nulls_for_each_rcu(sk, node, &head->chain) {
493 if (sk->sk_state != TCP_TIME_WAIT)
494 continue;
494 tw = inet_twsk(sk); 495 tw = inet_twsk(sk);
495 if ((tw->tw_family != family) || 496 if ((tw->tw_family != family) ||
496 atomic_read(&twsk_net(tw)->count)) 497 atomic_read(&twsk_net(tw)->count))
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6e5617b9f9db..be4b161802e8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3137,10 +3137,9 @@ void __init tcp_init(void)
3137 &tcp_hashinfo.ehash_mask, 3137 &tcp_hashinfo.ehash_mask,
3138 0, 3138 0,
3139 thash_entries ? 0 : 512 * 1024); 3139 thash_entries ? 0 : 512 * 1024);
3140 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { 3140 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3141 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 3141 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3142 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); 3142
3143 }
3144 if (inet_ehash_locks_alloc(&tcp_hashinfo)) 3143 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3145 panic("TCP: failed to alloc ehash_locks"); 3144 panic("TCP: failed to alloc ehash_locks");
3146 tcp_hashinfo.bhash = 3145 tcp_hashinfo.bhash =
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5d6b1a609da8..e4695dde1af6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2194,18 +2194,6 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
2194#ifdef CONFIG_PROC_FS 2194#ifdef CONFIG_PROC_FS
2195/* Proc filesystem TCP sock list dumping. */ 2195/* Proc filesystem TCP sock list dumping. */
2196 2196
2197static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2198{
2199 return hlist_nulls_empty(head) ? NULL :
2200 list_entry(head->first, struct inet_timewait_sock, tw_node);
2201}
2202
2203static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2204{
2205 return !is_a_nulls(tw->tw_node.next) ?
2206 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2207}
2208
2209/* 2197/*
2210 * Get next listener socket follow cur. If cur is NULL, get first socket 2198 * Get next listener socket follow cur. If cur is NULL, get first socket
2211 * starting from bucket given in st->bucket; when st->bucket is zero the 2199 * starting from bucket given in st->bucket; when st->bucket is zero the
@@ -2309,10 +2297,9 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2309 return rc; 2297 return rc;
2310} 2298}
2311 2299
2312static inline bool empty_bucket(struct tcp_iter_state *st) 2300static inline bool empty_bucket(const struct tcp_iter_state *st)
2313{ 2301{
2314 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && 2302 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2315 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2316} 2303}
2317 2304
2318/* 2305/*
@@ -2329,7 +2316,6 @@ static void *established_get_first(struct seq_file *seq)
2329 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2316 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2330 struct sock *sk; 2317 struct sock *sk;
2331 struct hlist_nulls_node *node; 2318 struct hlist_nulls_node *node;
2332 struct inet_timewait_sock *tw;
2333 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2319 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2334 2320
2335 /* Lockless fast path for the common case of empty buckets */ 2321 /* Lockless fast path for the common case of empty buckets */
@@ -2345,18 +2331,7 @@ static void *established_get_first(struct seq_file *seq)
2345 rc = sk; 2331 rc = sk;
2346 goto out; 2332 goto out;
2347 } 2333 }
2348 st->state = TCP_SEQ_STATE_TIME_WAIT;
2349 inet_twsk_for_each(tw, node,
2350 &tcp_hashinfo.ehash[st->bucket].twchain) {
2351 if (tw->tw_family != st->family ||
2352 !net_eq(twsk_net(tw), net)) {
2353 continue;
2354 }
2355 rc = tw;
2356 goto out;
2357 }
2358 spin_unlock_bh(lock); 2334 spin_unlock_bh(lock);
2359 st->state = TCP_SEQ_STATE_ESTABLISHED;
2360 } 2335 }
2361out: 2336out:
2362 return rc; 2337 return rc;
@@ -2365,7 +2340,6 @@ out:
2365static void *established_get_next(struct seq_file *seq, void *cur) 2340static void *established_get_next(struct seq_file *seq, void *cur)
2366{ 2341{
2367 struct sock *sk = cur; 2342 struct sock *sk = cur;
2368 struct inet_timewait_sock *tw;
2369 struct hlist_nulls_node *node; 2343 struct hlist_nulls_node *node;
2370 struct tcp_iter_state *st = seq->private; 2344 struct tcp_iter_state *st = seq->private;
2371 struct net *net = seq_file_net(seq); 2345 struct net *net = seq_file_net(seq);
@@ -2373,45 +2347,16 @@ static void *established_get_next(struct seq_file *seq, void *cur)
2373 ++st->num; 2347 ++st->num;
2374 ++st->offset; 2348 ++st->offset;
2375 2349
2376 if (st->state == TCP_SEQ_STATE_TIME_WAIT) { 2350 sk = sk_nulls_next(sk);
2377 tw = cur;
2378 tw = tw_next(tw);
2379get_tw:
2380 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2381 tw = tw_next(tw);
2382 }
2383 if (tw) {
2384 cur = tw;
2385 goto out;
2386 }
2387 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2388 st->state = TCP_SEQ_STATE_ESTABLISHED;
2389
2390 /* Look for next non empty bucket */
2391 st->offset = 0;
2392 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2393 empty_bucket(st))
2394 ;
2395 if (st->bucket > tcp_hashinfo.ehash_mask)
2396 return NULL;
2397
2398 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2399 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2400 } else
2401 sk = sk_nulls_next(sk);
2402 2351
2403 sk_nulls_for_each_from(sk, node) { 2352 sk_nulls_for_each_from(sk, node) {
2404 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2353 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2405 goto found; 2354 return sk;
2406 } 2355 }
2407 2356
2408 st->state = TCP_SEQ_STATE_TIME_WAIT; 2357 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2409 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); 2358 ++st->bucket;
2410 goto get_tw; 2359 return established_get_first(seq);
2411found:
2412 cur = sk;
2413out:
2414 return cur;
2415} 2360}
2416 2361
2417static void *established_get_idx(struct seq_file *seq, loff_t pos) 2362static void *established_get_idx(struct seq_file *seq, loff_t pos)
@@ -2464,10 +2409,9 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
2464 if (rc) 2409 if (rc)
2465 break; 2410 break;
2466 st->bucket = 0; 2411 st->bucket = 0;
2412 st->state = TCP_SEQ_STATE_ESTABLISHED;
2467 /* Fallthrough */ 2413 /* Fallthrough */
2468 case TCP_SEQ_STATE_ESTABLISHED: 2414 case TCP_SEQ_STATE_ESTABLISHED:
2469 case TCP_SEQ_STATE_TIME_WAIT:
2470 st->state = TCP_SEQ_STATE_ESTABLISHED;
2471 if (st->bucket > tcp_hashinfo.ehash_mask) 2415 if (st->bucket > tcp_hashinfo.ehash_mask)
2472 break; 2416 break;
2473 rc = established_get_first(seq); 2417 rc = established_get_first(seq);
@@ -2524,7 +2468,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2524 } 2468 }
2525 break; 2469 break;
2526 case TCP_SEQ_STATE_ESTABLISHED: 2470 case TCP_SEQ_STATE_ESTABLISHED:
2527 case TCP_SEQ_STATE_TIME_WAIT:
2528 rc = established_get_next(seq, v); 2471 rc = established_get_next(seq, v);
2529 break; 2472 break;
2530 } 2473 }
@@ -2548,7 +2491,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2548 if (v != SEQ_START_TOKEN) 2491 if (v != SEQ_START_TOKEN)
2549 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); 2492 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2550 break; 2493 break;
2551 case TCP_SEQ_STATE_TIME_WAIT:
2552 case TCP_SEQ_STATE_ESTABLISHED: 2494 case TCP_SEQ_STATE_ESTABLISHED:
2553 if (v) 2495 if (v)
2554 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2496 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2707,6 +2649,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2707static int tcp4_seq_show(struct seq_file *seq, void *v) 2649static int tcp4_seq_show(struct seq_file *seq, void *v)
2708{ 2650{
2709 struct tcp_iter_state *st; 2651 struct tcp_iter_state *st;
2652 struct sock *sk = v;
2710 int len; 2653 int len;
2711 2654
2712 if (v == SEQ_START_TOKEN) { 2655 if (v == SEQ_START_TOKEN) {
@@ -2721,14 +2664,14 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
2721 switch (st->state) { 2664 switch (st->state) {
2722 case TCP_SEQ_STATE_LISTENING: 2665 case TCP_SEQ_STATE_LISTENING:
2723 case TCP_SEQ_STATE_ESTABLISHED: 2666 case TCP_SEQ_STATE_ESTABLISHED:
2724 get_tcp4_sock(v, seq, st->num, &len); 2667 if (sk->sk_state == TCP_TIME_WAIT)
2668 get_timewait4_sock(v, seq, st->num, &len);
2669 else
2670 get_tcp4_sock(v, seq, st->num, &len);
2725 break; 2671 break;
2726 case TCP_SEQ_STATE_OPENREQ: 2672 case TCP_SEQ_STATE_OPENREQ:
2727 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); 2673 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2728 break; 2674 break;
2729 case TCP_SEQ_STATE_TIME_WAIT:
2730 get_timewait4_sock(v, seq, st->num, &len);
2731 break;
2732 } 2675 }
2733 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); 2676 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2734out: 2677out:
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 066640e0ba8e..46440777e1c5 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -89,43 +89,36 @@ begin:
89 sk_nulls_for_each_rcu(sk, node, &head->chain) { 89 sk_nulls_for_each_rcu(sk, node, &head->chain) {
90 if (sk->sk_hash != hash) 90 if (sk->sk_hash != hash)
91 continue; 91 continue;
92 if (likely(INET6_MATCH(sk, net, saddr, daddr, ports, dif))) { 92 if (sk->sk_state == TCP_TIME_WAIT) {
93 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 93 if (!INET6_TW_MATCH(sk, net, saddr, daddr, ports, dif))
94 goto begintw; 94 continue;
95 } else {
96 if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif))
97 continue;
98 }
99 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
100 goto out;
101
102 if (sk->sk_state == TCP_TIME_WAIT) {
103 if (unlikely(!INET6_TW_MATCH(sk, net, saddr, daddr,
104 ports, dif))) {
105 sock_gen_put(sk);
106 goto begin;
107 }
108 } else {
95 if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, 109 if (unlikely(!INET6_MATCH(sk, net, saddr, daddr,
96 ports, dif))) { 110 ports, dif))) {
97 sock_put(sk); 111 sock_put(sk);
98 goto begin; 112 goto begin;
99 } 113 }
100 goto out; 114 goto found;
101 } 115 }
102 } 116 }
103 if (get_nulls_value(node) != slot) 117 if (get_nulls_value(node) != slot)
104 goto begin; 118 goto begin;
105
106begintw:
107 /* Must check for a TIME_WAIT'er before going to listener hash. */
108 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
109 if (sk->sk_hash != hash)
110 continue;
111 if (likely(INET6_TW_MATCH(sk, net, saddr, daddr,
112 ports, dif))) {
113 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
114 sk = NULL;
115 goto out;
116 }
117 if (unlikely(!INET6_TW_MATCH(sk, net, saddr, daddr,
118 ports, dif))) {
119 inet_twsk_put(inet_twsk(sk));
120 goto begintw;
121 }
122 goto out;
123 }
124 }
125 if (get_nulls_value(node) != slot)
126 goto begintw;
127 sk = NULL;
128out: 119out:
120 sk = NULL;
121found:
129 rcu_read_unlock(); 122 rcu_read_unlock();
130 return sk; 123 return sk;
131} 124}
@@ -248,31 +241,25 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
248 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 241 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
249 struct sock *sk2; 242 struct sock *sk2;
250 const struct hlist_nulls_node *node; 243 const struct hlist_nulls_node *node;
251 struct inet_timewait_sock *tw; 244 struct inet_timewait_sock *tw = NULL;
252 int twrefcnt = 0; 245 int twrefcnt = 0;
253 246
254 spin_lock(lock); 247 spin_lock(lock);
255 248
256 /* Check TIME-WAIT sockets first. */ 249 sk_nulls_for_each(sk2, node, &head->chain) {
257 sk_nulls_for_each(sk2, node, &head->twchain) {
258 if (sk2->sk_hash != hash) 250 if (sk2->sk_hash != hash)
259 continue; 251 continue;
260 252
261 if (likely(INET6_TW_MATCH(sk2, net, saddr, daddr, 253 if (sk2->sk_state == TCP_TIME_WAIT) {
262 ports, dif))) { 254 if (likely(INET6_TW_MATCH(sk2, net, saddr, daddr,
263 tw = inet_twsk(sk2); 255 ports, dif))) {
264 if (twsk_unique(sk, sk2, twp)) 256 tw = inet_twsk(sk2);
265 goto unique; 257 if (twsk_unique(sk, sk2, twp))
266 else 258 goto unique;
267 goto not_unique; 259 else
260 goto not_unique;
261 }
268 } 262 }
269 }
270 tw = NULL;
271
272 /* And established part... */
273 sk_nulls_for_each(sk2, node, &head->chain) {
274 if (sk2->sk_hash != hash)
275 continue;
276 if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) 263 if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif)))
277 goto not_unique; 264 goto not_unique;
278 } 265 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index dde8bad04481..528e61afaf5e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1834,6 +1834,7 @@ static void get_timewait6_sock(struct seq_file *seq,
1834static int tcp6_seq_show(struct seq_file *seq, void *v) 1834static int tcp6_seq_show(struct seq_file *seq, void *v)
1835{ 1835{
1836 struct tcp_iter_state *st; 1836 struct tcp_iter_state *st;
1837 struct sock *sk = v;
1837 1838
1838 if (v == SEQ_START_TOKEN) { 1839 if (v == SEQ_START_TOKEN) {
1839 seq_puts(seq, 1840 seq_puts(seq,
@@ -1849,14 +1850,14 @@ static int tcp6_seq_show(struct seq_file *seq, void *v)
1849 switch (st->state) { 1850 switch (st->state) {
1850 case TCP_SEQ_STATE_LISTENING: 1851 case TCP_SEQ_STATE_LISTENING:
1851 case TCP_SEQ_STATE_ESTABLISHED: 1852 case TCP_SEQ_STATE_ESTABLISHED:
1852 get_tcp6_sock(seq, v, st->num); 1853 if (sk->sk_state == TCP_TIME_WAIT)
1854 get_timewait6_sock(seq, v, st->num);
1855 else
1856 get_tcp6_sock(seq, v, st->num);
1853 break; 1857 break;
1854 case TCP_SEQ_STATE_OPENREQ: 1858 case TCP_SEQ_STATE_OPENREQ:
1855 get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid); 1859 get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid);
1856 break; 1860 break;
1857 case TCP_SEQ_STATE_TIME_WAIT:
1858 get_timewait6_sock(seq, v, st->num);
1859 break;
1860 } 1861 }
1861out: 1862out:
1862 return 0; 1863 return 0;