diff options
Diffstat (limited to 'net/ipv4/inet_timewait_sock.c')
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 112 |
1 files changed, 76 insertions, 36 deletions
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 13f0781f35cd..0fdf45e4c90c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c | |||
@@ -14,22 +14,33 @@ | |||
14 | #include <net/inet_timewait_sock.h> | 14 | #include <net/inet_timewait_sock.h> |
15 | #include <net/ip.h> | 15 | #include <net/ip.h> |
16 | 16 | ||
17 | |||
18 | /* | ||
19 | * unhash a timewait socket from established hash | ||
20 | * lock must be hold by caller | ||
21 | */ | ||
22 | int inet_twsk_unhash(struct inet_timewait_sock *tw) | ||
23 | { | ||
24 | if (hlist_nulls_unhashed(&tw->tw_node)) | ||
25 | return 0; | ||
26 | |||
27 | hlist_nulls_del_rcu(&tw->tw_node); | ||
28 | sk_nulls_node_init(&tw->tw_node); | ||
29 | return 1; | ||
30 | } | ||
31 | |||
17 | /* Must be called with locally disabled BHs. */ | 32 | /* Must be called with locally disabled BHs. */ |
18 | static void __inet_twsk_kill(struct inet_timewait_sock *tw, | 33 | static void __inet_twsk_kill(struct inet_timewait_sock *tw, |
19 | struct inet_hashinfo *hashinfo) | 34 | struct inet_hashinfo *hashinfo) |
20 | { | 35 | { |
21 | struct inet_bind_hashbucket *bhead; | 36 | struct inet_bind_hashbucket *bhead; |
22 | struct inet_bind_bucket *tb; | 37 | struct inet_bind_bucket *tb; |
38 | int refcnt; | ||
23 | /* Unlink from established hashes. */ | 39 | /* Unlink from established hashes. */ |
24 | spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); | 40 | spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); |
25 | 41 | ||
26 | spin_lock(lock); | 42 | spin_lock(lock); |
27 | if (hlist_nulls_unhashed(&tw->tw_node)) { | 43 | refcnt = inet_twsk_unhash(tw); |
28 | spin_unlock(lock); | ||
29 | return; | ||
30 | } | ||
31 | hlist_nulls_del_rcu(&tw->tw_node); | ||
32 | sk_nulls_node_init(&tw->tw_node); | ||
33 | spin_unlock(lock); | 44 | spin_unlock(lock); |
34 | 45 | ||
35 | /* Disassociate with bind bucket. */ | 46 | /* Disassociate with bind bucket. */ |
@@ -37,9 +48,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, | |||
37 | hashinfo->bhash_size)]; | 48 | hashinfo->bhash_size)]; |
38 | spin_lock(&bhead->lock); | 49 | spin_lock(&bhead->lock); |
39 | tb = tw->tw_tb; | 50 | tb = tw->tw_tb; |
40 | __hlist_del(&tw->tw_bind_node); | 51 | if (tb) { |
41 | tw->tw_tb = NULL; | 52 | __hlist_del(&tw->tw_bind_node); |
42 | inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); | 53 | tw->tw_tb = NULL; |
54 | inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); | ||
55 | refcnt++; | ||
56 | } | ||
43 | spin_unlock(&bhead->lock); | 57 | spin_unlock(&bhead->lock); |
44 | #ifdef SOCK_REFCNT_DEBUG | 58 | #ifdef SOCK_REFCNT_DEBUG |
45 | if (atomic_read(&tw->tw_refcnt) != 1) { | 59 | if (atomic_read(&tw->tw_refcnt) != 1) { |
@@ -47,7 +61,10 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, | |||
47 | tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); | 61 | tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); |
48 | } | 62 | } |
49 | #endif | 63 | #endif |
50 | inet_twsk_put(tw); | 64 | while (refcnt) { |
65 | inet_twsk_put(tw); | ||
66 | refcnt--; | ||
67 | } | ||
51 | } | 68 | } |
52 | 69 | ||
53 | static noinline void inet_twsk_free(struct inet_timewait_sock *tw) | 70 | static noinline void inet_twsk_free(struct inet_timewait_sock *tw) |
@@ -86,7 +103,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | |||
86 | Note, that any socket with inet->num != 0 MUST be bound in | 103 | Note, that any socket with inet->num != 0 MUST be bound in |
87 | binding cache, even if it is closed. | 104 | binding cache, even if it is closed. |
88 | */ | 105 | */ |
89 | bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, | 106 | bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, |
90 | hashinfo->bhash_size)]; | 107 | hashinfo->bhash_size)]; |
91 | spin_lock(&bhead->lock); | 108 | spin_lock(&bhead->lock); |
92 | tw->tw_tb = icsk->icsk_bind_hash; | 109 | tw->tw_tb = icsk->icsk_bind_hash; |
@@ -101,13 +118,22 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | |||
101 | * Should be done before removing sk from established chain | 118 | * Should be done before removing sk from established chain |
102 | * because readers are lockless and search established first. | 119 | * because readers are lockless and search established first. |
103 | */ | 120 | */ |
104 | atomic_inc(&tw->tw_refcnt); | ||
105 | inet_twsk_add_node_rcu(tw, &ehead->twchain); | 121 | inet_twsk_add_node_rcu(tw, &ehead->twchain); |
106 | 122 | ||
107 | /* Step 3: Remove SK from established hash. */ | 123 | /* Step 3: Remove SK from established hash. */ |
108 | if (__sk_nulls_del_node_init_rcu(sk)) | 124 | if (__sk_nulls_del_node_init_rcu(sk)) |
109 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 125 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
110 | 126 | ||
127 | /* | ||
128 | * Notes : | ||
129 | * - We initially set tw_refcnt to 0 in inet_twsk_alloc() | ||
130 | * - We add one reference for the bhash link | ||
131 | * - We add one reference for the ehash link | ||
132 | * - We want this refcnt update done before allowing other | ||
133 | * threads to find this tw in ehash chain. | ||
134 | */ | ||
135 | atomic_add(1 + 1 + 1, &tw->tw_refcnt); | ||
136 | |||
111 | spin_unlock(lock); | 137 | spin_unlock(lock); |
112 | } | 138 | } |
113 | 139 | ||
@@ -124,14 +150,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat | |||
124 | kmemcheck_annotate_bitfield(tw, flags); | 150 | kmemcheck_annotate_bitfield(tw, flags); |
125 | 151 | ||
126 | /* Give us an identity. */ | 152 | /* Give us an identity. */ |
127 | tw->tw_daddr = inet->daddr; | 153 | tw->tw_daddr = inet->inet_daddr; |
128 | tw->tw_rcv_saddr = inet->rcv_saddr; | 154 | tw->tw_rcv_saddr = inet->inet_rcv_saddr; |
129 | tw->tw_bound_dev_if = sk->sk_bound_dev_if; | 155 | tw->tw_bound_dev_if = sk->sk_bound_dev_if; |
130 | tw->tw_num = inet->num; | 156 | tw->tw_num = inet->inet_num; |
131 | tw->tw_state = TCP_TIME_WAIT; | 157 | tw->tw_state = TCP_TIME_WAIT; |
132 | tw->tw_substate = state; | 158 | tw->tw_substate = state; |
133 | tw->tw_sport = inet->sport; | 159 | tw->tw_sport = inet->inet_sport; |
134 | tw->tw_dport = inet->dport; | 160 | tw->tw_dport = inet->inet_dport; |
135 | tw->tw_family = sk->sk_family; | 161 | tw->tw_family = sk->sk_family; |
136 | tw->tw_reuse = sk->sk_reuse; | 162 | tw->tw_reuse = sk->sk_reuse; |
137 | tw->tw_hash = sk->sk_hash; | 163 | tw->tw_hash = sk->sk_hash; |
@@ -139,7 +165,12 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat | |||
139 | tw->tw_transparent = inet->transparent; | 165 | tw->tw_transparent = inet->transparent; |
140 | tw->tw_prot = sk->sk_prot_creator; | 166 | tw->tw_prot = sk->sk_prot_creator; |
141 | twsk_net_set(tw, hold_net(sock_net(sk))); | 167 | twsk_net_set(tw, hold_net(sock_net(sk))); |
142 | atomic_set(&tw->tw_refcnt, 1); | 168 | /* |
169 | * Because we use RCU lookups, we should not set tw_refcnt | ||
170 | * to a non null value before everything is setup for this | ||
171 | * timewait socket. | ||
172 | */ | ||
173 | atomic_set(&tw->tw_refcnt, 0); | ||
143 | inet_twsk_dead_node_init(tw); | 174 | inet_twsk_dead_node_init(tw); |
144 | __module_get(tw->tw_prot->owner); | 175 | __module_get(tw->tw_prot->owner); |
145 | } | 176 | } |
@@ -421,37 +452,46 @@ out: | |||
421 | 452 | ||
422 | EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); | 453 | EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); |
423 | 454 | ||
424 | void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, | 455 | void inet_twsk_purge(struct inet_hashinfo *hashinfo, |
425 | struct inet_timewait_death_row *twdr, int family) | 456 | struct inet_timewait_death_row *twdr, int family) |
426 | { | 457 | { |
427 | struct inet_timewait_sock *tw; | 458 | struct inet_timewait_sock *tw; |
428 | struct sock *sk; | 459 | struct sock *sk; |
429 | struct hlist_nulls_node *node; | 460 | struct hlist_nulls_node *node; |
430 | int h; | 461 | unsigned int slot; |
431 | 462 | ||
432 | local_bh_disable(); | 463 | for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { |
433 | for (h = 0; h < (hashinfo->ehash_size); h++) { | 464 | struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; |
434 | struct inet_ehash_bucket *head = | 465 | restart_rcu: |
435 | inet_ehash_bucket(hashinfo, h); | 466 | rcu_read_lock(); |
436 | spinlock_t *lock = inet_ehash_lockp(hashinfo, h); | ||
437 | restart: | 467 | restart: |
438 | spin_lock(lock); | 468 | sk_nulls_for_each_rcu(sk, node, &head->twchain) { |
439 | sk_nulls_for_each(sk, node, &head->twchain) { | ||
440 | |||
441 | tw = inet_twsk(sk); | 469 | tw = inet_twsk(sk); |
442 | if (!net_eq(twsk_net(tw), net) || | 470 | if ((tw->tw_family != family) || |
443 | tw->tw_family != family) | 471 | atomic_read(&twsk_net(tw)->count)) |
472 | continue; | ||
473 | |||
474 | if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt))) | ||
444 | continue; | 475 | continue; |
445 | 476 | ||
446 | atomic_inc(&tw->tw_refcnt); | 477 | if (unlikely((tw->tw_family != family) || |
447 | spin_unlock(lock); | 478 | atomic_read(&twsk_net(tw)->count))) { |
479 | inet_twsk_put(tw); | ||
480 | goto restart; | ||
481 | } | ||
482 | |||
483 | rcu_read_unlock(); | ||
448 | inet_twsk_deschedule(tw, twdr); | 484 | inet_twsk_deschedule(tw, twdr); |
449 | inet_twsk_put(tw); | 485 | inet_twsk_put(tw); |
450 | 486 | goto restart_rcu; | |
451 | goto restart; | ||
452 | } | 487 | } |
453 | spin_unlock(lock); | 488 | /* If the nulls value we got at the end of this lookup is |
489 | * not the expected one, we must restart lookup. | ||
490 | * We probably met an item that was moved to another chain. | ||
491 | */ | ||
492 | if (get_nulls_value(node) != slot) | ||
493 | goto restart; | ||
494 | rcu_read_unlock(); | ||
454 | } | 495 | } |
455 | local_bh_enable(); | ||
456 | } | 496 | } |
457 | EXPORT_SYMBOL_GPL(inet_twsk_purge); | 497 | EXPORT_SYMBOL_GPL(inet_twsk_purge); |