aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/inet_hashtables.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/inet_hashtables.c')
-rw-r--r--net/ipv4/inet_hashtables.c277
1 files changed, 142 insertions, 135 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 44981906fb91..6a1045da48d2 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -35,7 +35,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
36 36
37 if (tb != NULL) { 37 if (tb != NULL) {
38 tb->ib_net = hold_net(net); 38 write_pnet(&tb->ib_net, hold_net(net));
39 tb->port = snum; 39 tb->port = snum;
40 tb->fastreuse = 0; 40 tb->fastreuse = 0;
41 INIT_HLIST_HEAD(&tb->owners); 41 INIT_HLIST_HEAD(&tb->owners);
@@ -51,7 +51,7 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
51{ 51{
52 if (hlist_empty(&tb->owners)) { 52 if (hlist_empty(&tb->owners)) {
53 __hlist_del(&tb->node); 53 __hlist_del(&tb->node);
54 release_net(tb->ib_net); 54 release_net(ib_net(tb));
55 kmem_cache_free(cachep, tb); 55 kmem_cache_free(cachep, tb);
56 } 56 }
57} 57}
@@ -110,33 +110,29 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
110 110
111EXPORT_SYMBOL_GPL(__inet_inherit_port); 111EXPORT_SYMBOL_GPL(__inet_inherit_port);
112 112
113/* 113static inline int compute_score(struct sock *sk, struct net *net,
114 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 114 const unsigned short hnum, const __be32 daddr,
115 * Look, when several writers sleep and reader wakes them up, all but one 115 const int dif)
116 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
117 * this, _but_ remember, it adds useless work on UP machines (wake up each
118 * exclusive lock release). It should be ifdefed really.
119 */
120void inet_listen_wlock(struct inet_hashinfo *hashinfo)
121 __acquires(hashinfo->lhash_lock)
122{ 116{
123 write_lock(&hashinfo->lhash_lock); 117 int score = -1;
124 118 struct inet_sock *inet = inet_sk(sk);
125 if (atomic_read(&hashinfo->lhash_users)) {
126 DEFINE_WAIT(wait);
127 119
128 for (;;) { 120 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
129 prepare_to_wait_exclusive(&hashinfo->lhash_wait, 121 !ipv6_only_sock(sk)) {
130 &wait, TASK_UNINTERRUPTIBLE); 122 __be32 rcv_saddr = inet->rcv_saddr;
131 if (!atomic_read(&hashinfo->lhash_users)) 123 score = sk->sk_family == PF_INET ? 1 : 0;
132 break; 124 if (rcv_saddr) {
133 write_unlock_bh(&hashinfo->lhash_lock); 125 if (rcv_saddr != daddr)
134 schedule(); 126 return -1;
135 write_lock_bh(&hashinfo->lhash_lock); 127 score += 2;
128 }
129 if (sk->sk_bound_dev_if) {
130 if (sk->sk_bound_dev_if != dif)
131 return -1;
132 score += 2;
136 } 133 }
137
138 finish_wait(&hashinfo->lhash_wait, &wait);
139 } 134 }
135 return score;
140} 136}
141 137
142/* 138/*
@@ -145,72 +141,48 @@ void inet_listen_wlock(struct inet_hashinfo *hashinfo)
145 * remote address for the connection. So always assume those are both 141 * remote address for the connection. So always assume those are both
146 * wildcarded during the search since they can never be otherwise. 142 * wildcarded during the search since they can never be otherwise.
147 */ 143 */
148static struct sock *inet_lookup_listener_slow(struct net *net,
149 const struct hlist_head *head,
150 const __be32 daddr,
151 const unsigned short hnum,
152 const int dif)
153{
154 struct sock *result = NULL, *sk;
155 const struct hlist_node *node;
156 int hiscore = -1;
157
158 sk_for_each(sk, node, head) {
159 const struct inet_sock *inet = inet_sk(sk);
160
161 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
162 !ipv6_only_sock(sk)) {
163 const __be32 rcv_saddr = inet->rcv_saddr;
164 int score = sk->sk_family == PF_INET ? 1 : 0;
165
166 if (rcv_saddr) {
167 if (rcv_saddr != daddr)
168 continue;
169 score += 2;
170 }
171 if (sk->sk_bound_dev_if) {
172 if (sk->sk_bound_dev_if != dif)
173 continue;
174 score += 2;
175 }
176 if (score == 5)
177 return sk;
178 if (score > hiscore) {
179 hiscore = score;
180 result = sk;
181 }
182 }
183 }
184 return result;
185}
186 144
187/* Optimize the common listener case. */ 145
188struct sock *__inet_lookup_listener(struct net *net, 146struct sock *__inet_lookup_listener(struct net *net,
189 struct inet_hashinfo *hashinfo, 147 struct inet_hashinfo *hashinfo,
190 const __be32 daddr, const unsigned short hnum, 148 const __be32 daddr, const unsigned short hnum,
191 const int dif) 149 const int dif)
192{ 150{
193 struct sock *sk = NULL; 151 struct sock *sk, *result;
194 const struct hlist_head *head; 152 struct hlist_nulls_node *node;
195 153 unsigned int hash = inet_lhashfn(net, hnum);
196 read_lock(&hashinfo->lhash_lock); 154 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
197 head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; 155 int score, hiscore;
198 if (!hlist_empty(head)) { 156
199 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 157 rcu_read_lock();
200 158begin:
201 if (inet->num == hnum && !sk->sk_node.next && 159 result = NULL;
202 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 160 hiscore = -1;
203 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 161 sk_nulls_for_each_rcu(sk, node, &ilb->head) {
204 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) 162 score = compute_score(sk, net, hnum, daddr, dif);
205 goto sherry_cache; 163 if (score > hiscore) {
206 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); 164 result = sk;
165 hiscore = score;
166 }
207 } 167 }
208 if (sk) { 168 /*
209sherry_cache: 169 * if the nulls value we got at the end of this lookup is
210 sock_hold(sk); 170 * not the expected one, we must restart lookup.
171 * We probably met an item that was moved to another chain.
172 */
173 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
174 goto begin;
175 if (result) {
176 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
177 result = NULL;
178 else if (unlikely(compute_score(result, net, hnum, daddr,
179 dif) < hiscore)) {
180 sock_put(result);
181 goto begin;
182 }
211 } 183 }
212 read_unlock(&hashinfo->lhash_lock); 184 rcu_read_unlock();
213 return sk; 185 return result;
214} 186}
215EXPORT_SYMBOL_GPL(__inet_lookup_listener); 187EXPORT_SYMBOL_GPL(__inet_lookup_listener);
216 188
@@ -223,35 +195,65 @@ struct sock * __inet_lookup_established(struct net *net,
223 INET_ADDR_COOKIE(acookie, saddr, daddr) 195 INET_ADDR_COOKIE(acookie, saddr, daddr)
224 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 196 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
225 struct sock *sk; 197 struct sock *sk;
226 const struct hlist_node *node; 198 const struct hlist_nulls_node *node;
227 /* Optimize here for direct hit, only listening connections can 199 /* Optimize here for direct hit, only listening connections can
228 * have wildcards anyways. 200 * have wildcards anyways.
229 */ 201 */
230 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 202 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
231 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 203 unsigned int slot = hash & (hashinfo->ehash_size - 1);
232 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 204 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
233 205
234 prefetch(head->chain.first); 206 rcu_read_lock();
235 read_lock(lock); 207begin:
236 sk_for_each(sk, node, &head->chain) { 208 sk_nulls_for_each_rcu(sk, node, &head->chain) {
237 if (INET_MATCH(sk, net, hash, acookie, 209 if (INET_MATCH(sk, net, hash, acookie,
238 saddr, daddr, ports, dif)) 210 saddr, daddr, ports, dif)) {
239 goto hit; /* You sunk my battleship! */ 211 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
212 goto begintw;
213 if (unlikely(!INET_MATCH(sk, net, hash, acookie,
214 saddr, daddr, ports, dif))) {
215 sock_put(sk);
216 goto begin;
217 }
218 goto out;
219 }
240 } 220 }
221 /*
222 * if the nulls value we got at the end of this lookup is
223 * not the expected one, we must restart lookup.
224 * We probably met an item that was moved to another chain.
225 */
226 if (get_nulls_value(node) != slot)
227 goto begin;
241 228
229begintw:
242 /* Must check for a TIME_WAIT'er before going to listener hash. */ 230 /* Must check for a TIME_WAIT'er before going to listener hash. */
243 sk_for_each(sk, node, &head->twchain) { 231 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
244 if (INET_TW_MATCH(sk, net, hash, acookie, 232 if (INET_TW_MATCH(sk, net, hash, acookie,
245 saddr, daddr, ports, dif)) 233 saddr, daddr, ports, dif)) {
246 goto hit; 234 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
235 sk = NULL;
236 goto out;
237 }
238 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
239 saddr, daddr, ports, dif))) {
240 sock_put(sk);
241 goto begintw;
242 }
243 goto out;
244 }
247 } 245 }
246 /*
247 * if the nulls value we got at the end of this lookup is
248 * not the expected one, we must restart lookup.
249 * We probably met an item that was moved to another chain.
250 */
251 if (get_nulls_value(node) != slot)
252 goto begintw;
248 sk = NULL; 253 sk = NULL;
249out: 254out:
250 read_unlock(lock); 255 rcu_read_unlock();
251 return sk; 256 return sk;
252hit:
253 sock_hold(sk);
254 goto out;
255} 257}
256EXPORT_SYMBOL_GPL(__inet_lookup_established); 258EXPORT_SYMBOL_GPL(__inet_lookup_established);
257 259
@@ -270,16 +272,15 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
270 struct net *net = sock_net(sk); 272 struct net *net = sock_net(sk);
271 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); 273 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport);
272 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 274 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
273 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 275 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
274 struct sock *sk2; 276 struct sock *sk2;
275 const struct hlist_node *node; 277 const struct hlist_nulls_node *node;
276 struct inet_timewait_sock *tw; 278 struct inet_timewait_sock *tw;
277 279
278 prefetch(head->chain.first); 280 spin_lock(lock);
279 write_lock(lock);
280 281
281 /* Check TIME-WAIT sockets first. */ 282 /* Check TIME-WAIT sockets first. */
282 sk_for_each(sk2, node, &head->twchain) { 283 sk_nulls_for_each(sk2, node, &head->twchain) {
283 tw = inet_twsk(sk2); 284 tw = inet_twsk(sk2);
284 285
285 if (INET_TW_MATCH(sk2, net, hash, acookie, 286 if (INET_TW_MATCH(sk2, net, hash, acookie,
@@ -293,7 +294,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
293 tw = NULL; 294 tw = NULL;
294 295
295 /* And established part... */ 296 /* And established part... */
296 sk_for_each(sk2, node, &head->chain) { 297 sk_nulls_for_each(sk2, node, &head->chain) {
297 if (INET_MATCH(sk2, net, hash, acookie, 298 if (INET_MATCH(sk2, net, hash, acookie,
298 saddr, daddr, ports, dif)) 299 saddr, daddr, ports, dif))
299 goto not_unique; 300 goto not_unique;
@@ -306,9 +307,9 @@ unique:
306 inet->sport = htons(lport); 307 inet->sport = htons(lport);
307 sk->sk_hash = hash; 308 sk->sk_hash = hash;
308 WARN_ON(!sk_unhashed(sk)); 309 WARN_ON(!sk_unhashed(sk));
309 __sk_add_node(sk, &head->chain); 310 __sk_nulls_add_node_rcu(sk, &head->chain);
311 spin_unlock(lock);
310 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 312 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
311 write_unlock(lock);
312 313
313 if (twp) { 314 if (twp) {
314 *twp = tw; 315 *twp = tw;
@@ -324,7 +325,7 @@ unique:
324 return 0; 325 return 0;
325 326
326not_unique: 327not_unique:
327 write_unlock(lock); 328 spin_unlock(lock);
328 return -EADDRNOTAVAIL; 329 return -EADDRNOTAVAIL;
329} 330}
330 331
@@ -338,8 +339,8 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
338void __inet_hash_nolisten(struct sock *sk) 339void __inet_hash_nolisten(struct sock *sk)
339{ 340{
340 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 341 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
341 struct hlist_head *list; 342 struct hlist_nulls_head *list;
342 rwlock_t *lock; 343 spinlock_t *lock;
343 struct inet_ehash_bucket *head; 344 struct inet_ehash_bucket *head;
344 345
345 WARN_ON(!sk_unhashed(sk)); 346 WARN_ON(!sk_unhashed(sk));
@@ -349,18 +350,17 @@ void __inet_hash_nolisten(struct sock *sk)
349 list = &head->chain; 350 list = &head->chain;
350 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 351 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
351 352
352 write_lock(lock); 353 spin_lock(lock);
353 __sk_add_node(sk, list); 354 __sk_nulls_add_node_rcu(sk, list);
355 spin_unlock(lock);
354 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 356 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
355 write_unlock(lock);
356} 357}
357EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 358EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
358 359
359static void __inet_hash(struct sock *sk) 360static void __inet_hash(struct sock *sk)
360{ 361{
361 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 362 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
362 struct hlist_head *list; 363 struct inet_listen_hashbucket *ilb;
363 rwlock_t *lock;
364 364
365 if (sk->sk_state != TCP_LISTEN) { 365 if (sk->sk_state != TCP_LISTEN) {
366 __inet_hash_nolisten(sk); 366 __inet_hash_nolisten(sk);
@@ -368,14 +368,12 @@ static void __inet_hash(struct sock *sk)
368 } 368 }
369 369
370 WARN_ON(!sk_unhashed(sk)); 370 WARN_ON(!sk_unhashed(sk));
371 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 371 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
372 lock = &hashinfo->lhash_lock;
373 372
374 inet_listen_wlock(hashinfo); 373 spin_lock(&ilb->lock);
375 __sk_add_node(sk, list); 374 __sk_nulls_add_node_rcu(sk, &ilb->head);
376 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
377 write_unlock(lock); 376 spin_unlock(&ilb->lock);
378 wake_up(&hashinfo->lhash_wait);
379} 377}
380 378
381void inet_hash(struct sock *sk) 379void inet_hash(struct sock *sk)
@@ -390,27 +388,23 @@ EXPORT_SYMBOL_GPL(inet_hash);
390 388
391void inet_unhash(struct sock *sk) 389void inet_unhash(struct sock *sk)
392{ 390{
393 rwlock_t *lock;
394 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 391 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
392 spinlock_t *lock;
393 int done;
395 394
396 if (sk_unhashed(sk)) 395 if (sk_unhashed(sk))
397 goto out; 396 return;
398 397
399 if (sk->sk_state == TCP_LISTEN) { 398 if (sk->sk_state == TCP_LISTEN)
400 local_bh_disable(); 399 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
401 inet_listen_wlock(hashinfo); 400 else
402 lock = &hashinfo->lhash_lock;
403 } else {
404 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 401 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
405 write_lock_bh(lock);
406 }
407 402
408 if (__sk_del_node_init(sk)) 403 spin_lock_bh(lock);
404 done =__sk_nulls_del_node_init_rcu(sk);
405 if (done)
409 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 406 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
410 write_unlock_bh(lock); 407 spin_unlock_bh(lock);
411out:
412 if (sk->sk_state == TCP_LISTEN)
413 wake_up(&hashinfo->lhash_wait);
414} 408}
415EXPORT_SYMBOL_GPL(inet_unhash); 409EXPORT_SYMBOL_GPL(inet_unhash);
416 410
@@ -449,7 +443,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
449 * unique enough. 443 * unique enough.
450 */ 444 */
451 inet_bind_bucket_for_each(tb, node, &head->chain) { 445 inet_bind_bucket_for_each(tb, node, &head->chain) {
452 if (tb->ib_net == net && tb->port == port) { 446 if (ib_net(tb) == net && tb->port == port) {
453 WARN_ON(hlist_empty(&tb->owners)); 447 WARN_ON(hlist_empty(&tb->owners));
454 if (tb->fastreuse >= 0) 448 if (tb->fastreuse >= 0)
455 goto next_port; 449 goto next_port;
@@ -524,3 +518,16 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
524} 518}
525 519
526EXPORT_SYMBOL_GPL(inet_hash_connect); 520EXPORT_SYMBOL_GPL(inet_hash_connect);
521
522void inet_hashinfo_init(struct inet_hashinfo *h)
523{
524 int i;
525
526 for (i = 0; i < INET_LHTABLE_SIZE; i++) {
527 spin_lock_init(&h->listening_hash[i].lock);
528 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
529 i + LISTENING_NULLS_BASE);
530 }
531}
532
533EXPORT_SYMBOL_GPL(inet_hashinfo_init);