diff options
Diffstat (limited to 'lib/rhashtable.c')
-rw-r--r-- | lib/rhashtable.c | 170 |
1 files changed, 69 insertions, 101 deletions
diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 71fd0dd45ce3..cea4244e032b 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Resizable, Scalable, Concurrent Hash Table | 2 | * Resizable, Scalable, Concurrent Hash Table |
3 | * | 3 | * |
4 | * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch> | 4 | * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> |
5 | * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> | 5 | * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> |
6 | * | 6 | * |
7 | * Based on the following paper: | 7 | * Based on the following paper: |
@@ -34,12 +34,17 @@ | |||
34 | enum { | 34 | enum { |
35 | RHT_LOCK_NORMAL, | 35 | RHT_LOCK_NORMAL, |
36 | RHT_LOCK_NESTED, | 36 | RHT_LOCK_NESTED, |
37 | RHT_LOCK_NESTED2, | ||
38 | }; | 37 | }; |
39 | 38 | ||
40 | /* The bucket lock is selected based on the hash and protects mutations | 39 | /* The bucket lock is selected based on the hash and protects mutations |
41 | * on a group of hash buckets. | 40 | * on a group of hash buckets. |
42 | * | 41 | * |
42 | * A maximum of tbl->size/2 bucket locks is allocated. This ensures that | ||
43 | * a single lock always covers both buckets which may both contains | ||
44 | * entries which link to the same bucket of the old table during resizing. | ||
45 | * This allows to simplify the locking as locking the bucket in both | ||
46 | * tables during resize always guarantee protection. | ||
47 | * | ||
43 | * IMPORTANT: When holding the bucket lock of both the old and new table | 48 | * IMPORTANT: When holding the bucket lock of both the old and new table |
44 | * during expansions and shrinking, the old bucket lock must always be | 49 | * during expansions and shrinking, the old bucket lock must always be |
45 | * acquired first. | 50 | * acquired first. |
@@ -128,8 +133,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) | |||
128 | nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); | 133 | nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); |
129 | size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); | 134 | size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); |
130 | 135 | ||
131 | /* Never allocate more than one lock per bucket */ | 136 | /* Never allocate more than 0.5 locks per bucket */ |
132 | size = min_t(unsigned int, size, tbl->size); | 137 | size = min_t(unsigned int, size, tbl->size >> 1); |
133 | 138 | ||
134 | if (sizeof(spinlock_t) != 0) { | 139 | if (sizeof(spinlock_t) != 0) { |
135 | #ifdef CONFIG_NUMA | 140 | #ifdef CONFIG_NUMA |
@@ -211,13 +216,36 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) | |||
211 | } | 216 | } |
212 | EXPORT_SYMBOL_GPL(rht_shrink_below_30); | 217 | EXPORT_SYMBOL_GPL(rht_shrink_below_30); |
213 | 218 | ||
214 | static void hashtable_chain_unzip(const struct rhashtable *ht, | 219 | static void lock_buckets(struct bucket_table *new_tbl, |
220 | struct bucket_table *old_tbl, unsigned int hash) | ||
221 | __acquires(old_bucket_lock) | ||
222 | { | ||
223 | spin_lock_bh(bucket_lock(old_tbl, hash)); | ||
224 | if (new_tbl != old_tbl) | ||
225 | spin_lock_bh_nested(bucket_lock(new_tbl, hash), | ||
226 | RHT_LOCK_NESTED); | ||
227 | } | ||
228 | |||
229 | static void unlock_buckets(struct bucket_table *new_tbl, | ||
230 | struct bucket_table *old_tbl, unsigned int hash) | ||
231 | __releases(old_bucket_lock) | ||
232 | { | ||
233 | if (new_tbl != old_tbl) | ||
234 | spin_unlock_bh(bucket_lock(new_tbl, hash)); | ||
235 | spin_unlock_bh(bucket_lock(old_tbl, hash)); | ||
236 | } | ||
237 | |||
238 | /** | ||
239 | * Unlink entries on bucket which hash to different bucket. | ||
240 | * | ||
241 | * Returns true if no more work needs to be performed on the bucket. | ||
242 | */ | ||
243 | static bool hashtable_chain_unzip(const struct rhashtable *ht, | ||
215 | const struct bucket_table *new_tbl, | 244 | const struct bucket_table *new_tbl, |
216 | struct bucket_table *old_tbl, | 245 | struct bucket_table *old_tbl, |
217 | size_t old_hash) | 246 | size_t old_hash) |
218 | { | 247 | { |
219 | struct rhash_head *he, *p, *next; | 248 | struct rhash_head *he, *p, *next; |
220 | spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL; | ||
221 | unsigned int new_hash, new_hash2; | 249 | unsigned int new_hash, new_hash2; |
222 | 250 | ||
223 | ASSERT_BUCKET_LOCK(old_tbl, old_hash); | 251 | ASSERT_BUCKET_LOCK(old_tbl, old_hash); |
@@ -226,10 +254,10 @@ static void hashtable_chain_unzip(const struct rhashtable *ht, | |||
226 | p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, | 254 | p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, |
227 | old_hash); | 255 | old_hash); |
228 | if (rht_is_a_nulls(p)) | 256 | if (rht_is_a_nulls(p)) |
229 | return; | 257 | return false; |
230 | 258 | ||
231 | new_hash = new_hash2 = head_hashfn(ht, new_tbl, p); | 259 | new_hash = head_hashfn(ht, new_tbl, p); |
232 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | 260 | ASSERT_BUCKET_LOCK(new_tbl, new_hash); |
233 | 261 | ||
234 | /* Advance the old bucket pointer one or more times until it | 262 | /* Advance the old bucket pointer one or more times until it |
235 | * reaches a node that doesn't hash to the same bucket as the | 263 | * reaches a node that doesn't hash to the same bucket as the |
@@ -237,22 +265,14 @@ static void hashtable_chain_unzip(const struct rhashtable *ht, | |||
237 | */ | 265 | */ |
238 | rht_for_each_continue(he, p->next, old_tbl, old_hash) { | 266 | rht_for_each_continue(he, p->next, old_tbl, old_hash) { |
239 | new_hash2 = head_hashfn(ht, new_tbl, he); | 267 | new_hash2 = head_hashfn(ht, new_tbl, he); |
268 | ASSERT_BUCKET_LOCK(new_tbl, new_hash2); | ||
269 | |||
240 | if (new_hash != new_hash2) | 270 | if (new_hash != new_hash2) |
241 | break; | 271 | break; |
242 | p = he; | 272 | p = he; |
243 | } | 273 | } |
244 | rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); | 274 | rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); |
245 | 275 | ||
246 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); | ||
247 | |||
248 | /* If we have encountered an entry that maps to a different bucket in | ||
249 | * the new table, lock down that bucket as well as we might cut off | ||
250 | * the end of the chain. | ||
251 | */ | ||
252 | new_bucket_lock2 = bucket_lock(new_tbl, new_hash); | ||
253 | if (new_bucket_lock != new_bucket_lock2) | ||
254 | spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2); | ||
255 | |||
256 | /* Find the subsequent node which does hash to the same | 276 | /* Find the subsequent node which does hash to the same |
257 | * bucket as node P, or NULL if no such node exists. | 277 | * bucket as node P, or NULL if no such node exists. |
258 | */ | 278 | */ |
@@ -271,21 +291,16 @@ static void hashtable_chain_unzip(const struct rhashtable *ht, | |||
271 | */ | 291 | */ |
272 | rcu_assign_pointer(p->next, next); | 292 | rcu_assign_pointer(p->next, next); |
273 | 293 | ||
274 | if (new_bucket_lock != new_bucket_lock2) | 294 | p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, |
275 | spin_unlock_bh(new_bucket_lock2); | 295 | old_hash); |
276 | spin_unlock_bh(new_bucket_lock); | 296 | |
297 | return !rht_is_a_nulls(p); | ||
277 | } | 298 | } |
278 | 299 | ||
279 | static void link_old_to_new(struct bucket_table *new_tbl, | 300 | static void link_old_to_new(struct bucket_table *new_tbl, |
280 | unsigned int new_hash, struct rhash_head *entry) | 301 | unsigned int new_hash, struct rhash_head *entry) |
281 | { | 302 | { |
282 | spinlock_t *new_bucket_lock; | ||
283 | |||
284 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | ||
285 | |||
286 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); | ||
287 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); | 303 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); |
288 | spin_unlock_bh(new_bucket_lock); | ||
289 | } | 304 | } |
290 | 305 | ||
291 | /** | 306 | /** |
@@ -308,7 +323,6 @@ int rhashtable_expand(struct rhashtable *ht) | |||
308 | { | 323 | { |
309 | struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); | 324 | struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); |
310 | struct rhash_head *he; | 325 | struct rhash_head *he; |
311 | spinlock_t *old_bucket_lock; | ||
312 | unsigned int new_hash, old_hash; | 326 | unsigned int new_hash, old_hash; |
313 | bool complete = false; | 327 | bool complete = false; |
314 | 328 | ||
@@ -338,16 +352,14 @@ int rhashtable_expand(struct rhashtable *ht) | |||
338 | */ | 352 | */ |
339 | for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { | 353 | for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { |
340 | old_hash = rht_bucket_index(old_tbl, new_hash); | 354 | old_hash = rht_bucket_index(old_tbl, new_hash); |
341 | old_bucket_lock = bucket_lock(old_tbl, old_hash); | 355 | lock_buckets(new_tbl, old_tbl, new_hash); |
342 | |||
343 | spin_lock_bh(old_bucket_lock); | ||
344 | rht_for_each(he, old_tbl, old_hash) { | 356 | rht_for_each(he, old_tbl, old_hash) { |
345 | if (head_hashfn(ht, new_tbl, he) == new_hash) { | 357 | if (head_hashfn(ht, new_tbl, he) == new_hash) { |
346 | link_old_to_new(new_tbl, new_hash, he); | 358 | link_old_to_new(new_tbl, new_hash, he); |
347 | break; | 359 | break; |
348 | } | 360 | } |
349 | } | 361 | } |
350 | spin_unlock_bh(old_bucket_lock); | 362 | unlock_buckets(new_tbl, old_tbl, new_hash); |
351 | } | 363 | } |
352 | 364 | ||
353 | /* Publish the new table pointer. Lookups may now traverse | 365 | /* Publish the new table pointer. Lookups may now traverse |
@@ -370,18 +382,13 @@ int rhashtable_expand(struct rhashtable *ht) | |||
370 | */ | 382 | */ |
371 | complete = true; | 383 | complete = true; |
372 | for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { | 384 | for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { |
373 | struct rhash_head *head; | 385 | lock_buckets(new_tbl, old_tbl, old_hash); |
374 | 386 | ||
375 | old_bucket_lock = bucket_lock(old_tbl, old_hash); | 387 | if (hashtable_chain_unzip(ht, new_tbl, old_tbl, |
376 | spin_lock_bh(old_bucket_lock); | 388 | old_hash)) |
377 | |||
378 | hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash); | ||
379 | head = rht_dereference_bucket(old_tbl->buckets[old_hash], | ||
380 | old_tbl, old_hash); | ||
381 | if (!rht_is_a_nulls(head)) | ||
382 | complete = false; | 389 | complete = false; |
383 | 390 | ||
384 | spin_unlock_bh(old_bucket_lock); | 391 | unlock_buckets(new_tbl, old_tbl, old_hash); |
385 | } | 392 | } |
386 | } | 393 | } |
387 | 394 | ||
@@ -409,7 +416,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand); | |||
409 | int rhashtable_shrink(struct rhashtable *ht) | 416 | int rhashtable_shrink(struct rhashtable *ht) |
410 | { | 417 | { |
411 | struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); | 418 | struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); |
412 | spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2; | ||
413 | unsigned int new_hash; | 419 | unsigned int new_hash; |
414 | 420 | ||
415 | ASSERT_RHT_MUTEX(ht); | 421 | ASSERT_RHT_MUTEX(ht); |
@@ -427,36 +433,16 @@ int rhashtable_shrink(struct rhashtable *ht) | |||
427 | * always divide the size in half when shrinking, each bucket | 433 | * always divide the size in half when shrinking, each bucket |
428 | * in the new table maps to exactly two buckets in the old | 434 | * in the new table maps to exactly two buckets in the old |
429 | * table. | 435 | * table. |
430 | * | ||
431 | * As removals can occur concurrently on the old table, we need | ||
432 | * to lock down both matching buckets in the old table. | ||
433 | */ | 436 | */ |
434 | for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { | 437 | for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { |
435 | old_bucket_lock1 = bucket_lock(tbl, new_hash); | 438 | lock_buckets(new_tbl, tbl, new_hash); |
436 | old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size); | ||
437 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | ||
438 | |||
439 | spin_lock_bh(old_bucket_lock1); | ||
440 | |||
441 | /* Depending on the lock per buckets mapping, the bucket in | ||
442 | * the lower and upper region may map to the same lock. | ||
443 | */ | ||
444 | if (old_bucket_lock1 != old_bucket_lock2) { | ||
445 | spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED); | ||
446 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2); | ||
447 | } else { | ||
448 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); | ||
449 | } | ||
450 | 439 | ||
451 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), | 440 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), |
452 | tbl->buckets[new_hash]); | 441 | tbl->buckets[new_hash]); |
453 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), | 442 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), |
454 | tbl->buckets[new_hash + new_tbl->size]); | 443 | tbl->buckets[new_hash + new_tbl->size]); |
455 | 444 | ||
456 | spin_unlock_bh(new_bucket_lock); | 445 | unlock_buckets(new_tbl, tbl, new_hash); |
457 | if (old_bucket_lock1 != old_bucket_lock2) | ||
458 | spin_unlock_bh(old_bucket_lock2); | ||
459 | spin_unlock_bh(old_bucket_lock1); | ||
460 | } | 446 | } |
461 | 447 | ||
462 | /* Publish the new, valid hash table */ | 448 | /* Publish the new, valid hash table */ |
@@ -547,19 +533,18 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, | |||
547 | */ | 533 | */ |
548 | void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) | 534 | void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) |
549 | { | 535 | { |
550 | struct bucket_table *tbl; | 536 | struct bucket_table *tbl, *old_tbl; |
551 | spinlock_t *lock; | ||
552 | unsigned hash; | 537 | unsigned hash; |
553 | 538 | ||
554 | rcu_read_lock(); | 539 | rcu_read_lock(); |
555 | 540 | ||
556 | tbl = rht_dereference_rcu(ht->future_tbl, ht); | 541 | tbl = rht_dereference_rcu(ht->future_tbl, ht); |
542 | old_tbl = rht_dereference_rcu(ht->tbl, ht); | ||
557 | hash = head_hashfn(ht, tbl, obj); | 543 | hash = head_hashfn(ht, tbl, obj); |
558 | lock = bucket_lock(tbl, hash); | ||
559 | 544 | ||
560 | spin_lock_bh(lock); | 545 | lock_buckets(tbl, old_tbl, hash); |
561 | __rhashtable_insert(ht, obj, tbl, hash); | 546 | __rhashtable_insert(ht, obj, tbl, hash); |
562 | spin_unlock_bh(lock); | 547 | unlock_buckets(tbl, old_tbl, hash); |
563 | 548 | ||
564 | rcu_read_unlock(); | 549 | rcu_read_unlock(); |
565 | } | 550 | } |
@@ -582,21 +567,20 @@ EXPORT_SYMBOL_GPL(rhashtable_insert); | |||
582 | */ | 567 | */ |
583 | bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) | 568 | bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) |
584 | { | 569 | { |
585 | struct bucket_table *tbl; | 570 | struct bucket_table *tbl, *new_tbl, *old_tbl; |
586 | struct rhash_head __rcu **pprev; | 571 | struct rhash_head __rcu **pprev; |
587 | struct rhash_head *he; | 572 | struct rhash_head *he; |
588 | spinlock_t *lock; | 573 | unsigned int hash, new_hash; |
589 | unsigned int hash; | ||
590 | bool ret = false; | 574 | bool ret = false; |
591 | 575 | ||
592 | rcu_read_lock(); | 576 | rcu_read_lock(); |
593 | tbl = rht_dereference_rcu(ht->tbl, ht); | 577 | tbl = old_tbl = rht_dereference_rcu(ht->tbl, ht); |
594 | hash = head_hashfn(ht, tbl, obj); | 578 | new_tbl = rht_dereference_rcu(ht->future_tbl, ht); |
595 | 579 | new_hash = head_hashfn(ht, new_tbl, obj); | |
596 | lock = bucket_lock(tbl, hash); | ||
597 | spin_lock_bh(lock); | ||
598 | 580 | ||
581 | lock_buckets(new_tbl, old_tbl, new_hash); | ||
599 | restart: | 582 | restart: |
583 | hash = rht_bucket_index(tbl, new_hash); | ||
600 | pprev = &tbl->buckets[hash]; | 584 | pprev = &tbl->buckets[hash]; |
601 | rht_for_each(he, tbl, hash) { | 585 | rht_for_each(he, tbl, hash) { |
602 | if (he != obj) { | 586 | if (he != obj) { |
@@ -615,18 +599,12 @@ restart: | |||
615 | * resizing. Thus traversing both is fine and the added cost is | 599 | * resizing. Thus traversing both is fine and the added cost is |
616 | * very rare. | 600 | * very rare. |
617 | */ | 601 | */ |
618 | if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) { | 602 | if (tbl != new_tbl) { |
619 | spin_unlock_bh(lock); | 603 | tbl = new_tbl; |
620 | |||
621 | tbl = rht_dereference_rcu(ht->future_tbl, ht); | ||
622 | hash = head_hashfn(ht, tbl, obj); | ||
623 | |||
624 | lock = bucket_lock(tbl, hash); | ||
625 | spin_lock_bh(lock); | ||
626 | goto restart; | 604 | goto restart; |
627 | } | 605 | } |
628 | 606 | ||
629 | spin_unlock_bh(lock); | 607 | unlock_buckets(new_tbl, old_tbl, new_hash); |
630 | 608 | ||
631 | if (ret) { | 609 | if (ret) { |
632 | atomic_dec(&ht->nelems); | 610 | atomic_dec(&ht->nelems); |
@@ -782,24 +760,17 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht, | |||
782 | void *arg) | 760 | void *arg) |
783 | { | 761 | { |
784 | struct bucket_table *new_tbl, *old_tbl; | 762 | struct bucket_table *new_tbl, *old_tbl; |
785 | spinlock_t *new_bucket_lock, *old_bucket_lock; | 763 | u32 new_hash; |
786 | u32 new_hash, old_hash; | ||
787 | bool success = true; | 764 | bool success = true; |
788 | 765 | ||
789 | BUG_ON(!ht->p.key_len); | 766 | BUG_ON(!ht->p.key_len); |
790 | 767 | ||
791 | rcu_read_lock(); | 768 | rcu_read_lock(); |
792 | |||
793 | old_tbl = rht_dereference_rcu(ht->tbl, ht); | 769 | old_tbl = rht_dereference_rcu(ht->tbl, ht); |
794 | old_hash = head_hashfn(ht, old_tbl, obj); | ||
795 | old_bucket_lock = bucket_lock(old_tbl, old_hash); | ||
796 | spin_lock_bh(old_bucket_lock); | ||
797 | |||
798 | new_tbl = rht_dereference_rcu(ht->future_tbl, ht); | 770 | new_tbl = rht_dereference_rcu(ht->future_tbl, ht); |
799 | new_hash = head_hashfn(ht, new_tbl, obj); | 771 | new_hash = head_hashfn(ht, new_tbl, obj); |
800 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | 772 | |
801 | if (unlikely(old_tbl != new_tbl)) | 773 | lock_buckets(new_tbl, old_tbl, new_hash); |
802 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); | ||
803 | 774 | ||
804 | if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset, | 775 | if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset, |
805 | compare, arg)) { | 776 | compare, arg)) { |
@@ -810,10 +781,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht, | |||
810 | __rhashtable_insert(ht, obj, new_tbl, new_hash); | 781 | __rhashtable_insert(ht, obj, new_tbl, new_hash); |
811 | 782 | ||
812 | exit: | 783 | exit: |
813 | if (unlikely(old_tbl != new_tbl)) | 784 | unlock_buckets(new_tbl, old_tbl, new_hash); |
814 | spin_unlock_bh(new_bucket_lock); | ||
815 | spin_unlock_bh(old_bucket_lock); | ||
816 | |||
817 | rcu_read_unlock(); | 785 | rcu_read_unlock(); |
818 | 786 | ||
819 | return success; | 787 | return success; |