aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Graf <tgraf@suug.ch>2015-02-04 20:03:32 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-06 18:18:34 -0500
commita5ec68e3b8f2c95ea1a5d23dd543abbe0c8d0624 (patch)
treed9229fd3b7fd4b3321dded59e7bb4b8925e37cc9
parentc88455ce50ae4224d84960ce2baa53e61580df27 (diff)
rhashtable: Use a single bucket lock for sibling buckets
rhashtable currently allows to use a bucket lock per bucket. This requires multiple levels of complicated nested locking because when resizing, a single bucket of the smaller table will map to two buckets in the larger table. So far rhashtable has explicitly locked both buckets in the larger table. By excluding the highest bit of the hash from the bucket lock map and thus only allowing locks to buckets in a ratio of 1:2, the locking can be simplified a lot without losing the benefits of multiple locks. Larger tables which benefit from multiple locks will not have a single lock per bucket anyway. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--lib/rhashtable.c170
1 files changed, 69 insertions, 101 deletions
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 71fd0dd45ce3..cea4244e032b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Resizable, Scalable, Concurrent Hash Table 2 * Resizable, Scalable, Concurrent Hash Table
3 * 3 *
4 * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch> 4 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
5 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> 5 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
6 * 6 *
7 * Based on the following paper: 7 * Based on the following paper:
@@ -34,12 +34,17 @@
34enum { 34enum {
35 RHT_LOCK_NORMAL, 35 RHT_LOCK_NORMAL,
36 RHT_LOCK_NESTED, 36 RHT_LOCK_NESTED,
37 RHT_LOCK_NESTED2,
38}; 37};
39 38
40/* The bucket lock is selected based on the hash and protects mutations 39/* The bucket lock is selected based on the hash and protects mutations
41 * on a group of hash buckets. 40 * on a group of hash buckets.
42 * 41 *
42 * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
43 * a single lock always covers both buckets which may both contains
44 * entries which link to the same bucket of the old table during resizing.
45 * This allows to simplify the locking as locking the bucket in both
46 * tables during resize always guarantee protection.
47 *
43 * IMPORTANT: When holding the bucket lock of both the old and new table 48 * IMPORTANT: When holding the bucket lock of both the old and new table
44 * during expansions and shrinking, the old bucket lock must always be 49 * during expansions and shrinking, the old bucket lock must always be
45 * acquired first. 50 * acquired first.
@@ -128,8 +133,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
128 nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); 133 nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
129 size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); 134 size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
130 135
131 /* Never allocate more than one lock per bucket */ 136 /* Never allocate more than 0.5 locks per bucket */
132 size = min_t(unsigned int, size, tbl->size); 137 size = min_t(unsigned int, size, tbl->size >> 1);
133 138
134 if (sizeof(spinlock_t) != 0) { 139 if (sizeof(spinlock_t) != 0) {
135#ifdef CONFIG_NUMA 140#ifdef CONFIG_NUMA
@@ -211,13 +216,36 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
211} 216}
212EXPORT_SYMBOL_GPL(rht_shrink_below_30); 217EXPORT_SYMBOL_GPL(rht_shrink_below_30);
213 218
214static void hashtable_chain_unzip(const struct rhashtable *ht, 219static void lock_buckets(struct bucket_table *new_tbl,
220 struct bucket_table *old_tbl, unsigned int hash)
221 __acquires(old_bucket_lock)
222{
223 spin_lock_bh(bucket_lock(old_tbl, hash));
224 if (new_tbl != old_tbl)
225 spin_lock_bh_nested(bucket_lock(new_tbl, hash),
226 RHT_LOCK_NESTED);
227}
228
229static void unlock_buckets(struct bucket_table *new_tbl,
230 struct bucket_table *old_tbl, unsigned int hash)
231 __releases(old_bucket_lock)
232{
233 if (new_tbl != old_tbl)
234 spin_unlock_bh(bucket_lock(new_tbl, hash));
235 spin_unlock_bh(bucket_lock(old_tbl, hash));
236}
237
238/**
239 * Unlink entries on bucket which hash to different bucket.
240 *
241 * Returns true if no more work needs to be performed on the bucket.
242 */
243static bool hashtable_chain_unzip(const struct rhashtable *ht,
215 const struct bucket_table *new_tbl, 244 const struct bucket_table *new_tbl,
216 struct bucket_table *old_tbl, 245 struct bucket_table *old_tbl,
217 size_t old_hash) 246 size_t old_hash)
218{ 247{
219 struct rhash_head *he, *p, *next; 248 struct rhash_head *he, *p, *next;
220 spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
221 unsigned int new_hash, new_hash2; 249 unsigned int new_hash, new_hash2;
222 250
223 ASSERT_BUCKET_LOCK(old_tbl, old_hash); 251 ASSERT_BUCKET_LOCK(old_tbl, old_hash);
@@ -226,10 +254,10 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
226 p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, 254 p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
227 old_hash); 255 old_hash);
228 if (rht_is_a_nulls(p)) 256 if (rht_is_a_nulls(p))
229 return; 257 return false;
230 258
231 new_hash = new_hash2 = head_hashfn(ht, new_tbl, p); 259 new_hash = head_hashfn(ht, new_tbl, p);
232 new_bucket_lock = bucket_lock(new_tbl, new_hash); 260 ASSERT_BUCKET_LOCK(new_tbl, new_hash);
233 261
234 /* Advance the old bucket pointer one or more times until it 262 /* Advance the old bucket pointer one or more times until it
235 * reaches a node that doesn't hash to the same bucket as the 263 * reaches a node that doesn't hash to the same bucket as the
@@ -237,22 +265,14 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
237 */ 265 */
238 rht_for_each_continue(he, p->next, old_tbl, old_hash) { 266 rht_for_each_continue(he, p->next, old_tbl, old_hash) {
239 new_hash2 = head_hashfn(ht, new_tbl, he); 267 new_hash2 = head_hashfn(ht, new_tbl, he);
268 ASSERT_BUCKET_LOCK(new_tbl, new_hash2);
269
240 if (new_hash != new_hash2) 270 if (new_hash != new_hash2)
241 break; 271 break;
242 p = he; 272 p = he;
243 } 273 }
244 rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); 274 rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
245 275
246 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
247
248 /* If we have encountered an entry that maps to a different bucket in
249 * the new table, lock down that bucket as well as we might cut off
250 * the end of the chain.
251 */
252 new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
253 if (new_bucket_lock != new_bucket_lock2)
254 spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
255
256 /* Find the subsequent node which does hash to the same 276 /* Find the subsequent node which does hash to the same
257 * bucket as node P, or NULL if no such node exists. 277 * bucket as node P, or NULL if no such node exists.
258 */ 278 */
@@ -271,21 +291,16 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
271 */ 291 */
272 rcu_assign_pointer(p->next, next); 292 rcu_assign_pointer(p->next, next);
273 293
274 if (new_bucket_lock != new_bucket_lock2) 294 p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
275 spin_unlock_bh(new_bucket_lock2); 295 old_hash);
276 spin_unlock_bh(new_bucket_lock); 296
297 return !rht_is_a_nulls(p);
277} 298}
278 299
279static void link_old_to_new(struct bucket_table *new_tbl, 300static void link_old_to_new(struct bucket_table *new_tbl,
280 unsigned int new_hash, struct rhash_head *entry) 301 unsigned int new_hash, struct rhash_head *entry)
281{ 302{
282 spinlock_t *new_bucket_lock;
283
284 new_bucket_lock = bucket_lock(new_tbl, new_hash);
285
286 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
287 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); 303 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
288 spin_unlock_bh(new_bucket_lock);
289} 304}
290 305
291/** 306/**
@@ -308,7 +323,6 @@ int rhashtable_expand(struct rhashtable *ht)
308{ 323{
309 struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); 324 struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
310 struct rhash_head *he; 325 struct rhash_head *he;
311 spinlock_t *old_bucket_lock;
312 unsigned int new_hash, old_hash; 326 unsigned int new_hash, old_hash;
313 bool complete = false; 327 bool complete = false;
314 328
@@ -338,16 +352,14 @@ int rhashtable_expand(struct rhashtable *ht)
338 */ 352 */
339 for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { 353 for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
340 old_hash = rht_bucket_index(old_tbl, new_hash); 354 old_hash = rht_bucket_index(old_tbl, new_hash);
341 old_bucket_lock = bucket_lock(old_tbl, old_hash); 355 lock_buckets(new_tbl, old_tbl, new_hash);
342
343 spin_lock_bh(old_bucket_lock);
344 rht_for_each(he, old_tbl, old_hash) { 356 rht_for_each(he, old_tbl, old_hash) {
345 if (head_hashfn(ht, new_tbl, he) == new_hash) { 357 if (head_hashfn(ht, new_tbl, he) == new_hash) {
346 link_old_to_new(new_tbl, new_hash, he); 358 link_old_to_new(new_tbl, new_hash, he);
347 break; 359 break;
348 } 360 }
349 } 361 }
350 spin_unlock_bh(old_bucket_lock); 362 unlock_buckets(new_tbl, old_tbl, new_hash);
351 } 363 }
352 364
353 /* Publish the new table pointer. Lookups may now traverse 365 /* Publish the new table pointer. Lookups may now traverse
@@ -370,18 +382,13 @@ int rhashtable_expand(struct rhashtable *ht)
370 */ 382 */
371 complete = true; 383 complete = true;
372 for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { 384 for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
373 struct rhash_head *head; 385 lock_buckets(new_tbl, old_tbl, old_hash);
374 386
375 old_bucket_lock = bucket_lock(old_tbl, old_hash); 387 if (hashtable_chain_unzip(ht, new_tbl, old_tbl,
376 spin_lock_bh(old_bucket_lock); 388 old_hash))
377
378 hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
379 head = rht_dereference_bucket(old_tbl->buckets[old_hash],
380 old_tbl, old_hash);
381 if (!rht_is_a_nulls(head))
382 complete = false; 389 complete = false;
383 390
384 spin_unlock_bh(old_bucket_lock); 391 unlock_buckets(new_tbl, old_tbl, old_hash);
385 } 392 }
386 } 393 }
387 394
@@ -409,7 +416,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
409int rhashtable_shrink(struct rhashtable *ht) 416int rhashtable_shrink(struct rhashtable *ht)
410{ 417{
411 struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); 418 struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
412 spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
413 unsigned int new_hash; 419 unsigned int new_hash;
414 420
415 ASSERT_RHT_MUTEX(ht); 421 ASSERT_RHT_MUTEX(ht);
@@ -427,36 +433,16 @@ int rhashtable_shrink(struct rhashtable *ht)
427 * always divide the size in half when shrinking, each bucket 433 * always divide the size in half when shrinking, each bucket
428 * in the new table maps to exactly two buckets in the old 434 * in the new table maps to exactly two buckets in the old
429 * table. 435 * table.
430 *
431 * As removals can occur concurrently on the old table, we need
432 * to lock down both matching buckets in the old table.
433 */ 436 */
434 for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { 437 for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
435 old_bucket_lock1 = bucket_lock(tbl, new_hash); 438 lock_buckets(new_tbl, tbl, new_hash);
436 old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
437 new_bucket_lock = bucket_lock(new_tbl, new_hash);
438
439 spin_lock_bh(old_bucket_lock1);
440
441 /* Depending on the lock per buckets mapping, the bucket in
442 * the lower and upper region may map to the same lock.
443 */
444 if (old_bucket_lock1 != old_bucket_lock2) {
445 spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
446 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
447 } else {
448 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
449 }
450 439
451 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), 440 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
452 tbl->buckets[new_hash]); 441 tbl->buckets[new_hash]);
453 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), 442 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
454 tbl->buckets[new_hash + new_tbl->size]); 443 tbl->buckets[new_hash + new_tbl->size]);
455 444
456 spin_unlock_bh(new_bucket_lock); 445 unlock_buckets(new_tbl, tbl, new_hash);
457 if (old_bucket_lock1 != old_bucket_lock2)
458 spin_unlock_bh(old_bucket_lock2);
459 spin_unlock_bh(old_bucket_lock1);
460 } 446 }
461 447
462 /* Publish the new, valid hash table */ 448 /* Publish the new, valid hash table */
@@ -547,19 +533,18 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
547 */ 533 */
548void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) 534void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
549{ 535{
550 struct bucket_table *tbl; 536 struct bucket_table *tbl, *old_tbl;
551 spinlock_t *lock;
552 unsigned hash; 537 unsigned hash;
553 538
554 rcu_read_lock(); 539 rcu_read_lock();
555 540
556 tbl = rht_dereference_rcu(ht->future_tbl, ht); 541 tbl = rht_dereference_rcu(ht->future_tbl, ht);
542 old_tbl = rht_dereference_rcu(ht->tbl, ht);
557 hash = head_hashfn(ht, tbl, obj); 543 hash = head_hashfn(ht, tbl, obj);
558 lock = bucket_lock(tbl, hash);
559 544
560 spin_lock_bh(lock); 545 lock_buckets(tbl, old_tbl, hash);
561 __rhashtable_insert(ht, obj, tbl, hash); 546 __rhashtable_insert(ht, obj, tbl, hash);
562 spin_unlock_bh(lock); 547 unlock_buckets(tbl, old_tbl, hash);
563 548
564 rcu_read_unlock(); 549 rcu_read_unlock();
565} 550}
@@ -582,21 +567,20 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
582 */ 567 */
583bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) 568bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
584{ 569{
585 struct bucket_table *tbl; 570 struct bucket_table *tbl, *new_tbl, *old_tbl;
586 struct rhash_head __rcu **pprev; 571 struct rhash_head __rcu **pprev;
587 struct rhash_head *he; 572 struct rhash_head *he;
588 spinlock_t *lock; 573 unsigned int hash, new_hash;
589 unsigned int hash;
590 bool ret = false; 574 bool ret = false;
591 575
592 rcu_read_lock(); 576 rcu_read_lock();
593 tbl = rht_dereference_rcu(ht->tbl, ht); 577 tbl = old_tbl = rht_dereference_rcu(ht->tbl, ht);
594 hash = head_hashfn(ht, tbl, obj); 578 new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
595 579 new_hash = head_hashfn(ht, new_tbl, obj);
596 lock = bucket_lock(tbl, hash);
597 spin_lock_bh(lock);
598 580
581 lock_buckets(new_tbl, old_tbl, new_hash);
599restart: 582restart:
583 hash = rht_bucket_index(tbl, new_hash);
600 pprev = &tbl->buckets[hash]; 584 pprev = &tbl->buckets[hash];
601 rht_for_each(he, tbl, hash) { 585 rht_for_each(he, tbl, hash) {
602 if (he != obj) { 586 if (he != obj) {
@@ -615,18 +599,12 @@ restart:
615 * resizing. Thus traversing both is fine and the added cost is 599 * resizing. Thus traversing both is fine and the added cost is
616 * very rare. 600 * very rare.
617 */ 601 */
618 if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) { 602 if (tbl != new_tbl) {
619 spin_unlock_bh(lock); 603 tbl = new_tbl;
620
621 tbl = rht_dereference_rcu(ht->future_tbl, ht);
622 hash = head_hashfn(ht, tbl, obj);
623
624 lock = bucket_lock(tbl, hash);
625 spin_lock_bh(lock);
626 goto restart; 604 goto restart;
627 } 605 }
628 606
629 spin_unlock_bh(lock); 607 unlock_buckets(new_tbl, old_tbl, new_hash);
630 608
631 if (ret) { 609 if (ret) {
632 atomic_dec(&ht->nelems); 610 atomic_dec(&ht->nelems);
@@ -782,24 +760,17 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
782 void *arg) 760 void *arg)
783{ 761{
784 struct bucket_table *new_tbl, *old_tbl; 762 struct bucket_table *new_tbl, *old_tbl;
785 spinlock_t *new_bucket_lock, *old_bucket_lock; 763 u32 new_hash;
786 u32 new_hash, old_hash;
787 bool success = true; 764 bool success = true;
788 765
789 BUG_ON(!ht->p.key_len); 766 BUG_ON(!ht->p.key_len);
790 767
791 rcu_read_lock(); 768 rcu_read_lock();
792
793 old_tbl = rht_dereference_rcu(ht->tbl, ht); 769 old_tbl = rht_dereference_rcu(ht->tbl, ht);
794 old_hash = head_hashfn(ht, old_tbl, obj);
795 old_bucket_lock = bucket_lock(old_tbl, old_hash);
796 spin_lock_bh(old_bucket_lock);
797
798 new_tbl = rht_dereference_rcu(ht->future_tbl, ht); 770 new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
799 new_hash = head_hashfn(ht, new_tbl, obj); 771 new_hash = head_hashfn(ht, new_tbl, obj);
800 new_bucket_lock = bucket_lock(new_tbl, new_hash); 772
801 if (unlikely(old_tbl != new_tbl)) 773 lock_buckets(new_tbl, old_tbl, new_hash);
802 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
803 774
804 if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset, 775 if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
805 compare, arg)) { 776 compare, arg)) {
@@ -810,10 +781,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
810 __rhashtable_insert(ht, obj, new_tbl, new_hash); 781 __rhashtable_insert(ht, obj, new_tbl, new_hash);
811 782
812exit: 783exit:
813 if (unlikely(old_tbl != new_tbl)) 784 unlock_buckets(new_tbl, old_tbl, new_hash);
814 spin_unlock_bh(new_bucket_lock);
815 spin_unlock_bh(old_bucket_lock);
816
817 rcu_read_unlock(); 785 rcu_read_unlock();
818 786
819 return success; 787 return success;