Merge branch 'rhashtable-next'

Thomas Graf says: ==================== rhashtable fixes This series fixes all remaining known issues with rhashtable that have been reported. In particular the race condition reported by Ying Xue. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2015-02-06 18:18:39 -0500
committer: David S. Miller <davem@davemloft.net> 2015-02-06 18:18:39 -0500
commit: 41e8b206f8df6daf90e132a73fc2c60f51a36c6b (patch)
tree: 709001bcb9373ca4d622f70d4c48095aea4d1a61 /lib
parent: 2ca292d968ef20cb04f31192d1f626bd8d782960 (diff)
parent: cf52d52f9ccb9966ac019d9f79824195583e3e6c (diff)
1 files changed, 167 insertions, 138 deletions
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 057919164e23..5919d63f58e4 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -1,7 +1,7 @@
 /*
 * Resizable, Scalable, Concurrent Hash Table
 *
- * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch>
+ * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Based on the following paper:
@@ -34,12 +34,17 @@
 enum {
        RHT_LOCK_NORMAL,
        RHT_LOCK_NESTED,
-        RHT_LOCK_NESTED2,
 };
 /* The bucket lock is selected based on the hash and protects mutations
 * on a group of hash buckets.
 *
+ * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
+ * a single lock always covers both buckets which may both contains
+ * entries which link to the same bucket of the old table during resizing.
+ * This allows to simplify the locking as locking the bucket in both
+ * tables during resize always guarantee protection.
+ *
 * IMPORTANT: When holding the bucket lock of both the old and new table
 * during expansions and shrinking, the old bucket lock must always be
 * acquired first.
@@ -49,26 +54,6 @@ static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
        return &tbl->locks[hash & tbl->locks_mask];
 }
-#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
-#define ASSERT_BUCKET_LOCK(TBL, HASH) \
-        BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))
-#ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(struct rhashtable *ht)
-{
-        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
-}
-EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
-int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
-{
-        spinlock_t *lock = bucket_lock(tbl, hash);
-        return (debug_locks) ? lockdep_is_held(lock) : 1;
-}
-EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
-#endif
 static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 {
        return (void *) he - ht->p.head_offset;
@@ -94,13 +79,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 {
-        struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+        return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE;
-        u32 hash;
-        hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
-        hash >>= HASH_RESERVED_SPACE;
-        return rht_bucket_index(tbl, hash);
 }
 static u32 head_hashfn(const struct rhashtable *ht,
@@ -110,6 +89,77 @@ static u32 head_hashfn(const struct rhashtable *ht,
        return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
 }
+#ifdef CONFIG_PROVE_LOCKING
+static void debug_dump_buckets(const struct rhashtable *ht,
+                               const struct bucket_table *tbl)
+{
+        struct rhash_head *he;
+        unsigned int i, hash;
+        for (i = 0; i < tbl->size; i++) {
+                pr_warn(" [Bucket %d] ", i);
+                rht_for_each_rcu(he, tbl, i) {
+                        hash = head_hashfn(ht, tbl, he);
+                        pr_cont("[hash = %#x, lock = %p] ",
+                                hash, bucket_lock(tbl, hash));
+                }
+                pr_cont("\n");
+        }
+}
+static void debug_dump_table(struct rhashtable *ht,
+                             const struct bucket_table *tbl,
+                             unsigned int hash)
+{
+        struct bucket_table *old_tbl, *future_tbl;
+        pr_emerg("BUG: lock for hash %#x in table %p not held\n",
+                 hash, tbl);
+        rcu_read_lock();
+        future_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+        old_tbl = rht_dereference_rcu(ht->tbl, ht);
+        if (future_tbl != old_tbl) {
+                pr_warn("Future table %p (size: %zd)\n",
+                        future_tbl, future_tbl->size);
+                debug_dump_buckets(ht, future_tbl);
+        }
+        pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size);
+        debug_dump_buckets(ht, old_tbl);
+        rcu_read_unlock();
+}
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
+#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)                               \
+        do {                                                            \
+                if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) { \
+                        debug_dump_table(HT, TBL, HASH);                \
+                        BUG();                                          \
+                }                                                       \
+        } while (0)
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+        spinlock_t *lock = bucket_lock(tbl, hash);
+        return (debug_locks) ? lockdep_is_held(lock) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)
+#endif
 static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 {
        struct rhash_head __rcu **pprev;
@@ -134,8 +184,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
        nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
        size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
-        /* Never allocate more than one lock per bucket */
+        /* Never allocate more than 0.5 locks per bucket */
-        size = min_t(unsigned int, size, tbl->size);
+        size = min_t(unsigned int, size, tbl->size >> 1);
        if (sizeof(spinlock_t) != 0) {
 #ifdef CONFIG_NUMA
@@ -217,25 +267,48 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
-static void hashtable_chain_unzip(const struct rhashtable *ht,
+static void lock_buckets(struct bucket_table *new_tbl,
+                         struct bucket_table *old_tbl, unsigned int hash)
+        __acquires(old_bucket_lock)
+{
+        spin_lock_bh(bucket_lock(old_tbl, hash));
+        if (new_tbl != old_tbl)
+                spin_lock_bh_nested(bucket_lock(new_tbl, hash),
+                                    RHT_LOCK_NESTED);
+}
+static void unlock_buckets(struct bucket_table *new_tbl,
+                           struct bucket_table *old_tbl, unsigned int hash)
+        __releases(old_bucket_lock)
+{
+        if (new_tbl != old_tbl)
+                spin_unlock_bh(bucket_lock(new_tbl, hash));
+        spin_unlock_bh(bucket_lock(old_tbl, hash));
+}
+/**
+ * Unlink entries on bucket which hash to different bucket.
+ *
+ * Returns true if no more work needs to be performed on the bucket.
+ */
+static bool hashtable_chain_unzip(struct rhashtable *ht,
                                  const struct bucket_table *new_tbl,
                                  struct bucket_table *old_tbl,
                                  size_t old_hash)
 {
        struct rhash_head *he, *p, *next;
-        spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
        unsigned int new_hash, new_hash2;
-        ASSERT_BUCKET_LOCK(old_tbl, old_hash);
+        ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash);
        /* Old bucket empty, no work needed. */
        p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
                                   old_hash);
        if (rht_is_a_nulls(p))
-                return;
+                return false;
-        new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
+        new_hash = head_hashfn(ht, new_tbl, p);
-        new_bucket_lock = bucket_lock(new_tbl, new_hash);
+        ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);
        /* Advance the old bucket pointer one or more times until it
         * reaches a node that doesn't hash to the same bucket as the
@@ -243,22 +316,14 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
         */
        rht_for_each_continue(he, p->next, old_tbl, old_hash) {
                new_hash2 = head_hashfn(ht, new_tbl, he);
+                ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2);
                if (new_hash != new_hash2)
                        break;
                p = he;
        }
        rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
-        spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
-        /* If we have encountered an entry that maps to a different bucket in
-         * the new table, lock down that bucket as well as we might cut off
-         * the end of the chain.
-         */
-        new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
-        if (new_bucket_lock != new_bucket_lock2)
-                spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
        /* Find the subsequent node which does hash to the same
         * bucket as node P, or NULL if no such node exists.
         */
@@ -277,21 +342,18 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
         */
        rcu_assign_pointer(p->next, next);
-        if (new_bucket_lock != new_bucket_lock2)
+        p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
-                spin_unlock_bh(new_bucket_lock2);
+                                   old_hash);
-        spin_unlock_bh(new_bucket_lock);
+        return !rht_is_a_nulls(p);
 }
-static void link_old_to_new(struct bucket_table *new_tbl,
+static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl,
                            unsigned int new_hash, struct rhash_head *entry)
 {
-        spinlock_t *new_bucket_lock;
+        ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);
-        new_bucket_lock = bucket_lock(new_tbl, new_hash);
-        spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
        rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
-        spin_unlock_bh(new_bucket_lock);
 }
 /**
@@ -314,7 +376,6 @@ int rhashtable_expand(struct rhashtable *ht)
 {
        struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
        struct rhash_head *he;
-        spinlock_t *old_bucket_lock;
        unsigned int new_hash, old_hash;
        bool complete = false;
@@ -344,24 +405,16 @@ int rhashtable_expand(struct rhashtable *ht)
         */
        for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
                old_hash = rht_bucket_index(old_tbl, new_hash);
-                old_bucket_lock = bucket_lock(old_tbl, old_hash);
+                lock_buckets(new_tbl, old_tbl, new_hash);
-                spin_lock_bh(old_bucket_lock);
                rht_for_each(he, old_tbl, old_hash) {
                        if (head_hashfn(ht, new_tbl, he) == new_hash) {
-                                link_old_to_new(new_tbl, new_hash, he);
+                                link_old_to_new(ht, new_tbl, new_hash, he);
                                break;
                        }
                }
-                spin_unlock_bh(old_bucket_lock);
+                unlock_buckets(new_tbl, old_tbl, new_hash);
        }
-        /* Publish the new table pointer. Lookups may now traverse
-         * the new table, but they will not benefit from any
-         * additional efficiency until later steps unzip the buckets.
-         */
-        rcu_assign_pointer(ht->tbl, new_tbl);
        /* Unzip interleaved hash chains */
        while (!complete && !ht->being_destroyed) {
                /* Wait for readers. All new readers will see the new
@@ -376,21 +429,19 @@ int rhashtable_expand(struct rhashtable *ht)
                 */
                complete = true;
                for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
-                        struct rhash_head *head;
+                        lock_buckets(new_tbl, old_tbl, old_hash);
-                        old_bucket_lock = bucket_lock(old_tbl, old_hash);
-                        spin_lock_bh(old_bucket_lock);
-                        hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
+                        if (hashtable_chain_unzip(ht, new_tbl, old_tbl,
-                        head = rht_dereference_bucket(old_tbl->buckets[old_hash],
+                                                  old_hash))
-                                                      old_tbl, old_hash);
-                        if (!rht_is_a_nulls(head))
                                complete = false;
-                        spin_unlock_bh(old_bucket_lock);
+                        unlock_buckets(new_tbl, old_tbl, old_hash);
                }
        }
+        rcu_assign_pointer(ht->tbl, new_tbl);
+        synchronize_rcu();
        bucket_table_free(old_tbl);
        return 0;
 }
@@ -415,7 +466,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
 int rhashtable_shrink(struct rhashtable *ht)
 {
        struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
-        spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
        unsigned int new_hash;
        ASSERT_RHT_MUTEX(ht);
@@ -433,36 +483,17 @@ int rhashtable_shrink(struct rhashtable *ht)
         * always divide the size in half when shrinking, each bucket
         * in the new table maps to exactly two buckets in the old
         * table.
-         *
-         * As removals can occur concurrently on the old table, we need
-         * to lock down both matching buckets in the old table.
         */
        for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
-                old_bucket_lock1 = bucket_lock(tbl, new_hash);
+                lock_buckets(new_tbl, tbl, new_hash);
-                old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
-                new_bucket_lock = bucket_lock(new_tbl, new_hash);
-                spin_lock_bh(old_bucket_lock1);
-                /* Depending on the lock per buckets mapping, the bucket in
-                 * the lower and upper region may map to the same lock.
-                 */
-                if (old_bucket_lock1 != old_bucket_lock2) {
-                        spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
-                        spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
-                } else {
-                        spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
-                }
                rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
                                   tbl->buckets[new_hash]);
+                ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size);
                rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
                                   tbl->buckets[new_hash + new_tbl->size]);
-                spin_unlock_bh(new_bucket_lock);
+                unlock_buckets(new_tbl, tbl, new_hash);
-                if (old_bucket_lock1 != old_bucket_lock2)
-                        spin_unlock_bh(old_bucket_lock2);
-                spin_unlock_bh(old_bucket_lock1);
        }
        /* Publish the new, valid hash table */
@@ -524,6 +555,8 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
        struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash],
                                                         tbl, hash);
+        ASSERT_BUCKET_LOCK(ht, tbl, hash);
        if (rht_is_a_nulls(head))
                INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
        else
@@ -553,19 +586,18 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 */
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
-        struct bucket_table *tbl;
+        struct bucket_table *tbl, *old_tbl;
-        spinlock_t *lock;
        unsigned hash;
        rcu_read_lock();
        tbl = rht_dereference_rcu(ht->future_tbl, ht);
+        old_tbl = rht_dereference_rcu(ht->tbl, ht);
        hash = head_hashfn(ht, tbl, obj);
-        lock = bucket_lock(tbl, hash);
-        spin_lock_bh(lock);
+        lock_buckets(tbl, old_tbl, hash);
        __rhashtable_insert(ht, obj, tbl, hash);
-        spin_unlock_bh(lock);
+        unlock_buckets(tbl, old_tbl, hash);
        rcu_read_unlock();
 }
@@ -588,21 +620,20 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
 */
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
-        struct bucket_table *tbl;
+        struct bucket_table *tbl, *new_tbl, *old_tbl;
        struct rhash_head __rcu **pprev;
-        struct rhash_head *he;
+        struct rhash_head *he, *he2;
-        spinlock_t *lock;
+        unsigned int hash, new_hash;
-        unsigned int hash;
        bool ret = false;
        rcu_read_lock();
-        tbl = rht_dereference_rcu(ht->tbl, ht);
+        tbl = old_tbl = rht_dereference_rcu(ht->tbl, ht);
-        hash = head_hashfn(ht, tbl, obj);
+        new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+        new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
-        lock = bucket_lock(tbl, hash);
-        spin_lock_bh(lock);
+        lock_buckets(new_tbl, old_tbl, new_hash);
 restart:
+        hash = rht_bucket_index(tbl, new_hash);
        pprev = &tbl->buckets[hash];
        rht_for_each(he, tbl, hash) {
                if (he != obj) {
@@ -610,8 +641,22 @@ restart:
                        continue;
                }
-                rcu_assign_pointer(*pprev, obj->next);
+                ASSERT_BUCKET_LOCK(ht, tbl, hash);
+                if (unlikely(new_tbl != tbl)) {
+                        rht_for_each_continue(he2, he->next, tbl, hash) {
+                                if (head_hashfn(ht, tbl, he2) == hash) {
+                                        rcu_assign_pointer(*pprev, he2);
+                                        goto found;
+                                }
+                        }
+                        INIT_RHT_NULLS_HEAD(*pprev, ht, hash);
+                } else {
+                        rcu_assign_pointer(*pprev, obj->next);
+                }
+found:
                ret = true;
                break;
        }
@@ -621,18 +666,12 @@ restart:
         * resizing. Thus traversing both is fine and the added cost is
         * very rare.
         */
-        if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) {
+        if (tbl != new_tbl) {
-                spin_unlock_bh(lock);
+                tbl = new_tbl;
-                tbl = rht_dereference_rcu(ht->future_tbl, ht);
-                hash = head_hashfn(ht, tbl, obj);
-                lock = bucket_lock(tbl, hash);
-                spin_lock_bh(lock);
                goto restart;
        }
-        spin_unlock_bh(lock);
+        unlock_buckets(new_tbl, old_tbl, new_hash);
        if (ret) {
                atomic_dec(&ht->nelems);
@@ -788,24 +827,17 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
                                      void *arg)
 {
        struct bucket_table *new_tbl, *old_tbl;
-        spinlock_t *new_bucket_lock, *old_bucket_lock;
+        u32 new_hash;
-        u32 new_hash, old_hash;
        bool success = true;
        BUG_ON(!ht->p.key_len);
        rcu_read_lock();
        old_tbl = rht_dereference_rcu(ht->tbl, ht);
-        old_hash = head_hashfn(ht, old_tbl, obj);
-        old_bucket_lock = bucket_lock(old_tbl, old_hash);
-        spin_lock_bh(old_bucket_lock);
        new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
        new_hash = head_hashfn(ht, new_tbl, obj);
-        new_bucket_lock = bucket_lock(new_tbl, new_hash);
-        if (unlikely(old_tbl != new_tbl))
+        lock_buckets(new_tbl, old_tbl, new_hash);
-                spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
        if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
                                      compare, arg)) {
@@ -816,10 +848,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
        __rhashtable_insert(ht, obj, new_tbl, new_hash);
 exit:
-        if (unlikely(old_tbl != new_tbl))
+        unlock_buckets(new_tbl, old_tbl, new_hash);
-                spin_unlock_bh(new_bucket_lock);
-        spin_unlock_bh(old_bucket_lock);
        rcu_read_unlock();
        return success;
author	David S. Miller <davem@davemloft.net>	2015-02-06 18:18:39 -0500
committer	David S. Miller <davem@davemloft.net>	2015-02-06 18:18:39 -0500
commit	41e8b206f8df6daf90e132a73fc2c60f51a36c6b (patch)
tree	709001bcb9373ca4d622f70d4c48095aea4d1a61 /lib
parent	2ca292d968ef20cb04f31192d1f626bd8d782960 (diff)
parent	cf52d52f9ccb9966ac019d9f79824195583e3e6c (diff)