aboutsummaryrefslogtreecommitdiffstats
path: root/lib/rhashtable.c
diff options
context:
space:
mode:
authorThomas Graf <tgraf@suug.ch>2015-01-02 17:00:20 -0500
committerDavid S. Miller <davem@davemloft.net>2015-01-03 14:32:57 -0500
commit97defe1ecf868b8127f8e62395499d6a06e4c4b1 (patch)
treed3ed6d3db4943e01b1ae58e73580537ba1642d9e /lib/rhashtable.c
parent113948d841e8d78039e5dbbb5248f5b73e99eafa (diff)
rhashtable: Per bucket locks & deferred expansion/shrinking
Introduces an array of spinlocks to protect bucket mutations. The number of spinlocks per CPU is configurable and selected based on the hash of the bucket. This allows for parallel insertions and removals of entries which do not share a lock. The patch also defers expansion and shrinking to a worker queue which allows insertion and removal from atomic context. Insertions and deletions may occur in parallel to it and are only held up briefly while the particular bucket is linked or unzipped. Mutations of the bucket table pointer is protected by a new mutex, read access is RCU protected. In the event of an expansion or shrinking, the new bucket table allocated is exposed as a so called future table as soon as the resize process starts. Lookups, deletions, and insertions will briefly use both tables. The future table becomes the main table after an RCU grace period and initial linking of the old to the new table was performed. Optimization of the chains to make use of the new number of buckets follows only the new table is in use. The side effect of this is that during that RCU grace period, a bucket traversal using any rht_for_each() variant on the main table will not see any insertions performed during the RCU grace period which would at that point land in the future table. The lookup will see them as it searches both tables if needed. Having multiple insertions and removals occur in parallel requires nelems to become an atomic counter. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'lib/rhashtable.c')
-rw-r--r--lib/rhashtable.c458
1 files changed, 344 insertions, 114 deletions
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e6b85c4a5828..312e3437c7bc 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -26,19 +26,42 @@
26 26
27#define HASH_DEFAULT_SIZE 64UL 27#define HASH_DEFAULT_SIZE 64UL
28#define HASH_MIN_SIZE 4UL 28#define HASH_MIN_SIZE 4UL
29#define BUCKET_LOCKS_PER_CPU 128UL
30
31enum {
32 RHT_LOCK_NORMAL,
33 RHT_LOCK_NESTED,
34 RHT_LOCK_NESTED2,
35};
36
37/* The bucket lock is selected based on the hash and protects mutations
38 * on a group of hash buckets.
39 *
40 * IMPORTANT: When holding the bucket lock of both the old and new table
41 * during expansions and shrinking, the old bucket lock must always be
42 * acquired first.
43 */
44static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
45{
46 return &tbl->locks[hash & tbl->locks_mask];
47}
29 48
30#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) 49#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
50#define ASSERT_BUCKET_LOCK(TBL, HASH) \
51 BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))
31 52
32#ifdef CONFIG_PROVE_LOCKING 53#ifdef CONFIG_PROVE_LOCKING
33int lockdep_rht_mutex_is_held(const struct rhashtable *ht) 54int lockdep_rht_mutex_is_held(struct rhashtable *ht)
34{ 55{
35 return ht->p.mutex_is_held(ht->p.parent); 56 return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
36} 57}
37EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); 58EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
38 59
39int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) 60int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
40{ 61{
41 return 1; 62 spinlock_t *lock = bucket_lock(tbl, hash);
63
64 return (debug_locks) ? lockdep_is_held(lock) : 1;
42} 65}
43EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); 66EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
44#endif 67#endif
@@ -66,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
66 return hash; 89 return hash;
67} 90}
68 91
69static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len) 92static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
70{ 93{
71 struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); 94 struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
72 u32 hash; 95 u32 hash;
@@ -95,7 +118,49 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
95 return pprev; 118 return pprev;
96} 119}
97 120
98static struct bucket_table *bucket_table_alloc(size_t nbuckets) 121static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
122{
123 unsigned int i, size;
124#if defined(CONFIG_PROVE_LOCKING)
125 unsigned int nr_pcpus = 2;
126#else
127 unsigned int nr_pcpus = num_possible_cpus();
128#endif
129
130 nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
131 size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
132
133 /* Never allocate more than one lock per bucket */
134 size = min_t(unsigned int, size, tbl->size);
135
136 if (sizeof(spinlock_t) != 0) {
137#ifdef CONFIG_NUMA
138 if (size * sizeof(spinlock_t) > PAGE_SIZE)
139 tbl->locks = vmalloc(size * sizeof(spinlock_t));
140 else
141#endif
142 tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
143 GFP_KERNEL);
144 if (!tbl->locks)
145 return -ENOMEM;
146 for (i = 0; i < size; i++)
147 spin_lock_init(&tbl->locks[i]);
148 }
149 tbl->locks_mask = size - 1;
150
151 return 0;
152}
153
154static void bucket_table_free(const struct bucket_table *tbl)
155{
156 if (tbl)
157 kvfree(tbl->locks);
158
159 kvfree(tbl);
160}
161
162static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
163 size_t nbuckets)
99{ 164{
100 struct bucket_table *tbl; 165 struct bucket_table *tbl;
101 size_t size; 166 size_t size;
@@ -110,12 +175,12 @@ static struct bucket_table *bucket_table_alloc(size_t nbuckets)
110 175
111 tbl->size = nbuckets; 176 tbl->size = nbuckets;
112 177
113 return tbl; 178 if (alloc_bucket_locks(ht, tbl) < 0) {
114} 179 bucket_table_free(tbl);
180 return NULL;
181 }
115 182
116static void bucket_table_free(const struct bucket_table *tbl) 183 return tbl;
117{
118 kvfree(tbl);
119} 184}
120 185
121/** 186/**
@@ -126,7 +191,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
126bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) 191bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
127{ 192{
128 /* Expand table when exceeding 75% load */ 193 /* Expand table when exceeding 75% load */
129 return ht->nelems > (new_size / 4 * 3); 194 return atomic_read(&ht->nelems) > (new_size / 4 * 3);
130} 195}
131EXPORT_SYMBOL_GPL(rht_grow_above_75); 196EXPORT_SYMBOL_GPL(rht_grow_above_75);
132 197
@@ -138,41 +203,59 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75);
138bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) 203bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
139{ 204{
140 /* Shrink table beneath 30% load */ 205 /* Shrink table beneath 30% load */
141 return ht->nelems < (new_size * 3 / 10); 206 return atomic_read(&ht->nelems) < (new_size * 3 / 10);
142} 207}
143EXPORT_SYMBOL_GPL(rht_shrink_below_30); 208EXPORT_SYMBOL_GPL(rht_shrink_below_30);
144 209
145static void hashtable_chain_unzip(const struct rhashtable *ht, 210static void hashtable_chain_unzip(const struct rhashtable *ht,
146 const struct bucket_table *new_tbl, 211 const struct bucket_table *new_tbl,
147 struct bucket_table *old_tbl, size_t n) 212 struct bucket_table *old_tbl,
213 size_t old_hash)
148{ 214{
149 struct rhash_head *he, *p, *next; 215 struct rhash_head *he, *p, *next;
150 unsigned int h; 216 spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
217 unsigned int new_hash, new_hash2;
218
219 ASSERT_BUCKET_LOCK(old_tbl, old_hash);
151 220
152 /* Old bucket empty, no work needed. */ 221 /* Old bucket empty, no work needed. */
153 p = rht_dereference(old_tbl->buckets[n], ht); 222 p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
223 old_hash);
154 if (!p) 224 if (!p)
155 return; 225 return;
156 226
227 new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
228 new_bucket_lock = bucket_lock(new_tbl, new_hash);
229
157 /* Advance the old bucket pointer one or more times until it 230 /* Advance the old bucket pointer one or more times until it
158 * reaches a node that doesn't hash to the same bucket as the 231 * reaches a node that doesn't hash to the same bucket as the
159 * previous node p. Call the previous node p; 232 * previous node p. Call the previous node p;
160 */ 233 */
161 h = head_hashfn(ht, new_tbl, p); 234 rht_for_each_continue(he, p->next, old_tbl, old_hash) {
162 rht_for_each_continue(he, p->next, old_tbl, n) { 235 new_hash2 = head_hashfn(ht, new_tbl, he);
163 if (head_hashfn(ht, new_tbl, he) != h) 236 if (new_hash != new_hash2)
164 break; 237 break;
165 p = he; 238 p = he;
166 } 239 }
167 RCU_INIT_POINTER(old_tbl->buckets[n], p->next); 240 rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
241
242 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
243
244 /* If we have encountered an entry that maps to a different bucket in
245 * the new table, lock down that bucket as well as we might cut off
246 * the end of the chain.
247 */
248 new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
249 if (new_bucket_lock != new_bucket_lock2)
250 spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
168 251
169 /* Find the subsequent node which does hash to the same 252 /* Find the subsequent node which does hash to the same
170 * bucket as node P, or NULL if no such node exists. 253 * bucket as node P, or NULL if no such node exists.
171 */ 254 */
172 next = NULL; 255 next = NULL;
173 if (he) { 256 if (he) {
174 rht_for_each_continue(he, he->next, old_tbl, n) { 257 rht_for_each_continue(he, he->next, old_tbl, old_hash) {
175 if (head_hashfn(ht, new_tbl, he) == h) { 258 if (head_hashfn(ht, new_tbl, he) == new_hash) {
176 next = he; 259 next = he;
177 break; 260 break;
178 } 261 }
@@ -182,7 +265,23 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
182 /* Set p's next pointer to that subsequent node pointer, 265 /* Set p's next pointer to that subsequent node pointer,
183 * bypassing the nodes which do not hash to p's bucket 266 * bypassing the nodes which do not hash to p's bucket
184 */ 267 */
185 RCU_INIT_POINTER(p->next, next); 268 rcu_assign_pointer(p->next, next);
269
270 if (new_bucket_lock != new_bucket_lock2)
271 spin_unlock_bh(new_bucket_lock2);
272 spin_unlock_bh(new_bucket_lock);
273}
274
275static void link_old_to_new(struct bucket_table *new_tbl,
276 unsigned int new_hash, struct rhash_head *entry)
277{
278 spinlock_t *new_bucket_lock;
279
280 new_bucket_lock = bucket_lock(new_tbl, new_hash);
281
282 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
283 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
284 spin_unlock_bh(new_bucket_lock);
186} 285}
187 286
188/** 287/**
@@ -195,43 +294,59 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
195 * This function may only be called in a context where it is safe to call 294 * This function may only be called in a context where it is safe to call
196 * synchronize_rcu(), e.g. not within a rcu_read_lock() section. 295 * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
197 * 296 *
198 * The caller must ensure that no concurrent table mutations take place. 297 * The caller must ensure that no concurrent resizing occurs by holding
199 * It is however valid to have concurrent lookups if they are RCU protected. 298 * ht->mutex.
299 *
300 * It is valid to have concurrent insertions and deletions protected by per
301 * bucket locks or concurrent RCU protected lookups and traversals.
200 */ 302 */
201int rhashtable_expand(struct rhashtable *ht) 303int rhashtable_expand(struct rhashtable *ht)
202{ 304{
203 struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); 305 struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
204 struct rhash_head *he; 306 struct rhash_head *he;
205 unsigned int i, h; 307 spinlock_t *old_bucket_lock;
206 bool complete; 308 unsigned int new_hash, old_hash;
309 bool complete = false;
207 310
208 ASSERT_RHT_MUTEX(ht); 311 ASSERT_RHT_MUTEX(ht);
209 312
210 if (ht->p.max_shift && ht->shift >= ht->p.max_shift) 313 if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
211 return 0; 314 return 0;
212 315
213 new_tbl = bucket_table_alloc(old_tbl->size * 2); 316 new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
214 if (new_tbl == NULL) 317 if (new_tbl == NULL)
215 return -ENOMEM; 318 return -ENOMEM;
216 319
217 ht->shift++; 320 ht->shift++;
218 321
219 /* For each new bucket, search the corresponding old bucket 322 /* Make insertions go into the new, empty table right away. Deletions
220 * for the first entry that hashes to the new bucket, and 323 * and lookups will be attempted in both tables until we synchronize.
221 * link the new bucket to that entry. Since all the entries 324 * The synchronize_rcu() guarantees for the new table to be picked up
222 * which will end up in the new bucket appear in the same 325 * so no new additions go into the old table while we relink.
223 * old bucket, this constructs an entirely valid new hash 326 */
224 * table, but with multiple buckets "zipped" together into a 327 rcu_assign_pointer(ht->future_tbl, new_tbl);
225 * single imprecise chain. 328 synchronize_rcu();
329
330 /* For each new bucket, search the corresponding old bucket for the
331 * first entry that hashes to the new bucket, and link the end of
332 * newly formed bucket chain (containing entries added to future
333 * table) to that entry. Since all the entries which will end up in
334 * the new bucket appear in the same old bucket, this constructs an
335 * entirely valid new hash table, but with multiple buckets
336 * "zipped" together into a single imprecise chain.
226 */ 337 */
227 for (i = 0; i < new_tbl->size; i++) { 338 for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
228 h = rht_bucket_index(old_tbl, i); 339 old_hash = rht_bucket_index(old_tbl, new_hash);
229 rht_for_each(he, old_tbl, h) { 340 old_bucket_lock = bucket_lock(old_tbl, old_hash);
230 if (head_hashfn(ht, new_tbl, he) == i) { 341
231 RCU_INIT_POINTER(new_tbl->buckets[i], he); 342 spin_lock_bh(old_bucket_lock);
343 rht_for_each(he, old_tbl, old_hash) {
344 if (head_hashfn(ht, new_tbl, he) == new_hash) {
345 link_old_to_new(new_tbl, new_hash, he);
232 break; 346 break;
233 } 347 }
234 } 348 }
349 spin_unlock_bh(old_bucket_lock);
235 } 350 }
236 351
237 /* Publish the new table pointer. Lookups may now traverse 352 /* Publish the new table pointer. Lookups may now traverse
@@ -241,7 +356,7 @@ int rhashtable_expand(struct rhashtable *ht)
241 rcu_assign_pointer(ht->tbl, new_tbl); 356 rcu_assign_pointer(ht->tbl, new_tbl);
242 357
243 /* Unzip interleaved hash chains */ 358 /* Unzip interleaved hash chains */
244 do { 359 while (!complete && !ht->being_destroyed) {
245 /* Wait for readers. All new readers will see the new 360 /* Wait for readers. All new readers will see the new
246 * table, and thus no references to the old table will 361 * table, and thus no references to the old table will
247 * remain. 362 * remain.
@@ -253,12 +368,17 @@ int rhashtable_expand(struct rhashtable *ht)
253 * table): ... 368 * table): ...
254 */ 369 */
255 complete = true; 370 complete = true;
256 for (i = 0; i < old_tbl->size; i++) { 371 for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
257 hashtable_chain_unzip(ht, new_tbl, old_tbl, i); 372 old_bucket_lock = bucket_lock(old_tbl, old_hash);
258 if (old_tbl->buckets[i] != NULL) 373 spin_lock_bh(old_bucket_lock);
374
375 hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
376 if (old_tbl->buckets[old_hash] != NULL)
259 complete = false; 377 complete = false;
378
379 spin_unlock_bh(old_bucket_lock);
260 } 380 }
261 } while (!complete); 381 }
262 382
263 bucket_table_free(old_tbl); 383 bucket_table_free(old_tbl);
264 return 0; 384 return 0;
@@ -272,38 +392,65 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
272 * This function may only be called in a context where it is safe to call 392 * This function may only be called in a context where it is safe to call
273 * synchronize_rcu(), e.g. not within a rcu_read_lock() section. 393 * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
274 * 394 *
395 * The caller must ensure that no concurrent resizing occurs by holding
396 * ht->mutex.
397 *
275 * The caller must ensure that no concurrent table mutations take place. 398 * The caller must ensure that no concurrent table mutations take place.
276 * It is however valid to have concurrent lookups if they are RCU protected. 399 * It is however valid to have concurrent lookups if they are RCU protected.
400 *
401 * It is valid to have concurrent insertions and deletions protected by per
402 * bucket locks or concurrent RCU protected lookups and traversals.
277 */ 403 */
278int rhashtable_shrink(struct rhashtable *ht) 404int rhashtable_shrink(struct rhashtable *ht)
279{ 405{
280 struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht); 406 struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
281 unsigned int i; 407 spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
408 unsigned int new_hash;
282 409
283 ASSERT_RHT_MUTEX(ht); 410 ASSERT_RHT_MUTEX(ht);
284 411
285 if (ht->shift <= ht->p.min_shift) 412 if (ht->shift <= ht->p.min_shift)
286 return 0; 413 return 0;
287 414
288 ntbl = bucket_table_alloc(tbl->size / 2); 415 new_tbl = bucket_table_alloc(ht, tbl->size / 2);
289 if (ntbl == NULL) 416 if (new_tbl == NULL)
290 return -ENOMEM; 417 return -ENOMEM;
291 418
292 ht->shift--; 419 rcu_assign_pointer(ht->future_tbl, new_tbl);
420 synchronize_rcu();
293 421
294 /* Link each bucket in the new table to the first bucket 422 /* Link the first entry in the old bucket to the end of the
295 * in the old table that contains entries which will hash 423 * bucket in the new table. As entries are concurrently being
296 * to the new bucket. 424 * added to the new table, lock down the new bucket. As we
425 * always divide the size in half when shrinking, each bucket
426 * in the new table maps to exactly two buckets in the old
427 * table.
428 *
429 * As removals can occur concurrently on the old table, we need
430 * to lock down both matching buckets in the old table.
297 */ 431 */
298 for (i = 0; i < ntbl->size; i++) { 432 for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
299 ntbl->buckets[i] = tbl->buckets[i]; 433 old_bucket_lock1 = bucket_lock(tbl, new_hash);
300 RCU_INIT_POINTER(*bucket_tail(ntbl, i), 434 old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
301 tbl->buckets[i + ntbl->size]); 435 new_bucket_lock = bucket_lock(new_tbl, new_hash);
302 436
437 spin_lock_bh(old_bucket_lock1);
438 spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
439 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
440
441 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
442 tbl->buckets[new_hash]);
443 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
444 tbl->buckets[new_hash + new_tbl->size]);
445
446 spin_unlock_bh(new_bucket_lock);
447 spin_unlock_bh(old_bucket_lock2);
448 spin_unlock_bh(old_bucket_lock1);
303 } 449 }
304 450
305 /* Publish the new, valid hash table */ 451 /* Publish the new, valid hash table */
306 rcu_assign_pointer(ht->tbl, ntbl); 452 rcu_assign_pointer(ht->tbl, new_tbl);
453 ht->shift--;
307 454
308 /* Wait for readers. No new readers will have references to the 455 /* Wait for readers. No new readers will have references to the
309 * old hash table. 456 * old hash table.
@@ -316,31 +463,63 @@ int rhashtable_shrink(struct rhashtable *ht)
316} 463}
317EXPORT_SYMBOL_GPL(rhashtable_shrink); 464EXPORT_SYMBOL_GPL(rhashtable_shrink);
318 465
466static void rht_deferred_worker(struct work_struct *work)
467{
468 struct rhashtable *ht;
469 struct bucket_table *tbl;
470
471 ht = container_of(work, struct rhashtable, run_work.work);
472 mutex_lock(&ht->mutex);
473 tbl = rht_dereference(ht->tbl, ht);
474
475 if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
476 rhashtable_expand(ht);
477 else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
478 rhashtable_shrink(ht);
479
480 mutex_unlock(&ht->mutex);
481}
482
319/** 483/**
320 * rhashtable_insert - insert object into hash hash table 484 * rhashtable_insert - insert object into hash hash table
321 * @ht: hash table 485 * @ht: hash table
322 * @obj: pointer to hash head inside object 486 * @obj: pointer to hash head inside object
323 * 487 *
324 * Will automatically grow the table via rhashtable_expand() if the the 488 * Will take a per bucket spinlock to protect against mutual mutations
325 * grow_decision function specified at rhashtable_init() returns true. 489 * on the same bucket. Multiple insertions may occur in parallel unless
490 * they map to the same bucket lock.
326 * 491 *
327 * The caller must ensure that no concurrent table mutations occur. It is 492 * It is safe to call this function from atomic context.
328 * however valid to have concurrent lookups if they are RCU protected. 493 *
494 * Will trigger an automatic deferred table resizing if the size grows
495 * beyond the watermark indicated by grow_decision() which can be passed
496 * to rhashtable_init().
329 */ 497 */
330void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) 498void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
331{ 499{
332 struct bucket_table *tbl = rht_dereference(ht->tbl, ht); 500 struct bucket_table *tbl;
333 u32 hash; 501 spinlock_t *lock;
502 unsigned hash;
334 503
335 ASSERT_RHT_MUTEX(ht); 504 rcu_read_lock();
336 505
506 tbl = rht_dereference_rcu(ht->future_tbl, ht);
337 hash = head_hashfn(ht, tbl, obj); 507 hash = head_hashfn(ht, tbl, obj);
508 lock = bucket_lock(tbl, hash);
509
510 spin_lock_bh(lock);
338 RCU_INIT_POINTER(obj->next, tbl->buckets[hash]); 511 RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
339 rcu_assign_pointer(tbl->buckets[hash], obj); 512 rcu_assign_pointer(tbl->buckets[hash], obj);
340 ht->nelems++; 513 spin_unlock_bh(lock);
341 514
342 if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) 515 atomic_inc(&ht->nelems);
343 rhashtable_expand(ht); 516
517 /* Only grow the table if no resizing is currently in progress. */
518 if (ht->tbl != ht->future_tbl &&
519 ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
520 schedule_delayed_work(&ht->run_work, 0);
521
522 rcu_read_unlock();
344} 523}
345EXPORT_SYMBOL_GPL(rhashtable_insert); 524EXPORT_SYMBOL_GPL(rhashtable_insert);
346 525
@@ -361,32 +540,56 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
361 */ 540 */
362bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) 541bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
363{ 542{
364 struct bucket_table *tbl = rht_dereference(ht->tbl, ht); 543 struct bucket_table *tbl;
365 struct rhash_head __rcu **pprev; 544 struct rhash_head __rcu **pprev;
366 struct rhash_head *he; 545 struct rhash_head *he;
367 u32 h; 546 spinlock_t *lock;
547 unsigned int hash;
368 548
369 ASSERT_RHT_MUTEX(ht); 549 rcu_read_lock();
550 tbl = rht_dereference_rcu(ht->tbl, ht);
551 hash = head_hashfn(ht, tbl, obj);
370 552
371 h = head_hashfn(ht, tbl, obj); 553 lock = bucket_lock(tbl, hash);
554 spin_lock_bh(lock);
372 555
373 pprev = &tbl->buckets[h]; 556restart:
374 rht_for_each(he, tbl, h) { 557 pprev = &tbl->buckets[hash];
558 rht_for_each(he, tbl, hash) {
375 if (he != obj) { 559 if (he != obj) {
376 pprev = &he->next; 560 pprev = &he->next;
377 continue; 561 continue;
378 } 562 }
379 563
380 RCU_INIT_POINTER(*pprev, he->next); 564 rcu_assign_pointer(*pprev, obj->next);
381 ht->nelems--; 565 atomic_dec(&ht->nelems);
382 566
383 if (ht->p.shrink_decision && 567 spin_unlock_bh(lock);
568
569 if (ht->tbl != ht->future_tbl &&
570 ht->p.shrink_decision &&
384 ht->p.shrink_decision(ht, tbl->size)) 571 ht->p.shrink_decision(ht, tbl->size))
385 rhashtable_shrink(ht); 572 schedule_delayed_work(&ht->run_work, 0);
573
574 rcu_read_unlock();
386 575
387 return true; 576 return true;
388 } 577 }
389 578
579 if (tbl != rht_dereference_rcu(ht->tbl, ht)) {
580 spin_unlock_bh(lock);
581
582 tbl = rht_dereference_rcu(ht->tbl, ht);
583 hash = head_hashfn(ht, tbl, obj);
584
585 lock = bucket_lock(tbl, hash);
586 spin_lock_bh(lock);
587 goto restart;
588 }
589
590 spin_unlock_bh(lock);
591 rcu_read_unlock();
592
390 return false; 593 return false;
391} 594}
392EXPORT_SYMBOL_GPL(rhashtable_remove); 595EXPORT_SYMBOL_GPL(rhashtable_remove);
@@ -402,25 +605,35 @@ EXPORT_SYMBOL_GPL(rhashtable_remove);
402 * This lookup function may only be used for fixed key hash table (key_len 605 * This lookup function may only be used for fixed key hash table (key_len
403 * paramter set). It will BUG() if used inappropriately. 606 * paramter set). It will BUG() if used inappropriately.
404 * 607 *
405 * Lookups may occur in parallel with hash mutations as long as the lookup is 608 * Lookups may occur in parallel with hashtable mutations and resizing.
406 * guarded by rcu_read_lock(). The caller must take care of this.
407 */ 609 */
408void *rhashtable_lookup(const struct rhashtable *ht, const void *key) 610void *rhashtable_lookup(struct rhashtable *ht, const void *key)
409{ 611{
410 const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); 612 const struct bucket_table *tbl, *old_tbl;
411 struct rhash_head *he; 613 struct rhash_head *he;
412 u32 h; 614 u32 hash;
413 615
414 BUG_ON(!ht->p.key_len); 616 BUG_ON(!ht->p.key_len);
415 617
416 h = key_hashfn(ht, key, ht->p.key_len); 618 rcu_read_lock();
417 rht_for_each_rcu(he, tbl, h) { 619 old_tbl = rht_dereference_rcu(ht->tbl, ht);
620 tbl = rht_dereference_rcu(ht->future_tbl, ht);
621 hash = key_hashfn(ht, key, ht->p.key_len);
622restart:
623 rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
418 if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key, 624 if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
419 ht->p.key_len)) 625 ht->p.key_len))
420 continue; 626 continue;
627 rcu_read_unlock();
421 return rht_obj(ht, he); 628 return rht_obj(ht, he);
422 } 629 }
423 630
631 if (unlikely(tbl != old_tbl)) {
632 tbl = old_tbl;
633 goto restart;
634 }
635
636 rcu_read_unlock();
424 return NULL; 637 return NULL;
425} 638}
426EXPORT_SYMBOL_GPL(rhashtable_lookup); 639EXPORT_SYMBOL_GPL(rhashtable_lookup);
@@ -435,25 +648,36 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
435 * Traverses the bucket chain behind the provided hash value and calls the 648 * Traverses the bucket chain behind the provided hash value and calls the
436 * specified compare function for each entry. 649 * specified compare function for each entry.
437 * 650 *
438 * Lookups may occur in parallel with hash mutations as long as the lookup is 651 * Lookups may occur in parallel with hashtable mutations and resizing.
439 * guarded by rcu_read_lock(). The caller must take care of this.
440 * 652 *
441 * Returns the first entry on which the compare function returned true. 653 * Returns the first entry on which the compare function returned true.
442 */ 654 */
443void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key, 655void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
444 bool (*compare)(void *, void *), void *arg) 656 bool (*compare)(void *, void *), void *arg)
445{ 657{
446 const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); 658 const struct bucket_table *tbl, *old_tbl;
447 struct rhash_head *he; 659 struct rhash_head *he;
448 u32 hash; 660 u32 hash;
449 661
662 rcu_read_lock();
663
664 old_tbl = rht_dereference_rcu(ht->tbl, ht);
665 tbl = rht_dereference_rcu(ht->future_tbl, ht);
450 hash = key_hashfn(ht, key, ht->p.key_len); 666 hash = key_hashfn(ht, key, ht->p.key_len);
451 rht_for_each_rcu(he, tbl, hash) { 667restart:
668 rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
452 if (!compare(rht_obj(ht, he), arg)) 669 if (!compare(rht_obj(ht, he), arg))
453 continue; 670 continue;
671 rcu_read_unlock();
454 return rht_obj(ht, he); 672 return rht_obj(ht, he);
455 } 673 }
456 674
675 if (unlikely(tbl != old_tbl)) {
676 tbl = old_tbl;
677 goto restart;
678 }
679 rcu_read_unlock();
680
457 return NULL; 681 return NULL;
458} 682}
459EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); 683EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
@@ -485,9 +709,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
485 * .key_offset = offsetof(struct test_obj, key), 709 * .key_offset = offsetof(struct test_obj, key),
486 * .key_len = sizeof(int), 710 * .key_len = sizeof(int),
487 * .hashfn = jhash, 711 * .hashfn = jhash,
488 * #ifdef CONFIG_PROVE_LOCKING
489 * .mutex_is_held = &my_mutex_is_held,
490 * #endif
491 * }; 712 * };
492 * 713 *
493 * Configuration Example 2: Variable length keys 714 * Configuration Example 2: Variable length keys
@@ -507,9 +728,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
507 * .head_offset = offsetof(struct test_obj, node), 728 * .head_offset = offsetof(struct test_obj, node),
508 * .hashfn = jhash, 729 * .hashfn = jhash,
509 * .obj_hashfn = my_hash_fn, 730 * .obj_hashfn = my_hash_fn,
510 * #ifdef CONFIG_PROVE_LOCKING
511 * .mutex_is_held = &my_mutex_is_held,
512 * #endif
513 * }; 731 * };
514 */ 732 */
515int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) 733int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
@@ -529,18 +747,29 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
529 if (params->nelem_hint) 747 if (params->nelem_hint)
530 size = rounded_hashtable_size(params); 748 size = rounded_hashtable_size(params);
531 749
532 tbl = bucket_table_alloc(size); 750 memset(ht, 0, sizeof(*ht));
751 mutex_init(&ht->mutex);
752 memcpy(&ht->p, params, sizeof(*params));
753
754 if (params->locks_mul)
755 ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
756 else
757 ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
758
759 tbl = bucket_table_alloc(ht, size);
533 if (tbl == NULL) 760 if (tbl == NULL)
534 return -ENOMEM; 761 return -ENOMEM;
535 762
536 memset(ht, 0, sizeof(*ht));
537 ht->shift = ilog2(tbl->size); 763 ht->shift = ilog2(tbl->size);
538 memcpy(&ht->p, params, sizeof(*params));
539 RCU_INIT_POINTER(ht->tbl, tbl); 764 RCU_INIT_POINTER(ht->tbl, tbl);
765 RCU_INIT_POINTER(ht->future_tbl, tbl);
540 766
541 if (!ht->p.hash_rnd) 767 if (!ht->p.hash_rnd)
542 get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); 768 get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
543 769
770 if (ht->p.grow_decision || ht->p.shrink_decision)
771 INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
772
544 return 0; 773 return 0;
545} 774}
546EXPORT_SYMBOL_GPL(rhashtable_init); 775EXPORT_SYMBOL_GPL(rhashtable_init);
@@ -553,9 +782,16 @@ EXPORT_SYMBOL_GPL(rhashtable_init);
553 * has to make sure that no resizing may happen by unpublishing the hashtable 782 * has to make sure that no resizing may happen by unpublishing the hashtable
554 * and waiting for the quiescent cycle before releasing the bucket array. 783 * and waiting for the quiescent cycle before releasing the bucket array.
555 */ 784 */
556void rhashtable_destroy(const struct rhashtable *ht) 785void rhashtable_destroy(struct rhashtable *ht)
557{ 786{
558 bucket_table_free(ht->tbl); 787 ht->being_destroyed = true;
788
789 mutex_lock(&ht->mutex);
790
791 cancel_delayed_work(&ht->run_work);
792 bucket_table_free(rht_dereference(ht->tbl, ht));
793
794 mutex_unlock(&ht->mutex);
559} 795}
560EXPORT_SYMBOL_GPL(rhashtable_destroy); 796EXPORT_SYMBOL_GPL(rhashtable_destroy);
561 797
@@ -570,13 +806,6 @@ EXPORT_SYMBOL_GPL(rhashtable_destroy);
570#define TEST_PTR ((void *) 0xdeadbeef) 806#define TEST_PTR ((void *) 0xdeadbeef)
571#define TEST_NEXPANDS 4 807#define TEST_NEXPANDS 4
572 808
573#ifdef CONFIG_PROVE_LOCKING
574static int test_mutex_is_held(void *parent)
575{
576 return 1;
577}
578#endif
579
580struct test_obj { 809struct test_obj {
581 void *ptr; 810 void *ptr;
582 int value; 811 int value;
@@ -646,10 +875,10 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
646 i, tbl->buckets[i], cnt); 875 i, tbl->buckets[i], cnt);
647 } 876 }
648 877
649 pr_info(" Traversal complete: counted=%u, nelems=%zu, entries=%d\n", 878 pr_info(" Traversal complete: counted=%u, nelems=%u, entries=%d\n",
650 total, ht->nelems, TEST_ENTRIES); 879 total, atomic_read(&ht->nelems), TEST_ENTRIES);
651 880
652 if (total != ht->nelems || total != TEST_ENTRIES) 881 if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES)
653 pr_warn("Test failed: Total count mismatch ^^^"); 882 pr_warn("Test failed: Total count mismatch ^^^");
654} 883}
655 884
@@ -688,7 +917,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
688 917
689 for (i = 0; i < TEST_NEXPANDS; i++) { 918 for (i = 0; i < TEST_NEXPANDS; i++) {
690 pr_info(" Table expansion iteration %u...\n", i); 919 pr_info(" Table expansion iteration %u...\n", i);
920 mutex_lock(&ht->mutex);
691 rhashtable_expand(ht); 921 rhashtable_expand(ht);
922 mutex_unlock(&ht->mutex);
692 923
693 rcu_read_lock(); 924 rcu_read_lock();
694 pr_info(" Verifying lookups...\n"); 925 pr_info(" Verifying lookups...\n");
@@ -698,7 +929,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
698 929
699 for (i = 0; i < TEST_NEXPANDS; i++) { 930 for (i = 0; i < TEST_NEXPANDS; i++) {
700 pr_info(" Table shrinkage iteration %u...\n", i); 931 pr_info(" Table shrinkage iteration %u...\n", i);
932 mutex_lock(&ht->mutex);
701 rhashtable_shrink(ht); 933 rhashtable_shrink(ht);
934 mutex_unlock(&ht->mutex);
702 935
703 rcu_read_lock(); 936 rcu_read_lock();
704 pr_info(" Verifying lookups...\n"); 937 pr_info(" Verifying lookups...\n");
@@ -741,9 +974,6 @@ static int __init test_rht_init(void)
741 .key_offset = offsetof(struct test_obj, value), 974 .key_offset = offsetof(struct test_obj, value),
742 .key_len = sizeof(int), 975 .key_len = sizeof(int),
743 .hashfn = jhash, 976 .hashfn = jhash,
744#ifdef CONFIG_PROVE_LOCKING
745 .mutex_is_held = &test_mutex_is_held,
746#endif
747 .grow_decision = rht_grow_above_75, 977 .grow_decision = rht_grow_above_75,
748 .shrink_decision = rht_shrink_below_30, 978 .shrink_decision = rht_shrink_below_30,
749 }; 979 };