diff options
author | Thomas Graf <tgraf@suug.ch> | 2015-01-02 17:00:20 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-01-03 14:32:57 -0500 |
commit | 97defe1ecf868b8127f8e62395499d6a06e4c4b1 (patch) | |
tree | d3ed6d3db4943e01b1ae58e73580537ba1642d9e /lib/rhashtable.c | |
parent | 113948d841e8d78039e5dbbb5248f5b73e99eafa (diff) |
rhashtable: Per bucket locks & deferred expansion/shrinking
Introduces an array of spinlocks to protect bucket mutations. The number
of spinlocks per CPU is configurable and selected based on the hash of
the bucket. This allows for parallel insertions and removals of entries
which do not share a lock.
The patch also defers expansion and shrinking to a worker queue which
allows insertion and removal from atomic context. Insertions and
deletions may occur in parallel to it and are only held up briefly
while the particular bucket is linked or unzipped.
Mutations of the bucket table pointer is protected by a new mutex, read
access is RCU protected.
In the event of an expansion or shrinking, the new bucket table allocated
is exposed as a so called future table as soon as the resize process
starts. Lookups, deletions, and insertions will briefly use both tables.
The future table becomes the main table after an RCU grace period and
initial linking of the old to the new table was performed. Optimization
of the chains to make use of the new number of buckets follows only the
new table is in use.
The side effect of this is that during that RCU grace period, a bucket
traversal using any rht_for_each() variant on the main table will not see
any insertions performed during the RCU grace period which would at that
point land in the future table. The lookup will see them as it searches
both tables if needed.
Having multiple insertions and removals occur in parallel requires nelems
to become an atomic counter.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'lib/rhashtable.c')
-rw-r--r-- | lib/rhashtable.c | 458 |
1 files changed, 344 insertions, 114 deletions
diff --git a/lib/rhashtable.c b/lib/rhashtable.c index e6b85c4a5828..312e3437c7bc 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c | |||
@@ -26,19 +26,42 @@ | |||
26 | 26 | ||
27 | #define HASH_DEFAULT_SIZE 64UL | 27 | #define HASH_DEFAULT_SIZE 64UL |
28 | #define HASH_MIN_SIZE 4UL | 28 | #define HASH_MIN_SIZE 4UL |
29 | #define BUCKET_LOCKS_PER_CPU 128UL | ||
30 | |||
31 | enum { | ||
32 | RHT_LOCK_NORMAL, | ||
33 | RHT_LOCK_NESTED, | ||
34 | RHT_LOCK_NESTED2, | ||
35 | }; | ||
36 | |||
37 | /* The bucket lock is selected based on the hash and protects mutations | ||
38 | * on a group of hash buckets. | ||
39 | * | ||
40 | * IMPORTANT: When holding the bucket lock of both the old and new table | ||
41 | * during expansions and shrinking, the old bucket lock must always be | ||
42 | * acquired first. | ||
43 | */ | ||
44 | static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash) | ||
45 | { | ||
46 | return &tbl->locks[hash & tbl->locks_mask]; | ||
47 | } | ||
29 | 48 | ||
30 | #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) | 49 | #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) |
50 | #define ASSERT_BUCKET_LOCK(TBL, HASH) \ | ||
51 | BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH)) | ||
31 | 52 | ||
32 | #ifdef CONFIG_PROVE_LOCKING | 53 | #ifdef CONFIG_PROVE_LOCKING |
33 | int lockdep_rht_mutex_is_held(const struct rhashtable *ht) | 54 | int lockdep_rht_mutex_is_held(struct rhashtable *ht) |
34 | { | 55 | { |
35 | return ht->p.mutex_is_held(ht->p.parent); | 56 | return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; |
36 | } | 57 | } |
37 | EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); | 58 | EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); |
38 | 59 | ||
39 | int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) | 60 | int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) |
40 | { | 61 | { |
41 | return 1; | 62 | spinlock_t *lock = bucket_lock(tbl, hash); |
63 | |||
64 | return (debug_locks) ? lockdep_is_held(lock) : 1; | ||
42 | } | 65 | } |
43 | EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); | 66 | EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); |
44 | #endif | 67 | #endif |
@@ -66,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr) | |||
66 | return hash; | 89 | return hash; |
67 | } | 90 | } |
68 | 91 | ||
69 | static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len) | 92 | static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len) |
70 | { | 93 | { |
71 | struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); | 94 | struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); |
72 | u32 hash; | 95 | u32 hash; |
@@ -95,7 +118,49 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n) | |||
95 | return pprev; | 118 | return pprev; |
96 | } | 119 | } |
97 | 120 | ||
98 | static struct bucket_table *bucket_table_alloc(size_t nbuckets) | 121 | static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) |
122 | { | ||
123 | unsigned int i, size; | ||
124 | #if defined(CONFIG_PROVE_LOCKING) | ||
125 | unsigned int nr_pcpus = 2; | ||
126 | #else | ||
127 | unsigned int nr_pcpus = num_possible_cpus(); | ||
128 | #endif | ||
129 | |||
130 | nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); | ||
131 | size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); | ||
132 | |||
133 | /* Never allocate more than one lock per bucket */ | ||
134 | size = min_t(unsigned int, size, tbl->size); | ||
135 | |||
136 | if (sizeof(spinlock_t) != 0) { | ||
137 | #ifdef CONFIG_NUMA | ||
138 | if (size * sizeof(spinlock_t) > PAGE_SIZE) | ||
139 | tbl->locks = vmalloc(size * sizeof(spinlock_t)); | ||
140 | else | ||
141 | #endif | ||
142 | tbl->locks = kmalloc_array(size, sizeof(spinlock_t), | ||
143 | GFP_KERNEL); | ||
144 | if (!tbl->locks) | ||
145 | return -ENOMEM; | ||
146 | for (i = 0; i < size; i++) | ||
147 | spin_lock_init(&tbl->locks[i]); | ||
148 | } | ||
149 | tbl->locks_mask = size - 1; | ||
150 | |||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | static void bucket_table_free(const struct bucket_table *tbl) | ||
155 | { | ||
156 | if (tbl) | ||
157 | kvfree(tbl->locks); | ||
158 | |||
159 | kvfree(tbl); | ||
160 | } | ||
161 | |||
162 | static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, | ||
163 | size_t nbuckets) | ||
99 | { | 164 | { |
100 | struct bucket_table *tbl; | 165 | struct bucket_table *tbl; |
101 | size_t size; | 166 | size_t size; |
@@ -110,12 +175,12 @@ static struct bucket_table *bucket_table_alloc(size_t nbuckets) | |||
110 | 175 | ||
111 | tbl->size = nbuckets; | 176 | tbl->size = nbuckets; |
112 | 177 | ||
113 | return tbl; | 178 | if (alloc_bucket_locks(ht, tbl) < 0) { |
114 | } | 179 | bucket_table_free(tbl); |
180 | return NULL; | ||
181 | } | ||
115 | 182 | ||
116 | static void bucket_table_free(const struct bucket_table *tbl) | 183 | return tbl; |
117 | { | ||
118 | kvfree(tbl); | ||
119 | } | 184 | } |
120 | 185 | ||
121 | /** | 186 | /** |
@@ -126,7 +191,7 @@ static void bucket_table_free(const struct bucket_table *tbl) | |||
126 | bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) | 191 | bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) |
127 | { | 192 | { |
128 | /* Expand table when exceeding 75% load */ | 193 | /* Expand table when exceeding 75% load */ |
129 | return ht->nelems > (new_size / 4 * 3); | 194 | return atomic_read(&ht->nelems) > (new_size / 4 * 3); |
130 | } | 195 | } |
131 | EXPORT_SYMBOL_GPL(rht_grow_above_75); | 196 | EXPORT_SYMBOL_GPL(rht_grow_above_75); |
132 | 197 | ||
@@ -138,41 +203,59 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75); | |||
138 | bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) | 203 | bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) |
139 | { | 204 | { |
140 | /* Shrink table beneath 30% load */ | 205 | /* Shrink table beneath 30% load */ |
141 | return ht->nelems < (new_size * 3 / 10); | 206 | return atomic_read(&ht->nelems) < (new_size * 3 / 10); |
142 | } | 207 | } |
143 | EXPORT_SYMBOL_GPL(rht_shrink_below_30); | 208 | EXPORT_SYMBOL_GPL(rht_shrink_below_30); |
144 | 209 | ||
145 | static void hashtable_chain_unzip(const struct rhashtable *ht, | 210 | static void hashtable_chain_unzip(const struct rhashtable *ht, |
146 | const struct bucket_table *new_tbl, | 211 | const struct bucket_table *new_tbl, |
147 | struct bucket_table *old_tbl, size_t n) | 212 | struct bucket_table *old_tbl, |
213 | size_t old_hash) | ||
148 | { | 214 | { |
149 | struct rhash_head *he, *p, *next; | 215 | struct rhash_head *he, *p, *next; |
150 | unsigned int h; | 216 | spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL; |
217 | unsigned int new_hash, new_hash2; | ||
218 | |||
219 | ASSERT_BUCKET_LOCK(old_tbl, old_hash); | ||
151 | 220 | ||
152 | /* Old bucket empty, no work needed. */ | 221 | /* Old bucket empty, no work needed. */ |
153 | p = rht_dereference(old_tbl->buckets[n], ht); | 222 | p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, |
223 | old_hash); | ||
154 | if (!p) | 224 | if (!p) |
155 | return; | 225 | return; |
156 | 226 | ||
227 | new_hash = new_hash2 = head_hashfn(ht, new_tbl, p); | ||
228 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | ||
229 | |||
157 | /* Advance the old bucket pointer one or more times until it | 230 | /* Advance the old bucket pointer one or more times until it |
158 | * reaches a node that doesn't hash to the same bucket as the | 231 | * reaches a node that doesn't hash to the same bucket as the |
159 | * previous node p. Call the previous node p; | 232 | * previous node p. Call the previous node p; |
160 | */ | 233 | */ |
161 | h = head_hashfn(ht, new_tbl, p); | 234 | rht_for_each_continue(he, p->next, old_tbl, old_hash) { |
162 | rht_for_each_continue(he, p->next, old_tbl, n) { | 235 | new_hash2 = head_hashfn(ht, new_tbl, he); |
163 | if (head_hashfn(ht, new_tbl, he) != h) | 236 | if (new_hash != new_hash2) |
164 | break; | 237 | break; |
165 | p = he; | 238 | p = he; |
166 | } | 239 | } |
167 | RCU_INIT_POINTER(old_tbl->buckets[n], p->next); | 240 | rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); |
241 | |||
242 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); | ||
243 | |||
244 | /* If we have encountered an entry that maps to a different bucket in | ||
245 | * the new table, lock down that bucket as well as we might cut off | ||
246 | * the end of the chain. | ||
247 | */ | ||
248 | new_bucket_lock2 = bucket_lock(new_tbl, new_hash); | ||
249 | if (new_bucket_lock != new_bucket_lock2) | ||
250 | spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2); | ||
168 | 251 | ||
169 | /* Find the subsequent node which does hash to the same | 252 | /* Find the subsequent node which does hash to the same |
170 | * bucket as node P, or NULL if no such node exists. | 253 | * bucket as node P, or NULL if no such node exists. |
171 | */ | 254 | */ |
172 | next = NULL; | 255 | next = NULL; |
173 | if (he) { | 256 | if (he) { |
174 | rht_for_each_continue(he, he->next, old_tbl, n) { | 257 | rht_for_each_continue(he, he->next, old_tbl, old_hash) { |
175 | if (head_hashfn(ht, new_tbl, he) == h) { | 258 | if (head_hashfn(ht, new_tbl, he) == new_hash) { |
176 | next = he; | 259 | next = he; |
177 | break; | 260 | break; |
178 | } | 261 | } |
@@ -182,7 +265,23 @@ static void hashtable_chain_unzip(const struct rhashtable *ht, | |||
182 | /* Set p's next pointer to that subsequent node pointer, | 265 | /* Set p's next pointer to that subsequent node pointer, |
183 | * bypassing the nodes which do not hash to p's bucket | 266 | * bypassing the nodes which do not hash to p's bucket |
184 | */ | 267 | */ |
185 | RCU_INIT_POINTER(p->next, next); | 268 | rcu_assign_pointer(p->next, next); |
269 | |||
270 | if (new_bucket_lock != new_bucket_lock2) | ||
271 | spin_unlock_bh(new_bucket_lock2); | ||
272 | spin_unlock_bh(new_bucket_lock); | ||
273 | } | ||
274 | |||
275 | static void link_old_to_new(struct bucket_table *new_tbl, | ||
276 | unsigned int new_hash, struct rhash_head *entry) | ||
277 | { | ||
278 | spinlock_t *new_bucket_lock; | ||
279 | |||
280 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | ||
281 | |||
282 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); | ||
283 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); | ||
284 | spin_unlock_bh(new_bucket_lock); | ||
186 | } | 285 | } |
187 | 286 | ||
188 | /** | 287 | /** |
@@ -195,43 +294,59 @@ static void hashtable_chain_unzip(const struct rhashtable *ht, | |||
195 | * This function may only be called in a context where it is safe to call | 294 | * This function may only be called in a context where it is safe to call |
196 | * synchronize_rcu(), e.g. not within a rcu_read_lock() section. | 295 | * synchronize_rcu(), e.g. not within a rcu_read_lock() section. |
197 | * | 296 | * |
198 | * The caller must ensure that no concurrent table mutations take place. | 297 | * The caller must ensure that no concurrent resizing occurs by holding |
199 | * It is however valid to have concurrent lookups if they are RCU protected. | 298 | * ht->mutex. |
299 | * | ||
300 | * It is valid to have concurrent insertions and deletions protected by per | ||
301 | * bucket locks or concurrent RCU protected lookups and traversals. | ||
200 | */ | 302 | */ |
201 | int rhashtable_expand(struct rhashtable *ht) | 303 | int rhashtable_expand(struct rhashtable *ht) |
202 | { | 304 | { |
203 | struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); | 305 | struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); |
204 | struct rhash_head *he; | 306 | struct rhash_head *he; |
205 | unsigned int i, h; | 307 | spinlock_t *old_bucket_lock; |
206 | bool complete; | 308 | unsigned int new_hash, old_hash; |
309 | bool complete = false; | ||
207 | 310 | ||
208 | ASSERT_RHT_MUTEX(ht); | 311 | ASSERT_RHT_MUTEX(ht); |
209 | 312 | ||
210 | if (ht->p.max_shift && ht->shift >= ht->p.max_shift) | 313 | if (ht->p.max_shift && ht->shift >= ht->p.max_shift) |
211 | return 0; | 314 | return 0; |
212 | 315 | ||
213 | new_tbl = bucket_table_alloc(old_tbl->size * 2); | 316 | new_tbl = bucket_table_alloc(ht, old_tbl->size * 2); |
214 | if (new_tbl == NULL) | 317 | if (new_tbl == NULL) |
215 | return -ENOMEM; | 318 | return -ENOMEM; |
216 | 319 | ||
217 | ht->shift++; | 320 | ht->shift++; |
218 | 321 | ||
219 | /* For each new bucket, search the corresponding old bucket | 322 | /* Make insertions go into the new, empty table right away. Deletions |
220 | * for the first entry that hashes to the new bucket, and | 323 | * and lookups will be attempted in both tables until we synchronize. |
221 | * link the new bucket to that entry. Since all the entries | 324 | * The synchronize_rcu() guarantees for the new table to be picked up |
222 | * which will end up in the new bucket appear in the same | 325 | * so no new additions go into the old table while we relink. |
223 | * old bucket, this constructs an entirely valid new hash | 326 | */ |
224 | * table, but with multiple buckets "zipped" together into a | 327 | rcu_assign_pointer(ht->future_tbl, new_tbl); |
225 | * single imprecise chain. | 328 | synchronize_rcu(); |
329 | |||
330 | /* For each new bucket, search the corresponding old bucket for the | ||
331 | * first entry that hashes to the new bucket, and link the end of | ||
332 | * newly formed bucket chain (containing entries added to future | ||
333 | * table) to that entry. Since all the entries which will end up in | ||
334 | * the new bucket appear in the same old bucket, this constructs an | ||
335 | * entirely valid new hash table, but with multiple buckets | ||
336 | * "zipped" together into a single imprecise chain. | ||
226 | */ | 337 | */ |
227 | for (i = 0; i < new_tbl->size; i++) { | 338 | for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { |
228 | h = rht_bucket_index(old_tbl, i); | 339 | old_hash = rht_bucket_index(old_tbl, new_hash); |
229 | rht_for_each(he, old_tbl, h) { | 340 | old_bucket_lock = bucket_lock(old_tbl, old_hash); |
230 | if (head_hashfn(ht, new_tbl, he) == i) { | 341 | |
231 | RCU_INIT_POINTER(new_tbl->buckets[i], he); | 342 | spin_lock_bh(old_bucket_lock); |
343 | rht_for_each(he, old_tbl, old_hash) { | ||
344 | if (head_hashfn(ht, new_tbl, he) == new_hash) { | ||
345 | link_old_to_new(new_tbl, new_hash, he); | ||
232 | break; | 346 | break; |
233 | } | 347 | } |
234 | } | 348 | } |
349 | spin_unlock_bh(old_bucket_lock); | ||
235 | } | 350 | } |
236 | 351 | ||
237 | /* Publish the new table pointer. Lookups may now traverse | 352 | /* Publish the new table pointer. Lookups may now traverse |
@@ -241,7 +356,7 @@ int rhashtable_expand(struct rhashtable *ht) | |||
241 | rcu_assign_pointer(ht->tbl, new_tbl); | 356 | rcu_assign_pointer(ht->tbl, new_tbl); |
242 | 357 | ||
243 | /* Unzip interleaved hash chains */ | 358 | /* Unzip interleaved hash chains */ |
244 | do { | 359 | while (!complete && !ht->being_destroyed) { |
245 | /* Wait for readers. All new readers will see the new | 360 | /* Wait for readers. All new readers will see the new |
246 | * table, and thus no references to the old table will | 361 | * table, and thus no references to the old table will |
247 | * remain. | 362 | * remain. |
@@ -253,12 +368,17 @@ int rhashtable_expand(struct rhashtable *ht) | |||
253 | * table): ... | 368 | * table): ... |
254 | */ | 369 | */ |
255 | complete = true; | 370 | complete = true; |
256 | for (i = 0; i < old_tbl->size; i++) { | 371 | for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { |
257 | hashtable_chain_unzip(ht, new_tbl, old_tbl, i); | 372 | old_bucket_lock = bucket_lock(old_tbl, old_hash); |
258 | if (old_tbl->buckets[i] != NULL) | 373 | spin_lock_bh(old_bucket_lock); |
374 | |||
375 | hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash); | ||
376 | if (old_tbl->buckets[old_hash] != NULL) | ||
259 | complete = false; | 377 | complete = false; |
378 | |||
379 | spin_unlock_bh(old_bucket_lock); | ||
260 | } | 380 | } |
261 | } while (!complete); | 381 | } |
262 | 382 | ||
263 | bucket_table_free(old_tbl); | 383 | bucket_table_free(old_tbl); |
264 | return 0; | 384 | return 0; |
@@ -272,38 +392,65 @@ EXPORT_SYMBOL_GPL(rhashtable_expand); | |||
272 | * This function may only be called in a context where it is safe to call | 392 | * This function may only be called in a context where it is safe to call |
273 | * synchronize_rcu(), e.g. not within a rcu_read_lock() section. | 393 | * synchronize_rcu(), e.g. not within a rcu_read_lock() section. |
274 | * | 394 | * |
395 | * The caller must ensure that no concurrent resizing occurs by holding | ||
396 | * ht->mutex. | ||
397 | * | ||
275 | * The caller must ensure that no concurrent table mutations take place. | 398 | * The caller must ensure that no concurrent table mutations take place. |
276 | * It is however valid to have concurrent lookups if they are RCU protected. | 399 | * It is however valid to have concurrent lookups if they are RCU protected. |
400 | * | ||
401 | * It is valid to have concurrent insertions and deletions protected by per | ||
402 | * bucket locks or concurrent RCU protected lookups and traversals. | ||
277 | */ | 403 | */ |
278 | int rhashtable_shrink(struct rhashtable *ht) | 404 | int rhashtable_shrink(struct rhashtable *ht) |
279 | { | 405 | { |
280 | struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht); | 406 | struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); |
281 | unsigned int i; | 407 | spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2; |
408 | unsigned int new_hash; | ||
282 | 409 | ||
283 | ASSERT_RHT_MUTEX(ht); | 410 | ASSERT_RHT_MUTEX(ht); |
284 | 411 | ||
285 | if (ht->shift <= ht->p.min_shift) | 412 | if (ht->shift <= ht->p.min_shift) |
286 | return 0; | 413 | return 0; |
287 | 414 | ||
288 | ntbl = bucket_table_alloc(tbl->size / 2); | 415 | new_tbl = bucket_table_alloc(ht, tbl->size / 2); |
289 | if (ntbl == NULL) | 416 | if (new_tbl == NULL) |
290 | return -ENOMEM; | 417 | return -ENOMEM; |
291 | 418 | ||
292 | ht->shift--; | 419 | rcu_assign_pointer(ht->future_tbl, new_tbl); |
420 | synchronize_rcu(); | ||
293 | 421 | ||
294 | /* Link each bucket in the new table to the first bucket | 422 | /* Link the first entry in the old bucket to the end of the |
295 | * in the old table that contains entries which will hash | 423 | * bucket in the new table. As entries are concurrently being |
296 | * to the new bucket. | 424 | * added to the new table, lock down the new bucket. As we |
425 | * always divide the size in half when shrinking, each bucket | ||
426 | * in the new table maps to exactly two buckets in the old | ||
427 | * table. | ||
428 | * | ||
429 | * As removals can occur concurrently on the old table, we need | ||
430 | * to lock down both matching buckets in the old table. | ||
297 | */ | 431 | */ |
298 | for (i = 0; i < ntbl->size; i++) { | 432 | for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { |
299 | ntbl->buckets[i] = tbl->buckets[i]; | 433 | old_bucket_lock1 = bucket_lock(tbl, new_hash); |
300 | RCU_INIT_POINTER(*bucket_tail(ntbl, i), | 434 | old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size); |
301 | tbl->buckets[i + ntbl->size]); | 435 | new_bucket_lock = bucket_lock(new_tbl, new_hash); |
302 | 436 | ||
437 | spin_lock_bh(old_bucket_lock1); | ||
438 | spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED); | ||
439 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2); | ||
440 | |||
441 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), | ||
442 | tbl->buckets[new_hash]); | ||
443 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), | ||
444 | tbl->buckets[new_hash + new_tbl->size]); | ||
445 | |||
446 | spin_unlock_bh(new_bucket_lock); | ||
447 | spin_unlock_bh(old_bucket_lock2); | ||
448 | spin_unlock_bh(old_bucket_lock1); | ||
303 | } | 449 | } |
304 | 450 | ||
305 | /* Publish the new, valid hash table */ | 451 | /* Publish the new, valid hash table */ |
306 | rcu_assign_pointer(ht->tbl, ntbl); | 452 | rcu_assign_pointer(ht->tbl, new_tbl); |
453 | ht->shift--; | ||
307 | 454 | ||
308 | /* Wait for readers. No new readers will have references to the | 455 | /* Wait for readers. No new readers will have references to the |
309 | * old hash table. | 456 | * old hash table. |
@@ -316,31 +463,63 @@ int rhashtable_shrink(struct rhashtable *ht) | |||
316 | } | 463 | } |
317 | EXPORT_SYMBOL_GPL(rhashtable_shrink); | 464 | EXPORT_SYMBOL_GPL(rhashtable_shrink); |
318 | 465 | ||
466 | static void rht_deferred_worker(struct work_struct *work) | ||
467 | { | ||
468 | struct rhashtable *ht; | ||
469 | struct bucket_table *tbl; | ||
470 | |||
471 | ht = container_of(work, struct rhashtable, run_work.work); | ||
472 | mutex_lock(&ht->mutex); | ||
473 | tbl = rht_dereference(ht->tbl, ht); | ||
474 | |||
475 | if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) | ||
476 | rhashtable_expand(ht); | ||
477 | else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) | ||
478 | rhashtable_shrink(ht); | ||
479 | |||
480 | mutex_unlock(&ht->mutex); | ||
481 | } | ||
482 | |||
319 | /** | 483 | /** |
320 | * rhashtable_insert - insert object into hash hash table | 484 | * rhashtable_insert - insert object into hash hash table |
321 | * @ht: hash table | 485 | * @ht: hash table |
322 | * @obj: pointer to hash head inside object | 486 | * @obj: pointer to hash head inside object |
323 | * | 487 | * |
324 | * Will automatically grow the table via rhashtable_expand() if the the | 488 | * Will take a per bucket spinlock to protect against mutual mutations |
325 | * grow_decision function specified at rhashtable_init() returns true. | 489 | * on the same bucket. Multiple insertions may occur in parallel unless |
490 | * they map to the same bucket lock. | ||
326 | * | 491 | * |
327 | * The caller must ensure that no concurrent table mutations occur. It is | 492 | * It is safe to call this function from atomic context. |
328 | * however valid to have concurrent lookups if they are RCU protected. | 493 | * |
494 | * Will trigger an automatic deferred table resizing if the size grows | ||
495 | * beyond the watermark indicated by grow_decision() which can be passed | ||
496 | * to rhashtable_init(). | ||
329 | */ | 497 | */ |
330 | void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) | 498 | void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) |
331 | { | 499 | { |
332 | struct bucket_table *tbl = rht_dereference(ht->tbl, ht); | 500 | struct bucket_table *tbl; |
333 | u32 hash; | 501 | spinlock_t *lock; |
502 | unsigned hash; | ||
334 | 503 | ||
335 | ASSERT_RHT_MUTEX(ht); | 504 | rcu_read_lock(); |
336 | 505 | ||
506 | tbl = rht_dereference_rcu(ht->future_tbl, ht); | ||
337 | hash = head_hashfn(ht, tbl, obj); | 507 | hash = head_hashfn(ht, tbl, obj); |
508 | lock = bucket_lock(tbl, hash); | ||
509 | |||
510 | spin_lock_bh(lock); | ||
338 | RCU_INIT_POINTER(obj->next, tbl->buckets[hash]); | 511 | RCU_INIT_POINTER(obj->next, tbl->buckets[hash]); |
339 | rcu_assign_pointer(tbl->buckets[hash], obj); | 512 | rcu_assign_pointer(tbl->buckets[hash], obj); |
340 | ht->nelems++; | 513 | spin_unlock_bh(lock); |
341 | 514 | ||
342 | if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) | 515 | atomic_inc(&ht->nelems); |
343 | rhashtable_expand(ht); | 516 | |
517 | /* Only grow the table if no resizing is currently in progress. */ | ||
518 | if (ht->tbl != ht->future_tbl && | ||
519 | ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) | ||
520 | schedule_delayed_work(&ht->run_work, 0); | ||
521 | |||
522 | rcu_read_unlock(); | ||
344 | } | 523 | } |
345 | EXPORT_SYMBOL_GPL(rhashtable_insert); | 524 | EXPORT_SYMBOL_GPL(rhashtable_insert); |
346 | 525 | ||
@@ -361,32 +540,56 @@ EXPORT_SYMBOL_GPL(rhashtable_insert); | |||
361 | */ | 540 | */ |
362 | bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) | 541 | bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) |
363 | { | 542 | { |
364 | struct bucket_table *tbl = rht_dereference(ht->tbl, ht); | 543 | struct bucket_table *tbl; |
365 | struct rhash_head __rcu **pprev; | 544 | struct rhash_head __rcu **pprev; |
366 | struct rhash_head *he; | 545 | struct rhash_head *he; |
367 | u32 h; | 546 | spinlock_t *lock; |
547 | unsigned int hash; | ||
368 | 548 | ||
369 | ASSERT_RHT_MUTEX(ht); | 549 | rcu_read_lock(); |
550 | tbl = rht_dereference_rcu(ht->tbl, ht); | ||
551 | hash = head_hashfn(ht, tbl, obj); | ||
370 | 552 | ||
371 | h = head_hashfn(ht, tbl, obj); | 553 | lock = bucket_lock(tbl, hash); |
554 | spin_lock_bh(lock); | ||
372 | 555 | ||
373 | pprev = &tbl->buckets[h]; | 556 | restart: |
374 | rht_for_each(he, tbl, h) { | 557 | pprev = &tbl->buckets[hash]; |
558 | rht_for_each(he, tbl, hash) { | ||
375 | if (he != obj) { | 559 | if (he != obj) { |
376 | pprev = &he->next; | 560 | pprev = &he->next; |
377 | continue; | 561 | continue; |
378 | } | 562 | } |
379 | 563 | ||
380 | RCU_INIT_POINTER(*pprev, he->next); | 564 | rcu_assign_pointer(*pprev, obj->next); |
381 | ht->nelems--; | 565 | atomic_dec(&ht->nelems); |
382 | 566 | ||
383 | if (ht->p.shrink_decision && | 567 | spin_unlock_bh(lock); |
568 | |||
569 | if (ht->tbl != ht->future_tbl && | ||
570 | ht->p.shrink_decision && | ||
384 | ht->p.shrink_decision(ht, tbl->size)) | 571 | ht->p.shrink_decision(ht, tbl->size)) |
385 | rhashtable_shrink(ht); | 572 | schedule_delayed_work(&ht->run_work, 0); |
573 | |||
574 | rcu_read_unlock(); | ||
386 | 575 | ||
387 | return true; | 576 | return true; |
388 | } | 577 | } |
389 | 578 | ||
579 | if (tbl != rht_dereference_rcu(ht->tbl, ht)) { | ||
580 | spin_unlock_bh(lock); | ||
581 | |||
582 | tbl = rht_dereference_rcu(ht->tbl, ht); | ||
583 | hash = head_hashfn(ht, tbl, obj); | ||
584 | |||
585 | lock = bucket_lock(tbl, hash); | ||
586 | spin_lock_bh(lock); | ||
587 | goto restart; | ||
588 | } | ||
589 | |||
590 | spin_unlock_bh(lock); | ||
591 | rcu_read_unlock(); | ||
592 | |||
390 | return false; | 593 | return false; |
391 | } | 594 | } |
392 | EXPORT_SYMBOL_GPL(rhashtable_remove); | 595 | EXPORT_SYMBOL_GPL(rhashtable_remove); |
@@ -402,25 +605,35 @@ EXPORT_SYMBOL_GPL(rhashtable_remove); | |||
402 | * This lookup function may only be used for fixed key hash table (key_len | 605 | * This lookup function may only be used for fixed key hash table (key_len |
403 | * paramter set). It will BUG() if used inappropriately. | 606 | * paramter set). It will BUG() if used inappropriately. |
404 | * | 607 | * |
405 | * Lookups may occur in parallel with hash mutations as long as the lookup is | 608 | * Lookups may occur in parallel with hashtable mutations and resizing. |
406 | * guarded by rcu_read_lock(). The caller must take care of this. | ||
407 | */ | 609 | */ |
408 | void *rhashtable_lookup(const struct rhashtable *ht, const void *key) | 610 | void *rhashtable_lookup(struct rhashtable *ht, const void *key) |
409 | { | 611 | { |
410 | const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); | 612 | const struct bucket_table *tbl, *old_tbl; |
411 | struct rhash_head *he; | 613 | struct rhash_head *he; |
412 | u32 h; | 614 | u32 hash; |
413 | 615 | ||
414 | BUG_ON(!ht->p.key_len); | 616 | BUG_ON(!ht->p.key_len); |
415 | 617 | ||
416 | h = key_hashfn(ht, key, ht->p.key_len); | 618 | rcu_read_lock(); |
417 | rht_for_each_rcu(he, tbl, h) { | 619 | old_tbl = rht_dereference_rcu(ht->tbl, ht); |
620 | tbl = rht_dereference_rcu(ht->future_tbl, ht); | ||
621 | hash = key_hashfn(ht, key, ht->p.key_len); | ||
622 | restart: | ||
623 | rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) { | ||
418 | if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key, | 624 | if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key, |
419 | ht->p.key_len)) | 625 | ht->p.key_len)) |
420 | continue; | 626 | continue; |
627 | rcu_read_unlock(); | ||
421 | return rht_obj(ht, he); | 628 | return rht_obj(ht, he); |
422 | } | 629 | } |
423 | 630 | ||
631 | if (unlikely(tbl != old_tbl)) { | ||
632 | tbl = old_tbl; | ||
633 | goto restart; | ||
634 | } | ||
635 | |||
636 | rcu_read_unlock(); | ||
424 | return NULL; | 637 | return NULL; |
425 | } | 638 | } |
426 | EXPORT_SYMBOL_GPL(rhashtable_lookup); | 639 | EXPORT_SYMBOL_GPL(rhashtable_lookup); |
@@ -435,25 +648,36 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup); | |||
435 | * Traverses the bucket chain behind the provided hash value and calls the | 648 | * Traverses the bucket chain behind the provided hash value and calls the |
436 | * specified compare function for each entry. | 649 | * specified compare function for each entry. |
437 | * | 650 | * |
438 | * Lookups may occur in parallel with hash mutations as long as the lookup is | 651 | * Lookups may occur in parallel with hashtable mutations and resizing. |
439 | * guarded by rcu_read_lock(). The caller must take care of this. | ||
440 | * | 652 | * |
441 | * Returns the first entry on which the compare function returned true. | 653 | * Returns the first entry on which the compare function returned true. |
442 | */ | 654 | */ |
443 | void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key, | 655 | void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, |
444 | bool (*compare)(void *, void *), void *arg) | 656 | bool (*compare)(void *, void *), void *arg) |
445 | { | 657 | { |
446 | const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); | 658 | const struct bucket_table *tbl, *old_tbl; |
447 | struct rhash_head *he; | 659 | struct rhash_head *he; |
448 | u32 hash; | 660 | u32 hash; |
449 | 661 | ||
662 | rcu_read_lock(); | ||
663 | |||
664 | old_tbl = rht_dereference_rcu(ht->tbl, ht); | ||
665 | tbl = rht_dereference_rcu(ht->future_tbl, ht); | ||
450 | hash = key_hashfn(ht, key, ht->p.key_len); | 666 | hash = key_hashfn(ht, key, ht->p.key_len); |
451 | rht_for_each_rcu(he, tbl, hash) { | 667 | restart: |
668 | rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) { | ||
452 | if (!compare(rht_obj(ht, he), arg)) | 669 | if (!compare(rht_obj(ht, he), arg)) |
453 | continue; | 670 | continue; |
671 | rcu_read_unlock(); | ||
454 | return rht_obj(ht, he); | 672 | return rht_obj(ht, he); |
455 | } | 673 | } |
456 | 674 | ||
675 | if (unlikely(tbl != old_tbl)) { | ||
676 | tbl = old_tbl; | ||
677 | goto restart; | ||
678 | } | ||
679 | rcu_read_unlock(); | ||
680 | |||
457 | return NULL; | 681 | return NULL; |
458 | } | 682 | } |
459 | EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); | 683 | EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); |
@@ -485,9 +709,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params) | |||
485 | * .key_offset = offsetof(struct test_obj, key), | 709 | * .key_offset = offsetof(struct test_obj, key), |
486 | * .key_len = sizeof(int), | 710 | * .key_len = sizeof(int), |
487 | * .hashfn = jhash, | 711 | * .hashfn = jhash, |
488 | * #ifdef CONFIG_PROVE_LOCKING | ||
489 | * .mutex_is_held = &my_mutex_is_held, | ||
490 | * #endif | ||
491 | * }; | 712 | * }; |
492 | * | 713 | * |
493 | * Configuration Example 2: Variable length keys | 714 | * Configuration Example 2: Variable length keys |
@@ -507,9 +728,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params) | |||
507 | * .head_offset = offsetof(struct test_obj, node), | 728 | * .head_offset = offsetof(struct test_obj, node), |
508 | * .hashfn = jhash, | 729 | * .hashfn = jhash, |
509 | * .obj_hashfn = my_hash_fn, | 730 | * .obj_hashfn = my_hash_fn, |
510 | * #ifdef CONFIG_PROVE_LOCKING | ||
511 | * .mutex_is_held = &my_mutex_is_held, | ||
512 | * #endif | ||
513 | * }; | 731 | * }; |
514 | */ | 732 | */ |
515 | int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) | 733 | int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) |
@@ -529,18 +747,29 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) | |||
529 | if (params->nelem_hint) | 747 | if (params->nelem_hint) |
530 | size = rounded_hashtable_size(params); | 748 | size = rounded_hashtable_size(params); |
531 | 749 | ||
532 | tbl = bucket_table_alloc(size); | 750 | memset(ht, 0, sizeof(*ht)); |
751 | mutex_init(&ht->mutex); | ||
752 | memcpy(&ht->p, params, sizeof(*params)); | ||
753 | |||
754 | if (params->locks_mul) | ||
755 | ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); | ||
756 | else | ||
757 | ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; | ||
758 | |||
759 | tbl = bucket_table_alloc(ht, size); | ||
533 | if (tbl == NULL) | 760 | if (tbl == NULL) |
534 | return -ENOMEM; | 761 | return -ENOMEM; |
535 | 762 | ||
536 | memset(ht, 0, sizeof(*ht)); | ||
537 | ht->shift = ilog2(tbl->size); | 763 | ht->shift = ilog2(tbl->size); |
538 | memcpy(&ht->p, params, sizeof(*params)); | ||
539 | RCU_INIT_POINTER(ht->tbl, tbl); | 764 | RCU_INIT_POINTER(ht->tbl, tbl); |
765 | RCU_INIT_POINTER(ht->future_tbl, tbl); | ||
540 | 766 | ||
541 | if (!ht->p.hash_rnd) | 767 | if (!ht->p.hash_rnd) |
542 | get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); | 768 | get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); |
543 | 769 | ||
770 | if (ht->p.grow_decision || ht->p.shrink_decision) | ||
771 | INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker); | ||
772 | |||
544 | return 0; | 773 | return 0; |
545 | } | 774 | } |
546 | EXPORT_SYMBOL_GPL(rhashtable_init); | 775 | EXPORT_SYMBOL_GPL(rhashtable_init); |
@@ -553,9 +782,16 @@ EXPORT_SYMBOL_GPL(rhashtable_init); | |||
553 | * has to make sure that no resizing may happen by unpublishing the hashtable | 782 | * has to make sure that no resizing may happen by unpublishing the hashtable |
554 | * and waiting for the quiescent cycle before releasing the bucket array. | 783 | * and waiting for the quiescent cycle before releasing the bucket array. |
555 | */ | 784 | */ |
556 | void rhashtable_destroy(const struct rhashtable *ht) | 785 | void rhashtable_destroy(struct rhashtable *ht) |
557 | { | 786 | { |
558 | bucket_table_free(ht->tbl); | 787 | ht->being_destroyed = true; |
788 | |||
789 | mutex_lock(&ht->mutex); | ||
790 | |||
791 | cancel_delayed_work(&ht->run_work); | ||
792 | bucket_table_free(rht_dereference(ht->tbl, ht)); | ||
793 | |||
794 | mutex_unlock(&ht->mutex); | ||
559 | } | 795 | } |
560 | EXPORT_SYMBOL_GPL(rhashtable_destroy); | 796 | EXPORT_SYMBOL_GPL(rhashtable_destroy); |
561 | 797 | ||
@@ -570,13 +806,6 @@ EXPORT_SYMBOL_GPL(rhashtable_destroy); | |||
570 | #define TEST_PTR ((void *) 0xdeadbeef) | 806 | #define TEST_PTR ((void *) 0xdeadbeef) |
571 | #define TEST_NEXPANDS 4 | 807 | #define TEST_NEXPANDS 4 |
572 | 808 | ||
573 | #ifdef CONFIG_PROVE_LOCKING | ||
574 | static int test_mutex_is_held(void *parent) | ||
575 | { | ||
576 | return 1; | ||
577 | } | ||
578 | #endif | ||
579 | |||
580 | struct test_obj { | 809 | struct test_obj { |
581 | void *ptr; | 810 | void *ptr; |
582 | int value; | 811 | int value; |
@@ -646,10 +875,10 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet) | |||
646 | i, tbl->buckets[i], cnt); | 875 | i, tbl->buckets[i], cnt); |
647 | } | 876 | } |
648 | 877 | ||
649 | pr_info(" Traversal complete: counted=%u, nelems=%zu, entries=%d\n", | 878 | pr_info(" Traversal complete: counted=%u, nelems=%u, entries=%d\n", |
650 | total, ht->nelems, TEST_ENTRIES); | 879 | total, atomic_read(&ht->nelems), TEST_ENTRIES); |
651 | 880 | ||
652 | if (total != ht->nelems || total != TEST_ENTRIES) | 881 | if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES) |
653 | pr_warn("Test failed: Total count mismatch ^^^"); | 882 | pr_warn("Test failed: Total count mismatch ^^^"); |
654 | } | 883 | } |
655 | 884 | ||
@@ -688,7 +917,9 @@ static int __init test_rhashtable(struct rhashtable *ht) | |||
688 | 917 | ||
689 | for (i = 0; i < TEST_NEXPANDS; i++) { | 918 | for (i = 0; i < TEST_NEXPANDS; i++) { |
690 | pr_info(" Table expansion iteration %u...\n", i); | 919 | pr_info(" Table expansion iteration %u...\n", i); |
920 | mutex_lock(&ht->mutex); | ||
691 | rhashtable_expand(ht); | 921 | rhashtable_expand(ht); |
922 | mutex_unlock(&ht->mutex); | ||
692 | 923 | ||
693 | rcu_read_lock(); | 924 | rcu_read_lock(); |
694 | pr_info(" Verifying lookups...\n"); | 925 | pr_info(" Verifying lookups...\n"); |
@@ -698,7 +929,9 @@ static int __init test_rhashtable(struct rhashtable *ht) | |||
698 | 929 | ||
699 | for (i = 0; i < TEST_NEXPANDS; i++) { | 930 | for (i = 0; i < TEST_NEXPANDS; i++) { |
700 | pr_info(" Table shrinkage iteration %u...\n", i); | 931 | pr_info(" Table shrinkage iteration %u...\n", i); |
932 | mutex_lock(&ht->mutex); | ||
701 | rhashtable_shrink(ht); | 933 | rhashtable_shrink(ht); |
934 | mutex_unlock(&ht->mutex); | ||
702 | 935 | ||
703 | rcu_read_lock(); | 936 | rcu_read_lock(); |
704 | pr_info(" Verifying lookups...\n"); | 937 | pr_info(" Verifying lookups...\n"); |
@@ -741,9 +974,6 @@ static int __init test_rht_init(void) | |||
741 | .key_offset = offsetof(struct test_obj, value), | 974 | .key_offset = offsetof(struct test_obj, value), |
742 | .key_len = sizeof(int), | 975 | .key_len = sizeof(int), |
743 | .hashfn = jhash, | 976 | .hashfn = jhash, |
744 | #ifdef CONFIG_PROVE_LOCKING | ||
745 | .mutex_is_held = &test_mutex_is_held, | ||
746 | #endif | ||
747 | .grow_decision = rht_grow_above_75, | 977 | .grow_decision = rht_grow_above_75, |
748 | .shrink_decision = rht_shrink_below_30, | 978 | .shrink_decision = rht_shrink_below_30, |
749 | }; | 979 | }; |