aboutsummaryrefslogtreecommitdiffstats
path: root/fs/mbcache.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/mbcache.c')
-rw-r--r--fs/mbcache.c1093
1 files changed, 334 insertions, 759 deletions
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 187477ded6b3..eccda3a02de6 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -1,858 +1,433 @@
1/* 1#include <linux/spinlock.h>
2 * linux/fs/mbcache.c
3 * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
4 */
5
6/*
7 * Filesystem Meta Information Block Cache (mbcache)
8 *
9 * The mbcache caches blocks of block devices that need to be located
10 * by their device/block number, as well as by other criteria (such
11 * as the block's contents).
12 *
13 * There can only be one cache entry in a cache per device and block number.
14 * Additional indexes need not be unique in this sense. The number of
15 * additional indexes (=other criteria) can be hardwired at compile time
16 * or specified at cache create time.
17 *
18 * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
19 * in the cache. A valid entry is in the main hash tables of the cache,
20 * and may also be in the lru list. An invalid entry is not in any hashes
21 * or lists.
22 *
23 * A valid cache entry is only in the lru list if no handles refer to it.
24 * Invalid cache entries will be freed when the last handle to the cache
25 * entry is released. Entries that cannot be freed immediately are put
26 * back on the lru list.
27 */
28
29/*
30 * Lock descriptions and usage:
31 *
32 * Each hash chain of both the block and index hash tables now contains
33 * a built-in lock used to serialize accesses to the hash chain.
34 *
35 * Accesses to global data structures mb_cache_list and mb_cache_lru_list
36 * are serialized via the global spinlock mb_cache_spinlock.
37 *
38 * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
39 * accesses to its local data, such as e_used and e_queued.
40 *
41 * Lock ordering:
42 *
43 * Each block hash chain's lock has the highest lock order, followed by an
44 * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
45 * lock), and mb_cach_spinlock, with the lowest order. While holding
46 * either a block or index hash chain lock, a thread can acquire an
47 * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
48 *
49 * Synchronization:
50 *
51 * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
52 * index hash chian, it needs to lock the corresponding hash chain. For each
53 * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
54 * prevent either any simultaneous release or free on the entry and also
55 * to serialize accesses to either the e_used or e_queued member of the entry.
56 *
57 * To avoid having a dangling reference to an already freed
58 * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
59 * block hash chain and also no longer being referenced, both e_used,
60 * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
61 * first removed from a block hash chain.
62 */
63
64#include <linux/kernel.h>
65#include <linux/module.h>
66
67#include <linux/hash.h>
68#include <linux/fs.h>
69#include <linux/mm.h>
70#include <linux/slab.h> 2#include <linux/slab.h>
71#include <linux/sched.h> 3#include <linux/list.h>
72#include <linux/list_bl.h> 4#include <linux/list_bl.h>
5#include <linux/module.h>
6#include <linux/sched.h>
7#include <linux/workqueue.h>
73#include <linux/mbcache.h> 8#include <linux/mbcache.h>
74#include <linux/init.h>
75#include <linux/blockgroup_lock.h>
76#include <linux/log2.h>
77
78#ifdef MB_CACHE_DEBUG
79# define mb_debug(f...) do { \
80 printk(KERN_DEBUG f); \
81 printk("\n"); \
82 } while (0)
83#define mb_assert(c) do { if (!(c)) \
84 printk(KERN_ERR "assertion " #c " failed\n"); \
85 } while(0)
86#else
87# define mb_debug(f...) do { } while(0)
88# define mb_assert(c) do { } while(0)
89#endif
90#define mb_error(f...) do { \
91 printk(KERN_ERR f); \
92 printk("\n"); \
93 } while(0)
94
95#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
96
97#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS)
98#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
99 (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
100
101static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
102static struct blockgroup_lock *mb_cache_bg_lock;
103static struct kmem_cache *mb_cache_kmem_cache;
104
105MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
106MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
107MODULE_LICENSE("GPL");
108
109EXPORT_SYMBOL(mb_cache_create);
110EXPORT_SYMBOL(mb_cache_shrink);
111EXPORT_SYMBOL(mb_cache_destroy);
112EXPORT_SYMBOL(mb_cache_entry_alloc);
113EXPORT_SYMBOL(mb_cache_entry_insert);
114EXPORT_SYMBOL(mb_cache_entry_release);
115EXPORT_SYMBOL(mb_cache_entry_free);
116EXPORT_SYMBOL(mb_cache_entry_get);
117#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
118EXPORT_SYMBOL(mb_cache_entry_find_first);
119EXPORT_SYMBOL(mb_cache_entry_find_next);
120#endif
121 9
122/* 10/*
123 * Global data: list of all mbcache's, lru list, and a spinlock for 11 * Mbcache is a simple key-value store. Keys need not be unique, however
124 * accessing cache data structures on SMP machines. The lru list is 12 * key-value pairs are expected to be unique (we use this fact in
125 * global across all mbcaches. 13 * mb_cache_entry_delete_block()).
14 *
15 * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
16 * They use hash of a block contents as a key and block number as a value.
17 * That's why keys need not be unique (different xattr blocks may end up having
18 * the same hash). However block number always uniquely identifies a cache
19 * entry.
20 *
21 * We provide functions for creation and removal of entries, search by key,
22 * and a special "delete entry with given key-value pair" operation. Fixed
23 * size hash table is used for fast key lookups.
126 */ 24 */
127 25
128static LIST_HEAD(mb_cache_list); 26struct mb_cache {
129static LIST_HEAD(mb_cache_lru_list); 27 /* Hash table of entries */
130static DEFINE_SPINLOCK(mb_cache_spinlock); 28 struct hlist_bl_head *c_hash;
131 29 /* log2 of hash table size */
132static inline void 30 int c_bucket_bits;
133__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) 31 /* Maximum entries in cache to avoid degrading hash too much */
134{ 32 int c_max_entries;
135 spin_lock(bgl_lock_ptr(mb_cache_bg_lock, 33 /* Protects c_list, c_entry_count */
136 MB_CACHE_ENTRY_LOCK_INDEX(ce))); 34 spinlock_t c_list_lock;
137} 35 struct list_head c_list;
138 36 /* Number of entries in cache */
139static inline void 37 unsigned long c_entry_count;
140__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) 38 struct shrinker c_shrink;
141{ 39 /* Work for shrinking when the cache has too many entries */
142 spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, 40 struct work_struct c_shrink_work;
143 MB_CACHE_ENTRY_LOCK_INDEX(ce))); 41};
144}
145
146static inline int
147__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
148{
149 return !hlist_bl_unhashed(&ce->e_block_list);
150}
151 42
43static struct kmem_cache *mb_entry_cache;
152 44
153static inline void 45static unsigned long mb_cache_shrink(struct mb_cache *cache,
154__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) 46 unsigned int nr_to_scan);
155{
156 if (__mb_cache_entry_is_block_hashed(ce))
157 hlist_bl_del_init(&ce->e_block_list);
158}
159 47
160static inline int 48static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
161__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) 49 u32 key)
162{ 50{
163 return !hlist_bl_unhashed(&ce->e_index.o_list); 51 return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
164} 52}
165 53
166static inline void 54/*
167__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) 55 * Number of entries to reclaim synchronously when there are too many entries
168{ 56 * in cache
169 if (__mb_cache_entry_is_index_hashed(ce)) 57 */
170 hlist_bl_del_init(&ce->e_index.o_list); 58#define SYNC_SHRINK_BATCH 64
171}
172 59
173/* 60/*
174 * __mb_cache_entry_unhash_unlock() 61 * mb_cache_entry_create - create entry in cache
175 * 62 * @cache - cache where the entry should be created
176 * This function is called to unhash both the block and index hash 63 * @mask - gfp mask with which the entry should be allocated
177 * chain. 64 * @key - key of the entry
178 * It assumes both the block and index hash chain is locked upon entry. 65 * @block - block that contains data
179 * It also unlock both hash chains both exit 66 * @reusable - is the block reusable by other inodes?
67 *
68 * Creates entry in @cache with key @key and records that data is stored in
69 * block @block. The function returns -EBUSY if entry with the same key
70 * and for the same block already exists in cache. Otherwise 0 is returned.
180 */ 71 */
181static inline void 72int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
182__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) 73 sector_t block, bool reusable)
183{ 74{
184 __mb_cache_entry_unhash_index(ce); 75 struct mb_cache_entry *entry, *dup;
185 hlist_bl_unlock(ce->e_index_hash_p); 76 struct hlist_bl_node *dup_node;
186 __mb_cache_entry_unhash_block(ce); 77 struct hlist_bl_head *head;
187 hlist_bl_unlock(ce->e_block_hash_p); 78
79 /* Schedule background reclaim if there are too many entries */
80 if (cache->c_entry_count >= cache->c_max_entries)
81 schedule_work(&cache->c_shrink_work);
82 /* Do some sync reclaim if background reclaim cannot keep up */
83 if (cache->c_entry_count >= 2*cache->c_max_entries)
84 mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
85
86 entry = kmem_cache_alloc(mb_entry_cache, mask);
87 if (!entry)
88 return -ENOMEM;
89
90 INIT_LIST_HEAD(&entry->e_list);
91 /* One ref for hash, one ref returned */
92 atomic_set(&entry->e_refcnt, 1);
93 entry->e_key = key;
94 entry->e_block = block;
95 entry->e_reusable = reusable;
96 head = mb_cache_entry_head(cache, key);
97 hlist_bl_lock(head);
98 hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
99 if (dup->e_key == key && dup->e_block == block) {
100 hlist_bl_unlock(head);
101 kmem_cache_free(mb_entry_cache, entry);
102 return -EBUSY;
103 }
104 }
105 hlist_bl_add_head(&entry->e_hash_list, head);
106 hlist_bl_unlock(head);
107
108 spin_lock(&cache->c_list_lock);
109 list_add_tail(&entry->e_list, &cache->c_list);
110 /* Grab ref for LRU list */
111 atomic_inc(&entry->e_refcnt);
112 cache->c_entry_count++;
113 spin_unlock(&cache->c_list_lock);
114
115 return 0;
188} 116}
117EXPORT_SYMBOL(mb_cache_entry_create);
189 118
190static void 119void __mb_cache_entry_free(struct mb_cache_entry *entry)
191__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
192{ 120{
193 struct mb_cache *cache = ce->e_cache; 121 kmem_cache_free(mb_entry_cache, entry);
194
195 mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
196 kmem_cache_free(cache->c_entry_cache, ce);
197 atomic_dec(&cache->c_entry_count);
198} 122}
123EXPORT_SYMBOL(__mb_cache_entry_free);
199 124
200static void 125static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
201__mb_cache_entry_release(struct mb_cache_entry *ce) 126 struct mb_cache_entry *entry,
127 u32 key)
202{ 128{
203 /* First lock the entry to serialize access to its local data. */ 129 struct mb_cache_entry *old_entry = entry;
204 __spin_lock_mb_cache_entry(ce); 130 struct hlist_bl_node *node;
205 /* Wake up all processes queuing for this cache entry. */ 131 struct hlist_bl_head *head;
206 if (ce->e_queued) 132
207 wake_up_all(&mb_cache_queue); 133 head = mb_cache_entry_head(cache, key);
208 if (ce->e_used >= MB_CACHE_WRITER) 134 hlist_bl_lock(head);
209 ce->e_used -= MB_CACHE_WRITER; 135 if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
210 /* 136 node = entry->e_hash_list.next;
211 * Make sure that all cache entries on lru_list have 137 else
212 * both e_used and e_qued of 0s. 138 node = hlist_bl_first(head);
213 */ 139 while (node) {
214 ce->e_used--; 140 entry = hlist_bl_entry(node, struct mb_cache_entry,
215 if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { 141 e_hash_list);
216 if (!__mb_cache_entry_is_block_hashed(ce)) { 142 if (entry->e_key == key && entry->e_reusable) {
217 __spin_unlock_mb_cache_entry(ce); 143 atomic_inc(&entry->e_refcnt);
218 goto forget; 144 goto out;
219 } 145 }
220 /* 146 node = node->next;
221 * Need access to lru list, first drop entry lock,
222 * then reacquire the lock in the proper order.
223 */
224 spin_lock(&mb_cache_spinlock);
225 if (list_empty(&ce->e_lru_list))
226 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
227 spin_unlock(&mb_cache_spinlock);
228 } 147 }
229 __spin_unlock_mb_cache_entry(ce); 148 entry = NULL;
230 return; 149out:
231forget: 150 hlist_bl_unlock(head);
232 mb_assert(list_empty(&ce->e_lru_list)); 151 if (old_entry)
233 __mb_cache_entry_forget(ce, GFP_KERNEL); 152 mb_cache_entry_put(cache, old_entry);
153
154 return entry;
234} 155}
235 156
236/* 157/*
237 * mb_cache_shrink_scan() memory pressure callback 158 * mb_cache_entry_find_first - find the first entry in cache with given key
238 * 159 * @cache: cache where we should search
239 * This function is called by the kernel memory management when memory 160 * @key: key to look for
240 * gets low.
241 * 161 *
242 * @shrink: (ignored) 162 * Search in @cache for entry with key @key. Grabs reference to the first
243 * @sc: shrink_control passed from reclaim 163 * entry found and returns the entry.
244 *
245 * Returns the number of objects freed.
246 */ 164 */
247static unsigned long 165struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
248mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 166 u32 key)
249{ 167{
250 LIST_HEAD(free_list); 168 return __entry_find(cache, NULL, key);
251 struct mb_cache_entry *entry, *tmp;
252 int nr_to_scan = sc->nr_to_scan;
253 gfp_t gfp_mask = sc->gfp_mask;
254 unsigned long freed = 0;
255
256 mb_debug("trying to free %d entries", nr_to_scan);
257 spin_lock(&mb_cache_spinlock);
258 while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
259 struct mb_cache_entry *ce =
260 list_entry(mb_cache_lru_list.next,
261 struct mb_cache_entry, e_lru_list);
262 list_del_init(&ce->e_lru_list);
263 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
264 continue;
265 spin_unlock(&mb_cache_spinlock);
266 /* Prevent any find or get operation on the entry */
267 hlist_bl_lock(ce->e_block_hash_p);
268 hlist_bl_lock(ce->e_index_hash_p);
269 /* Ignore if it is touched by a find/get */
270 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
271 !list_empty(&ce->e_lru_list)) {
272 hlist_bl_unlock(ce->e_index_hash_p);
273 hlist_bl_unlock(ce->e_block_hash_p);
274 spin_lock(&mb_cache_spinlock);
275 continue;
276 }
277 __mb_cache_entry_unhash_unlock(ce);
278 list_add_tail(&ce->e_lru_list, &free_list);
279 spin_lock(&mb_cache_spinlock);
280 }
281 spin_unlock(&mb_cache_spinlock);
282
283 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
284 __mb_cache_entry_forget(entry, gfp_mask);
285 freed++;
286 }
287 return freed;
288} 169}
170EXPORT_SYMBOL(mb_cache_entry_find_first);
289 171
290static unsigned long 172/*
291mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 173 * mb_cache_entry_find_next - find next entry in cache with the same
174 * @cache: cache where we should search
175 * @entry: entry to start search from
176 *
177 * Finds next entry in the hash chain which has the same key as @entry.
178 * If @entry is unhashed (which can happen when deletion of entry races
179 * with the search), finds the first entry in the hash chain. The function
180 * drops reference to @entry and returns with a reference to the found entry.
181 */
182struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
183 struct mb_cache_entry *entry)
292{ 184{
293 struct mb_cache *cache; 185 return __entry_find(cache, entry, entry->e_key);
294 unsigned long count = 0;
295
296 spin_lock(&mb_cache_spinlock);
297 list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
298 mb_debug("cache %s (%d)", cache->c_name,
299 atomic_read(&cache->c_entry_count));
300 count += atomic_read(&cache->c_entry_count);
301 }
302 spin_unlock(&mb_cache_spinlock);
303
304 return vfs_pressure_ratio(count);
305} 186}
306 187EXPORT_SYMBOL(mb_cache_entry_find_next);
307static struct shrinker mb_cache_shrinker = {
308 .count_objects = mb_cache_shrink_count,
309 .scan_objects = mb_cache_shrink_scan,
310 .seeks = DEFAULT_SEEKS,
311};
312 188
313/* 189/*
314 * mb_cache_create() create a new cache 190 * mb_cache_entry_get - get a cache entry by block number (and key)
315 * 191 * @cache - cache we work with
316 * All entries in one cache are equal size. Cache entries may be from 192 * @key - key of block number @block
317 * multiple devices. If this is the first mbcache created, registers 193 * @block - block number
318 * the cache with kernel memory management. Returns NULL if no more
319 * memory was available.
320 *
321 * @name: name of the cache (informal)
322 * @bucket_bits: log2(number of hash buckets)
323 */ 194 */
324struct mb_cache * 195struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
325mb_cache_create(const char *name, int bucket_bits) 196 sector_t block)
326{ 197{
327 int n, bucket_count = 1 << bucket_bits; 198 struct hlist_bl_node *node;
328 struct mb_cache *cache = NULL; 199 struct hlist_bl_head *head;
329 200 struct mb_cache_entry *entry;
330 if (!mb_cache_bg_lock) { 201
331 mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), 202 head = mb_cache_entry_head(cache, key);
332 GFP_KERNEL); 203 hlist_bl_lock(head);
333 if (!mb_cache_bg_lock) 204 hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
334 return NULL; 205 if (entry->e_key == key && entry->e_block == block) {
335 bgl_lock_init(mb_cache_bg_lock); 206 atomic_inc(&entry->e_refcnt);
336 } 207 goto out;
337 208 }
338 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
339 if (!cache)
340 return NULL;
341 cache->c_name = name;
342 atomic_set(&cache->c_entry_count, 0);
343 cache->c_bucket_bits = bucket_bits;
344 cache->c_block_hash = kmalloc(bucket_count *
345 sizeof(struct hlist_bl_head), GFP_KERNEL);
346 if (!cache->c_block_hash)
347 goto fail;
348 for (n=0; n<bucket_count; n++)
349 INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
350 cache->c_index_hash = kmalloc(bucket_count *
351 sizeof(struct hlist_bl_head), GFP_KERNEL);
352 if (!cache->c_index_hash)
353 goto fail;
354 for (n=0; n<bucket_count; n++)
355 INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
356 if (!mb_cache_kmem_cache) {
357 mb_cache_kmem_cache = kmem_cache_create(name,
358 sizeof(struct mb_cache_entry), 0,
359 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
360 if (!mb_cache_kmem_cache)
361 goto fail2;
362 } 209 }
363 cache->c_entry_cache = mb_cache_kmem_cache; 210 entry = NULL;
364 211out:
365 /* 212 hlist_bl_unlock(head);
366 * Set an upper limit on the number of cache entries so that the hash 213 return entry;
367 * chains won't grow too long.
368 */
369 cache->c_max_entries = bucket_count << 4;
370
371 spin_lock(&mb_cache_spinlock);
372 list_add(&cache->c_cache_list, &mb_cache_list);
373 spin_unlock(&mb_cache_spinlock);
374 return cache;
375
376fail2:
377 kfree(cache->c_index_hash);
378
379fail:
380 kfree(cache->c_block_hash);
381 kfree(cache);
382 return NULL;
383} 214}
215EXPORT_SYMBOL(mb_cache_entry_get);
384 216
385 217/* mb_cache_entry_delete_block - remove information about block from cache
386/* 218 * @cache - cache we work with
387 * mb_cache_shrink() 219 * @key - key of block @block
388 * 220 * @block - block number
389 * Removes all cache entries of a device from the cache. All cache entries
390 * currently in use cannot be freed, and thus remain in the cache. All others
391 * are freed.
392 * 221 *
393 * @bdev: which device's cache entries to shrink 222 * Remove entry from cache @cache with key @key with data stored in @block.
394 */ 223 */
395void 224void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
396mb_cache_shrink(struct block_device *bdev) 225 sector_t block)
397{ 226{
398 LIST_HEAD(free_list); 227 struct hlist_bl_node *node;
399 struct list_head *l; 228 struct hlist_bl_head *head;
400 struct mb_cache_entry *ce, *tmp; 229 struct mb_cache_entry *entry;
401 230
402 l = &mb_cache_lru_list; 231 head = mb_cache_entry_head(cache, key);
403 spin_lock(&mb_cache_spinlock); 232 hlist_bl_lock(head);
404 while (!list_is_last(l, &mb_cache_lru_list)) { 233 hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
405 l = l->next; 234 if (entry->e_key == key && entry->e_block == block) {
406 ce = list_entry(l, struct mb_cache_entry, e_lru_list); 235 /* We keep hash list reference to keep entry alive */
407 if (ce->e_bdev == bdev) { 236 hlist_bl_del_init(&entry->e_hash_list);
408 list_del_init(&ce->e_lru_list); 237 hlist_bl_unlock(head);
409 if (ce->e_used || ce->e_queued || 238 spin_lock(&cache->c_list_lock);
410 atomic_read(&ce->e_refcnt)) 239 if (!list_empty(&entry->e_list)) {
411 continue; 240 list_del_init(&entry->e_list);
412 spin_unlock(&mb_cache_spinlock); 241 cache->c_entry_count--;
413 /* 242 atomic_dec(&entry->e_refcnt);
414 * Prevent any find or get operation on the entry.
415 */
416 hlist_bl_lock(ce->e_block_hash_p);
417 hlist_bl_lock(ce->e_index_hash_p);
418 /* Ignore if it is touched by a find/get */
419 if (ce->e_used || ce->e_queued ||
420 atomic_read(&ce->e_refcnt) ||
421 !list_empty(&ce->e_lru_list)) {
422 hlist_bl_unlock(ce->e_index_hash_p);
423 hlist_bl_unlock(ce->e_block_hash_p);
424 l = &mb_cache_lru_list;
425 spin_lock(&mb_cache_spinlock);
426 continue;
427 } 243 }
428 __mb_cache_entry_unhash_unlock(ce); 244 spin_unlock(&cache->c_list_lock);
429 mb_assert(!(ce->e_used || ce->e_queued || 245 mb_cache_entry_put(cache, entry);
430 atomic_read(&ce->e_refcnt))); 246 return;
431 list_add_tail(&ce->e_lru_list, &free_list);
432 l = &mb_cache_lru_list;
433 spin_lock(&mb_cache_spinlock);
434 } 247 }
435 } 248 }
436 spin_unlock(&mb_cache_spinlock); 249 hlist_bl_unlock(head);
437
438 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
439 __mb_cache_entry_forget(ce, GFP_KERNEL);
440 }
441} 250}
251EXPORT_SYMBOL(mb_cache_entry_delete_block);
442 252
443 253/* mb_cache_entry_touch - cache entry got used
444/* 254 * @cache - cache the entry belongs to
445 * mb_cache_destroy() 255 * @entry - entry that got used
446 * 256 *
447 * Shrinks the cache to its minimum possible size (hopefully 0 entries), 257 * Marks entry as used to give hit higher chances of surviving in cache.
448 * and then destroys it. If this was the last mbcache, un-registers the
449 * mbcache from kernel memory management.
450 */ 258 */
451void 259void mb_cache_entry_touch(struct mb_cache *cache,
452mb_cache_destroy(struct mb_cache *cache) 260 struct mb_cache_entry *entry)
453{ 261{
454 LIST_HEAD(free_list); 262 entry->e_referenced = 1;
455 struct mb_cache_entry *ce, *tmp;
456
457 spin_lock(&mb_cache_spinlock);
458 list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
459 if (ce->e_cache == cache)
460 list_move_tail(&ce->e_lru_list, &free_list);
461 }
462 list_del(&cache->c_cache_list);
463 spin_unlock(&mb_cache_spinlock);
464
465 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
466 list_del_init(&ce->e_lru_list);
467 /*
468 * Prevent any find or get operation on the entry.
469 */
470 hlist_bl_lock(ce->e_block_hash_p);
471 hlist_bl_lock(ce->e_index_hash_p);
472 mb_assert(!(ce->e_used || ce->e_queued ||
473 atomic_read(&ce->e_refcnt)));
474 __mb_cache_entry_unhash_unlock(ce);
475 __mb_cache_entry_forget(ce, GFP_KERNEL);
476 }
477
478 if (atomic_read(&cache->c_entry_count) > 0) {
479 mb_error("cache %s: %d orphaned entries",
480 cache->c_name,
481 atomic_read(&cache->c_entry_count));
482 }
483
484 if (list_empty(&mb_cache_list)) {
485 kmem_cache_destroy(mb_cache_kmem_cache);
486 mb_cache_kmem_cache = NULL;
487 }
488 kfree(cache->c_index_hash);
489 kfree(cache->c_block_hash);
490 kfree(cache);
491} 263}
264EXPORT_SYMBOL(mb_cache_entry_touch);
492 265
493/* 266static unsigned long mb_cache_count(struct shrinker *shrink,
494 * mb_cache_entry_alloc() 267 struct shrink_control *sc)
495 *
496 * Allocates a new cache entry. The new entry will not be valid initially,
497 * and thus cannot be looked up yet. It should be filled with data, and
498 * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
499 * if no more memory was available.
500 */
501struct mb_cache_entry *
502mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
503{ 268{
504 struct mb_cache_entry *ce; 269 struct mb_cache *cache = container_of(shrink, struct mb_cache,
505 270 c_shrink);
506 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
507 struct list_head *l;
508
509 l = &mb_cache_lru_list;
510 spin_lock(&mb_cache_spinlock);
511 while (!list_is_last(l, &mb_cache_lru_list)) {
512 l = l->next;
513 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
514 if (ce->e_cache == cache) {
515 list_del_init(&ce->e_lru_list);
516 if (ce->e_used || ce->e_queued ||
517 atomic_read(&ce->e_refcnt))
518 continue;
519 spin_unlock(&mb_cache_spinlock);
520 /*
521 * Prevent any find or get operation on the
522 * entry.
523 */
524 hlist_bl_lock(ce->e_block_hash_p);
525 hlist_bl_lock(ce->e_index_hash_p);
526 /* Ignore if it is touched by a find/get */
527 if (ce->e_used || ce->e_queued ||
528 atomic_read(&ce->e_refcnt) ||
529 !list_empty(&ce->e_lru_list)) {
530 hlist_bl_unlock(ce->e_index_hash_p);
531 hlist_bl_unlock(ce->e_block_hash_p);
532 l = &mb_cache_lru_list;
533 spin_lock(&mb_cache_spinlock);
534 continue;
535 }
536 mb_assert(list_empty(&ce->e_lru_list));
537 mb_assert(!(ce->e_used || ce->e_queued ||
538 atomic_read(&ce->e_refcnt)));
539 __mb_cache_entry_unhash_unlock(ce);
540 goto found;
541 }
542 }
543 spin_unlock(&mb_cache_spinlock);
544 }
545 271
546 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 272 return cache->c_entry_count;
547 if (!ce)
548 return NULL;
549 atomic_inc(&cache->c_entry_count);
550 INIT_LIST_HEAD(&ce->e_lru_list);
551 INIT_HLIST_BL_NODE(&ce->e_block_list);
552 INIT_HLIST_BL_NODE(&ce->e_index.o_list);
553 ce->e_cache = cache;
554 ce->e_queued = 0;
555 atomic_set(&ce->e_refcnt, 0);
556found:
557 ce->e_block_hash_p = &cache->c_block_hash[0];
558 ce->e_index_hash_p = &cache->c_index_hash[0];
559 ce->e_used = 1 + MB_CACHE_WRITER;
560 return ce;
561} 273}
562 274
563 275/* Shrink number of entries in cache */
564/* 276static unsigned long mb_cache_shrink(struct mb_cache *cache,
565 * mb_cache_entry_insert() 277 unsigned int nr_to_scan)
566 *
567 * Inserts an entry that was allocated using mb_cache_entry_alloc() into
568 * the cache. After this, the cache entry can be looked up, but is not yet
569 * in the lru list as the caller still holds a handle to it. Returns 0 on
570 * success, or -EBUSY if a cache entry for that device + inode exists
571 * already (this may happen after a failed lookup, but when another process
572 * has inserted the same cache entry in the meantime).
573 *
574 * @bdev: device the cache entry belongs to
575 * @block: block number
576 * @key: lookup key
577 */
578int
579mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
580 sector_t block, unsigned int key)
581{ 278{
582 struct mb_cache *cache = ce->e_cache; 279 struct mb_cache_entry *entry;
583 unsigned int bucket; 280 struct hlist_bl_head *head;
584 struct hlist_bl_node *l; 281 unsigned int shrunk = 0;
585 struct hlist_bl_head *block_hash_p; 282
586 struct hlist_bl_head *index_hash_p; 283 spin_lock(&cache->c_list_lock);
587 struct mb_cache_entry *lce; 284 while (nr_to_scan-- && !list_empty(&cache->c_list)) {
588 285 entry = list_first_entry(&cache->c_list,
589 mb_assert(ce); 286 struct mb_cache_entry, e_list);
590 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 287 if (entry->e_referenced) {
591 cache->c_bucket_bits); 288 entry->e_referenced = 0;
592 block_hash_p = &cache->c_block_hash[bucket]; 289 list_move_tail(&cache->c_list, &entry->e_list);
593 hlist_bl_lock(block_hash_p); 290 continue;
594 hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
595 if (lce->e_bdev == bdev && lce->e_block == block) {
596 hlist_bl_unlock(block_hash_p);
597 return -EBUSY;
598 } 291 }
292 list_del_init(&entry->e_list);
293 cache->c_entry_count--;
294 /*
295 * We keep LRU list reference so that entry doesn't go away
296 * from under us.
297 */
298 spin_unlock(&cache->c_list_lock);
299 head = mb_cache_entry_head(cache, entry->e_key);
300 hlist_bl_lock(head);
301 if (!hlist_bl_unhashed(&entry->e_hash_list)) {
302 hlist_bl_del_init(&entry->e_hash_list);
303 atomic_dec(&entry->e_refcnt);
304 }
305 hlist_bl_unlock(head);
306 if (mb_cache_entry_put(cache, entry))
307 shrunk++;
308 cond_resched();
309 spin_lock(&cache->c_list_lock);
599 } 310 }
600 mb_assert(!__mb_cache_entry_is_block_hashed(ce)); 311 spin_unlock(&cache->c_list_lock);
601 __mb_cache_entry_unhash_block(ce);
602 __mb_cache_entry_unhash_index(ce);
603 ce->e_bdev = bdev;
604 ce->e_block = block;
605 ce->e_block_hash_p = block_hash_p;
606 ce->e_index.o_key = key;
607 hlist_bl_add_head(&ce->e_block_list, block_hash_p);
608 hlist_bl_unlock(block_hash_p);
609 bucket = hash_long(key, cache->c_bucket_bits);
610 index_hash_p = &cache->c_index_hash[bucket];
611 hlist_bl_lock(index_hash_p);
612 ce->e_index_hash_p = index_hash_p;
613 hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
614 hlist_bl_unlock(index_hash_p);
615 return 0;
616}
617 312
313 return shrunk;
314}
618 315
619/* 316static unsigned long mb_cache_scan(struct shrinker *shrink,
620 * mb_cache_entry_release() 317 struct shrink_control *sc)
621 *
622 * Release a handle to a cache entry. When the last handle to a cache entry
623 * is released it is either freed (if it is invalid) or otherwise inserted
624 * in to the lru list.
625 */
626void
627mb_cache_entry_release(struct mb_cache_entry *ce)
628{ 318{
629 __mb_cache_entry_release(ce); 319 int nr_to_scan = sc->nr_to_scan;
320 struct mb_cache *cache = container_of(shrink, struct mb_cache,
321 c_shrink);
322 return mb_cache_shrink(cache, nr_to_scan);
630} 323}
631 324
325/* We shrink 1/X of the cache when we have too many entries in it */
326#define SHRINK_DIVISOR 16
632 327
633/* 328static void mb_cache_shrink_worker(struct work_struct *work)
634 * mb_cache_entry_free()
635 *
636 */
637void
638mb_cache_entry_free(struct mb_cache_entry *ce)
639{ 329{
640 mb_assert(ce); 330 struct mb_cache *cache = container_of(work, struct mb_cache,
641 mb_assert(list_empty(&ce->e_lru_list)); 331 c_shrink_work);
642 hlist_bl_lock(ce->e_index_hash_p); 332 mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
643 __mb_cache_entry_unhash_index(ce);
644 hlist_bl_unlock(ce->e_index_hash_p);
645 hlist_bl_lock(ce->e_block_hash_p);
646 __mb_cache_entry_unhash_block(ce);
647 hlist_bl_unlock(ce->e_block_hash_p);
648 __mb_cache_entry_release(ce);
649} 333}
650 334
651
652/* 335/*
653 * mb_cache_entry_get() 336 * mb_cache_create - create cache
337 * @bucket_bits: log2 of the hash table size
654 * 338 *
655 * Get a cache entry by device / block number. (There can only be one entry 339 * Create cache for keys with 2^bucket_bits hash entries.
656 * in the cache per device and block.) Returns NULL if no such cache entry
657 * exists. The returned cache entry is locked for exclusive access ("single
658 * writer").
659 */ 340 */
660struct mb_cache_entry * 341struct mb_cache *mb_cache_create(int bucket_bits)
661mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
662 sector_t block)
663{ 342{
664 unsigned int bucket; 343 struct mb_cache *cache;
665 struct hlist_bl_node *l; 344 int bucket_count = 1 << bucket_bits;
666 struct mb_cache_entry *ce; 345 int i;
667 struct hlist_bl_head *block_hash_p;
668
669 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
670 cache->c_bucket_bits);
671 block_hash_p = &cache->c_block_hash[bucket];
672 /* First serialize access to the block corresponding hash chain. */
673 hlist_bl_lock(block_hash_p);
674 hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
675 mb_assert(ce->e_block_hash_p == block_hash_p);
676 if (ce->e_bdev == bdev && ce->e_block == block) {
677 /*
678 * Prevent a free from removing the entry.
679 */
680 atomic_inc(&ce->e_refcnt);
681 hlist_bl_unlock(block_hash_p);
682 __spin_lock_mb_cache_entry(ce);
683 atomic_dec(&ce->e_refcnt);
684 if (ce->e_used > 0) {
685 DEFINE_WAIT(wait);
686 while (ce->e_used > 0) {
687 ce->e_queued++;
688 prepare_to_wait(&mb_cache_queue, &wait,
689 TASK_UNINTERRUPTIBLE);
690 __spin_unlock_mb_cache_entry(ce);
691 schedule();
692 __spin_lock_mb_cache_entry(ce);
693 ce->e_queued--;
694 }
695 finish_wait(&mb_cache_queue, &wait);
696 }
697 ce->e_used += 1 + MB_CACHE_WRITER;
698 __spin_unlock_mb_cache_entry(ce);
699 346
700 if (!list_empty(&ce->e_lru_list)) { 347 if (!try_module_get(THIS_MODULE))
701 spin_lock(&mb_cache_spinlock); 348 return NULL;
702 list_del_init(&ce->e_lru_list); 349
703 spin_unlock(&mb_cache_spinlock); 350 cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
704 } 351 if (!cache)
705 if (!__mb_cache_entry_is_block_hashed(ce)) { 352 goto err_out;
706 __mb_cache_entry_release(ce); 353 cache->c_bucket_bits = bucket_bits;
707 return NULL; 354 cache->c_max_entries = bucket_count << 4;
708 } 355 INIT_LIST_HEAD(&cache->c_list);
709 return ce; 356 spin_lock_init(&cache->c_list_lock);
710 } 357 cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
358 GFP_KERNEL);
359 if (!cache->c_hash) {
360 kfree(cache);
361 goto err_out;
711 } 362 }
712 hlist_bl_unlock(block_hash_p); 363 for (i = 0; i < bucket_count; i++)
713 return NULL; 364 INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
714}
715 365
716#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 366 cache->c_shrink.count_objects = mb_cache_count;
367 cache->c_shrink.scan_objects = mb_cache_scan;
368 cache->c_shrink.seeks = DEFAULT_SEEKS;
369 register_shrinker(&cache->c_shrink);
717 370
718static struct mb_cache_entry * 371 INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
719__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
720 struct block_device *bdev, unsigned int key)
721{
722 372
723 /* The index hash chain is alredy acquire by caller. */ 373 return cache;
724 while (l != NULL) { 374
725 struct mb_cache_entry *ce = 375err_out:
726 hlist_bl_entry(l, struct mb_cache_entry, 376 module_put(THIS_MODULE);
727 e_index.o_list);
728 mb_assert(ce->e_index_hash_p == head);
729 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
730 /*
731 * Prevent a free from removing the entry.
732 */
733 atomic_inc(&ce->e_refcnt);
734 hlist_bl_unlock(head);
735 __spin_lock_mb_cache_entry(ce);
736 atomic_dec(&ce->e_refcnt);
737 ce->e_used++;
738 /* Incrementing before holding the lock gives readers
739 priority over writers. */
740 if (ce->e_used >= MB_CACHE_WRITER) {
741 DEFINE_WAIT(wait);
742
743 while (ce->e_used >= MB_CACHE_WRITER) {
744 ce->e_queued++;
745 prepare_to_wait(&mb_cache_queue, &wait,
746 TASK_UNINTERRUPTIBLE);
747 __spin_unlock_mb_cache_entry(ce);
748 schedule();
749 __spin_lock_mb_cache_entry(ce);
750 ce->e_queued--;
751 }
752 finish_wait(&mb_cache_queue, &wait);
753 }
754 __spin_unlock_mb_cache_entry(ce);
755 if (!list_empty(&ce->e_lru_list)) {
756 spin_lock(&mb_cache_spinlock);
757 list_del_init(&ce->e_lru_list);
758 spin_unlock(&mb_cache_spinlock);
759 }
760 if (!__mb_cache_entry_is_block_hashed(ce)) {
761 __mb_cache_entry_release(ce);
762 return ERR_PTR(-EAGAIN);
763 }
764 return ce;
765 }
766 l = l->next;
767 }
768 hlist_bl_unlock(head);
769 return NULL; 377 return NULL;
770} 378}
771 379EXPORT_SYMBOL(mb_cache_create);
772 380
773/* 381/*
774 * mb_cache_entry_find_first() 382 * mb_cache_destroy - destroy cache
775 * 383 * @cache: the cache to destroy
776 * Find the first cache entry on a given device with a certain key in
777 * an additional index. Additional matches can be found with
778 * mb_cache_entry_find_next(). Returns NULL if no match was found. The
779 * returned cache entry is locked for shared access ("multiple readers").
780 * 384 *
781 * @cache: the cache to search 385 * Free all entries in cache and cache itself. Caller must make sure nobody
782 * @bdev: the device the cache entry should belong to 386 * (except shrinker) can reach @cache when calling this.
783 * @key: the key in the index
784 */ 387 */
785struct mb_cache_entry * 388void mb_cache_destroy(struct mb_cache *cache)
786mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
787 unsigned int key)
788{ 389{
789 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 390 struct mb_cache_entry *entry, *next;
790 struct hlist_bl_node *l;
791 struct mb_cache_entry *ce = NULL;
792 struct hlist_bl_head *index_hash_p;
793
794 index_hash_p = &cache->c_index_hash[bucket];
795 hlist_bl_lock(index_hash_p);
796 if (!hlist_bl_empty(index_hash_p)) {
797 l = hlist_bl_first(index_hash_p);
798 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
799 } else
800 hlist_bl_unlock(index_hash_p);
801 return ce;
802}
803 391
392 unregister_shrinker(&cache->c_shrink);
804 393
805/* 394 /*
806 * mb_cache_entry_find_next() 395 * We don't bother with any locking. Cache must not be used at this
807 * 396 * point.
808 * Find the next cache entry on a given device with a certain key in an 397 */
809 * additional index. Returns NULL if no match could be found. The previous 398 list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
810 * entry is atomatically released, so that mb_cache_entry_find_next() can 399 if (!hlist_bl_unhashed(&entry->e_hash_list)) {
811 * be called like this: 400 hlist_bl_del_init(&entry->e_hash_list);
812 * 401 atomic_dec(&entry->e_refcnt);
813 * entry = mb_cache_entry_find_first(); 402 } else
814 * while (entry) { 403 WARN_ON(1);
815 * ... 404 list_del(&entry->e_list);
816 * entry = mb_cache_entry_find_next(entry, ...); 405 WARN_ON(atomic_read(&entry->e_refcnt) != 1);
817 * } 406 mb_cache_entry_put(cache, entry);
818 * 407 }
819 * @prev: The previous match 408 kfree(cache->c_hash);
820 * @bdev: the device the cache entry should belong to 409 kfree(cache);
821 * @key: the key in the index 410 module_put(THIS_MODULE);
822 */
823struct mb_cache_entry *
824mb_cache_entry_find_next(struct mb_cache_entry *prev,
825 struct block_device *bdev, unsigned int key)
826{
827 struct mb_cache *cache = prev->e_cache;
828 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
829 struct hlist_bl_node *l;
830 struct mb_cache_entry *ce;
831 struct hlist_bl_head *index_hash_p;
832
833 index_hash_p = &cache->c_index_hash[bucket];
834 mb_assert(prev->e_index_hash_p == index_hash_p);
835 hlist_bl_lock(index_hash_p);
836 mb_assert(!hlist_bl_empty(index_hash_p));
837 l = prev->e_index.o_list.next;
838 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
839 __mb_cache_entry_release(prev);
840 return ce;
841} 411}
412EXPORT_SYMBOL(mb_cache_destroy);
842 413
843#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ 414static int __init mbcache_init(void)
844
845static int __init init_mbcache(void)
846{ 415{
847 register_shrinker(&mb_cache_shrinker); 416 mb_entry_cache = kmem_cache_create("mbcache",
417 sizeof(struct mb_cache_entry), 0,
418 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
419 BUG_ON(!mb_entry_cache);
848 return 0; 420 return 0;
849} 421}
850 422
851static void __exit exit_mbcache(void) 423static void __exit mbcache_exit(void)
852{ 424{
853 unregister_shrinker(&mb_cache_shrinker); 425 kmem_cache_destroy(mb_entry_cache);
854} 426}
855 427
856module_init(init_mbcache) 428module_init(mbcache_init)
857module_exit(exit_mbcache) 429module_exit(mbcache_exit)
858 430
431MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
432MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
433MODULE_LICENSE("GPL");