aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_mru_cache.c
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2007-07-10 21:09:12 -0400
committerTim Shimmin <tes@chook.melbourne.sgi.com>2007-07-14 01:40:53 -0400
commit2a82b8be8a8dacb48cb7371449a7a9daa558b4a8 (patch)
tree44e6a81dd0e7d7dc634e04b9230b5262a254c5ee /fs/xfs/xfs_mru_cache.c
parent0892ccd6fe13e08ad9e57007afbb78fe02d66005 (diff)
[XFS] Concurrent Multi-File Data Streams
In media spaces, video is often stored in a frame-per-file format. When dealing with uncompressed realtime HD video streams in this format, it is crucial that files do not get fragmented and that multiple files a placed contiguously on disk. When multiple streams are being ingested and played out at the same time, it is critical that the filesystem does not cross the streams and interleave them together as this creates seek and readahead cache miss latency and prevents both ingest and playout from meeting frame rate targets. This patch set creates a "stream of files" concept into the allocator to place all the data from a single stream contiguously on disk so that RAID array readahead can be used effectively. Each additional stream gets placed in different allocation groups within the filesystem, thereby ensuring that we don't cross any streams. When an AG fills up, we select a new AG for the stream that is not in use. The core of the functionality is the stream tracking - each inode that we create in a directory needs to be associated with the directories' stream. Hence every time we create a file, we look up the directories' stream object and associate the new file with that object. Once we have a stream object for a file, we use the AG that the stream object point to for allocations. If we can't allocate in that AG (e.g. it is full) we move the entire stream to another AG. Other inodes in the same stream are moved to the new AG on their next allocation (i.e. lazy update). Stream objects are kept in a cache and hold a reference on the inode. Hence the inode cannot be reclaimed while there is an outstanding stream reference. This means that on unlink we need to remove the stream association and we also need to flush all the associations on certain events that want to reclaim all unreferenced inodes (e.g. filesystem freeze). SGI-PV: 964469 SGI-Modid: xfs-linux-melb:xfs-kern:29096a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Barry Naujok <bnaujok@sgi.com> Signed-off-by: Donald Douwsma <donaldd@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com> Signed-off-by: Vlad Apostolov <vapo@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_mru_cache.c')
-rw-r--r--fs/xfs/xfs_mru_cache.c608
1 files changed, 608 insertions, 0 deletions
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
new file mode 100644
index 000000000000..7deb9e3cbbd3
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.c
@@ -0,0 +1,608 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_mru_cache.h"
20
21/*
22 * The MRU Cache data structure consists of a data store, an array of lists and
23 * a lock to protect its internal state. At initialisation time, the client
24 * supplies an element lifetime in milliseconds and a group count, as well as a
25 * function pointer to call when deleting elements. A data structure for
26 * queueing up work in the form of timed callbacks is also included.
27 *
28 * The group count controls how many lists are created, and thereby how finely
29 * the elements are grouped in time. When reaping occurs, all the elements in
30 * all the lists whose time has expired are deleted.
31 *
32 * To give an example of how this works in practice, consider a client that
33 * initialises an MRU Cache with a lifetime of ten seconds and a group count of
34 * five. Five internal lists will be created, each representing a two second
35 * period in time. When the first element is added, time zero for the data
36 * structure is initialised to the current time.
37 *
38 * All the elements added in the first two seconds are appended to the first
39 * list. Elements added in the third second go into the second list, and so on.
40 * If an element is accessed at any point, it is removed from its list and
41 * inserted at the head of the current most-recently-used list.
42 *
43 * The reaper function will have nothing to do until at least twelve seconds
44 * have elapsed since the first element was added. The reason for this is that
45 * if it were called at t=11s, there could be elements in the first list that
46 * have only been inactive for nine seconds, so it still does nothing. If it is
47 * called anywhere between t=12 and t=14 seconds, it will delete all the
48 * elements that remain in the first list. It's therefore possible for elements
49 * to remain in the data store even after they've been inactive for up to
50 * (t + t/g) seconds, where t is the inactive element lifetime and g is the
51 * number of groups.
52 *
53 * The above example assumes that the reaper function gets called at least once
54 * every (t/g) seconds. If it is called less frequently, unused elements will
55 * accumulate in the reap list until the reaper function is eventually called.
56 * The current implementation uses work queue callbacks to carefully time the
57 * reaper function calls, so this should happen rarely, if at all.
58 *
59 * From a design perspective, the primary reason for the choice of a list array
60 * representing discrete time intervals is that it's only practical to reap
61 * expired elements in groups of some appreciable size. This automatically
62 * introduces a granularity to element lifetimes, so there's no point storing an
63 * individual timeout with each element that specifies a more precise reap time.
64 * The bonus is a saving of sizeof(long) bytes of memory per element stored.
65 *
66 * The elements could have been stored in just one list, but an array of
67 * counters or pointers would need to be maintained to allow them to be divided
68 * up into discrete time groups. More critically, the process of touching or
69 * removing an element would involve walking large portions of the entire list,
70 * which would have a detrimental effect on performance. The additional memory
71 * requirement for the array of list heads is minimal.
72 *
73 * When an element is touched or deleted, it needs to be removed from its
74 * current list. Doubly linked lists are used to make the list maintenance
75 * portion of these operations O(1). Since reaper timing can be imprecise,
76 * inserts and lookups can occur when there are no free lists available. When
77 * this happens, all the elements on the LRU list need to be migrated to the end
78 * of the reap list. To keep the list maintenance portion of these operations
79 * O(1) also, list tails need to be accessible without walking the entire list.
80 * This is the reason why doubly linked list heads are used.
81 */
82
83/*
84 * An MRU Cache is a dynamic data structure that stores its elements in a way
85 * that allows efficient lookups, but also groups them into discrete time
86 * intervals based on insertion time. This allows elements to be efficiently
87 * and automatically reaped after a fixed period of inactivity.
88 *
89 * When a client data pointer is stored in the MRU Cache it needs to be added to
90 * both the data store and to one of the lists. It must also be possible to
91 * access each of these entries via the other, i.e. to:
92 *
93 * a) Walk a list, removing the corresponding data store entry for each item.
94 * b) Look up a data store entry, then access its list entry directly.
95 *
96 * To achieve both of these goals, each entry must contain both a list entry and
97 * a key, in addition to the user's data pointer. Note that it's not a good
98 * idea to have the client embed one of these structures at the top of their own
99 * data structure, because inserting the same item more than once would most
100 * likely result in a loop in one of the lists. That's a sure-fire recipe for
101 * an infinite loop in the code.
102 */
103typedef struct xfs_mru_cache_elem
104{
105 struct list_head list_node;
106 unsigned long key;
107 void *value;
108} xfs_mru_cache_elem_t;
109
110static kmem_zone_t *xfs_mru_elem_zone;
111static struct workqueue_struct *xfs_mru_reap_wq;
112
113/*
114 * When inserting, destroying or reaping, it's first necessary to update the
115 * lists relative to a particular time. In the case of destroying, that time
116 * will be well in the future to ensure that all items are moved to the reap
117 * list. In all other cases though, the time will be the current time.
118 *
119 * This function enters a loop, moving the contents of the LRU list to the reap
120 * list again and again until either a) the lists are all empty, or b) time zero
121 * has been advanced sufficiently to be within the immediate element lifetime.
122 *
123 * Case a) above is detected by counting how many groups are migrated and
124 * stopping when they've all been moved. Case b) is detected by monitoring the
125 * time_zero field, which is updated as each group is migrated.
126 *
127 * The return value is the earliest time that more migration could be needed, or
128 * zero if there's no need to schedule more work because the lists are empty.
129 */
130STATIC unsigned long
131_xfs_mru_cache_migrate(
132 xfs_mru_cache_t *mru,
133 unsigned long now)
134{
135 unsigned int grp;
136 unsigned int migrated = 0;
137 struct list_head *lru_list;
138
139 /* Nothing to do if the data store is empty. */
140 if (!mru->time_zero)
141 return 0;
142
143 /* While time zero is older than the time spanned by all the lists. */
144 while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
145
146 /*
147 * If the LRU list isn't empty, migrate its elements to the tail
148 * of the reap list.
149 */
150 lru_list = mru->lists + mru->lru_grp;
151 if (!list_empty(lru_list))
152 list_splice_init(lru_list, mru->reap_list.prev);
153
154 /*
155 * Advance the LRU group number, freeing the old LRU list to
156 * become the new MRU list; advance time zero accordingly.
157 */
158 mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
159 mru->time_zero += mru->grp_time;
160
161 /*
162 * If reaping is so far behind that all the elements on all the
163 * lists have been migrated to the reap list, it's now empty.
164 */
165 if (++migrated == mru->grp_count) {
166 mru->lru_grp = 0;
167 mru->time_zero = 0;
168 return 0;
169 }
170 }
171
172 /* Find the first non-empty list from the LRU end. */
173 for (grp = 0; grp < mru->grp_count; grp++) {
174
175 /* Check the grp'th list from the LRU end. */
176 lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
177 if (!list_empty(lru_list))
178 return mru->time_zero +
179 (mru->grp_count + grp) * mru->grp_time;
180 }
181
182 /* All the lists must be empty. */
183 mru->lru_grp = 0;
184 mru->time_zero = 0;
185 return 0;
186}
187
188/*
189 * When inserting or doing a lookup, an element needs to be inserted into the
190 * MRU list. The lists must be migrated first to ensure that they're
191 * up-to-date, otherwise the new element could be given a shorter lifetime in
192 * the cache than it should.
193 */
194STATIC void
195_xfs_mru_cache_list_insert(
196 xfs_mru_cache_t *mru,
197 xfs_mru_cache_elem_t *elem)
198{
199 unsigned int grp = 0;
200 unsigned long now = jiffies;
201
202 /*
203 * If the data store is empty, initialise time zero, leave grp set to
204 * zero and start the work queue timer if necessary. Otherwise, set grp
205 * to the number of group times that have elapsed since time zero.
206 */
207 if (!_xfs_mru_cache_migrate(mru, now)) {
208 mru->time_zero = now;
209 if (!mru->next_reap)
210 mru->next_reap = mru->grp_count * mru->grp_time;
211 } else {
212 grp = (now - mru->time_zero) / mru->grp_time;
213 grp = (mru->lru_grp + grp) % mru->grp_count;
214 }
215
216 /* Insert the element at the tail of the corresponding list. */
217 list_add_tail(&elem->list_node, mru->lists + grp);
218}
219
220/*
221 * When destroying or reaping, all the elements that were migrated to the reap
222 * list need to be deleted. For each element this involves removing it from the
223 * data store, removing it from the reap list, calling the client's free
224 * function and deleting the element from the element zone.
225 */
226STATIC void
227_xfs_mru_cache_clear_reap_list(
228 xfs_mru_cache_t *mru)
229{
230 xfs_mru_cache_elem_t *elem, *next;
231 struct list_head tmp;
232
233 INIT_LIST_HEAD(&tmp);
234 list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
235
236 /* Remove the element from the data store. */
237 radix_tree_delete(&mru->store, elem->key);
238
239 /*
240 * remove to temp list so it can be freed without
241 * needing to hold the lock
242 */
243 list_move(&elem->list_node, &tmp);
244 }
245 mutex_spinunlock(&mru->lock, 0);
246
247 list_for_each_entry_safe(elem, next, &tmp, list_node) {
248
249 /* Remove the element from the reap list. */
250 list_del_init(&elem->list_node);
251
252 /* Call the client's free function with the key and value pointer. */
253 mru->free_func(elem->key, elem->value);
254
255 /* Free the element structure. */
256 kmem_zone_free(xfs_mru_elem_zone, elem);
257 }
258
259 mutex_spinlock(&mru->lock);
260}
261
262/*
263 * We fire the reap timer every group expiry interval so
264 * we always have a reaper ready to run. This makes shutdown
265 * and flushing of the reaper easy to do. Hence we need to
266 * keep when the next reap must occur so we can determine
267 * at each interval whether there is anything we need to do.
268 */
269STATIC void
270_xfs_mru_cache_reap(
271 struct work_struct *work)
272{
273 xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work);
274 unsigned long now;
275
276 ASSERT(mru && mru->lists);
277 if (!mru || !mru->lists)
278 return;
279
280 mutex_spinlock(&mru->lock);
281 now = jiffies;
282 if (mru->reap_all ||
283 (mru->next_reap && time_after(now, mru->next_reap))) {
284 if (mru->reap_all)
285 now += mru->grp_count * mru->grp_time * 2;
286 mru->next_reap = _xfs_mru_cache_migrate(mru, now);
287 _xfs_mru_cache_clear_reap_list(mru);
288 }
289
290 /*
291 * the process that triggered the reap_all is responsible
292 * for restating the periodic reap if it is required.
293 */
294 if (!mru->reap_all)
295 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
296 mru->reap_all = 0;
297 mutex_spinunlock(&mru->lock, 0);
298}
299
300int
301xfs_mru_cache_init(void)
302{
303 xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
304 "xfs_mru_cache_elem");
305 if (!xfs_mru_elem_zone)
306 return ENOMEM;
307
308 xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
309 if (!xfs_mru_reap_wq) {
310 kmem_zone_destroy(xfs_mru_elem_zone);
311 return ENOMEM;
312 }
313
314 return 0;
315}
316
317void
318xfs_mru_cache_uninit(void)
319{
320 destroy_workqueue(xfs_mru_reap_wq);
321 kmem_zone_destroy(xfs_mru_elem_zone);
322}
323
324/*
325 * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
326 * with the address of the pointer, a lifetime value in milliseconds, a group
327 * count and a free function to use when deleting elements. This function
328 * returns 0 if the initialisation was successful.
329 */
330int
331xfs_mru_cache_create(
332 xfs_mru_cache_t **mrup,
333 unsigned int lifetime_ms,
334 unsigned int grp_count,
335 xfs_mru_cache_free_func_t free_func)
336{
337 xfs_mru_cache_t *mru = NULL;
338 int err = 0, grp;
339 unsigned int grp_time;
340
341 if (mrup)
342 *mrup = NULL;
343
344 if (!mrup || !grp_count || !lifetime_ms || !free_func)
345 return EINVAL;
346
347 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
348 return EINVAL;
349
350 if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
351 return ENOMEM;
352
353 /* An extra list is needed to avoid reaping up to a grp_time early. */
354 mru->grp_count = grp_count + 1;
355 mru->lists = kmem_alloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
356
357 if (!mru->lists) {
358 err = ENOMEM;
359 goto exit;
360 }
361
362 for (grp = 0; grp < mru->grp_count; grp++)
363 INIT_LIST_HEAD(mru->lists + grp);
364
365 /*
366 * We use GFP_KERNEL radix tree preload and do inserts under a
367 * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
368 */
369 INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
370 INIT_LIST_HEAD(&mru->reap_list);
371 spinlock_init(&mru->lock, "xfs_mru_cache");
372 INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
373
374 mru->grp_time = grp_time;
375 mru->free_func = free_func;
376
377 /* start up the reaper event */
378 mru->next_reap = 0;
379 mru->reap_all = 0;
380 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
381
382 *mrup = mru;
383
384exit:
385 if (err && mru && mru->lists)
386 kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
387 if (err && mru)
388 kmem_free(mru, sizeof(*mru));
389
390 return err;
391}
392
393/*
394 * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
395 * free functions as they're deleted. When this function returns, the caller is
396 * guaranteed that all the free functions for all the elements have finished
397 * executing.
398 *
399 * While we are flushing, we stop the periodic reaper event from triggering.
400 * Normally, we want to restart this periodic event, but if we are shutting
401 * down the cache we do not want it restarted. hence the restart parameter
402 * where 0 = do not restart reaper and 1 = restart reaper.
403 */
404void
405xfs_mru_cache_flush(
406 xfs_mru_cache_t *mru,
407 int restart)
408{
409 if (!mru || !mru->lists)
410 return;
411
412 cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
413
414 mutex_spinlock(&mru->lock);
415 mru->reap_all = 1;
416 mutex_spinunlock(&mru->lock, 0);
417
418 queue_work(xfs_mru_reap_wq, &mru->work.work);
419 flush_workqueue(xfs_mru_reap_wq);
420
421 mutex_spinlock(&mru->lock);
422 WARN_ON_ONCE(mru->reap_all != 0);
423 mru->reap_all = 0;
424 if (restart)
425 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
426 mutex_spinunlock(&mru->lock, 0);
427}
428
429void
430xfs_mru_cache_destroy(
431 xfs_mru_cache_t *mru)
432{
433 if (!mru || !mru->lists)
434 return;
435
436 /* we don't want the reaper to restart here */
437 xfs_mru_cache_flush(mru, 0);
438
439 kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
440 kmem_free(mru, sizeof(*mru));
441}
442
443/*
444 * To insert an element, call xfs_mru_cache_insert() with the data store, the
445 * element's key and the client data pointer. This function returns 0 on
446 * success or ENOMEM if memory for the data element couldn't be allocated.
447 */
448int
449xfs_mru_cache_insert(
450 xfs_mru_cache_t *mru,
451 unsigned long key,
452 void *value)
453{
454 xfs_mru_cache_elem_t *elem;
455
456 ASSERT(mru && mru->lists);
457 if (!mru || !mru->lists)
458 return EINVAL;
459
460 elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
461 if (!elem)
462 return ENOMEM;
463
464 if (radix_tree_preload(GFP_KERNEL)) {
465 kmem_zone_free(xfs_mru_elem_zone, elem);
466 return ENOMEM;
467 }
468
469 INIT_LIST_HEAD(&elem->list_node);
470 elem->key = key;
471 elem->value = value;
472
473 mutex_spinlock(&mru->lock);
474
475 radix_tree_insert(&mru->store, key, elem);
476 radix_tree_preload_end();
477 _xfs_mru_cache_list_insert(mru, elem);
478
479 mutex_spinunlock(&mru->lock, 0);
480
481 return 0;
482}
483
484/*
485 * To remove an element without calling the free function, call
486 * xfs_mru_cache_remove() with the data store and the element's key. On success
487 * the client data pointer for the removed element is returned, otherwise this
488 * function will return a NULL pointer.
489 */
490void *
491xfs_mru_cache_remove(
492 xfs_mru_cache_t *mru,
493 unsigned long key)
494{
495 xfs_mru_cache_elem_t *elem;
496 void *value = NULL;
497
498 ASSERT(mru && mru->lists);
499 if (!mru || !mru->lists)
500 return NULL;
501
502 mutex_spinlock(&mru->lock);
503 elem = radix_tree_delete(&mru->store, key);
504 if (elem) {
505 value = elem->value;
506 list_del(&elem->list_node);
507 }
508
509 mutex_spinunlock(&mru->lock, 0);
510
511 if (elem)
512 kmem_zone_free(xfs_mru_elem_zone, elem);
513
514 return value;
515}
516
517/*
518 * To remove and element and call the free function, call xfs_mru_cache_delete()
519 * with the data store and the element's key.
520 */
521void
522xfs_mru_cache_delete(
523 xfs_mru_cache_t *mru,
524 unsigned long key)
525{
526 void *value = xfs_mru_cache_remove(mru, key);
527
528 if (value)
529 mru->free_func(key, value);
530}
531
532/*
533 * To look up an element using its key, call xfs_mru_cache_lookup() with the
534 * data store and the element's key. If found, the element will be moved to the
535 * head of the MRU list to indicate that it's been touched.
536 *
537 * The internal data structures are protected by a spinlock that is STILL HELD
538 * when this function returns. Call xfs_mru_cache_done() to release it. Note
539 * that it is not safe to call any function that might sleep in the interim.
540 *
541 * The implementation could have used reference counting to avoid this
542 * restriction, but since most clients simply want to get, set or test a member
543 * of the returned data structure, the extra per-element memory isn't warranted.
544 *
545 * If the element isn't found, this function returns NULL and the spinlock is
546 * released. xfs_mru_cache_done() should NOT be called when this occurs.
547 */
548void *
549xfs_mru_cache_lookup(
550 xfs_mru_cache_t *mru,
551 unsigned long key)
552{
553 xfs_mru_cache_elem_t *elem;
554
555 ASSERT(mru && mru->lists);
556 if (!mru || !mru->lists)
557 return NULL;
558
559 mutex_spinlock(&mru->lock);
560 elem = radix_tree_lookup(&mru->store, key);
561 if (elem) {
562 list_del(&elem->list_node);
563 _xfs_mru_cache_list_insert(mru, elem);
564 }
565 else
566 mutex_spinunlock(&mru->lock, 0);
567
568 return elem ? elem->value : NULL;
569}
570
571/*
572 * To look up an element using its key, but leave its location in the internal
573 * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
574 * function returns NULL.
575 *
576 * See the comments above the declaration of the xfs_mru_cache_lookup() function
577 * for important locking information pertaining to this call.
578 */
579void *
580xfs_mru_cache_peek(
581 xfs_mru_cache_t *mru,
582 unsigned long key)
583{
584 xfs_mru_cache_elem_t *elem;
585
586 ASSERT(mru && mru->lists);
587 if (!mru || !mru->lists)
588 return NULL;
589
590 mutex_spinlock(&mru->lock);
591 elem = radix_tree_lookup(&mru->store, key);
592 if (!elem)
593 mutex_spinunlock(&mru->lock, 0);
594
595 return elem ? elem->value : NULL;
596}
597
598/*
599 * To release the internal data structure spinlock after having performed an
600 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
601 * with the data store pointer.
602 */
603void
604xfs_mru_cache_done(
605 xfs_mru_cache_t *mru)
606{
607 mutex_spinunlock(&mru->lock, 0);
608}