aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2014-04-22 17:11:51 -0400
committerDave Chinner <david@fromorbit.com>2014-04-22 17:11:51 -0400
commit2cd2ef6a300b1ac912bb515b75451585c3d33ea9 (patch)
treecae34ff7663a57aec71cea7b2a9b3555d70b32c4
parentf37211c336d722805493aec8b13afdbb92bbfd98 (diff)
xfs: rewrite the filestream allocator using the dentry cache
In Linux we will always be able to find a parent inode for file that are undergoing I/O. Use this to simply the file stream allocator by only keeping track of parent inodes. Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--fs/xfs/xfs_filestream.c660
-rw-r--r--fs/xfs/xfs_filestream.h31
-rw-r--r--fs/xfs/xfs_inode.c24
3 files changed, 171 insertions, 544 deletions
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c422110c7c43..ff6f90215c8a 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc. 2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * Copyright (c) 2014 Christoph Hellwig.
3 * All Rights Reserved. 4 * All Rights Reserved.
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -32,101 +33,28 @@
32#include "xfs_filestream.h" 33#include "xfs_filestream.h"
33#include "xfs_trace.h" 34#include "xfs_trace.h"
34 35
35#ifdef XFS_FILESTREAMS_TRACE
36
37ktrace_t *xfs_filestreams_trace_buf;
38
39STATIC void
40xfs_filestreams_trace(
41 xfs_mount_t *mp, /* mount point */
42 int type, /* type of trace */
43 const char *func, /* source function */
44 int line, /* source line number */
45 __psunsigned_t arg0,
46 __psunsigned_t arg1,
47 __psunsigned_t arg2,
48 __psunsigned_t arg3,
49 __psunsigned_t arg4,
50 __psunsigned_t arg5)
51{
52 ktrace_enter(xfs_filestreams_trace_buf,
53 (void *)(__psint_t)(type | (line << 16)),
54 (void *)func,
55 (void *)(__psunsigned_t)current_pid(),
56 (void *)mp,
57 (void *)(__psunsigned_t)arg0,
58 (void *)(__psunsigned_t)arg1,
59 (void *)(__psunsigned_t)arg2,
60 (void *)(__psunsigned_t)arg3,
61 (void *)(__psunsigned_t)arg4,
62 (void *)(__psunsigned_t)arg5,
63 NULL, NULL, NULL, NULL, NULL, NULL);
64}
65
66#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0)
67#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0)
68#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0)
69#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0)
70#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
71#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
72#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
73 xfs_filestreams_trace(mp, t, __func__, __LINE__, \
74 (__psunsigned_t)a0, (__psunsigned_t)a1, \
75 (__psunsigned_t)a2, (__psunsigned_t)a3, \
76 (__psunsigned_t)a4, (__psunsigned_t)a5)
77
78#define TRACE_AG_SCAN(mp, ag, ag2) \
79 TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
80#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
81 TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
82#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
83 TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
84 cnt, free, scan, flag)
85#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
86 TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
87#define TRACE_FREE(mp, ip, pip, ag, cnt) \
88 TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
89#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
90 TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
91#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
92 TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
93#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
94 TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
95#define TRACE_ORPHAN(mp, ip, ag) \
96 TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
97
98
99#else
100#define TRACE_AG_SCAN(mp, ag, ag2) 36#define TRACE_AG_SCAN(mp, ag, ag2)
101#define TRACE_AG_PICK1(mp, max_ag, maxfree) 37#define TRACE_AG_PICK1(mp, max_ag, maxfree)
102#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) 38#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
103#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
104#define TRACE_FREE(mp, ip, pip, ag, cnt) 39#define TRACE_FREE(mp, ip, pip, ag, cnt)
105#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) 40#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
106#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
107#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
108#define TRACE_ORPHAN(mp, ip, ag)
109#endif
110 41
111static kmem_zone_t *item_zone; 42static kmem_zone_t *item_zone;
112 43
113/* 44struct xfs_fstrm_item {
114 * Structure for associating a file or a directory with an allocation group. 45 struct xfs_mru_cache_elem mru;
115 * The parent directory pointer is only needed for files, but since there will 46 struct xfs_inode *ip;
116 * generally be vastly more files than directories in the cache, using the same 47 xfs_agnumber_t ag; /* AG in use for this directory */
117 * data structure simplifies the code with very little memory overhead. 48};
118 */ 49
119typedef struct fstrm_item 50enum xfs_fstrm_alloc {
120{ 51 XFS_PICK_USERDATA = 1,
121 struct xfs_mru_cache_elem mru; 52 XFS_PICK_LOWSPACE = 2,
122 xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ 53};
123 xfs_inode_t *ip; /* inode self-pointer. */
124 xfs_inode_t *pip; /* Parent directory inode pointer. */
125} fstrm_item_t;
126 54
127/* 55/*
128 * Allocation group filestream associations are tracked with per-ag atomic 56 * Allocation group filestream associations are tracked with per-ag atomic
129 * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a 57 * counters. These counters allow xfs_filestream_pick_ag() to tell whether a
130 * particular AG already has active filestreams associated with it. The mount 58 * particular AG already has active filestreams associated with it. The mount
131 * point's m_peraglock is used to protect these counters from per-ag array 59 * point's m_peraglock is used to protect these counters from per-ag array
132 * re-allocation during a growfs operation. When xfs_growfs_data_private() is 60 * re-allocation during a growfs operation. When xfs_growfs_data_private() is
@@ -201,23 +129,42 @@ xfs_filestream_put_ag(
201 xfs_perag_put(pag); 129 xfs_perag_put(pag);
202} 130}
203 131
132static void
133xfs_fstrm_free_func(
134 struct xfs_mru_cache_elem *mru)
135{
136 struct xfs_fstrm_item *item =
137 container_of(mru, struct xfs_fstrm_item, mru);
138
139 xfs_filestream_put_ag(item->ip->i_mount, item->ag);
140
141 TRACE_FREE(mp, ip, NULL, item->ag,
142 xfs_filestream_peek_ag(mp, item->ag));
143
144 kmem_zone_free(item_zone, item);
145}
146
204/* 147/*
205 * Scan the AGs starting at startag looking for an AG that isn't in use and has 148 * Scan the AGs starting at startag looking for an AG that isn't in use and has
206 * at least minlen blocks free. 149 * at least minlen blocks free.
207 */ 150 */
208static int 151static int
209_xfs_filestream_pick_ag( 152xfs_filestream_pick_ag(
210 xfs_mount_t *mp, 153 struct xfs_inode *ip,
211 xfs_agnumber_t startag, 154 xfs_agnumber_t startag,
212 xfs_agnumber_t *agp, 155 xfs_agnumber_t *agp,
213 int flags, 156 int flags,
214 xfs_extlen_t minlen) 157 xfs_extlen_t minlen)
215{ 158{
216 int streams, max_streams; 159 struct xfs_mount *mp = ip->i_mount;
217 int err, trylock, nscan; 160 struct xfs_fstrm_item *item;
218 xfs_extlen_t longest, free, minfree, maxfree = 0; 161 struct xfs_perag *pag;
219 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 162 xfs_extlen_t longest, free, minfree, maxfree = 0;
220 struct xfs_perag *pag; 163 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
164 int streams, max_streams;
165 int err, trylock, nscan;
166
167 ASSERT(S_ISDIR(ip->i_d.di_mode));
221 168
222 /* 2% of an AG's blocks must be free for it to be chosen. */ 169 /* 2% of an AG's blocks must be free for it to be chosen. */
223 minfree = mp->m_sb.sb_agblocks / 50; 170 minfree = mp->m_sb.sb_agblocks / 50;
@@ -321,205 +268,55 @@ next_ag:
321 268
322 TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags); 269 TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
323 270
324 return 0; 271 if (*agp == NULLAGNUMBER)
325}
326
327/*
328 * Set the allocation group number for a file or a directory, updating inode
329 * references and per-AG references as appropriate.
330 */
331static int
332_xfs_filestream_update_ag(
333 xfs_inode_t *ip,
334 xfs_inode_t *pip,
335 xfs_agnumber_t ag)
336{
337 int err = 0;
338 xfs_mount_t *mp;
339 fstrm_item_t *item;
340 xfs_agnumber_t old_ag;
341 xfs_inode_t *old_pip;
342 struct xfs_mru_cache_elem *mru;
343
344 /*
345 * Either ip is a regular file and pip is a directory, or ip is a
346 * directory and pip is NULL.
347 */
348 ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
349 S_ISDIR(pip->i_d.di_mode)) ||
350 (S_ISDIR(ip->i_d.di_mode) && !pip)));
351
352 mp = ip->i_mount;
353
354 mru = xfs_mru_cache_lookup(mp->m_filestream, ip->i_ino);
355 if (mru) {
356 item = container_of(mru, fstrm_item_t, mru);
357
358 ASSERT(item->ip == ip);
359 old_ag = item->ag;
360 item->ag = ag;
361 old_pip = item->pip;
362 item->pip = pip;
363 xfs_mru_cache_done(mp->m_filestream);
364
365 /*
366 * If the AG has changed, drop the old ref and take a new one,
367 * effectively transferring the reference from old to new AG.
368 */
369 if (ag != old_ag) {
370 xfs_filestream_put_ag(mp, old_ag);
371 xfs_filestream_get_ag(mp, ag);
372 }
373
374 /*
375 * If ip is a file and its pip has changed, drop the old ref and
376 * take a new one.
377 */
378 if (pip && pip != old_pip) {
379 IRELE(old_pip);
380 IHOLD(pip);
381 }
382
383 TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
384 ag, xfs_filestream_peek_ag(mp, ag));
385 return 0; 272 return 0;
386 }
387 273
274 err = ENOMEM;
388 item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); 275 item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
389 if (!item) 276 if (!item)
390 return ENOMEM; 277 goto out_put_ag;
391 278
392 item->ag = ag; 279 item->ag = *agp;
393 item->ip = ip; 280 item->ip = ip;
394 item->pip = pip;
395 281
396 err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); 282 err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
397 if (err) { 283 if (err) {
398 kmem_zone_free(item_zone, item); 284 if (err == EEXIST)
399 return err; 285 err = 0;
286 goto out_free_item;
400 } 287 }
401 288
402 /* Take a reference on the AG. */
403 xfs_filestream_get_ag(mp, ag);
404
405 /*
406 * Take a reference on the inode itself regardless of whether it's a
407 * regular file or a directory.
408 */
409 IHOLD(ip);
410
411 /*
412 * In the case of a regular file, take a reference on the parent inode
413 * as well to ensure it remains in-core.
414 */
415 if (pip)
416 IHOLD(pip);
417
418 TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
419 ag, xfs_filestream_peek_ag(mp, ag));
420
421 return 0; 289 return 0;
422}
423
424/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
425STATIC void
426xfs_fstrm_free_func(
427 struct xfs_mru_cache_elem *mru)
428{
429 fstrm_item_t *item =
430 container_of(mru, fstrm_item_t, mru);
431 xfs_inode_t *ip = item->ip;
432
433 /* Drop the reference taken on the AG when the item was added. */
434 xfs_filestream_put_ag(ip->i_mount, item->ag);
435
436 TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
437 xfs_filestream_peek_ag(ip->i_mount, item->ag));
438
439 /*
440 * _xfs_filestream_update_ag() always takes a reference on the inode
441 * itself, whether it's a file or a directory. Release it here.
442 * This can result in the inode being freed and so we must
443 * not hold any inode locks when freeing filesstreams objects
444 * otherwise we can deadlock here.
445 */
446 IRELE(ip);
447 290
448 /* 291out_free_item:
449 * In the case of a regular file, _xfs_filestream_update_ag() also
450 * takes a ref on the parent inode to keep it in-core. Release that
451 * too.
452 */
453 if (item->pip)
454 IRELE(item->pip);
455
456 /* Finally, free the memory allocated for the item. */
457 kmem_zone_free(item_zone, item); 292 kmem_zone_free(item_zone, item);
293out_put_ag:
294 xfs_filestream_put_ag(mp, *agp);
295 return err;
458} 296}
459 297
460/* 298static struct xfs_inode *
461 * xfs_filestream_init() is called at xfs initialisation time to set up the 299xfs_filestream_get_parent(
462 * memory zone that will be used for filestream data structure allocation. 300 struct xfs_inode *ip)
463 */
464int
465xfs_filestream_init(void)
466{
467 item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
468 if (!item_zone)
469 return -ENOMEM;
470
471 return 0;
472}
473
474/*
475 * xfs_filestream_uninit() is called at xfs termination time to destroy the
476 * memory zone that was used for filestream data structure allocation.
477 */
478void
479xfs_filestream_uninit(void)
480{ 301{
481 kmem_zone_destroy(item_zone); 302 struct inode *inode = VFS_I(ip), *dir = NULL;
482} 303 struct dentry *dentry, *parent;
483 304
484/* 305 dentry = d_find_alias(inode);
485 * xfs_filestream_mount() is called when a file system is mounted with the 306 if (!dentry)
486 * filestream option. It is responsible for allocating the data structures 307 goto out;
487 * needed to track the new file system's file streams.
488 */
489int
490xfs_filestream_mount(
491 xfs_mount_t *mp)
492{
493 int err;
494 unsigned int lifetime, grp_count;
495 308
496 /* 309 parent = dget_parent(dentry);
497 * The filestream timer tunable is currently fixed within the range of 310 if (!parent)
498 * one second to four minutes, with five seconds being the default. The 311 goto out_dput;
499 * group count is somewhat arbitrary, but it'd be nice to adhere to the
500 * timer tunable to within about 10 percent. This requires at least 10
501 * groups.
502 */
503 lifetime = xfs_fstrm_centisecs * 10;
504 grp_count = 10;
505 312
506 err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, 313 dir = igrab(parent->d_inode);
507 xfs_fstrm_free_func); 314 dput(parent);
508 315
509 return err; 316out_dput:
510} 317 dput(dentry);
511 318out:
512/* 319 return dir ? XFS_I(dir) : NULL;
513 * xfs_filestream_unmount() is called when a file system that was mounted with
514 * the filestream option is unmounted. It drains the data structures created
515 * to track the file system's file streams and frees all the memory that was
516 * allocated.
517 */
518void
519xfs_filestream_unmount(
520 xfs_mount_t *mp)
521{
522 xfs_mru_cache_destroy(mp->m_filestream);
523} 320}
524 321
525/* 322/*
@@ -528,94 +325,61 @@ xfs_filestream_unmount(
528 */ 325 */
529xfs_agnumber_t 326xfs_agnumber_t
530xfs_filestream_lookup_ag( 327xfs_filestream_lookup_ag(
531 xfs_inode_t *ip) 328 struct xfs_inode *ip)
532{ 329{
533 struct xfs_mount *mp = ip->i_mount; 330 struct xfs_mount *mp = ip->i_mount;
331 struct xfs_fstrm_item *item;
332 struct xfs_inode *pip = NULL;
333 xfs_agnumber_t ag = NULLAGNUMBER;
334 int ref = 0;
534 struct xfs_mru_cache_elem *mru; 335 struct xfs_mru_cache_elem *mru;
535 fstrm_item_t *item;
536 xfs_agnumber_t ag;
537 int ref;
538 336
539 if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { 337 ASSERT(S_ISREG(ip->i_d.di_mode));
540 ASSERT(0);
541 return NULLAGNUMBER;
542 }
543 338
544 mru = xfs_mru_cache_lookup(mp->m_filestream, ip->i_ino); 339 pip = xfs_filestream_get_parent(ip);
545 if (!mru) { 340 if (!pip)
546 TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); 341 goto out;
547 return NULLAGNUMBER; 342
548 } 343 mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
344 if (!mru)
345 goto out;
346
347 item = container_of(mru, struct xfs_fstrm_item, mru);
549 348
550 item = container_of(mru, fstrm_item_t, mru);
551 ASSERT(ip == item->ip);
552 ag = item->ag; 349 ag = item->ag;
553 ref = xfs_filestream_peek_ag(ip->i_mount, ag);
554 xfs_mru_cache_done(mp->m_filestream); 350 xfs_mru_cache_done(mp->m_filestream);
555 351
556 TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); 352 ref = xfs_filestream_peek_ag(ip->i_mount, ag);
353out:
354 TRACE_LOOKUP(mp, ip, pip, ag, ref);
355 IRELE(pip);
557 return ag; 356 return ag;
558} 357}
559 358
560/* 359/*
561 * xfs_filestream_associate() should only be called to associate a regular file 360 * Make sure a directory has a filestream associated with it.
562 * with its parent directory. Calling it with a child directory isn't
563 * appropriate because filestreams don't apply to entire directory hierarchies.
564 * Creating a file in a child directory of an existing filestream directory
565 * starts a new filestream with its own allocation group association.
566 * 361 *
567 * Returns < 0 on error, 0 if successful association occurred, > 0 if 362 * This is called when creating regular files in an directory that has
568 * we failed to get an association because of locking issues. 363 * filestreams enabled, so that a stream is ready by the time we need it
364 * in the allocator for the files inside the directory.
569 */ 365 */
570int 366int
571xfs_filestream_associate( 367xfs_filestream_associate(
572 xfs_inode_t *pip, 368 struct xfs_inode *pip)
573 xfs_inode_t *ip)
574{ 369{
370 struct xfs_mount *mp = pip->i_mount;
575 struct xfs_mru_cache_elem *mru; 371 struct xfs_mru_cache_elem *mru;
576 xfs_mount_t *mp; 372 xfs_agnumber_t startag, ag;
577 fstrm_item_t *item;
578 xfs_agnumber_t ag, rotorstep, startag;
579 int err = 0;
580 373
581 ASSERT(S_ISDIR(pip->i_d.di_mode)); 374 ASSERT(S_ISDIR(pip->i_d.di_mode));
582 ASSERT(S_ISREG(ip->i_d.di_mode));
583 if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode))
584 return -EINVAL;
585
586 mp = pip->i_mount;
587 375
588 /* 376 /*
589 * We have a problem, Houston. 377 * If the directory already has a file stream associated we're done.
590 *
591 * Taking the iolock here violates inode locking order - we already
592 * hold the ilock. Hence if we block getting this lock we may never
593 * wake. Unfortunately, that means if we can't get the lock, we're
594 * screwed in terms of getting a stream association - we can't spin
595 * waiting for the lock because someone else is waiting on the lock we
596 * hold and we cannot drop that as we are in a transaction here.
597 *
598 * Lucky for us, this inversion is not a problem because it's a
599 * directory inode that we are trying to lock here.
600 *
601 * So, if we can't get the iolock without sleeping then just give up
602 */ 378 */
603 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
604 return 1;
605
606 /* If the parent directory is already in the cache, use its AG. */
607 mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); 379 mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
608 if (mru) { 380 if (mru) {
609 item = container_of(mru, fstrm_item_t, mru);
610
611 ASSERT(item->ip == pip);
612 ag = item->ag;
613 xfs_mru_cache_done(mp->m_filestream); 381 xfs_mru_cache_done(mp->m_filestream);
614 382 return 0;
615 TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
616 err = _xfs_filestream_update_ag(ip, pip, ag);
617
618 goto exit;
619 } 383 }
620 384
621 /* 385 /*
@@ -623,201 +387,107 @@ xfs_filestream_associate(
623 * use the directory inode's AG. 387 * use the directory inode's AG.
624 */ 388 */
625 if (mp->m_flags & XFS_MOUNT_32BITINODES) { 389 if (mp->m_flags & XFS_MOUNT_32BITINODES) {
626 rotorstep = xfs_rotorstep; 390 xfs_agnumber_t rotorstep = xfs_rotorstep;
627 startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; 391 startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
628 mp->m_agfrotor = (mp->m_agfrotor + 1) % 392 mp->m_agfrotor = (mp->m_agfrotor + 1) %
629 (mp->m_sb.sb_agcount * rotorstep); 393 (mp->m_sb.sb_agcount * rotorstep);
630 } else 394 } else
631 startag = XFS_INO_TO_AGNO(mp, pip->i_ino); 395 startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
632 396
633 /* Pick a new AG for the parent inode starting at startag. */ 397 return xfs_filestream_pick_ag(pip, startag, &ag, 0, 0);
634 err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
635 if (err || ag == NULLAGNUMBER)
636 goto exit_did_pick;
637
638 /* Associate the parent inode with the AG. */
639 err = _xfs_filestream_update_ag(pip, NULL, ag);
640 if (err)
641 goto exit_did_pick;
642
643 /* Associate the file inode with the AG. */
644 err = _xfs_filestream_update_ag(ip, pip, ag);
645 if (err)
646 goto exit_did_pick;
647
648 TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
649
650exit_did_pick:
651 /*
652 * If _xfs_filestream_pick_ag() returned a valid AG, remove the
653 * reference it took on it, since the file and directory will have taken
654 * their own now if they were successfully cached.
655 */
656 if (ag != NULLAGNUMBER)
657 xfs_filestream_put_ag(mp, ag);
658
659exit:
660 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
661 return -err;
662} 398}
663 399
664/* 400/*
665 * Pick a new allocation group for the current file and its file stream. This 401 * Pick a new allocation group for the current file and its file stream.
666 * function is called by xfs_bmap_filestreams() with the mount point's per-ag 402 *
667 * lock held. 403 * This is called when the allocator can't find a suitable extent in the
404 * current AG, and we have to move the stream into a new AG with more space.
668 */ 405 */
669int 406int
670xfs_filestream_new_ag( 407xfs_filestream_new_ag(
671 struct xfs_bmalloca *ap, 408 struct xfs_bmalloca *ap,
672 xfs_agnumber_t *agp) 409 xfs_agnumber_t *agp)
673{ 410{
674 struct xfs_mru_cache_elem *mru, *mru2; 411 struct xfs_inode *ip = ap->ip, *pip;
675 int flags, err; 412 struct xfs_mount *mp = ip->i_mount;
676 xfs_inode_t *ip, *pip = NULL; 413 xfs_extlen_t minlen = ap->length;
677 xfs_mount_t *mp; 414 xfs_agnumber_t startag = 0;
678 xfs_extlen_t minlen; 415 int flags, err = 0;
679 fstrm_item_t *dir, *file; 416 struct xfs_mru_cache_elem *mru;
680 xfs_agnumber_t ag = NULLAGNUMBER;
681
682 ip = ap->ip;
683 mp = ip->i_mount;
684 minlen = ap->length;
685 *agp = NULLAGNUMBER;
686
687 /*
688 * Look for the file in the cache, removing it if it's found. Doing
689 * this allows it to be held across the dir lookup that follows.
690 */
691 mru = xfs_mru_cache_remove(mp->m_filestream, ip->i_ino);
692 if (mru) {
693 file = container_of(mru, fstrm_item_t, mru);
694 ASSERT(ip == file->ip);
695
696 /* Save the file's parent inode and old AG number for later. */
697 pip = file->pip;
698 ag = file->ag;
699
700 /* Look for the file's directory in the cache. */
701 mru2 = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
702 if (mru2) {
703 dir = container_of(mru2, fstrm_item_t, mru);
704 ASSERT(pip == dir->ip);
705
706 /*
707 * If the directory has already moved on to a new AG,
708 * use that AG as the new AG for the file. Don't
709 * forget to twiddle the AG refcounts to match the
710 * movement.
711 */
712 if (dir->ag != file->ag) {
713 xfs_filestream_put_ag(mp, file->ag);
714 xfs_filestream_get_ag(mp, dir->ag);
715 *agp = file->ag = dir->ag;
716 }
717 417
718 xfs_mru_cache_done(mp->m_filestream); 418 *agp = NULLAGNUMBER;
719 }
720 419
721 /* 420 pip = xfs_filestream_get_parent(ip);
722 * Put the file back in the cache. If this fails, the free 421 if (!pip)
723 * function needs to be called to tidy up in the same way as if 422 goto exit;
724 * the item had simply expired from the cache.
725 */
726 err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, mru);
727 if (err) {
728 xfs_fstrm_free_func(mru);
729 return err;
730 }
731 423
732 /* 424 mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
733 * If the file's AG was moved to the directory's new AG, there's 425 if (mru) {
734 * nothing more to be done. 426 struct xfs_fstrm_item *item =
735 */ 427 container_of(mru, struct xfs_fstrm_item, mru);
736 if (*agp != NULLAGNUMBER) { 428 startag = (item->ag + 1) % mp->m_sb.sb_agcount;
737 TRACE_MOVEAG(mp, ip, pip,
738 ag, xfs_filestream_peek_ag(mp, ag),
739 *agp, xfs_filestream_peek_ag(mp, *agp));
740 return 0;
741 }
742 } 429 }
743 430
744 /*
745 * If the file's parent directory is known, take its iolock in exclusive
746 * mode to prevent two sibling files from racing each other to migrate
747 * themselves and their parent to different AGs.
748 *
749 * Note that we lock the parent directory iolock inside the child
750 * iolock here. That's fine as we never hold both parent and child
751 * iolock in any other place. This is different from the ilock,
752 * which requires locking of the child after the parent for namespace
753 * operations.
754 */
755 if (pip)
756 xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
757
758 /*
759 * A new AG needs to be found for the file. If the file's parent
760 * directory is also known, it will be moved to the new AG as well to
761 * ensure that files created inside it in future use the new AG.
762 */
763 ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
764 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | 431 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
765 (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); 432 (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
766 433
767 err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); 434 err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
768 if (err || *agp == NULLAGNUMBER)
769 goto exit;
770 435
771 /* 436 /*
772 * If the file wasn't found in the file cache, then its parent directory 437 * Only free the item here so we skip over the old AG earlier.
773 * inode isn't known. For this to have happened, the file must either
774 * be pre-existing, or it was created long enough ago that its cache
775 * entry has expired. This isn't the sort of usage that the filestreams
776 * allocator is trying to optimise, so there's no point trying to track
777 * its new AG somehow in the filestream data structures.
778 */ 438 */
779 if (!pip) { 439 if (mru)
780 TRACE_ORPHAN(mp, ip, *agp); 440 xfs_fstrm_free_func(mru);
781 goto exit;
782 }
783 441
784 /* Associate the parent inode with the AG. */ 442 IRELE(pip);
785 err = _xfs_filestream_update_ag(pip, NULL, *agp); 443exit:
786 if (err) 444 if (*agp == NULLAGNUMBER)
787 goto exit; 445 *agp = 0;
788 446 return err;
789 /* Associate the file inode with the AG. */ 447}
790 err = _xfs_filestream_update_ag(ip, pip, *agp);
791 if (err)
792 goto exit;
793 448
794 TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, 449void
795 *agp, xfs_filestream_peek_ag(mp, *agp)); 450xfs_filestream_deassociate(
451 struct xfs_inode *ip)
452{
453 xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino);
454}
796 455
797exit: 456int
457xfs_filestream_mount(
458 xfs_mount_t *mp)
459{
798 /* 460 /*
799 * If _xfs_filestream_pick_ag() returned a valid AG, remove the 461 * The filestream timer tunable is currently fixed within the range of
800 * reference it took on it, since the file and directory will have taken 462 * one second to four minutes, with five seconds being the default. The
801 * their own now if they were successfully cached. 463 * group count is somewhat arbitrary, but it'd be nice to adhere to the
464 * timer tunable to within about 10 percent. This requires at least 10
465 * groups.
802 */ 466 */
803 if (*agp != NULLAGNUMBER) 467 return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10,
804 xfs_filestream_put_ag(mp, *agp); 468 10, xfs_fstrm_free_func);
805 else 469}
806 *agp = 0;
807 470
808 if (pip) 471void
809 xfs_iunlock(pip, XFS_IOLOCK_EXCL); 472xfs_filestream_unmount(
473 xfs_mount_t *mp)
474{
475 xfs_mru_cache_destroy(mp->m_filestream);
476}
810 477
811 return err; 478
479/* needs to return a positive errno for the init path */
480int
481xfs_filestream_init(void)
482{
483 item_zone = kmem_zone_init(sizeof(struct xfs_fstrm_item), "fstrm_item");
484 if (!item_zone)
485 return -ENOMEM;
486 return 0;
812} 487}
813 488
814/*
815 * Remove an association between an inode and a filestream object.
816 * Typically this is done on last close of an unlinked file.
817 */
818void 489void
819xfs_filestream_deassociate( 490xfs_filestream_uninit(void)
820 xfs_inode_t *ip)
821{ 491{
822 xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino); 492 kmem_zone_destroy(item_zone);
823} 493}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index c4fa9a0cd62f..e3a25f891d08 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -20,44 +20,17 @@
20 20
21struct xfs_mount; 21struct xfs_mount;
22struct xfs_inode; 22struct xfs_inode;
23struct xfs_perag;
24struct xfs_bmalloca; 23struct xfs_bmalloca;
25 24
26#ifdef XFS_FILESTREAMS_TRACE
27#define XFS_FSTRM_KTRACE_INFO 1
28#define XFS_FSTRM_KTRACE_AGSCAN 2
29#define XFS_FSTRM_KTRACE_AGPICK1 3
30#define XFS_FSTRM_KTRACE_AGPICK2 4
31#define XFS_FSTRM_KTRACE_UPDATE 5
32#define XFS_FSTRM_KTRACE_FREE 6
33#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7
34#define XFS_FSTRM_KTRACE_ASSOCIATE 8
35#define XFS_FSTRM_KTRACE_MOVEAG 9
36#define XFS_FSTRM_KTRACE_ORPHAN 10
37
38#define XFS_FSTRM_KTRACE_SIZE 16384
39extern ktrace_t *xfs_filestreams_trace_buf;
40
41#endif
42
43/* allocation selection flags */
44typedef enum xfs_fstrm_alloc {
45 XFS_PICK_USERDATA = 1,
46 XFS_PICK_LOWSPACE = 2,
47} xfs_fstrm_alloc_t;
48
49/* prototypes for filestream.c */
50int xfs_filestream_init(void); 25int xfs_filestream_init(void);
51void xfs_filestream_uninit(void); 26void xfs_filestream_uninit(void);
52int xfs_filestream_mount(struct xfs_mount *mp); 27int xfs_filestream_mount(struct xfs_mount *mp);
53void xfs_filestream_unmount(struct xfs_mount *mp); 28void xfs_filestream_unmount(struct xfs_mount *mp);
54xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
55int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
56void xfs_filestream_deassociate(struct xfs_inode *ip); 29void xfs_filestream_deassociate(struct xfs_inode *ip);
30xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
31int xfs_filestream_associate(struct xfs_inode *dip);
57int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); 32int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
58 33
59
60/* filestreams for the inode? */
61static inline int 34static inline int
62xfs_inode_is_filestream( 35xfs_inode_is_filestream(
63 struct xfs_inode *ip) 36 struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3328320592a6..b9b531f7fa3d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -846,9 +846,9 @@ xfs_ialloc(
846 846
847 /* now we have set up the vfs inode we can associate the filestream */ 847 /* now we have set up the vfs inode we can associate the filestream */
848 if (filestreams) { 848 if (filestreams) {
849 error = xfs_filestream_associate(pip, ip); 849 error = xfs_filestream_associate(pip);
850 if (error < 0) 850 if (error)
851 return -error; 851 return error;
852 } 852 }
853 853
854 *ipp = ip; 854 *ipp = ip;
@@ -1696,16 +1696,6 @@ xfs_release(
1696 int truncated; 1696 int truncated;
1697 1697
1698 /* 1698 /*
1699 * If we are using filestreams, and we have an unlinked
1700 * file that we are processing the last close on, then nothing
1701 * will be able to reopen and write to this file. Purge this
1702 * inode from the filestreams cache so that it doesn't delay
1703 * teardown of the inode.
1704 */
1705 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1706 xfs_filestream_deassociate(ip);
1707
1708 /*
1709 * If we previously truncated this file and removed old data 1699 * If we previously truncated this file and removed old data
1710 * in the process, we want to initiate "early" writeout on 1700 * in the process, we want to initiate "early" writeout on
1711 * the last close. This is an attempt to combat the notorious 1701 * the last close. This is an attempt to combat the notorious
@@ -2661,13 +2651,7 @@ xfs_remove(
2661 if (error) 2651 if (error)
2662 goto std_return; 2652 goto std_return;
2663 2653
2664 /* 2654 if (is_dir && xfs_inode_is_filestream(ip))
2665 * If we are using filestreams, kill the stream association.
2666 * If the file is still open it may get a new one but that
2667 * will get killed on last close in xfs_close() so we don't
2668 * have to worry about that.
2669 */
2670 if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
2671 xfs_filestream_deassociate(ip); 2655 xfs_filestream_deassociate(ip);
2672 2656
2673 return 0; 2657 return 0;