aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dcache.c
diff options
context:
space:
mode:
authorKentaro Makita <k-makita@np.css.fujitsu.com>2008-07-24 00:27:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-24 13:47:15 -0400
commitda3bbdd4632c0171406b2677e31494afa5bde2f8 (patch)
tree6a1947b2a32041ef0d14044e659ba7d546d55483 /fs/dcache.c
parent3c82d0ce2c4f642b2f24ef98707a030543b06b90 (diff)
fix soft lock up at NFS mount via per-SB LRU-list of unused dentries
[Summary] Split LRU-list of unused dentries to one per superblock to avoid soft lock up during NFS mounts and remounting of any filesystem. Previously I posted here: http://lkml.org/lkml/2008/3/5/590 [Descriptions] - background dentry_unused is a list of dentries which are not referenced. dentry_unused grows up when references on directories or files are released. This list can be very long if there is huge free memory. - the problem When shrink_dcache_sb() is called, it scans all dentry_unused linearly under spin_lock(), and if dentry->d_sb is differnt from given superblock, scan next dentry. This scan costs very much if there are many entries, and very ineffective if there are many superblocks. IOW, When we need to shrink unused dentries on one dentry, but scans unused dentries on all superblocks in the system. For example, we scan 500 dentries to unmount a filesystem, but scans 1,000,000 or more unused dentries on other superblocks. In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink unused dentries on NFS, but scans 100,000,000 unused dentries on superblocks in the system such as local ext3 filesystems. I hear NFS mounting took 1 min on some system in use. * : NFS uses virtual filesystem in rpc layer, so NFS is affected by this problem. 100,000,000 is possible number on large systems. Per-superblock LRU of unused dentried can reduce the cost in reasonable manner. - How to fix I found this problem is solved by David Chinner's "Per-superblock unused dentry LRU lists V3"(1), so I rebase it and add some fix to reclaim with fairness, which is in Andrew Morton's comments(2). 1) http://lkml.org/lkml/2006/5/25/318 2) http://lkml.org/lkml/2006/5/25/320 Split LRU-list of unused dentries to each superblocks. Then, NFS mounting will check dentries under a superblock instead of all. But this spliting will break LRU of dentry-unused. So, I've attempted to make reclaim unused dentrins with fairness by calculate number of dentries to scan on this sb based on following way number of dentries to scan on this sb = count * (number of dentries on this sb / number of dentries in the machine) - ToDo - I have to measuring performance number and do stress tests. - When unmount occurs during prune_dcache(), scanning on same superblock, It is unable to reach next superblock because it is gone away. We restart scannig superblock from first one, it causes unfairness of reclaim unused dentries on first superblock. But I think this happens very rarely. - Test Results Result on 6GB boxes with excessive unused dentries. Without patch: $ cat /proc/sys/fs/dentry-state 10181835 10180203 45 0 0 0 # mount -t nfs 10.124.60.70:/work/kernel-src nfs real 0m1.830s user 0m0.001s sys 0m1.653s With this patch: $ cat /proc/sys/fs/dentry-state 10236610 10234751 45 0 0 0 # mount -t nfs 10.124.60.70:/work/kernel-src nfs real 0m0.106s user 0m0.002s sys 0m0.032s [akpm@linux-foundation.org: fix comments] Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com> Cc: Neil Brown <neilb@suse.de> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Cc: David Chinner <dgc@sgi.com> Cc: "J. Bruce Fields" <bfields@fieldses.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/dcache.c')
-rw-r--r--fs/dcache.c335
1 files changed, 180 insertions, 155 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index 6068c25b393c..3818d6ab76ca 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -61,7 +61,6 @@ static struct kmem_cache *dentry_cache __read_mostly;
61static unsigned int d_hash_mask __read_mostly; 61static unsigned int d_hash_mask __read_mostly;
62static unsigned int d_hash_shift __read_mostly; 62static unsigned int d_hash_shift __read_mostly;
63static struct hlist_head *dentry_hashtable __read_mostly; 63static struct hlist_head *dentry_hashtable __read_mostly;
64static LIST_HEAD(dentry_unused);
65 64
66/* Statistics gathering. */ 65/* Statistics gathering. */
67struct dentry_stat_t dentry_stat = { 66struct dentry_stat_t dentry_stat = {
@@ -96,14 +95,6 @@ static void d_free(struct dentry *dentry)
96 call_rcu(&dentry->d_u.d_rcu, d_callback); 95 call_rcu(&dentry->d_u.d_rcu, d_callback);
97} 96}
98 97
99static void dentry_lru_remove(struct dentry *dentry)
100{
101 if (!list_empty(&dentry->d_lru)) {
102 list_del_init(&dentry->d_lru);
103 dentry_stat.nr_unused--;
104 }
105}
106
107/* 98/*
108 * Release the dentry's inode, using the filesystem 99 * Release the dentry's inode, using the filesystem
109 * d_iput() operation if defined. 100 * d_iput() operation if defined.
@@ -130,6 +121,41 @@ static void dentry_iput(struct dentry * dentry)
130 } 121 }
131} 122}
132 123
124/*
125 * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
126 */
127static void dentry_lru_add(struct dentry *dentry)
128{
129 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
130 dentry->d_sb->s_nr_dentry_unused++;
131 dentry_stat.nr_unused++;
132}
133
134static void dentry_lru_add_tail(struct dentry *dentry)
135{
136 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
137 dentry->d_sb->s_nr_dentry_unused++;
138 dentry_stat.nr_unused++;
139}
140
141static void dentry_lru_del(struct dentry *dentry)
142{
143 if (!list_empty(&dentry->d_lru)) {
144 list_del(&dentry->d_lru);
145 dentry->d_sb->s_nr_dentry_unused--;
146 dentry_stat.nr_unused--;
147 }
148}
149
150static void dentry_lru_del_init(struct dentry *dentry)
151{
152 if (likely(!list_empty(&dentry->d_lru))) {
153 list_del_init(&dentry->d_lru);
154 dentry->d_sb->s_nr_dentry_unused--;
155 dentry_stat.nr_unused--;
156 }
157}
158
133/** 159/**
134 * d_kill - kill dentry and return parent 160 * d_kill - kill dentry and return parent
135 * @dentry: dentry to kill 161 * @dentry: dentry to kill
@@ -212,8 +238,7 @@ repeat:
212 goto kill_it; 238 goto kill_it;
213 if (list_empty(&dentry->d_lru)) { 239 if (list_empty(&dentry->d_lru)) {
214 dentry->d_flags |= DCACHE_REFERENCED; 240 dentry->d_flags |= DCACHE_REFERENCED;
215 list_add(&dentry->d_lru, &dentry_unused); 241 dentry_lru_add(dentry);
216 dentry_stat.nr_unused++;
217 } 242 }
218 spin_unlock(&dentry->d_lock); 243 spin_unlock(&dentry->d_lock);
219 spin_unlock(&dcache_lock); 244 spin_unlock(&dcache_lock);
@@ -222,7 +247,8 @@ repeat:
222unhash_it: 247unhash_it:
223 __d_drop(dentry); 248 __d_drop(dentry);
224kill_it: 249kill_it:
225 dentry_lru_remove(dentry); 250 /* if dentry was on the d_lru list delete it from there */
251 dentry_lru_del(dentry);
226 dentry = d_kill(dentry); 252 dentry = d_kill(dentry);
227 if (dentry) 253 if (dentry)
228 goto repeat; 254 goto repeat;
@@ -290,7 +316,7 @@ int d_invalidate(struct dentry * dentry)
290static inline struct dentry * __dget_locked(struct dentry *dentry) 316static inline struct dentry * __dget_locked(struct dentry *dentry)
291{ 317{
292 atomic_inc(&dentry->d_count); 318 atomic_inc(&dentry->d_count);
293 dentry_lru_remove(dentry); 319 dentry_lru_del_init(dentry);
294 return dentry; 320 return dentry;
295} 321}
296 322
@@ -406,133 +432,167 @@ static void prune_one_dentry(struct dentry * dentry)
406 432
407 if (dentry->d_op && dentry->d_op->d_delete) 433 if (dentry->d_op && dentry->d_op->d_delete)
408 dentry->d_op->d_delete(dentry); 434 dentry->d_op->d_delete(dentry);
409 dentry_lru_remove(dentry); 435 dentry_lru_del_init(dentry);
410 __d_drop(dentry); 436 __d_drop(dentry);
411 dentry = d_kill(dentry); 437 dentry = d_kill(dentry);
412 spin_lock(&dcache_lock); 438 spin_lock(&dcache_lock);
413 } 439 }
414} 440}
415 441
416/** 442/*
417 * prune_dcache - shrink the dcache 443 * Shrink the dentry LRU on a given superblock.
418 * @count: number of entries to try and free 444 * @sb : superblock to shrink dentry LRU.
419 * @sb: if given, ignore dentries for other superblocks 445 * @count: If count is NULL, we prune all dentries on superblock.
420 * which are being unmounted. 446 * @flags: If flags is non-zero, we need to do special processing based on
421 * 447 * which flags are set. This means we don't need to maintain multiple
422 * Shrink the dcache. This is done when we need 448 * similar copies of this loop.
423 * more memory, or simply when we need to unmount
424 * something (at which point we need to unuse
425 * all dentries).
426 *
427 * This function may fail to free any resources if
428 * all the dentries are in use.
429 */ 449 */
430 450static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
431static void prune_dcache(int count, struct super_block *sb)
432{ 451{
433 spin_lock(&dcache_lock); 452 LIST_HEAD(referenced);
434 for (; count ; count--) { 453 LIST_HEAD(tmp);
435 struct dentry *dentry; 454 struct dentry *dentry;
436 struct list_head *tmp; 455 int cnt = 0;
437 struct rw_semaphore *s_umount;
438
439 cond_resched_lock(&dcache_lock);
440 456
441 tmp = dentry_unused.prev; 457 BUG_ON(!sb);
442 if (sb) { 458 BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
443 /* Try to find a dentry for this sb, but don't try 459 spin_lock(&dcache_lock);
444 * too hard, if they aren't near the tail they will 460 if (count != NULL)
445 * be moved down again soon 461 /* called from prune_dcache() and shrink_dcache_parent() */
462 cnt = *count;
463restart:
464 if (count == NULL)
465 list_splice_init(&sb->s_dentry_lru, &tmp);
466 else {
467 while (!list_empty(&sb->s_dentry_lru)) {
468 dentry = list_entry(sb->s_dentry_lru.prev,
469 struct dentry, d_lru);
470 BUG_ON(dentry->d_sb != sb);
471
472 spin_lock(&dentry->d_lock);
473 /*
474 * If we are honouring the DCACHE_REFERENCED flag and
475 * the dentry has this flag set, don't free it. Clear
476 * the flag and put it back on the LRU.
446 */ 477 */
447 int skip = count; 478 if ((flags & DCACHE_REFERENCED)
448 while (skip && tmp != &dentry_unused && 479 && (dentry->d_flags & DCACHE_REFERENCED)) {
449 list_entry(tmp, struct dentry, d_lru)->d_sb != sb) { 480 dentry->d_flags &= ~DCACHE_REFERENCED;
450 skip--; 481 list_move_tail(&dentry->d_lru, &referenced);
451 tmp = tmp->prev; 482 spin_unlock(&dentry->d_lock);
483 } else {
484 list_move_tail(&dentry->d_lru, &tmp);
485 spin_unlock(&dentry->d_lock);
486 cnt--;
487 if (!cnt)
488 break;
452 } 489 }
453 } 490 }
454 if (tmp == &dentry_unused) 491 }
455 break; 492 while (!list_empty(&tmp)) {
456 list_del_init(tmp); 493 dentry = list_entry(tmp.prev, struct dentry, d_lru);
457 prefetch(dentry_unused.prev); 494 dentry_lru_del_init(dentry);
458 dentry_stat.nr_unused--; 495 spin_lock(&dentry->d_lock);
459 dentry = list_entry(tmp, struct dentry, d_lru);
460
461 spin_lock(&dentry->d_lock);
462 /* 496 /*
463 * We found an inuse dentry which was not removed from 497 * We found an inuse dentry which was not removed from
464 * dentry_unused because of laziness during lookup. Do not free 498 * the LRU because of laziness during lookup. Do not free
465 * it - just keep it off the dentry_unused list. 499 * it - just keep it off the LRU list.
466 */ 500 */
467 if (atomic_read(&dentry->d_count)) { 501 if (atomic_read(&dentry->d_count)) {
468 spin_unlock(&dentry->d_lock); 502 spin_unlock(&dentry->d_lock);
469 continue; 503 continue;
470 } 504 }
471 /* If the dentry was recently referenced, don't free it. */ 505 prune_one_dentry(dentry);
472 if (dentry->d_flags & DCACHE_REFERENCED) { 506 /* dentry->d_lock was dropped in prune_one_dentry() */
473 dentry->d_flags &= ~DCACHE_REFERENCED; 507 cond_resched_lock(&dcache_lock);
474 list_add(&dentry->d_lru, &dentry_unused); 508 }
475 dentry_stat.nr_unused++; 509 if (count == NULL && !list_empty(&sb->s_dentry_lru))
476 spin_unlock(&dentry->d_lock); 510 goto restart;
511 if (count != NULL)
512 *count = cnt;
513 if (!list_empty(&referenced))
514 list_splice(&referenced, &sb->s_dentry_lru);
515 spin_unlock(&dcache_lock);
516}
517
518/**
519 * prune_dcache - shrink the dcache
520 * @count: number of entries to try to free
521 *
522 * Shrink the dcache. This is done when we need more memory, or simply when we
523 * need to unmount something (at which point we need to unuse all dentries).
524 *
525 * This function may fail to free any resources if all the dentries are in use.
526 */
527static void prune_dcache(int count)
528{
529 struct super_block *sb;
530 int w_count;
531 int unused = dentry_stat.nr_unused;
532 int prune_ratio;
533 int pruned;
534
535 if (unused == 0 || count == 0)
536 return;
537 spin_lock(&dcache_lock);
538restart:
539 if (count >= unused)
540 prune_ratio = 1;
541 else
542 prune_ratio = unused / count;
543 spin_lock(&sb_lock);
544 list_for_each_entry(sb, &super_blocks, s_list) {
545 if (sb->s_nr_dentry_unused == 0)
477 continue; 546 continue;
478 } 547 sb->s_count++;
479 /* 548 /* Now, we reclaim unused dentrins with fairness.
480 * If the dentry is not DCACHED_REFERENCED, it is time 549 * We reclaim them same percentage from each superblock.
481 * to remove it from the dcache, provided the super block is 550 * We calculate number of dentries to scan on this sb
482 * NULL (which means we are trying to reclaim memory) 551 * as follows, but the implementation is arranged to avoid
483 * or this dentry belongs to the same super block that 552 * overflows:
484 * we want to shrink. 553 * number of dentries to scan on this sb =
485 */ 554 * count * (number of dentries on this sb /
486 /* 555 * number of dentries in the machine)
487 * If this dentry is for "my" filesystem, then I can prune it
488 * without taking the s_umount lock (I already hold it).
489 */ 556 */
490 if (sb && dentry->d_sb == sb) { 557 spin_unlock(&sb_lock);
491 prune_one_dentry(dentry); 558 if (prune_ratio != 1)
492 continue; 559 w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
493 } 560 else
561 w_count = sb->s_nr_dentry_unused;
562 pruned = w_count;
494 /* 563 /*
495 * ...otherwise we need to be sure this filesystem isn't being 564 * We need to be sure this filesystem isn't being unmounted,
496 * unmounted, otherwise we could race with 565 * otherwise we could race with generic_shutdown_super(), and
497 * generic_shutdown_super(), and end up holding a reference to 566 * end up holding a reference to an inode while the filesystem
498 * an inode while the filesystem is unmounted. 567 * is unmounted. So we try to get s_umount, and make sure
499 * So we try to get s_umount, and make sure s_root isn't NULL. 568 * s_root isn't NULL.
500 * (Take a local copy of s_umount to avoid a use-after-free of
501 * `dentry').
502 */ 569 */
503 s_umount = &dentry->d_sb->s_umount; 570 if (down_read_trylock(&sb->s_umount)) {
504 if (down_read_trylock(s_umount)) { 571 if ((sb->s_root != NULL) &&
505 if (dentry->d_sb->s_root != NULL) { 572 (!list_empty(&sb->s_dentry_lru))) {
506 prune_one_dentry(dentry); 573 spin_unlock(&dcache_lock);
507 up_read(s_umount); 574 __shrink_dcache_sb(sb, &w_count,
508 continue; 575 DCACHE_REFERENCED);
576 pruned -= w_count;
577 spin_lock(&dcache_lock);
509 } 578 }
510 up_read(s_umount); 579 up_read(&sb->s_umount);
511 } 580 }
512 spin_unlock(&dentry->d_lock); 581 spin_lock(&sb_lock);
582 count -= pruned;
513 /* 583 /*
514 * Insert dentry at the head of the list as inserting at the 584 * restart only when sb is no longer on the list and
515 * tail leads to a cycle. 585 * we have more work to do.
516 */ 586 */
517 list_add(&dentry->d_lru, &dentry_unused); 587 if (__put_super_and_need_restart(sb) && count > 0) {
518 dentry_stat.nr_unused++; 588 spin_unlock(&sb_lock);
589 goto restart;
590 }
519 } 591 }
592 spin_unlock(&sb_lock);
520 spin_unlock(&dcache_lock); 593 spin_unlock(&dcache_lock);
521} 594}
522 595
523/*
524 * Shrink the dcache for the specified super block.
525 * This allows us to unmount a device without disturbing
526 * the dcache for the other devices.
527 *
528 * This implementation makes just two traversals of the
529 * unused list. On the first pass we move the selected
530 * dentries to the most recent end, and on the second
531 * pass we free them. The second pass must restart after
532 * each dput(), but since the target dentries are all at
533 * the end, it's really just a single traversal.
534 */
535
536/** 596/**
537 * shrink_dcache_sb - shrink dcache for a superblock 597 * shrink_dcache_sb - shrink dcache for a superblock
538 * @sb: superblock 598 * @sb: superblock
@@ -541,44 +601,9 @@ static void prune_dcache(int count, struct super_block *sb)
541 * is used to free the dcache before unmounting a file 601 * is used to free the dcache before unmounting a file
542 * system 602 * system
543 */ 603 */
544
545void shrink_dcache_sb(struct super_block * sb) 604void shrink_dcache_sb(struct super_block * sb)
546{ 605{
547 struct list_head *tmp, *next; 606 __shrink_dcache_sb(sb, NULL, 0);
548 struct dentry *dentry;
549
550 /*
551 * Pass one ... move the dentries for the specified
552 * superblock to the most recent end of the unused list.
553 */
554 spin_lock(&dcache_lock);
555 list_for_each_prev_safe(tmp, next, &dentry_unused) {
556 dentry = list_entry(tmp, struct dentry, d_lru);
557 if (dentry->d_sb != sb)
558 continue;
559 list_move_tail(tmp, &dentry_unused);
560 }
561
562 /*
563 * Pass two ... free the dentries for this superblock.
564 */
565repeat:
566 list_for_each_prev_safe(tmp, next, &dentry_unused) {
567 dentry = list_entry(tmp, struct dentry, d_lru);
568 if (dentry->d_sb != sb)
569 continue;
570 dentry_stat.nr_unused--;
571 list_del_init(tmp);
572 spin_lock(&dentry->d_lock);
573 if (atomic_read(&dentry->d_count)) {
574 spin_unlock(&dentry->d_lock);
575 continue;
576 }
577 prune_one_dentry(dentry);
578 cond_resched_lock(&dcache_lock);
579 goto repeat;
580 }
581 spin_unlock(&dcache_lock);
582} 607}
583 608
584/* 609/*
@@ -595,7 +620,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
595 620
596 /* detach this root from the system */ 621 /* detach this root from the system */
597 spin_lock(&dcache_lock); 622 spin_lock(&dcache_lock);
598 dentry_lru_remove(dentry); 623 dentry_lru_del_init(dentry);
599 __d_drop(dentry); 624 __d_drop(dentry);
600 spin_unlock(&dcache_lock); 625 spin_unlock(&dcache_lock);
601 626
@@ -609,7 +634,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
609 spin_lock(&dcache_lock); 634 spin_lock(&dcache_lock);
610 list_for_each_entry(loop, &dentry->d_subdirs, 635 list_for_each_entry(loop, &dentry->d_subdirs,
611 d_u.d_child) { 636 d_u.d_child) {
612 dentry_lru_remove(loop); 637 dentry_lru_del_init(loop);
613 __d_drop(loop); 638 __d_drop(loop);
614 cond_resched_lock(&dcache_lock); 639 cond_resched_lock(&dcache_lock);
615 } 640 }
@@ -791,14 +816,13 @@ resume:
791 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 816 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
792 next = tmp->next; 817 next = tmp->next;
793 818
794 dentry_lru_remove(dentry); 819 dentry_lru_del_init(dentry);
795 /* 820 /*
796 * move only zero ref count dentries to the end 821 * move only zero ref count dentries to the end
797 * of the unused list for prune_dcache 822 * of the unused list for prune_dcache
798 */ 823 */
799 if (!atomic_read(&dentry->d_count)) { 824 if (!atomic_read(&dentry->d_count)) {
800 list_add_tail(&dentry->d_lru, &dentry_unused); 825 dentry_lru_add_tail(dentry);
801 dentry_stat.nr_unused++;
802 found++; 826 found++;
803 } 827 }
804 828
@@ -840,10 +864,11 @@ out:
840 864
841void shrink_dcache_parent(struct dentry * parent) 865void shrink_dcache_parent(struct dentry * parent)
842{ 866{
867 struct super_block *sb = parent->d_sb;
843 int found; 868 int found;
844 869
845 while ((found = select_parent(parent)) != 0) 870 while ((found = select_parent(parent)) != 0)
846 prune_dcache(found, parent->d_sb); 871 __shrink_dcache_sb(sb, &found, 0);
847} 872}
848 873
849/* 874/*
@@ -863,7 +888,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
863 if (nr) { 888 if (nr) {
864 if (!(gfp_mask & __GFP_FS)) 889 if (!(gfp_mask & __GFP_FS))
865 return -1; 890 return -1;
866 prune_dcache(nr, NULL); 891 prune_dcache(nr);
867 } 892 }
868 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 893 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
869} 894}
@@ -1215,7 +1240,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1215 * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while 1240 * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
1216 * lookup is going on. 1241 * lookup is going on.
1217 * 1242 *
1218 * dentry_unused list is not updated even if lookup finds the required dentry 1243 * The dentry unused LRU is not updated even if lookup finds the required dentry
1219 * in there. It is updated in places such as prune_dcache, shrink_dcache_sb, 1244 * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
1220 * select_parent and __dget_locked. This laziness saves lookup from dcache_lock 1245 * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
1221 * acquisition. 1246 * acquisition.