aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-06-22 17:47:28 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-22 18:05:57 -0400
commit0feae5c47aabdde59cbbec32d150e17102de37f0 (patch)
tree244f742d943a0516921180b840419fdc329075f0 /fs
parentde047c1bcd7f7bcfbdc29eb5b439fb332594da3f (diff)
[PATCH] Fix dcache race during umount
The race is that the shrink_dcache_memory shrinker could get called while a filesystem is being unmounted, and could try to prune a dentry belonging to that filesystem. If it does, then it will call in to iput on the inode while the dentry is no longer able to be found by the umounting process. If iput takes a while, generic_shutdown_super could get all the way though shrink_dcache_parent and shrink_dcache_anon and invalidate_inodes without ever waiting on this particular inode. Eventually the superblock gets freed anyway and if the iput tried to touch it (which some filesystems certainly do), it will lose. The promised "Self-destruct in 5 seconds" doesn't lead to a nice day. The race is closed by holding s_umount while calling prune_one_dentry on someone else's dentry. As a down_read_trylock is used, shrink_dcache_memory will no longer try to prune the dentry of a filesystem that is being unmounted, and unmount will not be able to start until any such active prune_one_dentry completes. This requires that prune_dcache *knows* which filesystem (if any) it is doing the prune on behalf of so that it can be careful of other filesystems. shrink_dcache_memory isn't called it on behalf of any filesystem, and so is careful of everything. shrink_dcache_anon is now passed a super_block rather than the s_anon list out of the superblock, so it can get the s_anon list itself, and can pass the superblock down to prune_dcache. If prune_dcache finds a dentry that it cannot free, it leaves it where it is (at the tail of the list) and exits, on the assumption that some other thread will be removing that dentry soon. To try to make sure that some work gets done, a limited number of dnetries which are untouchable are skipped over while choosing the dentry to work on. I believe this race was first found by Kirill Korotaev. Cc: Jan Blunck <jblunck@suse.de> Acked-by: Kirill Korotaev <dev@openvz.org> Cc: Olaf Hering <olh@suse.de> Acked-by: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Balbir Singh <balbir@in.ibm.com> Acked-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/dcache.c66
-rw-r--r--fs/super.c2
2 files changed, 61 insertions, 7 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index 940d188e5d14..385f5dbc4b0c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -382,6 +382,8 @@ static inline void prune_one_dentry(struct dentry * dentry)
382/** 382/**
383 * prune_dcache - shrink the dcache 383 * prune_dcache - shrink the dcache
384 * @count: number of entries to try and free 384 * @count: number of entries to try and free
385 * @sb: if given, ignore dentries for other superblocks
386 * which are being unmounted.
385 * 387 *
386 * Shrink the dcache. This is done when we need 388 * Shrink the dcache. This is done when we need
387 * more memory, or simply when we need to unmount 389 * more memory, or simply when we need to unmount
@@ -392,16 +394,29 @@ static inline void prune_one_dentry(struct dentry * dentry)
392 * all the dentries are in use. 394 * all the dentries are in use.
393 */ 395 */
394 396
395static void prune_dcache(int count) 397static void prune_dcache(int count, struct super_block *sb)
396{ 398{
397 spin_lock(&dcache_lock); 399 spin_lock(&dcache_lock);
398 for (; count ; count--) { 400 for (; count ; count--) {
399 struct dentry *dentry; 401 struct dentry *dentry;
400 struct list_head *tmp; 402 struct list_head *tmp;
403 struct rw_semaphore *s_umount;
401 404
402 cond_resched_lock(&dcache_lock); 405 cond_resched_lock(&dcache_lock);
403 406
404 tmp = dentry_unused.prev; 407 tmp = dentry_unused.prev;
408 if (unlikely(sb)) {
409 /* Try to find a dentry for this sb, but don't try
410 * too hard, if they aren't near the tail they will
411 * be moved down again soon
412 */
413 int skip = count;
414 while (skip && tmp != &dentry_unused &&
415 list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
416 skip--;
417 tmp = tmp->prev;
418 }
419 }
405 if (tmp == &dentry_unused) 420 if (tmp == &dentry_unused)
406 break; 421 break;
407 list_del_init(tmp); 422 list_del_init(tmp);
@@ -427,7 +442,45 @@ static void prune_dcache(int count)
427 spin_unlock(&dentry->d_lock); 442 spin_unlock(&dentry->d_lock);
428 continue; 443 continue;
429 } 444 }
430 prune_one_dentry(dentry); 445 /*
446 * If the dentry is not DCACHED_REFERENCED, it is time
447 * to remove it from the dcache, provided the super block is
448 * NULL (which means we are trying to reclaim memory)
449 * or this dentry belongs to the same super block that
450 * we want to shrink.
451 */
452 /*
453 * If this dentry is for "my" filesystem, then I can prune it
454 * without taking the s_umount lock (I already hold it).
455 */
456 if (sb && dentry->d_sb == sb) {
457 prune_one_dentry(dentry);
458 continue;
459 }
460 /*
461 * ...otherwise we need to be sure this filesystem isn't being
462 * unmounted, otherwise we could race with
463 * generic_shutdown_super(), and end up holding a reference to
464 * an inode while the filesystem is unmounted.
465 * So we try to get s_umount, and make sure s_root isn't NULL.
466 * (Take a local copy of s_umount to avoid a use-after-free of
467 * `dentry').
468 */
469 s_umount = &dentry->d_sb->s_umount;
470 if (down_read_trylock(s_umount)) {
471 if (dentry->d_sb->s_root != NULL) {
472 prune_one_dentry(dentry);
473 up_read(s_umount);
474 continue;
475 }
476 up_read(s_umount);
477 }
478 spin_unlock(&dentry->d_lock);
479 /* Cannot remove the first dentry, and it isn't appropriate
480 * to move it to the head of the list, so give up, and try
481 * later
482 */
483 break;
431 } 484 }
432 spin_unlock(&dcache_lock); 485 spin_unlock(&dcache_lock);
433} 486}
@@ -630,7 +683,7 @@ void shrink_dcache_parent(struct dentry * parent)
630 int found; 683 int found;
631 684
632 while ((found = select_parent(parent)) != 0) 685 while ((found = select_parent(parent)) != 0)
633 prune_dcache(found); 686 prune_dcache(found, parent->d_sb);
634} 687}
635 688
636/** 689/**
@@ -643,9 +696,10 @@ void shrink_dcache_parent(struct dentry * parent)
643 * done under dcache_lock. 696 * done under dcache_lock.
644 * 697 *
645 */ 698 */
646void shrink_dcache_anon(struct hlist_head *head) 699void shrink_dcache_anon(struct super_block *sb)
647{ 700{
648 struct hlist_node *lp; 701 struct hlist_node *lp;
702 struct hlist_head *head = &sb->s_anon;
649 int found; 703 int found;
650 do { 704 do {
651 found = 0; 705 found = 0;
@@ -668,7 +722,7 @@ void shrink_dcache_anon(struct hlist_head *head)
668 } 722 }
669 } 723 }
670 spin_unlock(&dcache_lock); 724 spin_unlock(&dcache_lock);
671 prune_dcache(found); 725 prune_dcache(found, sb);
672 } while(found); 726 } while(found);
673} 727}
674 728
@@ -689,7 +743,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
689 if (nr) { 743 if (nr) {
690 if (!(gfp_mask & __GFP_FS)) 744 if (!(gfp_mask & __GFP_FS))
691 return -1; 745 return -1;
692 prune_dcache(nr); 746 prune_dcache(nr, NULL);
693 } 747 }
694 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 748 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
695} 749}
diff --git a/fs/super.c b/fs/super.c
index a66f66bb8049..9d5c2add7228 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
231 if (root) { 231 if (root) {
232 sb->s_root = NULL; 232 sb->s_root = NULL;
233 shrink_dcache_parent(root); 233 shrink_dcache_parent(root);
234 shrink_dcache_anon(&sb->s_anon); 234 shrink_dcache_anon(sb);
235 dput(root); 235 dput(root);
236 fsync_super(sb); 236 fsync_super(sb);
237 lock_super(sb); 237 lock_super(sb);