diff options
author | NeilBrown <neilb@suse.de> | 2006-06-22 17:47:28 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-22 18:05:57 -0400 |
commit | 0feae5c47aabdde59cbbec32d150e17102de37f0 (patch) | |
tree | 244f742d943a0516921180b840419fdc329075f0 /fs | |
parent | de047c1bcd7f7bcfbdc29eb5b439fb332594da3f (diff) |
[PATCH] Fix dcache race during umount
The race is that the shrink_dcache_memory shrinker could get called while a
filesystem is being unmounted, and could try to prune a dentry belonging to
that filesystem.
If it does, then it will call in to iput on the inode while the dentry is
no longer able to be found by the umounting process. If iput takes a
while, generic_shutdown_super could get all the way though
shrink_dcache_parent and shrink_dcache_anon and invalidate_inodes without
ever waiting on this particular inode.
Eventually the superblock gets freed anyway and if the iput tried to touch
it (which some filesystems certainly do), it will lose. The promised
"Self-destruct in 5 seconds" doesn't lead to a nice day.
The race is closed by holding s_umount while calling prune_one_dentry on
someone else's dentry. As a down_read_trylock is used,
shrink_dcache_memory will no longer try to prune the dentry of a filesystem
that is being unmounted, and unmount will not be able to start until any
such active prune_one_dentry completes.
This requires that prune_dcache *knows* which filesystem (if any) it is
doing the prune on behalf of so that it can be careful of other
filesystems. shrink_dcache_memory isn't called it on behalf of any
filesystem, and so is careful of everything.
shrink_dcache_anon is now passed a super_block rather than the s_anon list
out of the superblock, so it can get the s_anon list itself, and can pass
the superblock down to prune_dcache.
If prune_dcache finds a dentry that it cannot free, it leaves it where it
is (at the tail of the list) and exits, on the assumption that some other
thread will be removing that dentry soon. To try to make sure that some
work gets done, a limited number of dnetries which are untouchable are
skipped over while choosing the dentry to work on.
I believe this race was first found by Kirill Korotaev.
Cc: Jan Blunck <jblunck@suse.de>
Acked-by: Kirill Korotaev <dev@openvz.org>
Cc: Olaf Hering <olh@suse.de>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/dcache.c | 66 | ||||
-rw-r--r-- | fs/super.c | 2 |
2 files changed, 61 insertions, 7 deletions
diff --git a/fs/dcache.c b/fs/dcache.c index 940d188e5d14..385f5dbc4b0c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -382,6 +382,8 @@ static inline void prune_one_dentry(struct dentry * dentry) | |||
382 | /** | 382 | /** |
383 | * prune_dcache - shrink the dcache | 383 | * prune_dcache - shrink the dcache |
384 | * @count: number of entries to try and free | 384 | * @count: number of entries to try and free |
385 | * @sb: if given, ignore dentries for other superblocks | ||
386 | * which are being unmounted. | ||
385 | * | 387 | * |
386 | * Shrink the dcache. This is done when we need | 388 | * Shrink the dcache. This is done when we need |
387 | * more memory, or simply when we need to unmount | 389 | * more memory, or simply when we need to unmount |
@@ -392,16 +394,29 @@ static inline void prune_one_dentry(struct dentry * dentry) | |||
392 | * all the dentries are in use. | 394 | * all the dentries are in use. |
393 | */ | 395 | */ |
394 | 396 | ||
395 | static void prune_dcache(int count) | 397 | static void prune_dcache(int count, struct super_block *sb) |
396 | { | 398 | { |
397 | spin_lock(&dcache_lock); | 399 | spin_lock(&dcache_lock); |
398 | for (; count ; count--) { | 400 | for (; count ; count--) { |
399 | struct dentry *dentry; | 401 | struct dentry *dentry; |
400 | struct list_head *tmp; | 402 | struct list_head *tmp; |
403 | struct rw_semaphore *s_umount; | ||
401 | 404 | ||
402 | cond_resched_lock(&dcache_lock); | 405 | cond_resched_lock(&dcache_lock); |
403 | 406 | ||
404 | tmp = dentry_unused.prev; | 407 | tmp = dentry_unused.prev; |
408 | if (unlikely(sb)) { | ||
409 | /* Try to find a dentry for this sb, but don't try | ||
410 | * too hard, if they aren't near the tail they will | ||
411 | * be moved down again soon | ||
412 | */ | ||
413 | int skip = count; | ||
414 | while (skip && tmp != &dentry_unused && | ||
415 | list_entry(tmp, struct dentry, d_lru)->d_sb != sb) { | ||
416 | skip--; | ||
417 | tmp = tmp->prev; | ||
418 | } | ||
419 | } | ||
405 | if (tmp == &dentry_unused) | 420 | if (tmp == &dentry_unused) |
406 | break; | 421 | break; |
407 | list_del_init(tmp); | 422 | list_del_init(tmp); |
@@ -427,7 +442,45 @@ static void prune_dcache(int count) | |||
427 | spin_unlock(&dentry->d_lock); | 442 | spin_unlock(&dentry->d_lock); |
428 | continue; | 443 | continue; |
429 | } | 444 | } |
430 | prune_one_dentry(dentry); | 445 | /* |
446 | * If the dentry is not DCACHED_REFERENCED, it is time | ||
447 | * to remove it from the dcache, provided the super block is | ||
448 | * NULL (which means we are trying to reclaim memory) | ||
449 | * or this dentry belongs to the same super block that | ||
450 | * we want to shrink. | ||
451 | */ | ||
452 | /* | ||
453 | * If this dentry is for "my" filesystem, then I can prune it | ||
454 | * without taking the s_umount lock (I already hold it). | ||
455 | */ | ||
456 | if (sb && dentry->d_sb == sb) { | ||
457 | prune_one_dentry(dentry); | ||
458 | continue; | ||
459 | } | ||
460 | /* | ||
461 | * ...otherwise we need to be sure this filesystem isn't being | ||
462 | * unmounted, otherwise we could race with | ||
463 | * generic_shutdown_super(), and end up holding a reference to | ||
464 | * an inode while the filesystem is unmounted. | ||
465 | * So we try to get s_umount, and make sure s_root isn't NULL. | ||
466 | * (Take a local copy of s_umount to avoid a use-after-free of | ||
467 | * `dentry'). | ||
468 | */ | ||
469 | s_umount = &dentry->d_sb->s_umount; | ||
470 | if (down_read_trylock(s_umount)) { | ||
471 | if (dentry->d_sb->s_root != NULL) { | ||
472 | prune_one_dentry(dentry); | ||
473 | up_read(s_umount); | ||
474 | continue; | ||
475 | } | ||
476 | up_read(s_umount); | ||
477 | } | ||
478 | spin_unlock(&dentry->d_lock); | ||
479 | /* Cannot remove the first dentry, and it isn't appropriate | ||
480 | * to move it to the head of the list, so give up, and try | ||
481 | * later | ||
482 | */ | ||
483 | break; | ||
431 | } | 484 | } |
432 | spin_unlock(&dcache_lock); | 485 | spin_unlock(&dcache_lock); |
433 | } | 486 | } |
@@ -630,7 +683,7 @@ void shrink_dcache_parent(struct dentry * parent) | |||
630 | int found; | 683 | int found; |
631 | 684 | ||
632 | while ((found = select_parent(parent)) != 0) | 685 | while ((found = select_parent(parent)) != 0) |
633 | prune_dcache(found); | 686 | prune_dcache(found, parent->d_sb); |
634 | } | 687 | } |
635 | 688 | ||
636 | /** | 689 | /** |
@@ -643,9 +696,10 @@ void shrink_dcache_parent(struct dentry * parent) | |||
643 | * done under dcache_lock. | 696 | * done under dcache_lock. |
644 | * | 697 | * |
645 | */ | 698 | */ |
646 | void shrink_dcache_anon(struct hlist_head *head) | 699 | void shrink_dcache_anon(struct super_block *sb) |
647 | { | 700 | { |
648 | struct hlist_node *lp; | 701 | struct hlist_node *lp; |
702 | struct hlist_head *head = &sb->s_anon; | ||
649 | int found; | 703 | int found; |
650 | do { | 704 | do { |
651 | found = 0; | 705 | found = 0; |
@@ -668,7 +722,7 @@ void shrink_dcache_anon(struct hlist_head *head) | |||
668 | } | 722 | } |
669 | } | 723 | } |
670 | spin_unlock(&dcache_lock); | 724 | spin_unlock(&dcache_lock); |
671 | prune_dcache(found); | 725 | prune_dcache(found, sb); |
672 | } while(found); | 726 | } while(found); |
673 | } | 727 | } |
674 | 728 | ||
@@ -689,7 +743,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) | |||
689 | if (nr) { | 743 | if (nr) { |
690 | if (!(gfp_mask & __GFP_FS)) | 744 | if (!(gfp_mask & __GFP_FS)) |
691 | return -1; | 745 | return -1; |
692 | prune_dcache(nr); | 746 | prune_dcache(nr, NULL); |
693 | } | 747 | } |
694 | return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; | 748 | return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; |
695 | } | 749 | } |
diff --git a/fs/super.c b/fs/super.c index a66f66bb8049..9d5c2add7228 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb) | |||
231 | if (root) { | 231 | if (root) { |
232 | sb->s_root = NULL; | 232 | sb->s_root = NULL; |
233 | shrink_dcache_parent(root); | 233 | shrink_dcache_parent(root); |
234 | shrink_dcache_anon(&sb->s_anon); | 234 | shrink_dcache_anon(sb); |
235 | dput(root); | 235 | dput(root); |
236 | fsync_super(sb); | 236 | fsync_super(sb); |
237 | lock_super(sb); | 237 | lock_super(sb); |