aboutsummaryrefslogtreecommitdiffstats
path: root/fs/inode.c
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2009-09-22 19:43:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-23 10:39:29 -0400
commit88e0fbc452ed94393bf89585c2b90edb94749b45 (patch)
tree374b7c8f397609da7a7b22ddee77a9c70956fcb1 /fs/inode.c
parent70867453092297be9afb2249e712a1f960ec0a09 (diff)
fs: turn iprune_mutex into rwsem
We have had a report of bad memory allocation latency during DVD-RAM (UDF) writing. This is causing the user's desktop session to become unusable. Jan tracked the cause of this down to UDF inode reclaim blocking: gnome-screens D ffff810006d1d598 0 20686 1 ffff810006d1d508 0000000000000082 ffff810037db6718 0000000000000800 ffff810006d1d488 ffffffff807e4280 ffffffff807e4280 ffff810006d1a580 ffff8100bccbc140 ffff810006d1a8c0 0000000006d1d4e8 ffff810006d1a8c0 Call Trace: [<ffffffff804477f3>] io_schedule+0x63/0xa5 [<ffffffff802c2587>] sync_buffer+0x3b/0x3f [<ffffffff80447d2a>] __wait_on_bit+0x47/0x79 [<ffffffff80447dc6>] out_of_line_wait_on_bit+0x6a/0x77 [<ffffffff802c24f6>] __wait_on_buffer+0x1f/0x21 [<ffffffff802c442a>] __bread+0x70/0x86 [<ffffffff88de9ec7>] :udf:udf_tread+0x38/0x3a [<ffffffff88de0fcf>] :udf:udf_update_inode+0x4d/0x68c [<ffffffff88de26e1>] :udf:udf_write_inode+0x1d/0x2b [<ffffffff802bcf85>] __writeback_single_inode+0x1c0/0x394 [<ffffffff802bd205>] write_inode_now+0x7d/0xc4 [<ffffffff88de2e76>] :udf:udf_clear_inode+0x3d/0x53 [<ffffffff802b39ae>] clear_inode+0xc2/0x11b [<ffffffff802b3ab1>] dispose_list+0x5b/0x102 [<ffffffff802b3d35>] shrink_icache_memory+0x1dd/0x213 [<ffffffff8027ede3>] shrink_slab+0xe3/0x158 [<ffffffff8027fbab>] try_to_free_pages+0x177/0x232 [<ffffffff8027a578>] __alloc_pages+0x1fa/0x392 [<ffffffff802951fa>] alloc_page_vma+0x176/0x189 [<ffffffff802822d8>] __do_fault+0x10c/0x417 [<ffffffff80284232>] handle_mm_fault+0x466/0x940 [<ffffffff8044b922>] do_page_fault+0x676/0xabf This blocks with iprune_mutex held, which then blocks other reclaimers: X D ffff81009d47c400 0 17285 14831 ffff8100844f3728 0000000000000086 0000000000000000 ffff81000000e288 ffff81000000da00 ffffffff807e4280 ffffffff807e4280 ffff81009d47c400 ffffffff805ff890 ffff81009d47c740 00000000844f3808 ffff81009d47c740 Call Trace: [<ffffffff80447f8c>] __mutex_lock_slowpath+0x72/0xa9 [<ffffffff80447e1a>] mutex_lock+0x1e/0x22 [<ffffffff802b3ba1>] shrink_icache_memory+0x49/0x213 [<ffffffff8027ede3>] shrink_slab+0xe3/0x158 [<ffffffff8027fbab>] try_to_free_pages+0x177/0x232 [<ffffffff8027a578>] __alloc_pages+0x1fa/0x392 [<ffffffff8029507f>] alloc_pages_current+0xd1/0xd6 [<ffffffff80279ac0>] __get_free_pages+0xe/0x4d [<ffffffff802ae1b7>] __pollwait+0x5e/0xdf [<ffffffff8860f2b4>] :nvidia:nv_kern_poll+0x2e/0x73 [<ffffffff802ad949>] do_select+0x308/0x506 [<ffffffff802adced>] core_sys_select+0x1a6/0x254 [<ffffffff802ae0b7>] sys_select+0xb5/0x157 Now I think the main problem is having the filesystem block (and do IO) in inode reclaim. The problem is that this doesn't get accounted well and penalizes a random allocator with a big latency spike caused by work generated from elsewhere. I think the best idea would be to avoid this. By design if possible, or by deferring the hard work to an asynchronous context. If the latter, then the fs would probably want to throttle creation of new work with queue size of the deferred work, but let's not get into those details. Anyway, the other obvious thing we looked at is the iprune_mutex which is causing the cascading blocking. We could turn this into an rwsem to improve concurrency. It is unreasonable to totally ban all potentially slow or blocking operations in inode reclaim, so I think this is a cheap way to get a small improvement. This doesn't solve the whole problem of course. The process doing inode reclaim will still take the latency hit, and concurrent processes may end up contending on filesystem locks. So fs developers should keep these problems in mind. Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Jan Kara <jack@ucw.cz> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/inode.c')
-rw-r--r--fs/inode.c19
1 files changed, 12 insertions, 7 deletions
diff --git a/fs/inode.c b/fs/inode.c
index f5ff71cb3e2b..76582b06ab97 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/rwsem.h>
17#include <linux/hash.h> 18#include <linux/hash.h>
18#include <linux/swap.h> 19#include <linux/swap.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -87,14 +88,18 @@ static struct hlist_head *inode_hashtable __read_mostly;
87DEFINE_SPINLOCK(inode_lock); 88DEFINE_SPINLOCK(inode_lock);
88 89
89/* 90/*
90 * iprune_mutex provides exclusion between the kswapd or try_to_free_pages 91 * iprune_sem provides exclusion between the kswapd or try_to_free_pages
91 * icache shrinking path, and the umount path. Without this exclusion, 92 * icache shrinking path, and the umount path. Without this exclusion,
92 * by the time prune_icache calls iput for the inode whose pages it has 93 * by the time prune_icache calls iput for the inode whose pages it has
93 * been invalidating, or by the time it calls clear_inode & destroy_inode 94 * been invalidating, or by the time it calls clear_inode & destroy_inode
94 * from its final dispose_list, the struct super_block they refer to 95 * from its final dispose_list, the struct super_block they refer to
95 * (for inode->i_sb->s_op) may already have been freed and reused. 96 * (for inode->i_sb->s_op) may already have been freed and reused.
97 *
98 * We make this an rwsem because the fastpath is icache shrinking. In
99 * some cases a filesystem may be doing a significant amount of work in
100 * its inode reclaim code, so this should improve parallelism.
96 */ 101 */
97static DEFINE_MUTEX(iprune_mutex); 102static DECLARE_RWSEM(iprune_sem);
98 103
99/* 104/*
100 * Statistics gathering.. 105 * Statistics gathering..
@@ -381,7 +386,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
381 /* 386 /*
382 * We can reschedule here without worrying about the list's 387 * We can reschedule here without worrying about the list's
383 * consistency because the per-sb list of inodes must not 388 * consistency because the per-sb list of inodes must not
384 * change during umount anymore, and because iprune_mutex keeps 389 * change during umount anymore, and because iprune_sem keeps
385 * shrink_icache_memory() away. 390 * shrink_icache_memory() away.
386 */ 391 */
387 cond_resched_lock(&inode_lock); 392 cond_resched_lock(&inode_lock);
@@ -420,7 +425,7 @@ int invalidate_inodes(struct super_block *sb)
420 int busy; 425 int busy;
421 LIST_HEAD(throw_away); 426 LIST_HEAD(throw_away);
422 427
423 mutex_lock(&iprune_mutex); 428 down_write(&iprune_sem);
424 spin_lock(&inode_lock); 429 spin_lock(&inode_lock);
425 inotify_unmount_inodes(&sb->s_inodes); 430 inotify_unmount_inodes(&sb->s_inodes);
426 fsnotify_unmount_inodes(&sb->s_inodes); 431 fsnotify_unmount_inodes(&sb->s_inodes);
@@ -428,7 +433,7 @@ int invalidate_inodes(struct super_block *sb)
428 spin_unlock(&inode_lock); 433 spin_unlock(&inode_lock);
429 434
430 dispose_list(&throw_away); 435 dispose_list(&throw_away);
431 mutex_unlock(&iprune_mutex); 436 up_write(&iprune_sem);
432 437
433 return busy; 438 return busy;
434} 439}
@@ -467,7 +472,7 @@ static void prune_icache(int nr_to_scan)
467 int nr_scanned; 472 int nr_scanned;
468 unsigned long reap = 0; 473 unsigned long reap = 0;
469 474
470 mutex_lock(&iprune_mutex); 475 down_read(&iprune_sem);
471 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
472 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 477 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
473 struct inode *inode; 478 struct inode *inode;
@@ -509,7 +514,7 @@ static void prune_icache(int nr_to_scan)
509 spin_unlock(&inode_lock); 514 spin_unlock(&inode_lock);
510 515
511 dispose_list(&freeable); 516 dispose_list(&freeable);
512 mutex_unlock(&iprune_mutex); 517 up_read(&iprune_sem);
513} 518}
514 519
515/* 520/*