aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2012-09-19 22:42:36 -0400
committerTheodore Ts'o <tytso@mit.edu>2012-09-19 22:42:36 -0400
commit00d4e7362ed01987183e9528295de3213031309c (patch)
tree84986a838284cf45e3134ebac52197b4e2d29590
parent18888cf0883c286f238d44ee565530fe82752f06 (diff)
ext4: fix potential deadlock in ext4_nonda_switch()
In ext4_nonda_switch(), if the file system is getting full we used to call writeback_inodes_sb_if_idle(). The problem is that we can be holding i_mutex already, and this causes a potential deadlock when writeback_inodes_sb_if_idle() when it tries to take s_umount. (See lockdep output below). As it turns out we don't need need to hold s_umount; the fact that we are in the middle of the write(2) system call will keep the superblock pinned. Unfortunately writeback_inodes_sb() checks to make sure s_umount is taken, and the VFS uses a different mechanism for making sure the file system doesn't get unmounted out from under us. The simplest way of dealing with this is to just simply grab s_umount using a trylock, and skip kicking the writeback flusher thread in the very unlikely case that we can't take a read lock on s_umount without blocking. Also, we now check the cirteria for kicking the writeback thread before we decide to whether to fall back to non-delayed writeback, so if there are any outstanding delayed allocation writes, we try to get them resolved as soon as possible. [ INFO: possible circular locking dependency detected ] 3.6.0-rc1-00042-gce894ca #367 Not tainted ------------------------------------------------------- dd/8298 is trying to acquire lock: (&type->s_umount_key#18){++++..}, at: [<c02277d4>] writeback_inodes_sb_if_idle+0x28/0x46 but task is already holding lock: (&sb->s_type->i_mutex_key#8){+.+...}, at: [<c01ddcce>] generic_file_aio_write+0x5f/0xd3 which lock already depends on the new lock. 2 locks held by dd/8298: #0: (sb_writers#2){.+.+.+}, at: [<c01ddcc5>] generic_file_aio_write+0x56/0xd3 #1: (&sb->s_type->i_mutex_key#8){+.+...}, at: [<c01ddcce>] generic_file_aio_write+0x5f/0xd3 stack backtrace: Pid: 8298, comm: dd Not tainted 3.6.0-rc1-00042-gce894ca #367 Call Trace: [<c015b79c>] ? console_unlock+0x345/0x372 [<c06d62a1>] print_circular_bug+0x190/0x19d [<c019906c>] __lock_acquire+0x86d/0xb6c [<c01999db>] ? mark_held_locks+0x5c/0x7b [<c0199724>] lock_acquire+0x66/0xb9 [<c02277d4>] ? writeback_inodes_sb_if_idle+0x28/0x46 [<c06db935>] down_read+0x28/0x58 [<c02277d4>] ? writeback_inodes_sb_if_idle+0x28/0x46 [<c02277d4>] writeback_inodes_sb_if_idle+0x28/0x46 [<c026f3b2>] ext4_nonda_switch+0xe1/0xf4 [<c0271ece>] ext4_da_write_begin+0x27/0x193 [<c01dcdb0>] generic_file_buffered_write+0xc8/0x1bb [<c01ddc47>] __generic_file_aio_write+0x1dd/0x205 [<c01ddce7>] generic_file_aio_write+0x78/0xd3 [<c026d336>] ext4_file_write+0x480/0x4a6 [<c0198c1d>] ? __lock_acquire+0x41e/0xb6c [<c0180944>] ? sched_clock_cpu+0x11a/0x13e [<c01967e9>] ? trace_hardirqs_off+0xb/0xd [<c018099f>] ? local_clock+0x37/0x4e [<c0209f2c>] do_sync_write+0x67/0x9d [<c0209ec5>] ? wait_on_retry_sync_kiocb+0x44/0x44 [<c020a7b9>] vfs_write+0x7b/0xe6 [<c020a9a6>] sys_write+0x3b/0x64 [<c06dd4bd>] syscall_call+0x7/0xb Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: stable@vger.kernel.org
-rw-r--r--fs/ext4/inode.c17
-rw-r--r--fs/fs-writeback.c1
2 files changed, 11 insertions, 7 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ca76b5ed6c9e..0a31197590d7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2462,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb)
2462 free_blocks = EXT4_C2B(sbi, 2462 free_blocks = EXT4_C2B(sbi,
2463 percpu_counter_read_positive(&sbi->s_freeclusters_counter)); 2463 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
2464 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2464 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2465 /*
2466 * Start pushing delalloc when 1/2 of free blocks are dirty.
2467 */
2468 if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
2469 !writeback_in_progress(sb->s_bdi) &&
2470 down_read_trylock(&sb->s_umount)) {
2471 writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
2472 up_read(&sb->s_umount);
2473 }
2474
2465 if (2 * free_blocks < 3 * dirty_blocks || 2475 if (2 * free_blocks < 3 * dirty_blocks ||
2466 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { 2476 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
2467 /* 2477 /*
@@ -2470,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb)
2470 */ 2480 */
2471 return 1; 2481 return 1;
2472 } 2482 }
2473 /*
2474 * Even if we don't switch but are nearing capacity,
2475 * start pushing delalloc when 1/2 of free blocks are dirty.
2476 */
2477 if (free_blocks < 2 * dirty_blocks)
2478 writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
2479
2480 return 0; 2483 return 0;
2481} 2484}
2482 2485
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index be3efc4f64f4..5602d73d4ec7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
63{ 63{
64 return test_bit(BDI_writeback_running, &bdi->state); 64 return test_bit(BDI_writeback_running, &bdi->state);
65} 65}
66EXPORT_SYMBOL(writeback_in_progress);
66 67
67static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) 68static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
68{ 69{