diff options
author | Theodore Ts'o <tytso@mit.edu> | 2012-09-19 22:42:36 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2012-09-19 22:42:36 -0400 |
commit | 00d4e7362ed01987183e9528295de3213031309c (patch) | |
tree | 84986a838284cf45e3134ebac52197b4e2d29590 | |
parent | 18888cf0883c286f238d44ee565530fe82752f06 (diff) |
ext4: fix potential deadlock in ext4_nonda_switch()
In ext4_nonda_switch(), if the file system is getting full we used to
call writeback_inodes_sb_if_idle(). The problem is that we can be
holding i_mutex already, and this causes a potential deadlock when
writeback_inodes_sb_if_idle() when it tries to take s_umount. (See
lockdep output below).
As it turns out we don't need need to hold s_umount; the fact that we
are in the middle of the write(2) system call will keep the superblock
pinned. Unfortunately writeback_inodes_sb() checks to make sure
s_umount is taken, and the VFS uses a different mechanism for making
sure the file system doesn't get unmounted out from under us. The
simplest way of dealing with this is to just simply grab s_umount
using a trylock, and skip kicking the writeback flusher thread in the
very unlikely case that we can't take a read lock on s_umount without
blocking.
Also, we now check the cirteria for kicking the writeback thread
before we decide to whether to fall back to non-delayed writeback, so
if there are any outstanding delayed allocation writes, we try to get
them resolved as soon as possible.
[ INFO: possible circular locking dependency detected ]
3.6.0-rc1-00042-gce894ca #367 Not tainted
-------------------------------------------------------
dd/8298 is trying to acquire lock:
(&type->s_umount_key#18){++++..}, at: [<c02277d4>] writeback_inodes_sb_if_idle+0x28/0x46
but task is already holding lock:
(&sb->s_type->i_mutex_key#8){+.+...}, at: [<c01ddcce>] generic_file_aio_write+0x5f/0xd3
which lock already depends on the new lock.
2 locks held by dd/8298:
#0: (sb_writers#2){.+.+.+}, at: [<c01ddcc5>] generic_file_aio_write+0x56/0xd3
#1: (&sb->s_type->i_mutex_key#8){+.+...}, at: [<c01ddcce>] generic_file_aio_write+0x5f/0xd3
stack backtrace:
Pid: 8298, comm: dd Not tainted 3.6.0-rc1-00042-gce894ca #367
Call Trace:
[<c015b79c>] ? console_unlock+0x345/0x372
[<c06d62a1>] print_circular_bug+0x190/0x19d
[<c019906c>] __lock_acquire+0x86d/0xb6c
[<c01999db>] ? mark_held_locks+0x5c/0x7b
[<c0199724>] lock_acquire+0x66/0xb9
[<c02277d4>] ? writeback_inodes_sb_if_idle+0x28/0x46
[<c06db935>] down_read+0x28/0x58
[<c02277d4>] ? writeback_inodes_sb_if_idle+0x28/0x46
[<c02277d4>] writeback_inodes_sb_if_idle+0x28/0x46
[<c026f3b2>] ext4_nonda_switch+0xe1/0xf4
[<c0271ece>] ext4_da_write_begin+0x27/0x193
[<c01dcdb0>] generic_file_buffered_write+0xc8/0x1bb
[<c01ddc47>] __generic_file_aio_write+0x1dd/0x205
[<c01ddce7>] generic_file_aio_write+0x78/0xd3
[<c026d336>] ext4_file_write+0x480/0x4a6
[<c0198c1d>] ? __lock_acquire+0x41e/0xb6c
[<c0180944>] ? sched_clock_cpu+0x11a/0x13e
[<c01967e9>] ? trace_hardirqs_off+0xb/0xd
[<c018099f>] ? local_clock+0x37/0x4e
[<c0209f2c>] do_sync_write+0x67/0x9d
[<c0209ec5>] ? wait_on_retry_sync_kiocb+0x44/0x44
[<c020a7b9>] vfs_write+0x7b/0xe6
[<c020a9a6>] sys_write+0x3b/0x64
[<c06dd4bd>] syscall_call+0x7/0xb
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
-rw-r--r-- | fs/ext4/inode.c | 17 | ||||
-rw-r--r-- | fs/fs-writeback.c | 1 |
2 files changed, 11 insertions, 7 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ca76b5ed6c9e..0a31197590d7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -2462,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb) | |||
2462 | free_blocks = EXT4_C2B(sbi, | 2462 | free_blocks = EXT4_C2B(sbi, |
2463 | percpu_counter_read_positive(&sbi->s_freeclusters_counter)); | 2463 | percpu_counter_read_positive(&sbi->s_freeclusters_counter)); |
2464 | dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); | 2464 | dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); |
2465 | /* | ||
2466 | * Start pushing delalloc when 1/2 of free blocks are dirty. | ||
2467 | */ | ||
2468 | if (dirty_blocks && (free_blocks < 2 * dirty_blocks) && | ||
2469 | !writeback_in_progress(sb->s_bdi) && | ||
2470 | down_read_trylock(&sb->s_umount)) { | ||
2471 | writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); | ||
2472 | up_read(&sb->s_umount); | ||
2473 | } | ||
2474 | |||
2465 | if (2 * free_blocks < 3 * dirty_blocks || | 2475 | if (2 * free_blocks < 3 * dirty_blocks || |
2466 | free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { | 2476 | free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { |
2467 | /* | 2477 | /* |
@@ -2470,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb) | |||
2470 | */ | 2480 | */ |
2471 | return 1; | 2481 | return 1; |
2472 | } | 2482 | } |
2473 | /* | ||
2474 | * Even if we don't switch but are nearing capacity, | ||
2475 | * start pushing delalloc when 1/2 of free blocks are dirty. | ||
2476 | */ | ||
2477 | if (free_blocks < 2 * dirty_blocks) | ||
2478 | writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); | ||
2479 | |||
2480 | return 0; | 2483 | return 0; |
2481 | } | 2484 | } |
2482 | 2485 | ||
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index be3efc4f64f4..5602d73d4ec7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi) | |||
63 | { | 63 | { |
64 | return test_bit(BDI_writeback_running, &bdi->state); | 64 | return test_bit(BDI_writeback_running, &bdi->state); |
65 | } | 65 | } |
66 | EXPORT_SYMBOL(writeback_in_progress); | ||
66 | 67 | ||
67 | static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) | 68 | static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) |
68 | { | 69 | { |