diff options
author | Josef Bacik <josef@toxicpanda.com> | 2018-10-12 15:32:32 -0400 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2018-10-19 06:20:04 -0400 |
commit | c495144bc6962186feae31d687596d2472000e45 (patch) | |
tree | c17e15559aed1a066fce1ee363017dc27977fd19 | |
parent | 30928e9baac238a7330085a1c5747f0b5df444b4 (diff) |
btrfs: move the dio_sem higher up the callchain
We're getting a lockdep splat because we take the dio_sem under the
log_mutex. What we really need is to protect fsync() from logging an
extent map for an extent we never waited on higher up, so just guard the
whole thing with dio_sem.
======================================================
WARNING: possible circular locking dependency detected
4.18.0-rc4-xfstests-00025-g5de5edbaf1d4 #411 Not tainted
------------------------------------------------------
aio-dio-invalid/30928 is trying to acquire lock:
0000000092621cfd (&mm->mmap_sem){++++}, at: get_user_pages_unlocked+0x5a/0x1e0
but task is already holding lock:
00000000cefe6b35 (&ei->dio_sem){++++}, at: btrfs_direct_IO+0x3be/0x400
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #5 (&ei->dio_sem){++++}:
lock_acquire+0xbd/0x220
down_write+0x51/0xb0
btrfs_log_changed_extents+0x80/0xa40
btrfs_log_inode+0xbaf/0x1000
btrfs_log_inode_parent+0x26f/0xa80
btrfs_log_dentry_safe+0x50/0x70
btrfs_sync_file+0x357/0x540
do_fsync+0x38/0x60
__ia32_sys_fdatasync+0x12/0x20
do_fast_syscall_32+0x9a/0x2f0
entry_SYSENTER_compat+0x84/0x96
-> #4 (&ei->log_mutex){+.+.}:
lock_acquire+0xbd/0x220
__mutex_lock+0x86/0xa10
btrfs_record_unlink_dir+0x2a/0xa0
btrfs_unlink+0x5a/0xc0
vfs_unlink+0xb1/0x1a0
do_unlinkat+0x264/0x2b0
do_fast_syscall_32+0x9a/0x2f0
entry_SYSENTER_compat+0x84/0x96
-> #3 (sb_internal#2){.+.+}:
lock_acquire+0xbd/0x220
__sb_start_write+0x14d/0x230
start_transaction+0x3e6/0x590
btrfs_evict_inode+0x475/0x640
evict+0xbf/0x1b0
btrfs_run_delayed_iputs+0x6c/0x90
cleaner_kthread+0x124/0x1a0
kthread+0x106/0x140
ret_from_fork+0x3a/0x50
-> #2 (&fs_info->cleaner_delayed_iput_mutex){+.+.}:
lock_acquire+0xbd/0x220
__mutex_lock+0x86/0xa10
btrfs_alloc_data_chunk_ondemand+0x197/0x530
btrfs_check_data_free_space+0x4c/0x90
btrfs_delalloc_reserve_space+0x20/0x60
btrfs_page_mkwrite+0x87/0x520
do_page_mkwrite+0x31/0xa0
__handle_mm_fault+0x799/0xb00
handle_mm_fault+0x7c/0xe0
__do_page_fault+0x1d3/0x4a0
async_page_fault+0x1e/0x30
-> #1 (sb_pagefaults){.+.+}:
lock_acquire+0xbd/0x220
__sb_start_write+0x14d/0x230
btrfs_page_mkwrite+0x6a/0x520
do_page_mkwrite+0x31/0xa0
__handle_mm_fault+0x799/0xb00
handle_mm_fault+0x7c/0xe0
__do_page_fault+0x1d3/0x4a0
async_page_fault+0x1e/0x30
-> #0 (&mm->mmap_sem){++++}:
__lock_acquire+0x42e/0x7a0
lock_acquire+0xbd/0x220
down_read+0x48/0xb0
get_user_pages_unlocked+0x5a/0x1e0
get_user_pages_fast+0xa4/0x150
iov_iter_get_pages+0xc3/0x340
do_direct_IO+0xf93/0x1d70
__blockdev_direct_IO+0x32d/0x1c20
btrfs_direct_IO+0x227/0x400
generic_file_direct_write+0xcf/0x180
btrfs_file_write_iter+0x308/0x58c
aio_write+0xf8/0x1d0
io_submit_one+0x3a9/0x620
__ia32_compat_sys_io_submit+0xb2/0x270
do_int80_syscall_32+0x5b/0x1a0
entry_INT80_compat+0x88/0xa0
other info that might help us debug this:
Chain exists of:
&mm->mmap_sem --> &ei->log_mutex --> &ei->dio_sem
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&ei->dio_sem);
lock(&ei->log_mutex);
lock(&ei->dio_sem);
lock(&mm->mmap_sem);
*** DEADLOCK ***
1 lock held by aio-dio-invalid/30928:
#0: 00000000cefe6b35 (&ei->dio_sem){++++}, at: btrfs_direct_IO+0x3be/0x400
stack backtrace:
CPU: 0 PID: 30928 Comm: aio-dio-invalid Not tainted 4.18.0-rc4-xfstests-00025-g5de5edbaf1d4 #411
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
Call Trace:
dump_stack+0x7c/0xbb
print_circular_bug.isra.37+0x297/0x2a4
check_prev_add.constprop.45+0x781/0x7a0
? __lock_acquire+0x42e/0x7a0
validate_chain.isra.41+0x7f0/0xb00
__lock_acquire+0x42e/0x7a0
lock_acquire+0xbd/0x220
? get_user_pages_unlocked+0x5a/0x1e0
down_read+0x48/0xb0
? get_user_pages_unlocked+0x5a/0x1e0
get_user_pages_unlocked+0x5a/0x1e0
get_user_pages_fast+0xa4/0x150
iov_iter_get_pages+0xc3/0x340
do_direct_IO+0xf93/0x1d70
? __alloc_workqueue_key+0x358/0x490
? __blockdev_direct_IO+0x14b/0x1c20
__blockdev_direct_IO+0x32d/0x1c20
? btrfs_run_delalloc_work+0x40/0x40
? can_nocow_extent+0x490/0x490
? kvm_clock_read+0x1f/0x30
? can_nocow_extent+0x490/0x490
? btrfs_run_delalloc_work+0x40/0x40
btrfs_direct_IO+0x227/0x400
? btrfs_run_delalloc_work+0x40/0x40
generic_file_direct_write+0xcf/0x180
btrfs_file_write_iter+0x308/0x58c
aio_write+0xf8/0x1d0
? kvm_clock_read+0x1f/0x30
? __might_fault+0x3e/0x90
io_submit_one+0x3a9/0x620
? io_submit_one+0xe5/0x620
__ia32_compat_sys_io_submit+0xb2/0x270
do_int80_syscall_32+0x5b/0x1a0
entry_INT80_compat+0x88/0xa0
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r-- | fs/btrfs/file.c | 12 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 2 |
2 files changed, 12 insertions, 2 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15b925142793..97c7a086f7bd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -2078,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
2078 | goto out; | 2078 | goto out; |
2079 | 2079 | ||
2080 | inode_lock(inode); | 2080 | inode_lock(inode); |
2081 | |||
2082 | /* | ||
2083 | * We take the dio_sem here because the tree log stuff can race with | ||
2084 | * lockless dio writes and get an extent map logged for an extent we | ||
2085 | * never waited on. We need it this high up for lockdep reasons. | ||
2086 | */ | ||
2087 | down_write(&BTRFS_I(inode)->dio_sem); | ||
2088 | |||
2081 | atomic_inc(&root->log_batch); | 2089 | atomic_inc(&root->log_batch); |
2082 | 2090 | ||
2083 | /* | 2091 | /* |
@@ -2086,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
2086 | */ | 2094 | */ |
2087 | ret = btrfs_wait_ordered_range(inode, start, len); | 2095 | ret = btrfs_wait_ordered_range(inode, start, len); |
2088 | if (ret) { | 2096 | if (ret) { |
2097 | up_write(&BTRFS_I(inode)->dio_sem); | ||
2089 | inode_unlock(inode); | 2098 | inode_unlock(inode); |
2090 | goto out; | 2099 | goto out; |
2091 | } | 2100 | } |
@@ -2109,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
2109 | * checked called fsync. | 2118 | * checked called fsync. |
2110 | */ | 2119 | */ |
2111 | ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); | 2120 | ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); |
2121 | up_write(&BTRFS_I(inode)->dio_sem); | ||
2112 | inode_unlock(inode); | 2122 | inode_unlock(inode); |
2113 | goto out; | 2123 | goto out; |
2114 | } | 2124 | } |
@@ -2127,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
2127 | trans = btrfs_start_transaction(root, 0); | 2137 | trans = btrfs_start_transaction(root, 0); |
2128 | if (IS_ERR(trans)) { | 2138 | if (IS_ERR(trans)) { |
2129 | ret = PTR_ERR(trans); | 2139 | ret = PTR_ERR(trans); |
2140 | up_write(&BTRFS_I(inode)->dio_sem); | ||
2130 | inode_unlock(inode); | 2141 | inode_unlock(inode); |
2131 | goto out; | 2142 | goto out; |
2132 | } | 2143 | } |
@@ -2148,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
2148 | * file again, but that will end up using the synchronization | 2159 | * file again, but that will end up using the synchronization |
2149 | * inside btrfs_sync_log to keep things safe. | 2160 | * inside btrfs_sync_log to keep things safe. |
2150 | */ | 2161 | */ |
2162 | up_write(&BTRFS_I(inode)->dio_sem); | ||
2151 | inode_unlock(inode); | 2163 | inode_unlock(inode); |
2152 | 2164 | ||
2153 | /* | 2165 | /* |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7147d329ddbc..e07f3376b7df 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -4390,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, | |||
4390 | 4390 | ||
4391 | INIT_LIST_HEAD(&extents); | 4391 | INIT_LIST_HEAD(&extents); |
4392 | 4392 | ||
4393 | down_write(&inode->dio_sem); | ||
4394 | write_lock(&tree->lock); | 4393 | write_lock(&tree->lock); |
4395 | test_gen = root->fs_info->last_trans_committed; | 4394 | test_gen = root->fs_info->last_trans_committed; |
4396 | logged_start = start; | 4395 | logged_start = start; |
@@ -4456,7 +4455,6 @@ process: | |||
4456 | } | 4455 | } |
4457 | WARN_ON(!list_empty(&extents)); | 4456 | WARN_ON(!list_empty(&extents)); |
4458 | write_unlock(&tree->lock); | 4457 | write_unlock(&tree->lock); |
4459 | up_write(&inode->dio_sem); | ||
4460 | 4458 | ||
4461 | btrfs_release_path(path); | 4459 | btrfs_release_path(path); |
4462 | if (!ret) | 4460 | if (!ret) |