diff options
author | Jan Kara <jack@suse.cz> | 2009-08-17 22:17:20 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2009-08-17 22:17:20 -0400 |
commit | 487caeef9fc08c0565e082c40a8aaf58dad92bbb (patch) | |
tree | 69920293cfe3a50bdbbf845be785350e7c203a2b | |
parent | 9599b0e597d810be9b8f759ea6e9619c4f983c5e (diff) |
ext4: Fix possible deadlock between ext4_truncate() and ext4_get_blocks()
During truncate we are sometimes forced to start a new transaction as
the amount of blocks to be journaled is both quite large and hard to
predict. So far we restarted a transaction while holding i_data_sem
and that violates lock ordering because i_data_sem ranks below a
transaction start (and it can lead to a real deadlock with
ext4_get_blocks() mapping blocks in some page while having a
transaction open).
We fix the problem by dropping the i_data_sem before restarting the
transaction and acquire it afterwards. It's slightly subtle that this
works:
1) By the time ext4_truncate() is called, all the page cache for the
truncated part of the file is dropped so get_block() should not be
called on it (we only have to invalidate extent cache after we
reacquire i_data_sem because some extent from not-truncated part could
extend also into the part we are going to truncate).
2) Writes, migrate or defrag hold i_mutex so they are stopped for all
the time of the truncate.
This bug has been found and analyzed by Theodore Tso <tytso@mit.edu>.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r-- | fs/ext4/ext4.h | 1 | ||||
-rw-r--r-- | fs/ext4/extents.c | 15 | ||||
-rw-r--r-- | fs/ext4/inode.c | 23 |
3 files changed, 32 insertions, 7 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2e9a2036c114..fb21663ffe54 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -1370,6 +1370,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); | |||
1370 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); | 1370 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); |
1371 | extern int ext4_can_truncate(struct inode *inode); | 1371 | extern int ext4_can_truncate(struct inode *inode); |
1372 | extern void ext4_truncate(struct inode *); | 1372 | extern void ext4_truncate(struct inode *); |
1373 | extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); | ||
1373 | extern void ext4_set_inode_flags(struct inode *); | 1374 | extern void ext4_set_inode_flags(struct inode *); |
1374 | extern void ext4_get_inode_flags(struct ext4_inode_info *); | 1375 | extern void ext4_get_inode_flags(struct ext4_inode_info *); |
1375 | extern int ext4_alloc_da_blocks(struct inode *inode); | 1376 | extern int ext4_alloc_da_blocks(struct inode *inode); |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 13db43408533..8c20caf4aa5c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) | |||
93 | ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); | 93 | ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); |
94 | } | 94 | } |
95 | 95 | ||
96 | static int ext4_ext_journal_restart(handle_t *handle, int needed) | 96 | static int ext4_ext_truncate_extend_restart(handle_t *handle, |
97 | struct inode *inode, | ||
98 | int needed) | ||
97 | { | 99 | { |
98 | int err; | 100 | int err; |
99 | 101 | ||
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) | |||
104 | err = ext4_journal_extend(handle, needed); | 106 | err = ext4_journal_extend(handle, needed); |
105 | if (err <= 0) | 107 | if (err <= 0) |
106 | return err; | 108 | return err; |
107 | return ext4_journal_restart(handle, needed); | 109 | err = ext4_truncate_restart_trans(handle, inode, needed); |
110 | /* | ||
111 | * We have dropped i_data_sem so someone might have cached again | ||
112 | * an extent we are going to truncate. | ||
113 | */ | ||
114 | ext4_ext_invalidate_cache(inode); | ||
115 | |||
116 | return err; | ||
108 | } | 117 | } |
109 | 118 | ||
110 | /* | 119 | /* |
@@ -2150,7 +2159,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2150 | } | 2159 | } |
2151 | credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); | 2160 | credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); |
2152 | 2161 | ||
2153 | err = ext4_ext_journal_restart(handle, credits); | 2162 | err = ext4_ext_truncate_extend_restart(handle, inode, credits); |
2154 | if (err) | 2163 | if (err) |
2155 | goto out; | 2164 | goto out; |
2156 | 2165 | ||
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9a4c929b16dc..d61fb523308f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | |||
192 | * so before we call here everything must be consistently dirtied against | 192 | * so before we call here everything must be consistently dirtied against |
193 | * this transaction. | 193 | * this transaction. |
194 | */ | 194 | */ |
195 | static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) | 195 | int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, |
196 | int nblocks) | ||
196 | { | 197 | { |
198 | int ret; | ||
199 | |||
200 | /* | ||
201 | * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this | ||
202 | * moment, get_block can be called only for blocks inside i_size since | ||
203 | * page cache has been already dropped and writes are blocked by | ||
204 | * i_mutex. So we can safely drop the i_data_sem here. | ||
205 | */ | ||
197 | BUG_ON(EXT4_JOURNAL(inode) == NULL); | 206 | BUG_ON(EXT4_JOURNAL(inode) == NULL); |
198 | jbd_debug(2, "restarting handle %p\n", handle); | 207 | jbd_debug(2, "restarting handle %p\n", handle); |
199 | return ext4_journal_restart(handle, blocks_for_truncate(inode)); | 208 | up_write(&EXT4_I(inode)->i_data_sem); |
209 | ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); | ||
210 | down_write(&EXT4_I(inode)->i_data_sem); | ||
211 | |||
212 | return ret; | ||
200 | } | 213 | } |
201 | 214 | ||
202 | /* | 215 | /* |
@@ -3658,7 +3671,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, | |||
3658 | ext4_handle_dirty_metadata(handle, inode, bh); | 3671 | ext4_handle_dirty_metadata(handle, inode, bh); |
3659 | } | 3672 | } |
3660 | ext4_mark_inode_dirty(handle, inode); | 3673 | ext4_mark_inode_dirty(handle, inode); |
3661 | ext4_journal_test_restart(handle, inode); | 3674 | ext4_truncate_restart_trans(handle, inode, |
3675 | blocks_for_truncate(inode)); | ||
3662 | if (bh) { | 3676 | if (bh) { |
3663 | BUFFER_TRACE(bh, "retaking write access"); | 3677 | BUFFER_TRACE(bh, "retaking write access"); |
3664 | ext4_journal_get_write_access(handle, bh); | 3678 | ext4_journal_get_write_access(handle, bh); |
@@ -3869,7 +3883,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3869 | return; | 3883 | return; |
3870 | if (try_to_extend_transaction(handle, inode)) { | 3884 | if (try_to_extend_transaction(handle, inode)) { |
3871 | ext4_mark_inode_dirty(handle, inode); | 3885 | ext4_mark_inode_dirty(handle, inode); |
3872 | ext4_journal_test_restart(handle, inode); | 3886 | ext4_truncate_restart_trans(handle, inode, |
3887 | blocks_for_truncate(inode)); | ||
3873 | } | 3888 | } |
3874 | 3889 | ||
3875 | ext4_free_blocks(handle, inode, nr, 1, 1); | 3890 | ext4_free_blocks(handle, inode, nr, 1, 1); |