aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2009-08-17 22:17:20 -0400
committerTheodore Ts'o <tytso@mit.edu>2009-08-17 22:17:20 -0400
commit487caeef9fc08c0565e082c40a8aaf58dad92bbb (patch)
tree69920293cfe3a50bdbbf845be785350e7c203a2b
parent9599b0e597d810be9b8f759ea6e9619c4f983c5e (diff)
ext4: Fix possible deadlock between ext4_truncate() and ext4_get_blocks()
During truncate we are sometimes forced to start a new transaction as the amount of blocks to be journaled is both quite large and hard to predict. So far we restarted a transaction while holding i_data_sem and that violates lock ordering because i_data_sem ranks below a transaction start (and it can lead to a real deadlock with ext4_get_blocks() mapping blocks in some page while having a transaction open). We fix the problem by dropping the i_data_sem before restarting the transaction and acquire it afterwards. It's slightly subtle that this works: 1) By the time ext4_truncate() is called, all the page cache for the truncated part of the file is dropped so get_block() should not be called on it (we only have to invalidate extent cache after we reacquire i_data_sem because some extent from not-truncated part could extend also into the part we are going to truncate). 2) Writes, migrate or defrag hold i_mutex so they are stopped for all the time of the truncate. This bug has been found and analyzed by Theodore Tso <tytso@mit.edu>. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/extents.c15
-rw-r--r--fs/ext4/inode.c23
3 files changed, 32 insertions, 7 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2e9a2036c114..fb21663ffe54 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1370,6 +1370,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1370extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1370extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1371extern int ext4_can_truncate(struct inode *inode); 1371extern int ext4_can_truncate(struct inode *inode);
1372extern void ext4_truncate(struct inode *); 1372extern void ext4_truncate(struct inode *);
1373extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1373extern void ext4_set_inode_flags(struct inode *); 1374extern void ext4_set_inode_flags(struct inode *);
1374extern void ext4_get_inode_flags(struct ext4_inode_info *); 1375extern void ext4_get_inode_flags(struct ext4_inode_info *);
1375extern int ext4_alloc_da_blocks(struct inode *inode); 1376extern int ext4_alloc_da_blocks(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 13db43408533..8c20caf4aa5c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94} 94}
95 95
96static int ext4_ext_journal_restart(handle_t *handle, int needed) 96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode,
98 int needed)
97{ 99{
98 int err; 100 int err;
99 101
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
104 err = ext4_journal_extend(handle, needed); 106 err = ext4_journal_extend(handle, needed);
105 if (err <= 0) 107 if (err <= 0)
106 return err; 108 return err;
107 return ext4_journal_restart(handle, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /*
111 * We have dropped i_data_sem so someone might have cached again
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115
116 return err;
108} 117}
109 118
110/* 119/*
@@ -2150,7 +2159,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2150 } 2159 }
2151 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2160 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2152 2161
2153 err = ext4_ext_journal_restart(handle, credits); 2162 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2154 if (err) 2163 if (err)
2155 goto out; 2164 goto out;
2156 2165
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9a4c929b16dc..d61fb523308f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 192 * so before we call here everything must be consistently dirtied against
193 * this transaction. 193 * this transaction.
194 */ 194 */
195static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
196 int nblocks)
196{ 197{
198 int ret;
199
200 /*
201 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
202 * moment, get_block can be called only for blocks inside i_size since
203 * page cache has been already dropped and writes are blocked by
204 * i_mutex. So we can safely drop the i_data_sem here.
205 */
197 BUG_ON(EXT4_JOURNAL(inode) == NULL); 206 BUG_ON(EXT4_JOURNAL(inode) == NULL);
198 jbd_debug(2, "restarting handle %p\n", handle); 207 jbd_debug(2, "restarting handle %p\n", handle);
199 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 208 up_write(&EXT4_I(inode)->i_data_sem);
209 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
210 down_write(&EXT4_I(inode)->i_data_sem);
211
212 return ret;
200} 213}
201 214
202/* 215/*
@@ -3658,7 +3671,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3658 ext4_handle_dirty_metadata(handle, inode, bh); 3671 ext4_handle_dirty_metadata(handle, inode, bh);
3659 } 3672 }
3660 ext4_mark_inode_dirty(handle, inode); 3673 ext4_mark_inode_dirty(handle, inode);
3661 ext4_journal_test_restart(handle, inode); 3674 ext4_truncate_restart_trans(handle, inode,
3675 blocks_for_truncate(inode));
3662 if (bh) { 3676 if (bh) {
3663 BUFFER_TRACE(bh, "retaking write access"); 3677 BUFFER_TRACE(bh, "retaking write access");
3664 ext4_journal_get_write_access(handle, bh); 3678 ext4_journal_get_write_access(handle, bh);
@@ -3869,7 +3883,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3869 return; 3883 return;
3870 if (try_to_extend_transaction(handle, inode)) { 3884 if (try_to_extend_transaction(handle, inode)) {
3871 ext4_mark_inode_dirty(handle, inode); 3885 ext4_mark_inode_dirty(handle, inode);
3872 ext4_journal_test_restart(handle, inode); 3886 ext4_truncate_restart_trans(handle, inode,
3887 blocks_for_truncate(inode));
3873 } 3888 }
3874 3889
3875 ext4_free_blocks(handle, inode, nr, 1, 1); 3890 ext4_free_blocks(handle, inode, nr, 1, 1);