aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/extents.c
diff options
context:
space:
mode:
authorAditya Kali <adityakali@google.com>2011-09-09 19:20:51 -0400
committerTheodore Ts'o <tytso@mit.edu>2011-09-09 19:20:51 -0400
commit5356f2615cd558c57a1f7d7528d1ad4de3640d96 (patch)
treee3590bf14d9a21c4eb365105886382bfb1131b95 /fs/ext4/extents.c
parentd8990240d8c911064447f8aa5a440f9345a6d692 (diff)
ext4: attempt to fix race in bigalloc code path
Currently, there exists a race between delayed allocated writes and the writeback when bigalloc feature is in use. The race was because we wanted to determine what blocks in a cluster are under delayed allocation and we were using buffer_delayed(bh) check for it. But, the writeback codepath clears this bit without any synchronization which resulted in a race and an ext4 warning similar to: EXT4-fs (ram1): ext4_da_update_reserve_space: ino 13, used 1 with only 0 reserved data blocks The race existed in two places. (1) between ext4_find_delalloc_range() and ext4_map_blocks() when called from writeback code path. (2) between ext4_find_delalloc_range() and ext4_da_get_block_prep() (where buffer_delayed(bh) is set. To fix (1), this patch introduces a new buffer_head state bit - BH_Da_Mapped. This bit is set under the protection of EXT4_I(inode)->i_data_sem when we have actually mapped the delayed allocated blocks during the writeout time. We can now reliably check for this bit inside ext4_find_delalloc_range() to determine whether the reservation for the blocks have already been claimed or not. To fix (2), it was necessary to set buffer_delay(bh) under the protection of i_data_sem. So, I extracted the very beginning of ext4_map_blocks into a new function - ext4_da_map_blocks() - and performed the required setting of bh_delay bit and the quota reservation under the protection of i_data_sem. These two fixes makes the checking of buffer_delay(bh) and buffer_da_mapped(bh) consistent, thus removing the race. Tested: I was able to reproduce the problem by running 'dd' and 'fsync' in parallel. Also, xfstests sometimes used to reproduce this race. After the fix both my test and xfstests were successful and no race (warning message) was observed. Google-Bug-Id: 4997027 Signed-off-by: Aditya Kali <adityakali@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/extents.c')
-rw-r--r--fs/ext4/extents.c38
1 files changed, 12 insertions, 26 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9b119308dae..ad39627c1fb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3296,28 +3296,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
3296 3296
3297 while ((i >= lblk_start) && (i <= lblk_end)) { 3297 while ((i >= lblk_start) && (i <= lblk_end)) {
3298 page = find_get_page(mapping, index); 3298 page = find_get_page(mapping, index);
3299 if (!page || !PageDirty(page)) 3299 if (!page)
3300 goto nextpage; 3300 goto nextpage;
3301 3301
3302 if (PageWriteback(page)) {
3303 /*
3304 * This might be a race with allocation and writeout. In
3305 * this case we just assume that the rest of the range
3306 * will eventually be written and there wont be any
3307 * delalloc blocks left.
3308 * TODO: the above assumption is troublesome, but might
3309 * work better in practice. other option could be note
3310 * somewhere that the cluster is getting written out and
3311 * detect that here.
3312 */
3313 page_cache_release(page);
3314 trace_ext4_find_delalloc_range(inode,
3315 lblk_start, lblk_end,
3316 search_hint_reverse,
3317 0, i);
3318 return 0;
3319 }
3320
3321 if (!page_has_buffers(page)) 3302 if (!page_has_buffers(page))
3322 goto nextpage; 3303 goto nextpage;
3323 3304
@@ -3340,7 +3321,11 @@ static int ext4_find_delalloc_range(struct inode *inode,
3340 continue; 3321 continue;
3341 } 3322 }
3342 3323
3343 if (buffer_delay(bh)) { 3324 /* Check if the buffer is delayed allocated and that it
3325 * is not yet mapped. (when da-buffers are mapped during
3326 * their writeout, their da_mapped bit is set.)
3327 */
3328 if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3344 page_cache_release(page); 3329 page_cache_release(page);
3345 trace_ext4_find_delalloc_range(inode, 3330 trace_ext4_find_delalloc_range(inode,
3346 lblk_start, lblk_end, 3331 lblk_start, lblk_end,
@@ -4106,6 +4091,7 @@ got_allocated_blocks:
4106 ext4_da_update_reserve_space(inode, allocated_clusters, 4091 ext4_da_update_reserve_space(inode, allocated_clusters,
4107 1); 4092 1);
4108 if (reserved_clusters < allocated_clusters) { 4093 if (reserved_clusters < allocated_clusters) {
4094 struct ext4_inode_info *ei = EXT4_I(inode);
4109 int reservation = allocated_clusters - 4095 int reservation = allocated_clusters -
4110 reserved_clusters; 4096 reserved_clusters;
4111 /* 4097 /*
@@ -4148,11 +4134,11 @@ got_allocated_blocks:
4148 * remaining blocks finally gets written, we 4134 * remaining blocks finally gets written, we
4149 * could claim them. 4135 * could claim them.
4150 */ 4136 */
4151 while (reservation) { 4137 dquot_reserve_block(inode,
4152 ext4_da_reserve_space(inode, 4138 EXT4_C2B(sbi, reservation));
4153 map->m_lblk); 4139 spin_lock(&ei->i_block_reservation_lock);
4154 reservation--; 4140 ei->i_reserved_data_blocks += reservation;
4155 } 4141 spin_unlock(&ei->i_block_reservation_lock);
4156 } 4142 }
4157 } 4143 }
4158 } 4144 }