aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAditya Kali <adityakali@google.com>2011-09-09 19:20:51 -0400
committerTheodore Ts'o <tytso@mit.edu>2011-09-09 19:20:51 -0400
commit5356f2615cd558c57a1f7d7528d1ad4de3640d96 (patch)
treee3590bf14d9a21c4eb365105886382bfb1131b95
parentd8990240d8c911064447f8aa5a440f9345a6d692 (diff)
ext4: attempt to fix race in bigalloc code path
Currently, there exists a race between delayed allocated writes and the writeback when bigalloc feature is in use. The race was because we wanted to determine what blocks in a cluster are under delayed allocation and we were using buffer_delayed(bh) check for it. But, the writeback codepath clears this bit without any synchronization which resulted in a race and an ext4 warning similar to: EXT4-fs (ram1): ext4_da_update_reserve_space: ino 13, used 1 with only 0 reserved data blocks The race existed in two places. (1) between ext4_find_delalloc_range() and ext4_map_blocks() when called from writeback code path. (2) between ext4_find_delalloc_range() and ext4_da_get_block_prep() (where buffer_delayed(bh) is set. To fix (1), this patch introduces a new buffer_head state bit - BH_Da_Mapped. This bit is set under the protection of EXT4_I(inode)->i_data_sem when we have actually mapped the delayed allocated blocks during the writeout time. We can now reliably check for this bit inside ext4_find_delalloc_range() to determine whether the reservation for the blocks have already been claimed or not. To fix (2), it was necessary to set buffer_delay(bh) under the protection of i_data_sem. So, I extracted the very beginning of ext4_map_blocks into a new function - ext4_da_map_blocks() - and performed the required setting of bh_delay bit and the quota reservation under the protection of i_data_sem. These two fixes makes the checking of buffer_delay(bh) and buffer_da_mapped(bh) consistent, thus removing the race. Tested: I was able to reproduce the problem by running 'dd' and 'fsync' in parallel. Also, xfstests sometimes used to reproduce this race. After the fix both my test and xfstests were successful and no race (warning message) was observed. Google-Bug-Id: 4997027 Signed-off-by: Aditya Kali <adityakali@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/ext4.h5
-rw-r--r--fs/ext4/extents.c38
-rw-r--r--fs/ext4/inode.c146
3 files changed, 134 insertions, 55 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 751277a4890c..1bbd2caebe7f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1893,7 +1893,6 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1893extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1893extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1894extern void ext4_da_update_reserve_space(struct inode *inode, 1894extern void ext4_da_update_reserve_space(struct inode *inode,
1895 int used, int quota_claim); 1895 int used, int quota_claim);
1896extern int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock);
1897 1896
1898/* indirect.c */ 1897/* indirect.c */
1899extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 1898extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2300,10 +2299,14 @@ enum ext4_state_bits {
2300 * never, ever appear in a buffer_head's state 2299 * never, ever appear in a buffer_head's state
2301 * flag. See EXT4_MAP_FROM_CLUSTER to see where 2300 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2302 * this is used. */ 2301 * this is used. */
2302 BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
2303 * flag is set when ext4_map_blocks is called on a
2304 * delayed allocated block to get its real mapping. */
2303}; 2305};
2304 2306
2305BUFFER_FNS(Uninit, uninit) 2307BUFFER_FNS(Uninit, uninit)
2306TAS_BUFFER_FNS(Uninit, uninit) 2308TAS_BUFFER_FNS(Uninit, uninit)
2309BUFFER_FNS(Da_Mapped, da_mapped)
2307 2310
2308/* 2311/*
2309 * Add new method to test wether block and inode bitmaps are properly 2312 * Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9b119308daea..ad39627c1fbc 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3296,28 +3296,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
3296 3296
3297 while ((i >= lblk_start) && (i <= lblk_end)) { 3297 while ((i >= lblk_start) && (i <= lblk_end)) {
3298 page = find_get_page(mapping, index); 3298 page = find_get_page(mapping, index);
3299 if (!page || !PageDirty(page)) 3299 if (!page)
3300 goto nextpage; 3300 goto nextpage;
3301 3301
3302 if (PageWriteback(page)) {
3303 /*
3304 * This might be a race with allocation and writeout. In
3305 * this case we just assume that the rest of the range
3306 * will eventually be written and there wont be any
3307 * delalloc blocks left.
3308 * TODO: the above assumption is troublesome, but might
3309 * work better in practice. other option could be note
3310 * somewhere that the cluster is getting written out and
3311 * detect that here.
3312 */
3313 page_cache_release(page);
3314 trace_ext4_find_delalloc_range(inode,
3315 lblk_start, lblk_end,
3316 search_hint_reverse,
3317 0, i);
3318 return 0;
3319 }
3320
3321 if (!page_has_buffers(page)) 3302 if (!page_has_buffers(page))
3322 goto nextpage; 3303 goto nextpage;
3323 3304
@@ -3340,7 +3321,11 @@ static int ext4_find_delalloc_range(struct inode *inode,
3340 continue; 3321 continue;
3341 } 3322 }
3342 3323
3343 if (buffer_delay(bh)) { 3324 /* Check if the buffer is delayed allocated and that it
3325 * is not yet mapped. (when da-buffers are mapped during
3326 * their writeout, their da_mapped bit is set.)
3327 */
3328 if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3344 page_cache_release(page); 3329 page_cache_release(page);
3345 trace_ext4_find_delalloc_range(inode, 3330 trace_ext4_find_delalloc_range(inode,
3346 lblk_start, lblk_end, 3331 lblk_start, lblk_end,
@@ -4106,6 +4091,7 @@ got_allocated_blocks:
4106 ext4_da_update_reserve_space(inode, allocated_clusters, 4091 ext4_da_update_reserve_space(inode, allocated_clusters,
4107 1); 4092 1);
4108 if (reserved_clusters < allocated_clusters) { 4093 if (reserved_clusters < allocated_clusters) {
4094 struct ext4_inode_info *ei = EXT4_I(inode);
4109 int reservation = allocated_clusters - 4095 int reservation = allocated_clusters -
4110 reserved_clusters; 4096 reserved_clusters;
4111 /* 4097 /*
@@ -4148,11 +4134,11 @@ got_allocated_blocks:
4148 * remaining blocks finally gets written, we 4134 * remaining blocks finally gets written, we
4149 * could claim them. 4135 * could claim them.
4150 */ 4136 */
4151 while (reservation) { 4137 dquot_reserve_block(inode,
4152 ext4_da_reserve_space(inode, 4138 EXT4_C2B(sbi, reservation));
4153 map->m_lblk); 4139 spin_lock(&ei->i_block_reservation_lock);
4154 reservation--; 4140 ei->i_reserved_data_blocks += reservation;
4155 } 4141 spin_unlock(&ei->i_block_reservation_lock);
4156 } 4142 }
4157 } 4143 }
4158 } 4144 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2dcd4fed96ec..1380cd29c312 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -398,6 +398,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
398} 398}
399 399
400/* 400/*
401 * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
402 */
403static void set_buffers_da_mapped(struct inode *inode,
404 struct ext4_map_blocks *map)
405{
406 struct address_space *mapping = inode->i_mapping;
407 struct pagevec pvec;
408 int i, nr_pages;
409 pgoff_t index, end;
410
411 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
412 end = (map->m_lblk + map->m_len - 1) >>
413 (PAGE_CACHE_SHIFT - inode->i_blkbits);
414
415 pagevec_init(&pvec, 0);
416 while (index <= end) {
417 nr_pages = pagevec_lookup(&pvec, mapping, index,
418 min(end - index + 1,
419 (pgoff_t)PAGEVEC_SIZE));
420 if (nr_pages == 0)
421 break;
422 for (i = 0; i < nr_pages; i++) {
423 struct page *page = pvec.pages[i];
424 struct buffer_head *bh, *head;
425
426 if (unlikely(page->mapping != mapping) ||
427 !PageDirty(page))
428 break;
429
430 if (page_has_buffers(page)) {
431 bh = head = page_buffers(page);
432 do {
433 set_buffer_da_mapped(bh);
434 bh = bh->b_this_page;
435 } while (bh != head);
436 }
437 index++;
438 }
439 pagevec_release(&pvec);
440 }
441}
442
443/*
401 * The ext4_map_blocks() function tries to look up the requested blocks, 444 * The ext4_map_blocks() function tries to look up the requested blocks,
402 * and returns if the blocks are already mapped. 445 * and returns if the blocks are already mapped.
403 * 446 *
@@ -516,9 +559,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
516 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 559 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
517 ext4_da_update_reserve_space(inode, retval, 1); 560 ext4_da_update_reserve_space(inode, retval, 1);
518 } 561 }
519 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 562 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
520 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 563 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
521 564
565 /* If we have successfully mapped the delayed allocated blocks,
566 * set the BH_Da_Mapped bit on them. Its important to do this
567 * under the protection of i_data_sem.
568 */
569 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
570 set_buffers_da_mapped(inode, map);
571 }
572
522 up_write((&EXT4_I(inode)->i_data_sem)); 573 up_write((&EXT4_I(inode)->i_data_sem));
523 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 574 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
524 int ret = check_block_validity(inode, map); 575 int ret = check_block_validity(inode, map);
@@ -1038,7 +1089,7 @@ static int ext4_journalled_write_end(struct file *file,
1038/* 1089/*
1039 * Reserve a single cluster located at lblock 1090 * Reserve a single cluster located at lblock
1040 */ 1091 */
1041int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) 1092static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1042{ 1093{
1043 int retries = 0; 1094 int retries = 0;
1044 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1095 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1153,6 +1204,7 @@ static void ext4_da_page_release_reservation(struct page *page,
1153 if ((offset <= curr_off) && (buffer_delay(bh))) { 1204 if ((offset <= curr_off) && (buffer_delay(bh))) {
1154 to_release++; 1205 to_release++;
1155 clear_buffer_delay(bh); 1206 clear_buffer_delay(bh);
1207 clear_buffer_da_mapped(bh);
1156 } 1208 }
1157 curr_off = next_off; 1209 curr_off = next_off;
1158 } while ((bh = bh->b_this_page) != head); 1210 } while ((bh = bh->b_this_page) != head);
@@ -1271,6 +1323,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1271 clear_buffer_delay(bh); 1323 clear_buffer_delay(bh);
1272 bh->b_blocknr = pblock; 1324 bh->b_blocknr = pblock;
1273 } 1325 }
1326 if (buffer_da_mapped(bh))
1327 clear_buffer_da_mapped(bh);
1274 if (buffer_unwritten(bh) || 1328 if (buffer_unwritten(bh) ||
1275 buffer_mapped(bh)) 1329 buffer_mapped(bh))
1276 BUG_ON(bh->b_blocknr != pblock); 1330 BUG_ON(bh->b_blocknr != pblock);
@@ -1604,6 +1658,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1604} 1658}
1605 1659
1606/* 1660/*
1661 * This function is grabs code from the very beginning of
1662 * ext4_map_blocks, but assumes that the caller is from delayed write
1663 * time. This function looks up the requested blocks and sets the
1664 * buffer delay bit under the protection of i_data_sem.
1665 */
1666static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1667 struct ext4_map_blocks *map,
1668 struct buffer_head *bh)
1669{
1670 int retval;
1671 sector_t invalid_block = ~((sector_t) 0xffff);
1672
1673 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1674 invalid_block = ~0;
1675
1676 map->m_flags = 0;
1677 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1678 "logical block %lu\n", inode->i_ino, map->m_len,
1679 (unsigned long) map->m_lblk);
1680 /*
1681 * Try to see if we can get the block without requesting a new
1682 * file system block.
1683 */
1684 down_read((&EXT4_I(inode)->i_data_sem));
1685 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1686 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1687 else
1688 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1689
1690 if (retval == 0) {
1691 /*
1692 * XXX: __block_prepare_write() unmaps passed block,
1693 * is it OK?
1694 */
1695 /* If the block was allocated from previously allocated cluster,
1696 * then we dont need to reserve it again. */
1697 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
1698 retval = ext4_da_reserve_space(inode, iblock);
1699 if (retval)
1700 /* not enough space to reserve */
1701 goto out_unlock;
1702 }
1703
1704 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1705 * and it should not appear on the bh->b_state.
1706 */
1707 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
1708
1709 map_bh(bh, inode->i_sb, invalid_block);
1710 set_buffer_new(bh);
1711 set_buffer_delay(bh);
1712 }
1713
1714out_unlock:
1715 up_read((&EXT4_I(inode)->i_data_sem));
1716
1717 return retval;
1718}
1719
1720/*
1607 * This is a special get_blocks_t callback which is used by 1721 * This is a special get_blocks_t callback which is used by
1608 * ext4_da_write_begin(). It will either return mapped block or 1722 * ext4_da_write_begin(). It will either return mapped block or
1609 * reserve space for a single block. 1723 * reserve space for a single block.
@@ -1620,10 +1734,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1620{ 1734{
1621 struct ext4_map_blocks map; 1735 struct ext4_map_blocks map;
1622 int ret = 0; 1736 int ret = 0;
1623 sector_t invalid_block = ~((sector_t) 0xffff);
1624
1625 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1626 invalid_block = ~0;
1627 1737
1628 BUG_ON(create == 0); 1738 BUG_ON(create == 0);
1629 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 1739 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1636,29 +1746,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1636 * preallocated blocks are unmapped but should treated 1746 * preallocated blocks are unmapped but should treated
1637 * the same as allocated blocks. 1747 * the same as allocated blocks.
1638 */ 1748 */
1639 ret = ext4_map_blocks(NULL, inode, &map, 0); 1749 ret = ext4_da_map_blocks(inode, iblock, &map, bh);
1640 if (ret < 0) 1750 if (ret <= 0)
1641 return ret; 1751 return ret;
1642 if (ret == 0) {
1643 if (buffer_delay(bh))
1644 return 0; /* Not sure this could or should happen */
1645 /*
1646 * XXX: __block_write_begin() unmaps passed block, is it OK?
1647 */
1648 /* If the block was allocated from previously allocated cluster,
1649 * then we dont need to reserve it again. */
1650 if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) {
1651 ret = ext4_da_reserve_space(inode, iblock);
1652 if (ret)
1653 /* not enough space to reserve */
1654 return ret;
1655 }
1656
1657 map_bh(bh, inode->i_sb, invalid_block);
1658 set_buffer_new(bh);
1659 set_buffer_delay(bh);
1660 return 0;
1661 }
1662 1752
1663 map_bh(bh, inode->i_sb, map.m_pblk); 1753 map_bh(bh, inode->i_sb, map.m_pblk);
1664 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 1754 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;