aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
authorAditya Kali <adityakali@google.com>2011-09-09 19:20:51 -0400
committerTheodore Ts'o <tytso@mit.edu>2011-09-09 19:20:51 -0400
commit5356f2615cd558c57a1f7d7528d1ad4de3640d96 (patch)
treee3590bf14d9a21c4eb365105886382bfb1131b95 /fs/ext4/inode.c
parentd8990240d8c911064447f8aa5a440f9345a6d692 (diff)
ext4: attempt to fix race in bigalloc code path
Currently, there exists a race between delayed allocated writes and the writeback when bigalloc feature is in use. The race was because we wanted to determine what blocks in a cluster are under delayed allocation and we were using buffer_delayed(bh) check for it. But, the writeback codepath clears this bit without any synchronization which resulted in a race and an ext4 warning similar to: EXT4-fs (ram1): ext4_da_update_reserve_space: ino 13, used 1 with only 0 reserved data blocks The race existed in two places. (1) between ext4_find_delalloc_range() and ext4_map_blocks() when called from writeback code path. (2) between ext4_find_delalloc_range() and ext4_da_get_block_prep() (where buffer_delayed(bh) is set. To fix (1), this patch introduces a new buffer_head state bit - BH_Da_Mapped. This bit is set under the protection of EXT4_I(inode)->i_data_sem when we have actually mapped the delayed allocated blocks during the writeout time. We can now reliably check for this bit inside ext4_find_delalloc_range() to determine whether the reservation for the blocks have already been claimed or not. To fix (2), it was necessary to set buffer_delay(bh) under the protection of i_data_sem. So, I extracted the very beginning of ext4_map_blocks into a new function - ext4_da_map_blocks() - and performed the required setting of bh_delay bit and the quota reservation under the protection of i_data_sem. These two fixes makes the checking of buffer_delay(bh) and buffer_da_mapped(bh) consistent, thus removing the race. Tested: I was able to reproduce the problem by running 'dd' and 'fsync' in parallel. Also, xfstests sometimes used to reproduce this race. After the fix both my test and xfstests were successful and no race (warning message) was observed. Google-Bug-Id: 4997027 Signed-off-by: Aditya Kali <adityakali@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c146
1 files changed, 118 insertions, 28 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2dcd4fed96ec..1380cd29c312 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -398,6 +398,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
398} 398}
399 399
400/* 400/*
401 * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
402 */
403static void set_buffers_da_mapped(struct inode *inode,
404 struct ext4_map_blocks *map)
405{
406 struct address_space *mapping = inode->i_mapping;
407 struct pagevec pvec;
408 int i, nr_pages;
409 pgoff_t index, end;
410
411 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
412 end = (map->m_lblk + map->m_len - 1) >>
413 (PAGE_CACHE_SHIFT - inode->i_blkbits);
414
415 pagevec_init(&pvec, 0);
416 while (index <= end) {
417 nr_pages = pagevec_lookup(&pvec, mapping, index,
418 min(end - index + 1,
419 (pgoff_t)PAGEVEC_SIZE));
420 if (nr_pages == 0)
421 break;
422 for (i = 0; i < nr_pages; i++) {
423 struct page *page = pvec.pages[i];
424 struct buffer_head *bh, *head;
425
426 if (unlikely(page->mapping != mapping) ||
427 !PageDirty(page))
428 break;
429
430 if (page_has_buffers(page)) {
431 bh = head = page_buffers(page);
432 do {
433 set_buffer_da_mapped(bh);
434 bh = bh->b_this_page;
435 } while (bh != head);
436 }
437 index++;
438 }
439 pagevec_release(&pvec);
440 }
441}
442
443/*
401 * The ext4_map_blocks() function tries to look up the requested blocks, 444 * The ext4_map_blocks() function tries to look up the requested blocks,
402 * and returns if the blocks are already mapped. 445 * and returns if the blocks are already mapped.
403 * 446 *
@@ -516,9 +559,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
516 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 559 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
517 ext4_da_update_reserve_space(inode, retval, 1); 560 ext4_da_update_reserve_space(inode, retval, 1);
518 } 561 }
519 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 562 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
520 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 563 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
521 564
565 /* If we have successfully mapped the delayed allocated blocks,
566 * set the BH_Da_Mapped bit on them. Its important to do this
567 * under the protection of i_data_sem.
568 */
569 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
570 set_buffers_da_mapped(inode, map);
571 }
572
522 up_write((&EXT4_I(inode)->i_data_sem)); 573 up_write((&EXT4_I(inode)->i_data_sem));
523 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 574 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
524 int ret = check_block_validity(inode, map); 575 int ret = check_block_validity(inode, map);
@@ -1038,7 +1089,7 @@ static int ext4_journalled_write_end(struct file *file,
1038/* 1089/*
1039 * Reserve a single cluster located at lblock 1090 * Reserve a single cluster located at lblock
1040 */ 1091 */
1041int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) 1092static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1042{ 1093{
1043 int retries = 0; 1094 int retries = 0;
1044 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1095 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1153,6 +1204,7 @@ static void ext4_da_page_release_reservation(struct page *page,
1153 if ((offset <= curr_off) && (buffer_delay(bh))) { 1204 if ((offset <= curr_off) && (buffer_delay(bh))) {
1154 to_release++; 1205 to_release++;
1155 clear_buffer_delay(bh); 1206 clear_buffer_delay(bh);
1207 clear_buffer_da_mapped(bh);
1156 } 1208 }
1157 curr_off = next_off; 1209 curr_off = next_off;
1158 } while ((bh = bh->b_this_page) != head); 1210 } while ((bh = bh->b_this_page) != head);
@@ -1271,6 +1323,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1271 clear_buffer_delay(bh); 1323 clear_buffer_delay(bh);
1272 bh->b_blocknr = pblock; 1324 bh->b_blocknr = pblock;
1273 } 1325 }
1326 if (buffer_da_mapped(bh))
1327 clear_buffer_da_mapped(bh);
1274 if (buffer_unwritten(bh) || 1328 if (buffer_unwritten(bh) ||
1275 buffer_mapped(bh)) 1329 buffer_mapped(bh))
1276 BUG_ON(bh->b_blocknr != pblock); 1330 BUG_ON(bh->b_blocknr != pblock);
@@ -1604,6 +1658,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1604} 1658}
1605 1659
1606/* 1660/*
1661 * This function is grabs code from the very beginning of
1662 * ext4_map_blocks, but assumes that the caller is from delayed write
1663 * time. This function looks up the requested blocks and sets the
1664 * buffer delay bit under the protection of i_data_sem.
1665 */
1666static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1667 struct ext4_map_blocks *map,
1668 struct buffer_head *bh)
1669{
1670 int retval;
1671 sector_t invalid_block = ~((sector_t) 0xffff);
1672
1673 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1674 invalid_block = ~0;
1675
1676 map->m_flags = 0;
1677 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1678 "logical block %lu\n", inode->i_ino, map->m_len,
1679 (unsigned long) map->m_lblk);
1680 /*
1681 * Try to see if we can get the block without requesting a new
1682 * file system block.
1683 */
1684 down_read((&EXT4_I(inode)->i_data_sem));
1685 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1686 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1687 else
1688 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1689
1690 if (retval == 0) {
1691 /*
1692 * XXX: __block_prepare_write() unmaps passed block,
1693 * is it OK?
1694 */
1695 /* If the block was allocated from previously allocated cluster,
1696 * then we dont need to reserve it again. */
1697 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
1698 retval = ext4_da_reserve_space(inode, iblock);
1699 if (retval)
1700 /* not enough space to reserve */
1701 goto out_unlock;
1702 }
1703
1704 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1705 * and it should not appear on the bh->b_state.
1706 */
1707 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
1708
1709 map_bh(bh, inode->i_sb, invalid_block);
1710 set_buffer_new(bh);
1711 set_buffer_delay(bh);
1712 }
1713
1714out_unlock:
1715 up_read((&EXT4_I(inode)->i_data_sem));
1716
1717 return retval;
1718}
1719
1720/*
1607 * This is a special get_blocks_t callback which is used by 1721 * This is a special get_blocks_t callback which is used by
1608 * ext4_da_write_begin(). It will either return mapped block or 1722 * ext4_da_write_begin(). It will either return mapped block or
1609 * reserve space for a single block. 1723 * reserve space for a single block.
@@ -1620,10 +1734,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1620{ 1734{
1621 struct ext4_map_blocks map; 1735 struct ext4_map_blocks map;
1622 int ret = 0; 1736 int ret = 0;
1623 sector_t invalid_block = ~((sector_t) 0xffff);
1624
1625 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1626 invalid_block = ~0;
1627 1737
1628 BUG_ON(create == 0); 1738 BUG_ON(create == 0);
1629 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 1739 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1636,29 +1746,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1636 * preallocated blocks are unmapped but should treated 1746 * preallocated blocks are unmapped but should treated
1637 * the same as allocated blocks. 1747 * the same as allocated blocks.
1638 */ 1748 */
1639 ret = ext4_map_blocks(NULL, inode, &map, 0); 1749 ret = ext4_da_map_blocks(inode, iblock, &map, bh);
1640 if (ret < 0) 1750 if (ret <= 0)
1641 return ret; 1751 return ret;
1642 if (ret == 0) {
1643 if (buffer_delay(bh))
1644 return 0; /* Not sure this could or should happen */
1645 /*
1646 * XXX: __block_write_begin() unmaps passed block, is it OK?
1647 */
1648 /* If the block was allocated from previously allocated cluster,
1649 * then we dont need to reserve it again. */
1650 if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) {
1651 ret = ext4_da_reserve_space(inode, iblock);
1652 if (ret)
1653 /* not enough space to reserve */
1654 return ret;
1655 }
1656
1657 map_bh(bh, inode->i_sb, invalid_block);
1658 set_buffer_new(bh);
1659 set_buffer_delay(bh);
1660 return 0;
1661 }
1662 1752
1663 map_bh(bh, inode->i_sb, map.m_pblk); 1753 map_bh(bh, inode->i_sb, map.m_pblk);
1664 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 1754 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;