diff options
author | Jan Kara <jack@suse.com> | 2015-12-07 15:10:44 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2015-12-07 15:10:44 -0500 |
commit | ba5843f51d468644b094674c0317c9ab95632caa (patch) | |
tree | e2ab7e39a4eff12af5bc9f2b14dafc0f08731727 /fs/ext4/inode.c | |
parent | c86d8db33a922da808a5560aa15ed663a9569b37 (diff) |
ext4: use pre-zeroed blocks for DAX page faults
Make DAX fault path use pre-zeroed blocks to avoid races with extent
conversion and zeroing when two page faults to the same block happen.
Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 86 |
1 files changed, 69 insertions, 17 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4241d0cff062..ff2f3cd38522 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -723,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, | |||
723 | 723 | ||
724 | map_bh(bh, inode->i_sb, map.m_pblk); | 724 | map_bh(bh, inode->i_sb, map.m_pblk); |
725 | bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; | 725 | bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; |
726 | if (IS_DAX(inode) && buffer_unwritten(bh)) { | ||
727 | /* | ||
728 | * dgc: I suspect unwritten conversion on ext4+DAX is | ||
729 | * fundamentally broken here when there are concurrent | ||
730 | * read/write in progress on this inode. | ||
731 | */ | ||
732 | WARN_ON_ONCE(io_end); | ||
733 | bh->b_assoc_map = inode->i_mapping; | ||
734 | bh->b_private = (void *)(unsigned long)iblock; | ||
735 | } | ||
736 | if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) | 726 | if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) |
737 | set_buffer_defer_completion(bh); | 727 | set_buffer_defer_completion(bh); |
738 | bh->b_size = inode->i_sb->s_blocksize * map.m_len; | 728 | bh->b_size = inode->i_sb->s_blocksize * map.m_len; |
@@ -3097,17 +3087,79 @@ static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, | |||
3097 | return ret; | 3087 | return ret; |
3098 | } | 3088 | } |
3099 | 3089 | ||
3100 | int ext4_get_block_dax(struct inode *inode, sector_t iblock, | 3090 | #ifdef CONFIG_FS_DAX |
3101 | struct buffer_head *bh_result, int create) | 3091 | int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, |
3092 | struct buffer_head *bh_result, int create) | ||
3102 | { | 3093 | { |
3103 | int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; | 3094 | int ret, err; |
3095 | int credits; | ||
3096 | struct ext4_map_blocks map; | ||
3097 | handle_t *handle = NULL; | ||
3098 | int flags = 0; | ||
3104 | 3099 | ||
3105 | if (create) | 3100 | ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", |
3106 | flags |= EXT4_GET_BLOCKS_CREATE; | ||
3107 | ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", | ||
3108 | inode->i_ino, create); | 3101 | inode->i_ino, create); |
3109 | return _ext4_get_block(inode, iblock, bh_result, flags); | 3102 | map.m_lblk = iblock; |
3103 | map.m_len = bh_result->b_size >> inode->i_blkbits; | ||
3104 | credits = ext4_chunk_trans_blocks(inode, map.m_len); | ||
3105 | if (create) { | ||
3106 | flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; | ||
3107 | handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); | ||
3108 | if (IS_ERR(handle)) { | ||
3109 | ret = PTR_ERR(handle); | ||
3110 | return ret; | ||
3111 | } | ||
3112 | } | ||
3113 | |||
3114 | ret = ext4_map_blocks(handle, inode, &map, flags); | ||
3115 | if (create) { | ||
3116 | err = ext4_journal_stop(handle); | ||
3117 | if (ret >= 0 && err < 0) | ||
3118 | ret = err; | ||
3119 | } | ||
3120 | if (ret <= 0) | ||
3121 | goto out; | ||
3122 | if (map.m_flags & EXT4_MAP_UNWRITTEN) { | ||
3123 | int err2; | ||
3124 | |||
3125 | /* | ||
3126 | * We are protected by i_mmap_sem so we know block cannot go | ||
3127 | * away from under us even though we dropped i_data_sem. | ||
3128 | * Convert extent to written and write zeros there. | ||
3129 | * | ||
3130 | * Note: We may get here even when create == 0. | ||
3131 | */ | ||
3132 | handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); | ||
3133 | if (IS_ERR(handle)) { | ||
3134 | ret = PTR_ERR(handle); | ||
3135 | goto out; | ||
3136 | } | ||
3137 | |||
3138 | err = ext4_map_blocks(handle, inode, &map, | ||
3139 | EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); | ||
3140 | if (err < 0) | ||
3141 | ret = err; | ||
3142 | err2 = ext4_journal_stop(handle); | ||
3143 | if (err2 < 0 && ret > 0) | ||
3144 | ret = err2; | ||
3145 | } | ||
3146 | out: | ||
3147 | WARN_ON_ONCE(ret == 0 && create); | ||
3148 | if (ret > 0) { | ||
3149 | map_bh(bh_result, inode->i_sb, map.m_pblk); | ||
3150 | bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | | ||
3151 | map.m_flags; | ||
3152 | /* | ||
3153 | * At least for now we have to clear BH_New so that DAX code | ||
3154 | * doesn't attempt to zero blocks again in a racy way. | ||
3155 | */ | ||
3156 | bh_result->b_state &= ~(1 << BH_New); | ||
3157 | bh_result->b_size = map.m_len << inode->i_blkbits; | ||
3158 | ret = 0; | ||
3159 | } | ||
3160 | return ret; | ||
3110 | } | 3161 | } |
3162 | #endif | ||
3111 | 3163 | ||
3112 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | 3164 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, |
3113 | ssize_t size, void *private) | 3165 | ssize_t size, void *private) |