aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2016-05-13 00:51:15 -0400
committerTheodore Ts'o <tytso@mit.edu>2016-05-13 00:51:15 -0400
commit12735f881952c32b31bc4e433768f18489f79ec9 (patch)
treedd23e6d522644d094af930a68132d8071a5de05c
parent914f82a32d026884743fb3de9f6f0a5908a9d5dd (diff)
ext4: pre-zero allocated blocks for DAX IO
Currently ext4 treats DAX IO the same way as direct IO. I.e., it allocates unwritten extents before IO is done and converts unwritten extents afterwards. However this way DAX IO can race with page fault to the same area: ext4_ext_direct_IO() dax_fault() dax_io() get_block() - allocates unwritten extent copy_from_iter_pmem() get_block() - converts unwritten block to written and zeroes it out ext4_convert_unwritten_extents() So data written with DAX IO gets lost. Similarly dax_new_buf() called from dax_io() can overwrite data that has been already written to the block via mmap. Fix the problem by using pre-zeroed blocks for DAX IO the same way as we use them for DAX mmap. The downside of this solution is that every allocating write writes each block twice (once zeros, once data). Fixing the race with locking is possible as well however we would need to lock-out faults for the whole range written to by DAX IO. And that is not easy to do without locking-out faults for the whole file which seems too aggressive. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r--fs/ext4/ext4.h11
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/inode.c43
3 files changed, 44 insertions, 14 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 89e1bcb21341..b84aa1ca480a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2527,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
2527struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2527struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
2528int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 2528int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
2529 struct buffer_head *bh_result, int create); 2529 struct buffer_head *bh_result, int create);
2530int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 2530int ext4_dax_get_block(struct inode *inode, sector_t iblock,
2531 struct buffer_head *bh_result, int create); 2531 struct buffer_head *bh_result, int create);
2532int ext4_get_block(struct inode *inode, sector_t iblock, 2532int ext4_get_block(struct inode *inode, sector_t iblock,
2533 struct buffer_head *bh_result, int create); 2533 struct buffer_head *bh_result, int create);
2534int ext4_dio_get_block(struct inode *inode, sector_t iblock, 2534int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -3334,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
3334 } 3334 }
3335} 3335}
3336 3336
3337static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
3338{
3339 int blksize = 1 << inode->i_blkbits;
3340
3341 return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
3342}
3343
3337#endif /* __KERNEL__ */ 3344#endif /* __KERNEL__ */
3338 3345
3339#define EFSBADCRC EBADMSG /* Bad CRC detected */ 3346#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3e850b988923..37e28082885a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,7 +207,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
207 if (IS_ERR(handle)) 207 if (IS_ERR(handle))
208 result = VM_FAULT_SIGBUS; 208 result = VM_FAULT_SIGBUS;
209 else 209 else
210 result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); 210 result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL);
211 211
212 if (write) { 212 if (write) {
213 if (!IS_ERR(handle)) 213 if (!IS_ERR(handle))
@@ -243,7 +243,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
243 result = VM_FAULT_SIGBUS; 243 result = VM_FAULT_SIGBUS;
244 else 244 else
245 result = __dax_pmd_fault(vma, addr, pmd, flags, 245 result = __dax_pmd_fault(vma, addr, pmd, flags,
246 ext4_dax_mmap_get_block, NULL); 246 ext4_dax_get_block, NULL);
247 247
248 if (write) { 248 if (write) {
249 if (!IS_ERR(handle)) 249 if (!IS_ERR(handle))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4879e93c91d3..f9ab1e8cc416 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3229,13 +3229,17 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3229} 3229}
3230 3230
3231#ifdef CONFIG_FS_DAX 3231#ifdef CONFIG_FS_DAX
3232int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 3232/*
3233 struct buffer_head *bh_result, int create) 3233 * Get block function for DAX IO and mmap faults. It takes care of converting
3234 * unwritten extents to written ones and initializes new / converted blocks
3235 * to zeros.
3236 */
3237int ext4_dax_get_block(struct inode *inode, sector_t iblock,
3238 struct buffer_head *bh_result, int create)
3234{ 3239{
3235 int ret; 3240 int ret;
3236 3241
3237 ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", 3242 ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
3238 inode->i_ino, create);
3239 if (!create) 3243 if (!create)
3240 return _ext4_get_block(inode, iblock, bh_result, 0); 3244 return _ext4_get_block(inode, iblock, bh_result, 0);
3241 3245
@@ -3247,9 +3251,9 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
3247 3251
3248 if (buffer_unwritten(bh_result)) { 3252 if (buffer_unwritten(bh_result)) {
3249 /* 3253 /*
3250 * We are protected by i_mmap_sem so we know block cannot go 3254 * We are protected by i_mmap_sem or i_mutex so we know block
3251 * away from under us even though we dropped i_data_sem. 3255 * cannot go away from under us even though we dropped
3252 * Convert extent to written and write zeros there. 3256 * i_data_sem. Convert extent to written and write zeros there.
3253 */ 3257 */
3254 ret = ext4_get_block_trans(inode, iblock, bh_result, 3258 ret = ext4_get_block_trans(inode, iblock, bh_result,
3255 EXT4_GET_BLOCKS_CONVERT | 3259 EXT4_GET_BLOCKS_CONVERT |
@@ -3264,6 +3268,14 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
3264 clear_buffer_new(bh_result); 3268 clear_buffer_new(bh_result);
3265 return 0; 3269 return 0;
3266} 3270}
3271#else
3272/* Just define empty function, it will never get called. */
3273int ext4_dax_get_block(struct inode *inode, sector_t iblock,
3274 struct buffer_head *bh_result, int create)
3275{
3276 BUG();
3277 return 0;
3278}
3267#endif 3279#endif
3268 3280
3269static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3281static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -3385,8 +3397,20 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
3385 iocb->private = NULL; 3397 iocb->private = NULL;
3386 if (overwrite) 3398 if (overwrite)
3387 get_block_func = ext4_dio_get_block_overwrite; 3399 get_block_func = ext4_dio_get_block_overwrite;
3388 else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || 3400 else if (IS_DAX(inode)) {
3389 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { 3401 /*
3402 * We can avoid zeroing for aligned DAX writes beyond EOF. Other
3403 * writes need zeroing either because they can race with page
3404 * faults or because they use partial blocks.
3405 */
3406 if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
3407 ext4_aligned_io(inode, offset, count))
3408 get_block_func = ext4_dio_get_block;
3409 else
3410 get_block_func = ext4_dax_get_block;
3411 dio_flags = DIO_LOCKING;
3412 } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
3413 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
3390 get_block_func = ext4_dio_get_block; 3414 get_block_func = ext4_dio_get_block;
3391 dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; 3415 dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
3392 } else if (is_sync_kiocb(iocb)) { 3416 } else if (is_sync_kiocb(iocb)) {
@@ -3400,7 +3424,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
3400 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); 3424 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
3401#endif 3425#endif
3402 if (IS_DAX(inode)) { 3426 if (IS_DAX(inode)) {
3403 dio_flags &= ~DIO_SKIP_HOLES;
3404 ret = dax_do_io(iocb, inode, iter, offset, get_block_func, 3427 ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
3405 ext4_end_io_dio, dio_flags); 3428 ext4_end_io_dio, dio_flags);
3406 } else 3429 } else