aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2016-05-13 00:44:16 -0400
committerTheodore Ts'o <tytso@mit.edu>2016-05-13 00:44:16 -0400
commit914f82a32d026884743fb3de9f6f0a5908a9d5dd (patch)
tree7f56a9e69a6e6df90a27750c9bfc08a604a4ed10
parentdbc427ce4028580f1244b5b57ca1cbea31aad1e7 (diff)
ext4: refactor direct IO code
Currently ext4 direct IO handling is split between ext4_ext_direct_IO() and ext4_ind_direct_IO(). However the extent based function calls into the indirect based one for some cases and for example it is not able to handle file extending. Previously it was not also properly handling retries in case of ENOSPC errors. With DAX things would get even more contrieved so just refactor the direct IO code and instead of indirect / extent split do the split to read vs writes. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/indirect.c127
-rw-r--r--fs/ext4/inode.c131
3 files changed, 114 insertions, 146 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ba5aecc07fbc..89e1bcb21341 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2587,8 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
2587/* indirect.c */ 2587/* indirect.c */
2588extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 2588extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
2589 struct ext4_map_blocks *map, int flags); 2589 struct ext4_map_blocks *map, int flags);
2590extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
2591 loff_t offset);
2592extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2590extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2593extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); 2591extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2594extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2592extern void ext4_ind_truncate(handle_t *, struct inode *inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3027fa681de5..bc15c2c17633 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -649,133 +649,6 @@ out:
649} 649}
650 650
651/* 651/*
652 * O_DIRECT for ext3 (or indirect map) based files
653 *
654 * If the O_DIRECT write will extend the file then add this inode to the
655 * orphan list. So recovery will truncate it back to the original size
656 * if the machine crashes during the write.
657 *
658 * If the O_DIRECT write is intantiating holes inside i_size and the machine
659 * crashes then stale disk data _may_ be exposed inside the file. But current
660 * VFS code falls back into buffered path in that case so we are safe.
661 */
662ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
663 loff_t offset)
664{
665 struct file *file = iocb->ki_filp;
666 struct inode *inode = file->f_mapping->host;
667 struct ext4_inode_info *ei = EXT4_I(inode);
668 handle_t *handle;
669 ssize_t ret;
670 int orphan = 0;
671 size_t count = iov_iter_count(iter);
672 int retries = 0;
673
674 if (iov_iter_rw(iter) == WRITE) {
675 loff_t final_size = offset + count;
676
677 if (final_size > inode->i_size) {
678 /* Credits for sb + inode write */
679 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
680 if (IS_ERR(handle)) {
681 ret = PTR_ERR(handle);
682 goto out;
683 }
684 ret = ext4_orphan_add(handle, inode);
685 if (ret) {
686 ext4_journal_stop(handle);
687 goto out;
688 }
689 orphan = 1;
690 ei->i_disksize = inode->i_size;
691 ext4_journal_stop(handle);
692 }
693 }
694
695retry:
696 if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
697 /*
698 * Nolock dioread optimization may be dynamically disabled
699 * via ext4_inode_block_unlocked_dio(). Check inode's state
700 * while holding extra i_dio_count ref.
701 */
702 inode_dio_begin(inode);
703 smp_mb();
704 if (unlikely(ext4_test_inode_state(inode,
705 EXT4_STATE_DIOREAD_LOCK))) {
706 inode_dio_end(inode);
707 goto locked;
708 }
709 if (IS_DAX(inode))
710 ret = dax_do_io(iocb, inode, iter, offset,
711 ext4_dio_get_block, NULL, 0);
712 else
713 ret = __blockdev_direct_IO(iocb, inode,
714 inode->i_sb->s_bdev, iter,
715 offset, ext4_dio_get_block,
716 NULL, NULL, 0);
717 inode_dio_end(inode);
718 } else {
719locked:
720 if (IS_DAX(inode))
721 ret = dax_do_io(iocb, inode, iter, offset,
722 ext4_dio_get_block, NULL, DIO_LOCKING);
723 else
724 ret = blockdev_direct_IO(iocb, inode, iter, offset,
725 ext4_dio_get_block);
726
727 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
728 loff_t isize = i_size_read(inode);
729 loff_t end = offset + count;
730
731 if (end > isize)
732 ext4_truncate_failed_write(inode);
733 }
734 }
735 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
736 goto retry;
737
738 if (orphan) {
739 int err;
740
741 /* Credits for sb + inode write */
742 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
743 if (IS_ERR(handle)) {
744 /* This is really bad luck. We've written the data
745 * but cannot extend i_size. Bail out and pretend
746 * the write failed... */
747 ret = PTR_ERR(handle);
748 if (inode->i_nlink)
749 ext4_orphan_del(NULL, inode);
750
751 goto out;
752 }
753 if (inode->i_nlink)
754 ext4_orphan_del(handle, inode);
755 if (ret > 0) {
756 loff_t end = offset + ret;
757 if (end > inode->i_size) {
758 ei->i_disksize = end;
759 i_size_write(inode, end);
760 /*
761 * We're going to return a positive `ret'
762 * here due to non-zero-length I/O, so there's
763 * no way of reporting error returns from
764 * ext4_mark_inode_dirty() to userspace. So
765 * ignore it.
766 */
767 ext4_mark_inode_dirty(handle, inode);
768 }
769 }
770 err = ext4_journal_stop(handle);
771 if (ret == 0)
772 ret = err;
773 }
774out:
775 return ret;
776}
777
778/*
779 * Calculate the number of metadata blocks need to reserve 652 * Calculate the number of metadata blocks need to reserve
780 * to allocate a new block at @lblocks for non extent file based file 653 * to allocate a new block at @lblocks for non extent file based file
781 */ 654 */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 32825dee81d4..4879e93c91d3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3295,7 +3295,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3295} 3295}
3296 3296
3297/* 3297/*
3298 * For ext4 extent files, ext4 will do direct-io write to holes, 3298 * Handling of direct IO writes.
3299 *
3300 * For ext4 extent files, ext4 will do direct-io write even to holes,
3299 * preallocated extents, and those write extend the file, no need to 3301 * preallocated extents, and those write extend the file, no need to
3300 * fall back to buffered IO. 3302 * fall back to buffered IO.
3301 * 3303 *
@@ -3313,21 +3315,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3313 * if the machine crashes during the write. 3315 * if the machine crashes during the write.
3314 * 3316 *
3315 */ 3317 */
3316static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 3318static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
3317 loff_t offset) 3319 loff_t offset)
3318{ 3320{
3319 struct file *file = iocb->ki_filp; 3321 struct file *file = iocb->ki_filp;
3320 struct inode *inode = file->f_mapping->host; 3322 struct inode *inode = file->f_mapping->host;
3323 struct ext4_inode_info *ei = EXT4_I(inode);
3321 ssize_t ret; 3324 ssize_t ret;
3322 size_t count = iov_iter_count(iter); 3325 size_t count = iov_iter_count(iter);
3323 int overwrite = 0; 3326 int overwrite = 0;
3324 get_block_t *get_block_func = NULL; 3327 get_block_t *get_block_func = NULL;
3325 int dio_flags = 0; 3328 int dio_flags = 0;
3326 loff_t final_size = offset + count; 3329 loff_t final_size = offset + count;
3330 int orphan = 0;
3331 handle_t *handle;
3327 3332
3328 /* Use the old path for reads and writes beyond i_size. */ 3333 if (final_size > inode->i_size) {
3329 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) 3334 /* Credits for sb + inode write */
3330 return ext4_ind_direct_IO(iocb, iter, offset); 3335 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3336 if (IS_ERR(handle)) {
3337 ret = PTR_ERR(handle);
3338 goto out;
3339 }
3340 ret = ext4_orphan_add(handle, inode);
3341 if (ret) {
3342 ext4_journal_stop(handle);
3343 goto out;
3344 }
3345 orphan = 1;
3346 ei->i_disksize = inode->i_size;
3347 ext4_journal_stop(handle);
3348 }
3331 3349
3332 BUG_ON(iocb->private == NULL); 3350 BUG_ON(iocb->private == NULL);
3333 3351
@@ -3336,8 +3354,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3336 * conversion. This also disallows race between truncate() and 3354 * conversion. This also disallows race between truncate() and
3337 * overwrite DIO as i_dio_count needs to be incremented under i_mutex. 3355 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3338 */ 3356 */
3339 if (iov_iter_rw(iter) == WRITE) 3357 inode_dio_begin(inode);
3340 inode_dio_begin(inode);
3341 3358
3342 /* If we do a overwrite dio, i_mutex locking can be released */ 3359 /* If we do a overwrite dio, i_mutex locking can be released */
3343 overwrite = *((int *)iocb->private); 3360 overwrite = *((int *)iocb->private);
@@ -3346,7 +3363,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3346 inode_unlock(inode); 3363 inode_unlock(inode);
3347 3364
3348 /* 3365 /*
3349 * We could direct write to holes and fallocate. 3366 * For extent mapped files we could direct write to holes and fallocate.
3350 * 3367 *
3351 * Allocated blocks to fill the hole are marked as unwritten to prevent 3368 * Allocated blocks to fill the hole are marked as unwritten to prevent
3352 * parallel buffered read to expose the stale data before DIO complete 3369 * parallel buffered read to expose the stale data before DIO complete
@@ -3368,7 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3368 iocb->private = NULL; 3385 iocb->private = NULL;
3369 if (overwrite) 3386 if (overwrite)
3370 get_block_func = ext4_dio_get_block_overwrite; 3387 get_block_func = ext4_dio_get_block_overwrite;
3371 else if (is_sync_kiocb(iocb)) { 3388 else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
3389 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
3390 get_block_func = ext4_dio_get_block;
3391 dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
3392 } else if (is_sync_kiocb(iocb)) {
3372 get_block_func = ext4_dio_get_block_unwritten_sync; 3393 get_block_func = ext4_dio_get_block_unwritten_sync;
3373 dio_flags = DIO_LOCKING; 3394 dio_flags = DIO_LOCKING;
3374 } else { 3395 } else {
@@ -3378,10 +3399,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3378#ifdef CONFIG_EXT4_FS_ENCRYPTION 3399#ifdef CONFIG_EXT4_FS_ENCRYPTION
3379 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); 3400 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
3380#endif 3401#endif
3381 if (IS_DAX(inode)) 3402 if (IS_DAX(inode)) {
3403 dio_flags &= ~DIO_SKIP_HOLES;
3382 ret = dax_do_io(iocb, inode, iter, offset, get_block_func, 3404 ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
3383 ext4_end_io_dio, dio_flags); 3405 ext4_end_io_dio, dio_flags);
3384 else 3406 } else
3385 ret = __blockdev_direct_IO(iocb, inode, 3407 ret = __blockdev_direct_IO(iocb, inode,
3386 inode->i_sb->s_bdev, iter, offset, 3408 inode->i_sb->s_bdev, iter, offset,
3387 get_block_func, 3409 get_block_func,
@@ -3401,12 +3423,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3401 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3423 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3402 } 3424 }
3403 3425
3404 if (iov_iter_rw(iter) == WRITE) 3426 inode_dio_end(inode);
3405 inode_dio_end(inode);
3406 /* take i_mutex locking again if we do a ovewrite dio */ 3427 /* take i_mutex locking again if we do a ovewrite dio */
3407 if (overwrite) 3428 if (overwrite)
3408 inode_lock(inode); 3429 inode_lock(inode);
3409 3430
3431 if (ret < 0 && final_size > inode->i_size)
3432 ext4_truncate_failed_write(inode);
3433
3434 /* Handle extending of i_size after direct IO write */
3435 if (orphan) {
3436 int err;
3437
3438 /* Credits for sb + inode write */
3439 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3440 if (IS_ERR(handle)) {
3441 /* This is really bad luck. We've written the data
3442 * but cannot extend i_size. Bail out and pretend
3443 * the write failed... */
3444 ret = PTR_ERR(handle);
3445 if (inode->i_nlink)
3446 ext4_orphan_del(NULL, inode);
3447
3448 goto out;
3449 }
3450 if (inode->i_nlink)
3451 ext4_orphan_del(handle, inode);
3452 if (ret > 0) {
3453 loff_t end = offset + ret;
3454 if (end > inode->i_size) {
3455 ei->i_disksize = end;
3456 i_size_write(inode, end);
3457 /*
3458 * We're going to return a positive `ret'
3459 * here due to non-zero-length I/O, so there's
3460 * no way of reporting error returns from
3461 * ext4_mark_inode_dirty() to userspace. So
3462 * ignore it.
3463 */
3464 ext4_mark_inode_dirty(handle, inode);
3465 }
3466 }
3467 err = ext4_journal_stop(handle);
3468 if (ret == 0)
3469 ret = err;
3470 }
3471out:
3472 return ret;
3473}
3474
3475static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
3476 loff_t offset)
3477{
3478 int unlocked = 0;
3479 struct inode *inode = iocb->ki_filp->f_mapping->host;
3480 ssize_t ret;
3481
3482 if (ext4_should_dioread_nolock(inode)) {
3483 /*
3484 * Nolock dioread optimization may be dynamically disabled
3485 * via ext4_inode_block_unlocked_dio(). Check inode's state
3486 * while holding extra i_dio_count ref.
3487 */
3488 inode_dio_begin(inode);
3489 smp_mb();
3490 if (unlikely(ext4_test_inode_state(inode,
3491 EXT4_STATE_DIOREAD_LOCK)))
3492 inode_dio_end(inode);
3493 else
3494 unlocked = 1;
3495 }
3496 if (IS_DAX(inode)) {
3497 ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
3498 NULL, unlocked ? 0 : DIO_LOCKING);
3499 } else {
3500 ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
3501 iter, offset, ext4_dio_get_block,
3502 NULL, NULL,
3503 unlocked ? 0 : DIO_LOCKING);
3504 }
3505 if (unlocked)
3506 inode_dio_end(inode);
3410 return ret; 3507 return ret;
3411} 3508}
3412 3509
@@ -3434,10 +3531,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3434 return 0; 3531 return 0;
3435 3532
3436 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); 3533 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
3437 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3534 if (iov_iter_rw(iter) == READ)
3438 ret = ext4_ext_direct_IO(iocb, iter, offset); 3535 ret = ext4_direct_IO_read(iocb, iter, offset);
3439 else 3536 else
3440 ret = ext4_ind_direct_IO(iocb, iter, offset); 3537 ret = ext4_direct_IO_write(iocb, iter, offset);
3441 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); 3538 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
3442 return ret; 3539 return ret;
3443} 3540}