diff options
-rw-r--r-- | fs/ext4/ext4.h | 2 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 127 | ||||
-rw-r--r-- | fs/ext4/inode.c | 131 |
3 files changed, 114 insertions, 146 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ba5aecc07fbc..89e1bcb21341 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -2587,8 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, | |||
2587 | /* indirect.c */ | 2587 | /* indirect.c */ |
2588 | extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | 2588 | extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, |
2589 | struct ext4_map_blocks *map, int flags); | 2589 | struct ext4_map_blocks *map, int flags); |
2590 | extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | ||
2591 | loff_t offset); | ||
2592 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); | 2590 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); |
2593 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); | 2591 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); |
2594 | extern void ext4_ind_truncate(handle_t *, struct inode *inode); | 2592 | extern void ext4_ind_truncate(handle_t *, struct inode *inode); |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3027fa681de5..bc15c2c17633 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c | |||
@@ -649,133 +649,6 @@ out: | |||
649 | } | 649 | } |
650 | 650 | ||
651 | /* | 651 | /* |
652 | * O_DIRECT for ext3 (or indirect map) based files | ||
653 | * | ||
654 | * If the O_DIRECT write will extend the file then add this inode to the | ||
655 | * orphan list. So recovery will truncate it back to the original size | ||
656 | * if the machine crashes during the write. | ||
657 | * | ||
658 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | ||
659 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
660 | * VFS code falls back into buffered path in that case so we are safe. | ||
661 | */ | ||
662 | ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | ||
663 | loff_t offset) | ||
664 | { | ||
665 | struct file *file = iocb->ki_filp; | ||
666 | struct inode *inode = file->f_mapping->host; | ||
667 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
668 | handle_t *handle; | ||
669 | ssize_t ret; | ||
670 | int orphan = 0; | ||
671 | size_t count = iov_iter_count(iter); | ||
672 | int retries = 0; | ||
673 | |||
674 | if (iov_iter_rw(iter) == WRITE) { | ||
675 | loff_t final_size = offset + count; | ||
676 | |||
677 | if (final_size > inode->i_size) { | ||
678 | /* Credits for sb + inode write */ | ||
679 | handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); | ||
680 | if (IS_ERR(handle)) { | ||
681 | ret = PTR_ERR(handle); | ||
682 | goto out; | ||
683 | } | ||
684 | ret = ext4_orphan_add(handle, inode); | ||
685 | if (ret) { | ||
686 | ext4_journal_stop(handle); | ||
687 | goto out; | ||
688 | } | ||
689 | orphan = 1; | ||
690 | ei->i_disksize = inode->i_size; | ||
691 | ext4_journal_stop(handle); | ||
692 | } | ||
693 | } | ||
694 | |||
695 | retry: | ||
696 | if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) { | ||
697 | /* | ||
698 | * Nolock dioread optimization may be dynamically disabled | ||
699 | * via ext4_inode_block_unlocked_dio(). Check inode's state | ||
700 | * while holding extra i_dio_count ref. | ||
701 | */ | ||
702 | inode_dio_begin(inode); | ||
703 | smp_mb(); | ||
704 | if (unlikely(ext4_test_inode_state(inode, | ||
705 | EXT4_STATE_DIOREAD_LOCK))) { | ||
706 | inode_dio_end(inode); | ||
707 | goto locked; | ||
708 | } | ||
709 | if (IS_DAX(inode)) | ||
710 | ret = dax_do_io(iocb, inode, iter, offset, | ||
711 | ext4_dio_get_block, NULL, 0); | ||
712 | else | ||
713 | ret = __blockdev_direct_IO(iocb, inode, | ||
714 | inode->i_sb->s_bdev, iter, | ||
715 | offset, ext4_dio_get_block, | ||
716 | NULL, NULL, 0); | ||
717 | inode_dio_end(inode); | ||
718 | } else { | ||
719 | locked: | ||
720 | if (IS_DAX(inode)) | ||
721 | ret = dax_do_io(iocb, inode, iter, offset, | ||
722 | ext4_dio_get_block, NULL, DIO_LOCKING); | ||
723 | else | ||
724 | ret = blockdev_direct_IO(iocb, inode, iter, offset, | ||
725 | ext4_dio_get_block); | ||
726 | |||
727 | if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { | ||
728 | loff_t isize = i_size_read(inode); | ||
729 | loff_t end = offset + count; | ||
730 | |||
731 | if (end > isize) | ||
732 | ext4_truncate_failed_write(inode); | ||
733 | } | ||
734 | } | ||
735 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
736 | goto retry; | ||
737 | |||
738 | if (orphan) { | ||
739 | int err; | ||
740 | |||
741 | /* Credits for sb + inode write */ | ||
742 | handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); | ||
743 | if (IS_ERR(handle)) { | ||
744 | /* This is really bad luck. We've written the data | ||
745 | * but cannot extend i_size. Bail out and pretend | ||
746 | * the write failed... */ | ||
747 | ret = PTR_ERR(handle); | ||
748 | if (inode->i_nlink) | ||
749 | ext4_orphan_del(NULL, inode); | ||
750 | |||
751 | goto out; | ||
752 | } | ||
753 | if (inode->i_nlink) | ||
754 | ext4_orphan_del(handle, inode); | ||
755 | if (ret > 0) { | ||
756 | loff_t end = offset + ret; | ||
757 | if (end > inode->i_size) { | ||
758 | ei->i_disksize = end; | ||
759 | i_size_write(inode, end); | ||
760 | /* | ||
761 | * We're going to return a positive `ret' | ||
762 | * here due to non-zero-length I/O, so there's | ||
763 | * no way of reporting error returns from | ||
764 | * ext4_mark_inode_dirty() to userspace. So | ||
765 | * ignore it. | ||
766 | */ | ||
767 | ext4_mark_inode_dirty(handle, inode); | ||
768 | } | ||
769 | } | ||
770 | err = ext4_journal_stop(handle); | ||
771 | if (ret == 0) | ||
772 | ret = err; | ||
773 | } | ||
774 | out: | ||
775 | return ret; | ||
776 | } | ||
777 | |||
778 | /* | ||
779 | * Calculate the number of metadata blocks need to reserve | 652 | * Calculate the number of metadata blocks need to reserve |
780 | * to allocate a new block at @lblocks for non extent file based file | 653 | * to allocate a new block at @lblocks for non extent file based file |
781 | */ | 654 | */ |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 32825dee81d4..4879e93c91d3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -3295,7 +3295,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3295 | } | 3295 | } |
3296 | 3296 | ||
3297 | /* | 3297 | /* |
3298 | * For ext4 extent files, ext4 will do direct-io write to holes, | 3298 | * Handling of direct IO writes. |
3299 | * | ||
3300 | * For ext4 extent files, ext4 will do direct-io write even to holes, | ||
3299 | * preallocated extents, and those write extend the file, no need to | 3301 | * preallocated extents, and those write extend the file, no need to |
3300 | * fall back to buffered IO. | 3302 | * fall back to buffered IO. |
3301 | * | 3303 | * |
@@ -3313,21 +3315,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3313 | * if the machine crashes during the write. | 3315 | * if the machine crashes during the write. |
3314 | * | 3316 | * |
3315 | */ | 3317 | */ |
3316 | static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | 3318 | static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, |
3317 | loff_t offset) | 3319 | loff_t offset) |
3318 | { | 3320 | { |
3319 | struct file *file = iocb->ki_filp; | 3321 | struct file *file = iocb->ki_filp; |
3320 | struct inode *inode = file->f_mapping->host; | 3322 | struct inode *inode = file->f_mapping->host; |
3323 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
3321 | ssize_t ret; | 3324 | ssize_t ret; |
3322 | size_t count = iov_iter_count(iter); | 3325 | size_t count = iov_iter_count(iter); |
3323 | int overwrite = 0; | 3326 | int overwrite = 0; |
3324 | get_block_t *get_block_func = NULL; | 3327 | get_block_t *get_block_func = NULL; |
3325 | int dio_flags = 0; | 3328 | int dio_flags = 0; |
3326 | loff_t final_size = offset + count; | 3329 | loff_t final_size = offset + count; |
3330 | int orphan = 0; | ||
3331 | handle_t *handle; | ||
3327 | 3332 | ||
3328 | /* Use the old path for reads and writes beyond i_size. */ | 3333 | if (final_size > inode->i_size) { |
3329 | if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) | 3334 | /* Credits for sb + inode write */ |
3330 | return ext4_ind_direct_IO(iocb, iter, offset); | 3335 | handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); |
3336 | if (IS_ERR(handle)) { | ||
3337 | ret = PTR_ERR(handle); | ||
3338 | goto out; | ||
3339 | } | ||
3340 | ret = ext4_orphan_add(handle, inode); | ||
3341 | if (ret) { | ||
3342 | ext4_journal_stop(handle); | ||
3343 | goto out; | ||
3344 | } | ||
3345 | orphan = 1; | ||
3346 | ei->i_disksize = inode->i_size; | ||
3347 | ext4_journal_stop(handle); | ||
3348 | } | ||
3331 | 3349 | ||
3332 | BUG_ON(iocb->private == NULL); | 3350 | BUG_ON(iocb->private == NULL); |
3333 | 3351 | ||
@@ -3336,8 +3354,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3336 | * conversion. This also disallows race between truncate() and | 3354 | * conversion. This also disallows race between truncate() and |
3337 | * overwrite DIO as i_dio_count needs to be incremented under i_mutex. | 3355 | * overwrite DIO as i_dio_count needs to be incremented under i_mutex. |
3338 | */ | 3356 | */ |
3339 | if (iov_iter_rw(iter) == WRITE) | 3357 | inode_dio_begin(inode); |
3340 | inode_dio_begin(inode); | ||
3341 | 3358 | ||
3342 | /* If we do a overwrite dio, i_mutex locking can be released */ | 3359 | /* If we do a overwrite dio, i_mutex locking can be released */ |
3343 | overwrite = *((int *)iocb->private); | 3360 | overwrite = *((int *)iocb->private); |
@@ -3346,7 +3363,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3346 | inode_unlock(inode); | 3363 | inode_unlock(inode); |
3347 | 3364 | ||
3348 | /* | 3365 | /* |
3349 | * We could direct write to holes and fallocate. | 3366 | * For extent mapped files we could direct write to holes and fallocate. |
3350 | * | 3367 | * |
3351 | * Allocated blocks to fill the hole are marked as unwritten to prevent | 3368 | * Allocated blocks to fill the hole are marked as unwritten to prevent |
3352 | * parallel buffered read to expose the stale data before DIO complete | 3369 | * parallel buffered read to expose the stale data before DIO complete |
@@ -3368,7 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3368 | iocb->private = NULL; | 3385 | iocb->private = NULL; |
3369 | if (overwrite) | 3386 | if (overwrite) |
3370 | get_block_func = ext4_dio_get_block_overwrite; | 3387 | get_block_func = ext4_dio_get_block_overwrite; |
3371 | else if (is_sync_kiocb(iocb)) { | 3388 | else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || |
3389 | round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { | ||
3390 | get_block_func = ext4_dio_get_block; | ||
3391 | dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; | ||
3392 | } else if (is_sync_kiocb(iocb)) { | ||
3372 | get_block_func = ext4_dio_get_block_unwritten_sync; | 3393 | get_block_func = ext4_dio_get_block_unwritten_sync; |
3373 | dio_flags = DIO_LOCKING; | 3394 | dio_flags = DIO_LOCKING; |
3374 | } else { | 3395 | } else { |
@@ -3378,10 +3399,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3378 | #ifdef CONFIG_EXT4_FS_ENCRYPTION | 3399 | #ifdef CONFIG_EXT4_FS_ENCRYPTION |
3379 | BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); | 3400 | BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); |
3380 | #endif | 3401 | #endif |
3381 | if (IS_DAX(inode)) | 3402 | if (IS_DAX(inode)) { |
3403 | dio_flags &= ~DIO_SKIP_HOLES; | ||
3382 | ret = dax_do_io(iocb, inode, iter, offset, get_block_func, | 3404 | ret = dax_do_io(iocb, inode, iter, offset, get_block_func, |
3383 | ext4_end_io_dio, dio_flags); | 3405 | ext4_end_io_dio, dio_flags); |
3384 | else | 3406 | } else |
3385 | ret = __blockdev_direct_IO(iocb, inode, | 3407 | ret = __blockdev_direct_IO(iocb, inode, |
3386 | inode->i_sb->s_bdev, iter, offset, | 3408 | inode->i_sb->s_bdev, iter, offset, |
3387 | get_block_func, | 3409 | get_block_func, |
@@ -3401,12 +3423,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3401 | ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); | 3423 | ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); |
3402 | } | 3424 | } |
3403 | 3425 | ||
3404 | if (iov_iter_rw(iter) == WRITE) | 3426 | inode_dio_end(inode); |
3405 | inode_dio_end(inode); | ||
3406 | /* take i_mutex locking again if we do a ovewrite dio */ | 3427 | /* take i_mutex locking again if we do a ovewrite dio */ |
3407 | if (overwrite) | 3428 | if (overwrite) |
3408 | inode_lock(inode); | 3429 | inode_lock(inode); |
3409 | 3430 | ||
3431 | if (ret < 0 && final_size > inode->i_size) | ||
3432 | ext4_truncate_failed_write(inode); | ||
3433 | |||
3434 | /* Handle extending of i_size after direct IO write */ | ||
3435 | if (orphan) { | ||
3436 | int err; | ||
3437 | |||
3438 | /* Credits for sb + inode write */ | ||
3439 | handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); | ||
3440 | if (IS_ERR(handle)) { | ||
3441 | /* This is really bad luck. We've written the data | ||
3442 | * but cannot extend i_size. Bail out and pretend | ||
3443 | * the write failed... */ | ||
3444 | ret = PTR_ERR(handle); | ||
3445 | if (inode->i_nlink) | ||
3446 | ext4_orphan_del(NULL, inode); | ||
3447 | |||
3448 | goto out; | ||
3449 | } | ||
3450 | if (inode->i_nlink) | ||
3451 | ext4_orphan_del(handle, inode); | ||
3452 | if (ret > 0) { | ||
3453 | loff_t end = offset + ret; | ||
3454 | if (end > inode->i_size) { | ||
3455 | ei->i_disksize = end; | ||
3456 | i_size_write(inode, end); | ||
3457 | /* | ||
3458 | * We're going to return a positive `ret' | ||
3459 | * here due to non-zero-length I/O, so there's | ||
3460 | * no way of reporting error returns from | ||
3461 | * ext4_mark_inode_dirty() to userspace. So | ||
3462 | * ignore it. | ||
3463 | */ | ||
3464 | ext4_mark_inode_dirty(handle, inode); | ||
3465 | } | ||
3466 | } | ||
3467 | err = ext4_journal_stop(handle); | ||
3468 | if (ret == 0) | ||
3469 | ret = err; | ||
3470 | } | ||
3471 | out: | ||
3472 | return ret; | ||
3473 | } | ||
3474 | |||
3475 | static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter, | ||
3476 | loff_t offset) | ||
3477 | { | ||
3478 | int unlocked = 0; | ||
3479 | struct inode *inode = iocb->ki_filp->f_mapping->host; | ||
3480 | ssize_t ret; | ||
3481 | |||
3482 | if (ext4_should_dioread_nolock(inode)) { | ||
3483 | /* | ||
3484 | * Nolock dioread optimization may be dynamically disabled | ||
3485 | * via ext4_inode_block_unlocked_dio(). Check inode's state | ||
3486 | * while holding extra i_dio_count ref. | ||
3487 | */ | ||
3488 | inode_dio_begin(inode); | ||
3489 | smp_mb(); | ||
3490 | if (unlikely(ext4_test_inode_state(inode, | ||
3491 | EXT4_STATE_DIOREAD_LOCK))) | ||
3492 | inode_dio_end(inode); | ||
3493 | else | ||
3494 | unlocked = 1; | ||
3495 | } | ||
3496 | if (IS_DAX(inode)) { | ||
3497 | ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block, | ||
3498 | NULL, unlocked ? 0 : DIO_LOCKING); | ||
3499 | } else { | ||
3500 | ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, | ||
3501 | iter, offset, ext4_dio_get_block, | ||
3502 | NULL, NULL, | ||
3503 | unlocked ? 0 : DIO_LOCKING); | ||
3504 | } | ||
3505 | if (unlocked) | ||
3506 | inode_dio_end(inode); | ||
3410 | return ret; | 3507 | return ret; |
3411 | } | 3508 | } |
3412 | 3509 | ||
@@ -3434,10 +3531,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3434 | return 0; | 3531 | return 0; |
3435 | 3532 | ||
3436 | trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); | 3533 | trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); |
3437 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 3534 | if (iov_iter_rw(iter) == READ) |
3438 | ret = ext4_ext_direct_IO(iocb, iter, offset); | 3535 | ret = ext4_direct_IO_read(iocb, iter, offset); |
3439 | else | 3536 | else |
3440 | ret = ext4_ind_direct_IO(iocb, iter, offset); | 3537 | ret = ext4_direct_IO_write(iocb, iter, offset); |
3441 | trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); | 3538 | trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); |
3442 | return ret; | 3539 | return ret; |
3443 | } | 3540 | } |