aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/indirect.c127
-rw-r--r--fs/ext4/inode.c131
3 files changed, 114 insertions, 146 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ba5aecc07fbc..89e1bcb21341 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2587,8 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
2587/* indirect.c */ 2587/* indirect.c */
2588extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 2588extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
2589 struct ext4_map_blocks *map, int flags); 2589 struct ext4_map_blocks *map, int flags);
2590extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
2591 loff_t offset);
2592extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2590extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2593extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); 2591extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2594extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2592extern void ext4_ind_truncate(handle_t *, struct inode *inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3027fa681de5..bc15c2c17633 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -649,133 +649,6 @@ out:
649} 649}
650 650
651/* 651/*
652 * O_DIRECT for ext3 (or indirect map) based files
653 *
654 * If the O_DIRECT write will extend the file then add this inode to the
655 * orphan list. So recovery will truncate it back to the original size
656 * if the machine crashes during the write.
657 *
658 * If the O_DIRECT write is intantiating holes inside i_size and the machine
659 * crashes then stale disk data _may_ be exposed inside the file. But current
660 * VFS code falls back into buffered path in that case so we are safe.
661 */
662ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
663 loff_t offset)
664{
665 struct file *file = iocb->ki_filp;
666 struct inode *inode = file->f_mapping->host;
667 struct ext4_inode_info *ei = EXT4_I(inode);
668 handle_t *handle;
669 ssize_t ret;
670 int orphan = 0;
671 size_t count = iov_iter_count(iter);
672 int retries = 0;
673
674 if (iov_iter_rw(iter) == WRITE) {
675 loff_t final_size = offset + count;
676
677 if (final_size > inode->i_size) {
678 /* Credits for sb + inode write */
679 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
680 if (IS_ERR(handle)) {
681 ret = PTR_ERR(handle);
682 goto out;
683 }
684 ret = ext4_orphan_add(handle, inode);
685 if (ret) {
686 ext4_journal_stop(handle);
687 goto out;
688 }
689 orphan = 1;
690 ei->i_disksize = inode->i_size;
691 ext4_journal_stop(handle);
692 }
693 }
694
695retry:
696 if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
697 /*
698 * Nolock dioread optimization may be dynamically disabled
699 * via ext4_inode_block_unlocked_dio(). Check inode's state
700 * while holding extra i_dio_count ref.
701 */
702 inode_dio_begin(inode);
703 smp_mb();
704 if (unlikely(ext4_test_inode_state(inode,
705 EXT4_STATE_DIOREAD_LOCK))) {
706 inode_dio_end(inode);
707 goto locked;
708 }
709 if (IS_DAX(inode))
710 ret = dax_do_io(iocb, inode, iter, offset,
711 ext4_dio_get_block, NULL, 0);
712 else
713 ret = __blockdev_direct_IO(iocb, inode,
714 inode->i_sb->s_bdev, iter,
715 offset, ext4_dio_get_block,
716 NULL, NULL, 0);
717 inode_dio_end(inode);
718 } else {
719locked:
720 if (IS_DAX(inode))
721 ret = dax_do_io(iocb, inode, iter, offset,
722 ext4_dio_get_block, NULL, DIO_LOCKING);
723 else
724 ret = blockdev_direct_IO(iocb, inode, iter, offset,
725 ext4_dio_get_block);
726
727 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
728 loff_t isize = i_size_read(inode);
729 loff_t end = offset + count;
730
731 if (end > isize)
732 ext4_truncate_failed_write(inode);
733 }
734 }
735 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
736 goto retry;
737
738 if (orphan) {
739 int err;
740
741 /* Credits for sb + inode write */
742 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
743 if (IS_ERR(handle)) {
744 /* This is really bad luck. We've written the data
745 * but cannot extend i_size. Bail out and pretend
746 * the write failed... */
747 ret = PTR_ERR(handle);
748 if (inode->i_nlink)
749 ext4_orphan_del(NULL, inode);
750
751 goto out;
752 }
753 if (inode->i_nlink)
754 ext4_orphan_del(handle, inode);
755 if (ret > 0) {
756 loff_t end = offset + ret;
757 if (end > inode->i_size) {
758 ei->i_disksize = end;
759 i_size_write(inode, end);
760 /*
761 * We're going to return a positive `ret'
762 * here due to non-zero-length I/O, so there's
763 * no way of reporting error returns from
764 * ext4_mark_inode_dirty() to userspace. So
765 * ignore it.
766 */
767 ext4_mark_inode_dirty(handle, inode);
768 }
769 }
770 err = ext4_journal_stop(handle);
771 if (ret == 0)
772 ret = err;
773 }
774out:
775 return ret;
776}
777
778/*
779 * Calculate the number of metadata blocks need to reserve 652 * Calculate the number of metadata blocks need to reserve
780 * to allocate a new block at @lblocks for non extent file based file 653 * to allocate a new block at @lblocks for non extent file based file
781 */ 654 */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 32825dee81d4..4879e93c91d3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3295,7 +3295,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3295} 3295}
3296 3296
3297/* 3297/*
3298 * For ext4 extent files, ext4 will do direct-io write to holes, 3298 * Handling of direct IO writes.
3299 *
3300 * For ext4 extent files, ext4 will do direct-io write even to holes,
3299 * preallocated extents, and those write extend the file, no need to 3301 * preallocated extents, and those write extend the file, no need to
3300 * fall back to buffered IO. 3302 * fall back to buffered IO.
3301 * 3303 *
@@ -3313,21 +3315,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3313 * if the machine crashes during the write. 3315 * if the machine crashes during the write.
3314 * 3316 *
3315 */ 3317 */
3316static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 3318static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
3317 loff_t offset) 3319 loff_t offset)
3318{ 3320{
3319 struct file *file = iocb->ki_filp; 3321 struct file *file = iocb->ki_filp;
3320 struct inode *inode = file->f_mapping->host; 3322 struct inode *inode = file->f_mapping->host;
3323 struct ext4_inode_info *ei = EXT4_I(inode);
3321 ssize_t ret; 3324 ssize_t ret;
3322 size_t count = iov_iter_count(iter); 3325 size_t count = iov_iter_count(iter);
3323 int overwrite = 0; 3326 int overwrite = 0;
3324 get_block_t *get_block_func = NULL; 3327 get_block_t *get_block_func = NULL;
3325 int dio_flags = 0; 3328 int dio_flags = 0;
3326 loff_t final_size = offset + count; 3329 loff_t final_size = offset + count;
3330 int orphan = 0;
3331 handle_t *handle;
3327 3332
3328 /* Use the old path for reads and writes beyond i_size. */ 3333 if (final_size > inode->i_size) {
3329 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) 3334 /* Credits for sb + inode write */
3330 return ext4_ind_direct_IO(iocb, iter, offset); 3335 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3336 if (IS_ERR(handle)) {
3337 ret = PTR_ERR(handle);
3338 goto out;
3339 }
3340 ret = ext4_orphan_add(handle, inode);
3341 if (ret) {
3342 ext4_journal_stop(handle);
3343 goto out;
3344 }
3345 orphan = 1;
3346 ei->i_disksize = inode->i_size;
3347 ext4_journal_stop(handle);
3348 }
3331 3349
3332 BUG_ON(iocb->private == NULL); 3350 BUG_ON(iocb->private == NULL);
3333 3351
@@ -3336,8 +3354,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3336 * conversion. This also disallows race between truncate() and 3354 * conversion. This also disallows race between truncate() and
3337 * overwrite DIO as i_dio_count needs to be incremented under i_mutex. 3355 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3338 */ 3356 */
3339 if (iov_iter_rw(iter) == WRITE) 3357 inode_dio_begin(inode);
3340 inode_dio_begin(inode);
3341 3358
3342 /* If we do a overwrite dio, i_mutex locking can be released */ 3359 /* If we do a overwrite dio, i_mutex locking can be released */
3343 overwrite = *((int *)iocb->private); 3360 overwrite = *((int *)iocb->private);
@@ -3346,7 +3363,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3346 inode_unlock(inode); 3363 inode_unlock(inode);
3347 3364
3348 /* 3365 /*
3349 * We could direct write to holes and fallocate. 3366 * For extent mapped files we could direct write to holes and fallocate.
3350 * 3367 *
3351 * Allocated blocks to fill the hole are marked as unwritten to prevent 3368 * Allocated blocks to fill the hole are marked as unwritten to prevent
3352 * parallel buffered read to expose the stale data before DIO complete 3369 * parallel buffered read to expose the stale data before DIO complete
@@ -3368,7 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3368 iocb->private = NULL; 3385 iocb->private = NULL;
3369 if (overwrite) 3386 if (overwrite)
3370 get_block_func = ext4_dio_get_block_overwrite; 3387 get_block_func = ext4_dio_get_block_overwrite;
3371 else if (is_sync_kiocb(iocb)) { 3388 else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
3389 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
3390 get_block_func = ext4_dio_get_block;
3391 dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
3392 } else if (is_sync_kiocb(iocb)) {
3372 get_block_func = ext4_dio_get_block_unwritten_sync; 3393 get_block_func = ext4_dio_get_block_unwritten_sync;
3373 dio_flags = DIO_LOCKING; 3394 dio_flags = DIO_LOCKING;
3374 } else { 3395 } else {
@@ -3378,10 +3399,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3378#ifdef CONFIG_EXT4_FS_ENCRYPTION 3399#ifdef CONFIG_EXT4_FS_ENCRYPTION
3379 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); 3400 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
3380#endif 3401#endif
3381 if (IS_DAX(inode)) 3402 if (IS_DAX(inode)) {
3403 dio_flags &= ~DIO_SKIP_HOLES;
3382 ret = dax_do_io(iocb, inode, iter, offset, get_block_func, 3404 ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
3383 ext4_end_io_dio, dio_flags); 3405 ext4_end_io_dio, dio_flags);
3384 else 3406 } else
3385 ret = __blockdev_direct_IO(iocb, inode, 3407 ret = __blockdev_direct_IO(iocb, inode,
3386 inode->i_sb->s_bdev, iter, offset, 3408 inode->i_sb->s_bdev, iter, offset,
3387 get_block_func, 3409 get_block_func,
@@ -3401,12 +3423,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3401 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3423 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3402 } 3424 }
3403 3425
3404 if (iov_iter_rw(iter) == WRITE) 3426 inode_dio_end(inode);
3405 inode_dio_end(inode);
3406 /* take i_mutex locking again if we do a ovewrite dio */ 3427 /* take i_mutex locking again if we do a ovewrite dio */
3407 if (overwrite) 3428 if (overwrite)
3408 inode_lock(inode); 3429 inode_lock(inode);
3409 3430
3431 if (ret < 0 && final_size > inode->i_size)
3432 ext4_truncate_failed_write(inode);
3433
3434 /* Handle extending of i_size after direct IO write */
3435 if (orphan) {
3436 int err;
3437
3438 /* Credits for sb + inode write */
3439 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3440 if (IS_ERR(handle)) {
3441 /* This is really bad luck. We've written the data
3442 * but cannot extend i_size. Bail out and pretend
3443 * the write failed... */
3444 ret = PTR_ERR(handle);
3445 if (inode->i_nlink)
3446 ext4_orphan_del(NULL, inode);
3447
3448 goto out;
3449 }
3450 if (inode->i_nlink)
3451 ext4_orphan_del(handle, inode);
3452 if (ret > 0) {
3453 loff_t end = offset + ret;
3454 if (end > inode->i_size) {
3455 ei->i_disksize = end;
3456 i_size_write(inode, end);
3457 /*
3458 * We're going to return a positive `ret'
3459 * here due to non-zero-length I/O, so there's
3460 * no way of reporting error returns from
3461 * ext4_mark_inode_dirty() to userspace. So
3462 * ignore it.
3463 */
3464 ext4_mark_inode_dirty(handle, inode);
3465 }
3466 }
3467 err = ext4_journal_stop(handle);
3468 if (ret == 0)
3469 ret = err;
3470 }
3471out:
3472 return ret;
3473}
3474
3475static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
3476 loff_t offset)
3477{
3478 int unlocked = 0;
3479 struct inode *inode = iocb->ki_filp->f_mapping->host;
3480 ssize_t ret;
3481
3482 if (ext4_should_dioread_nolock(inode)) {
3483 /*
3484 * Nolock dioread optimization may be dynamically disabled
3485 * via ext4_inode_block_unlocked_dio(). Check inode's state
3486 * while holding extra i_dio_count ref.
3487 */
3488 inode_dio_begin(inode);
3489 smp_mb();
3490 if (unlikely(ext4_test_inode_state(inode,
3491 EXT4_STATE_DIOREAD_LOCK)))
3492 inode_dio_end(inode);
3493 else
3494 unlocked = 1;
3495 }
3496 if (IS_DAX(inode)) {
3497 ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
3498 NULL, unlocked ? 0 : DIO_LOCKING);
3499 } else {
3500 ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
3501 iter, offset, ext4_dio_get_block,
3502 NULL, NULL,
3503 unlocked ? 0 : DIO_LOCKING);
3504 }
3505 if (unlocked)
3506 inode_dio_end(inode);
3410 return ret; 3507 return ret;
3411} 3508}
3412 3509
@@ -3434,10 +3531,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3434 return 0; 3531 return 0;
3435 3532
3436 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); 3533 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
3437 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3534 if (iov_iter_rw(iter) == READ)
3438 ret = ext4_ext_direct_IO(iocb, iter, offset); 3535 ret = ext4_direct_IO_read(iocb, iter, offset);
3439 else 3536 else
3440 ret = ext4_ind_direct_IO(iocb, iter, offset); 3537 ret = ext4_direct_IO_write(iocb, iter, offset);
3441 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); 3538 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
3442 return ret; 3539 return ret;
3443} 3540}