diff options
Diffstat (limited to 'fs/ext4/inode.c')
| -rw-r--r-- | fs/ext4/inode.c | 231 |
1 files changed, 196 insertions, 35 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5633af6a7045..118e16ca91d7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
| @@ -3451,6 +3451,8 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, | |||
| 3451 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | 3451 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; |
| 3452 | int dio_credits; | 3452 | int dio_credits; |
| 3453 | 3453 | ||
| 3454 | ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", | ||
| 3455 | inode->i_ino, create); | ||
| 3454 | /* | 3456 | /* |
| 3455 | * DIO VFS code passes create = 0 flag for write to | 3457 | * DIO VFS code passes create = 0 flag for write to |
| 3456 | * the middle of file. It does this to avoid block | 3458 | * the middle of file. It does this to avoid block |
| @@ -3491,55 +3493,152 @@ out: | |||
| 3491 | return ret; | 3493 | return ret; |
| 3492 | } | 3494 | } |
| 3493 | 3495 | ||
| 3494 | #define DIO_AIO 0x1 | ||
| 3495 | |||
| 3496 | static void ext4_free_io_end(ext4_io_end_t *io) | 3496 | static void ext4_free_io_end(ext4_io_end_t *io) |
| 3497 | { | 3497 | { |
| 3498 | BUG_ON(!io); | ||
| 3499 | iput(io->inode); | ||
| 3498 | kfree(io); | 3500 | kfree(io); |
| 3499 | } | 3501 | } |
| 3502 | static void dump_aio_dio_list(struct inode * inode) | ||
| 3503 | { | ||
| 3504 | #ifdef EXT4_DEBUG | ||
| 3505 | struct list_head *cur, *before, *after; | ||
| 3506 | ext4_io_end_t *io, *io0, *io1; | ||
| 3507 | |||
| 3508 | if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ | ||
| 3509 | ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); | ||
| 3510 | return; | ||
| 3511 | } | ||
| 3512 | |||
| 3513 | ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); | ||
| 3514 | list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ | ||
| 3515 | cur = &io->list; | ||
| 3516 | before = cur->prev; | ||
| 3517 | io0 = container_of(before, ext4_io_end_t, list); | ||
| 3518 | after = cur->next; | ||
| 3519 | io1 = container_of(after, ext4_io_end_t, list); | ||
| 3520 | |||
| 3521 | ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", | ||
| 3522 | io, inode->i_ino, io0, io1); | ||
| 3523 | } | ||
| 3524 | #endif | ||
| 3525 | } | ||
| 3500 | 3526 | ||
| 3501 | /* | 3527 | /* |
| 3502 | * IO write completion for unwritten extents. | ||
| 3503 | * | ||
| 3504 | * check a range of space and convert unwritten extents to written. | 3528 | * check a range of space and convert unwritten extents to written. |
| 3505 | */ | 3529 | */ |
| 3506 | static void ext4_end_dio_unwritten(struct work_struct *work) | 3530 | static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) |
| 3507 | { | 3531 | { |
| 3508 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
| 3509 | struct inode *inode = io->inode; | 3532 | struct inode *inode = io->inode; |
| 3510 | loff_t offset = io->offset; | 3533 | loff_t offset = io->offset; |
| 3511 | size_t size = io->size; | 3534 | size_t size = io->size; |
| 3512 | int ret = 0; | 3535 | int ret = 0; |
| 3513 | int aio = io->flag & DIO_AIO; | ||
| 3514 | 3536 | ||
| 3515 | if (aio) | 3537 | ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," |
| 3516 | mutex_lock(&inode->i_mutex); | 3538 | "list->prev 0x%p\n", |
| 3539 | io, inode->i_ino, io->list.next, io->list.prev); | ||
| 3540 | |||
| 3541 | if (list_empty(&io->list)) | ||
| 3542 | return ret; | ||
| 3543 | |||
| 3544 | if (io->flag != DIO_AIO_UNWRITTEN) | ||
| 3545 | return ret; | ||
| 3546 | |||
| 3517 | if (offset + size <= i_size_read(inode)) | 3547 | if (offset + size <= i_size_read(inode)) |
| 3518 | ret = ext4_convert_unwritten_extents(inode, offset, size); | 3548 | ret = ext4_convert_unwritten_extents(inode, offset, size); |
| 3519 | 3549 | ||
| 3520 | if (ret < 0) | 3550 | if (ret < 0) { |
| 3521 | printk(KERN_EMERG "%s: failed to convert unwritten" | 3551 | printk(KERN_EMERG "%s: failed to convert unwritten" |
| 3522 | "extents to written extents, error is %d\n", | 3552 | "extents to written extents, error is %d" |
| 3523 | __func__, ret); | 3553 | " io is still on inode %lu aio dio list\n", |
| 3554 | __func__, ret, inode->i_ino); | ||
| 3555 | return ret; | ||
| 3556 | } | ||
| 3557 | |||
| 3558 | /* clear the DIO AIO unwritten flag */ | ||
| 3559 | io->flag = 0; | ||
| 3560 | return ret; | ||
| 3561 | } | ||
| 3562 | /* | ||
| 3563 | * work on completed aio dio IO, to convert unwritten extents to extents | ||
| 3564 | */ | ||
| 3565 | static void ext4_end_aio_dio_work(struct work_struct *work) | ||
| 3566 | { | ||
| 3567 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
| 3568 | struct inode *inode = io->inode; | ||
| 3569 | int ret = 0; | ||
| 3570 | |||
| 3571 | mutex_lock(&inode->i_mutex); | ||
| 3572 | ret = ext4_end_aio_dio_nolock(io); | ||
| 3573 | if (ret >= 0) { | ||
| 3574 | if (!list_empty(&io->list)) | ||
| 3575 | list_del_init(&io->list); | ||
| 3576 | ext4_free_io_end(io); | ||
| 3577 | } | ||
| 3578 | mutex_unlock(&inode->i_mutex); | ||
| 3579 | } | ||
| 3580 | /* | ||
| 3581 | * This function is called from ext4_sync_file(). | ||
| 3582 | * | ||
| 3583 | * When AIO DIO IO is completed, the work to convert unwritten | ||
| 3584 | * extents to written is queued on workqueue but may not get immediately | ||
| 3585 | * scheduled. When fsync is called, we need to ensure the | ||
| 3586 | * conversion is complete before fsync returns. | ||
| 3587 | * The inode keeps track of a list of completed AIO from DIO path | ||
| 3588 | * that might needs to do the conversion. This function walks through | ||
| 3589 | * the list and convert the related unwritten extents to written. | ||
| 3590 | */ | ||
| 3591 | int flush_aio_dio_completed_IO(struct inode *inode) | ||
| 3592 | { | ||
| 3593 | ext4_io_end_t *io; | ||
| 3594 | int ret = 0; | ||
| 3595 | int ret2 = 0; | ||
| 3524 | 3596 | ||
| 3525 | ext4_free_io_end(io); | 3597 | if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) |
| 3526 | if (aio) | 3598 | return ret; |
| 3527 | mutex_unlock(&inode->i_mutex); | 3599 | |
| 3600 | dump_aio_dio_list(inode); | ||
| 3601 | while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ | ||
| 3602 | io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, | ||
| 3603 | ext4_io_end_t, list); | ||
| 3604 | /* | ||
| 3605 | * Calling ext4_end_aio_dio_nolock() to convert completed | ||
| 3606 | * IO to written. | ||
| 3607 | * | ||
| 3608 | * When ext4_sync_file() is called, run_queue() may already | ||
| 3609 | * about to flush the work corresponding to this io structure. | ||
| 3610 | * It will be upset if it founds the io structure related | ||
| 3611 | * to the work-to-be schedule is freed. | ||
| 3612 | * | ||
| 3613 | * Thus we need to keep the io structure still valid here after | ||
| 3614 | * convertion finished. The io structure has a flag to | ||
| 3615 | * avoid double converting from both fsync and background work | ||
| 3616 | * queue work. | ||
| 3617 | */ | ||
| 3618 | ret = ext4_end_aio_dio_nolock(io); | ||
| 3619 | if (ret < 0) | ||
| 3620 | ret2 = ret; | ||
| 3621 | else | ||
| 3622 | list_del_init(&io->list); | ||
| 3623 | } | ||
| 3624 | return (ret2 < 0) ? ret2 : 0; | ||
| 3528 | } | 3625 | } |
| 3529 | 3626 | ||
| 3530 | static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) | 3627 | static ext4_io_end_t *ext4_init_io_end (struct inode *inode) |
| 3531 | { | 3628 | { |
| 3532 | ext4_io_end_t *io = NULL; | 3629 | ext4_io_end_t *io = NULL; |
| 3533 | 3630 | ||
| 3534 | io = kmalloc(sizeof(*io), GFP_NOFS); | 3631 | io = kmalloc(sizeof(*io), GFP_NOFS); |
| 3535 | 3632 | ||
| 3536 | if (io) { | 3633 | if (io) { |
| 3634 | igrab(inode); | ||
| 3537 | io->inode = inode; | 3635 | io->inode = inode; |
| 3538 | io->flag = flag; | 3636 | io->flag = 0; |
| 3539 | io->offset = 0; | 3637 | io->offset = 0; |
| 3540 | io->size = 0; | 3638 | io->size = 0; |
| 3541 | io->error = 0; | 3639 | io->error = 0; |
| 3542 | INIT_WORK(&io->work, ext4_end_dio_unwritten); | 3640 | INIT_WORK(&io->work, ext4_end_aio_dio_work); |
| 3641 | INIT_LIST_HEAD(&io->list); | ||
| 3543 | } | 3642 | } |
| 3544 | 3643 | ||
| 3545 | return io; | 3644 | return io; |
| @@ -3551,19 +3650,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
| 3551 | ext4_io_end_t *io_end = iocb->private; | 3650 | ext4_io_end_t *io_end = iocb->private; |
| 3552 | struct workqueue_struct *wq; | 3651 | struct workqueue_struct *wq; |
| 3553 | 3652 | ||
| 3554 | /* if not hole or unwritten extents, just simple return */ | 3653 | ext_debug("ext4_end_io_dio(): io_end 0x%p" |
| 3555 | if (!io_end || !size || !iocb->private) | 3654 | "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", |
| 3655 | iocb->private, io_end->inode->i_ino, iocb, offset, | ||
| 3656 | size); | ||
| 3657 | /* if not async direct IO or dio with 0 bytes write, just return */ | ||
| 3658 | if (!io_end || !size) | ||
| 3556 | return; | 3659 | return; |
| 3660 | |||
| 3661 | /* if not aio dio with unwritten extents, just free io and return */ | ||
| 3662 | if (io_end->flag != DIO_AIO_UNWRITTEN){ | ||
| 3663 | ext4_free_io_end(io_end); | ||
| 3664 | iocb->private = NULL; | ||
| 3665 | return; | ||
| 3666 | } | ||
| 3667 | |||
| 3557 | io_end->offset = offset; | 3668 | io_end->offset = offset; |
| 3558 | io_end->size = size; | 3669 | io_end->size = size; |
| 3559 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | 3670 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; |
| 3560 | 3671 | ||
| 3561 | /* We need to convert unwritten extents to written */ | 3672 | /* queue the work to convert unwritten extents to written */ |
| 3562 | queue_work(wq, &io_end->work); | 3673 | queue_work(wq, &io_end->work); |
| 3563 | 3674 | ||
| 3564 | if (is_sync_kiocb(iocb)) | 3675 | /* Add the io_end to per-inode completed aio dio list*/ |
| 3565 | flush_workqueue(wq); | 3676 | list_add_tail(&io_end->list, |
| 3566 | 3677 | &EXT4_I(io_end->inode)->i_aio_dio_complete_list); | |
| 3567 | iocb->private = NULL; | 3678 | iocb->private = NULL; |
| 3568 | } | 3679 | } |
| 3569 | /* | 3680 | /* |
| @@ -3575,8 +3686,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
| 3575 | * If those blocks were preallocated, we mark sure they are splited, but | 3686 | * If those blocks were preallocated, we mark sure they are splited, but |
| 3576 | * still keep the range to write as unintialized. | 3687 | * still keep the range to write as unintialized. |
| 3577 | * | 3688 | * |
| 3578 | * When end_io call back function called at the last IO complete time, | 3689 | * The unwrritten extents will be converted to written when DIO is completed. |
| 3579 | * those extents will be converted to written extents. | 3690 | * For async direct IO, since the IO may still pending when return, we |
| 3691 | * set up an end_io call back function, which will do the convertion | ||
| 3692 | * when async direct IO completed. | ||
| 3580 | * | 3693 | * |
| 3581 | * If the O_DIRECT write will extend the file then add this inode to the | 3694 | * If the O_DIRECT write will extend the file then add this inode to the |
| 3582 | * orphan list. So recovery will truncate it back to the original size | 3695 | * orphan list. So recovery will truncate it back to the original size |
| @@ -3595,28 +3708,76 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
| 3595 | loff_t final_size = offset + count; | 3708 | loff_t final_size = offset + count; |
| 3596 | if (rw == WRITE && final_size <= inode->i_size) { | 3709 | if (rw == WRITE && final_size <= inode->i_size) { |
| 3597 | /* | 3710 | /* |
| 3598 | * For DIO we fallocate blocks for holes, we fallocate blocks | 3711 | * We could direct write to holes and fallocate. |
| 3599 | * The fallocated extent for hole is marked as uninitialized | 3712 | * |
| 3713 | * Allocated blocks to fill the hole are marked as uninitialized | ||
| 3600 | * to prevent paralel buffered read to expose the stale data | 3714 | * to prevent paralel buffered read to expose the stale data |
| 3601 | * before DIO complete the data IO. | 3715 | * before DIO complete the data IO. |
| 3602 | * as for previously fallocated extents, ext4 get_block | 3716 | * |
| 3717 | * As to previously fallocated extents, ext4 get_block | ||
| 3603 | * will just simply mark the buffer mapped but still | 3718 | * will just simply mark the buffer mapped but still |
| 3604 | * keep the extents uninitialized. | 3719 | * keep the extents uninitialized. |
| 3605 | * | 3720 | * |
| 3606 | * At the end of IO, the ext4 end_io callback function | 3721 | * for non AIO case, we will convert those unwritten extents |
| 3607 | * will convert those unwritten extents to written, | 3722 | * to written after return back from blockdev_direct_IO. |
| 3608 | * | 3723 | * |
| 3724 | * for async DIO, the conversion needs to be defered when | ||
| 3725 | * the IO is completed. The ext4 end_io callback function | ||
| 3726 | * will be called to take care of the conversion work. | ||
| 3727 | * Here for async case, we allocate an io_end structure to | ||
| 3728 | * hook to the iocb. | ||
| 3609 | */ | 3729 | */ |
| 3610 | iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); | 3730 | iocb->private = NULL; |
| 3611 | if (!iocb->private) | 3731 | EXT4_I(inode)->cur_aio_dio = NULL; |
| 3612 | return -ENOMEM; | 3732 | if (!is_sync_kiocb(iocb)) { |
| 3733 | iocb->private = ext4_init_io_end(inode); | ||
| 3734 | if (!iocb->private) | ||
| 3735 | return -ENOMEM; | ||
| 3736 | /* | ||
| 3737 | * we save the io structure for current async | ||
| 3738 | * direct IO, so that later ext4_get_blocks() | ||
| 3739 | * could flag the io structure whether there | ||
| 3740 | * is a unwritten extents needs to be converted | ||
| 3741 | * when IO is completed. | ||
| 3742 | */ | ||
| 3743 | EXT4_I(inode)->cur_aio_dio = iocb->private; | ||
| 3744 | } | ||
| 3745 | |||
| 3613 | ret = blockdev_direct_IO(rw, iocb, inode, | 3746 | ret = blockdev_direct_IO(rw, iocb, inode, |
| 3614 | inode->i_sb->s_bdev, iov, | 3747 | inode->i_sb->s_bdev, iov, |
| 3615 | offset, nr_segs, | 3748 | offset, nr_segs, |
| 3616 | ext4_get_block_dio_write, | 3749 | ext4_get_block_dio_write, |
| 3617 | ext4_end_io_dio); | 3750 | ext4_end_io_dio); |
| 3751 | if (iocb->private) | ||
| 3752 | EXT4_I(inode)->cur_aio_dio = NULL; | ||
| 3753 | /* | ||
| 3754 | * The io_end structure takes a reference to the inode, | ||
| 3755 | * that structure needs to be destroyed and the | ||
| 3756 | * reference to the inode need to be dropped, when IO is | ||
| 3757 | * complete, even with 0 byte write, or failed. | ||
| 3758 | * | ||
| 3759 | * In the successful AIO DIO case, the io_end structure will be | ||
| 3760 | * desctroyed and the reference to the inode will be dropped | ||
| 3761 | * after the end_io call back function is called. | ||
| 3762 | * | ||
| 3763 | * In the case there is 0 byte write, or error case, since | ||
| 3764 | * VFS direct IO won't invoke the end_io call back function, | ||
| 3765 | * we need to free the end_io structure here. | ||
| 3766 | */ | ||
| 3767 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | ||
| 3768 | ext4_free_io_end(iocb->private); | ||
| 3769 | iocb->private = NULL; | ||
| 3770 | } else if (ret > 0) | ||
| 3771 | /* | ||
| 3772 | * for non AIO case, since the IO is already | ||
| 3773 | * completed, we could do the convertion right here | ||
| 3774 | */ | ||
| 3775 | ret = ext4_convert_unwritten_extents(inode, | ||
| 3776 | offset, ret); | ||
| 3618 | return ret; | 3777 | return ret; |
| 3619 | } | 3778 | } |
| 3779 | |||
| 3780 | /* for write the the end of file case, we fall back to old way */ | ||
| 3620 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | 3781 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); |
| 3621 | } | 3782 | } |
| 3622 | 3783 | ||
