aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c231
1 files changed, 196 insertions, 35 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5633af6a7045..118e16ca91d7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3451,6 +3451,8 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3451 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3451 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3452 int dio_credits; 3452 int dio_credits;
3453 3453
3454 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3455 inode->i_ino, create);
3454 /* 3456 /*
3455 * DIO VFS code passes create = 0 flag for write to 3457 * DIO VFS code passes create = 0 flag for write to
3456 * the middle of file. It does this to avoid block 3458 * the middle of file. It does this to avoid block
@@ -3491,55 +3493,152 @@ out:
3491 return ret; 3493 return ret;
3492} 3494}
3493 3495
3494#define DIO_AIO 0x1
3495
3496static void ext4_free_io_end(ext4_io_end_t *io) 3496static void ext4_free_io_end(ext4_io_end_t *io)
3497{ 3497{
3498 BUG_ON(!io);
3499 iput(io->inode);
3498 kfree(io); 3500 kfree(io);
3499} 3501}
3502static void dump_aio_dio_list(struct inode * inode)
3503{
3504#ifdef EXT4_DEBUG
3505 struct list_head *cur, *before, *after;
3506 ext4_io_end_t *io, *io0, *io1;
3507
3508 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3509 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3510 return;
3511 }
3512
3513 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3514 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3515 cur = &io->list;
3516 before = cur->prev;
3517 io0 = container_of(before, ext4_io_end_t, list);
3518 after = cur->next;
3519 io1 = container_of(after, ext4_io_end_t, list);
3520
3521 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3522 io, inode->i_ino, io0, io1);
3523 }
3524#endif
3525}
3500 3526
3501/* 3527/*
3502 * IO write completion for unwritten extents.
3503 *
3504 * check a range of space and convert unwritten extents to written. 3528 * check a range of space and convert unwritten extents to written.
3505 */ 3529 */
3506static void ext4_end_dio_unwritten(struct work_struct *work) 3530static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3507{ 3531{
3508 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3509 struct inode *inode = io->inode; 3532 struct inode *inode = io->inode;
3510 loff_t offset = io->offset; 3533 loff_t offset = io->offset;
3511 size_t size = io->size; 3534 size_t size = io->size;
3512 int ret = 0; 3535 int ret = 0;
3513 int aio = io->flag & DIO_AIO;
3514 3536
3515 if (aio) 3537 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3516 mutex_lock(&inode->i_mutex); 3538 "list->prev 0x%p\n",
3539 io, inode->i_ino, io->list.next, io->list.prev);
3540
3541 if (list_empty(&io->list))
3542 return ret;
3543
3544 if (io->flag != DIO_AIO_UNWRITTEN)
3545 return ret;
3546
3517 if (offset + size <= i_size_read(inode)) 3547 if (offset + size <= i_size_read(inode))
3518 ret = ext4_convert_unwritten_extents(inode, offset, size); 3548 ret = ext4_convert_unwritten_extents(inode, offset, size);
3519 3549
3520 if (ret < 0) 3550 if (ret < 0) {
3521 printk(KERN_EMERG "%s: failed to convert unwritten" 3551 printk(KERN_EMERG "%s: failed to convert unwritten"
3522 "extents to written extents, error is %d\n", 3552 "extents to written extents, error is %d"
3523 __func__, ret); 3553 " io is still on inode %lu aio dio list\n",
3554 __func__, ret, inode->i_ino);
3555 return ret;
3556 }
3557
3558 /* clear the DIO AIO unwritten flag */
3559 io->flag = 0;
3560 return ret;
3561}
3562/*
3563 * work on completed aio dio IO, to convert unwritten extents to extents
3564 */
3565static void ext4_end_aio_dio_work(struct work_struct *work)
3566{
3567 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3568 struct inode *inode = io->inode;
3569 int ret = 0;
3570
3571 mutex_lock(&inode->i_mutex);
3572 ret = ext4_end_aio_dio_nolock(io);
3573 if (ret >= 0) {
3574 if (!list_empty(&io->list))
3575 list_del_init(&io->list);
3576 ext4_free_io_end(io);
3577 }
3578 mutex_unlock(&inode->i_mutex);
3579}
3580/*
3581 * This function is called from ext4_sync_file().
3582 *
3583 * When AIO DIO IO is completed, the work to convert unwritten
3584 * extents to written is queued on workqueue but may not get immediately
3585 * scheduled. When fsync is called, we need to ensure the
3586 * conversion is complete before fsync returns.
3587 * The inode keeps track of a list of completed AIO from DIO path
3588 * that might needs to do the conversion. This function walks through
3589 * the list and convert the related unwritten extents to written.
3590 */
3591int flush_aio_dio_completed_IO(struct inode *inode)
3592{
3593 ext4_io_end_t *io;
3594 int ret = 0;
3595 int ret2 = 0;
3524 3596
3525 ext4_free_io_end(io); 3597 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3526 if (aio) 3598 return ret;
3527 mutex_unlock(&inode->i_mutex); 3599
3600 dump_aio_dio_list(inode);
3601 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3602 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3603 ext4_io_end_t, list);
3604 /*
3605 * Calling ext4_end_aio_dio_nolock() to convert completed
3606 * IO to written.
3607 *
3608 * When ext4_sync_file() is called, run_queue() may already
3609 * about to flush the work corresponding to this io structure.
3610 * It will be upset if it founds the io structure related
3611 * to the work-to-be schedule is freed.
3612 *
3613 * Thus we need to keep the io structure still valid here after
3614 * convertion finished. The io structure has a flag to
3615 * avoid double converting from both fsync and background work
3616 * queue work.
3617 */
3618 ret = ext4_end_aio_dio_nolock(io);
3619 if (ret < 0)
3620 ret2 = ret;
3621 else
3622 list_del_init(&io->list);
3623 }
3624 return (ret2 < 0) ? ret2 : 0;
3528} 3625}
3529 3626
3530static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) 3627static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3531{ 3628{
3532 ext4_io_end_t *io = NULL; 3629 ext4_io_end_t *io = NULL;
3533 3630
3534 io = kmalloc(sizeof(*io), GFP_NOFS); 3631 io = kmalloc(sizeof(*io), GFP_NOFS);
3535 3632
3536 if (io) { 3633 if (io) {
3634 igrab(inode);
3537 io->inode = inode; 3635 io->inode = inode;
3538 io->flag = flag; 3636 io->flag = 0;
3539 io->offset = 0; 3637 io->offset = 0;
3540 io->size = 0; 3638 io->size = 0;
3541 io->error = 0; 3639 io->error = 0;
3542 INIT_WORK(&io->work, ext4_end_dio_unwritten); 3640 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3641 INIT_LIST_HEAD(&io->list);
3543 } 3642 }
3544 3643
3545 return io; 3644 return io;
@@ -3551,19 +3650,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3551 ext4_io_end_t *io_end = iocb->private; 3650 ext4_io_end_t *io_end = iocb->private;
3552 struct workqueue_struct *wq; 3651 struct workqueue_struct *wq;
3553 3652
3554 /* if not hole or unwritten extents, just simple return */ 3653 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3555 if (!io_end || !size || !iocb->private) 3654 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3655 iocb->private, io_end->inode->i_ino, iocb, offset,
3656 size);
3657 /* if not async direct IO or dio with 0 bytes write, just return */
3658 if (!io_end || !size)
3556 return; 3659 return;
3660
3661 /* if not aio dio with unwritten extents, just free io and return */
3662 if (io_end->flag != DIO_AIO_UNWRITTEN){
3663 ext4_free_io_end(io_end);
3664 iocb->private = NULL;
3665 return;
3666 }
3667
3557 io_end->offset = offset; 3668 io_end->offset = offset;
3558 io_end->size = size; 3669 io_end->size = size;
3559 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3670 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3560 3671
3561 /* We need to convert unwritten extents to written */ 3672 /* queue the work to convert unwritten extents to written */
3562 queue_work(wq, &io_end->work); 3673 queue_work(wq, &io_end->work);
3563 3674
3564 if (is_sync_kiocb(iocb)) 3675 /* Add the io_end to per-inode completed aio dio list*/
3565 flush_workqueue(wq); 3676 list_add_tail(&io_end->list,
3566 3677 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3567 iocb->private = NULL; 3678 iocb->private = NULL;
3568} 3679}
3569/* 3680/*
@@ -3575,8 +3686,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3575 * If those blocks were preallocated, we mark sure they are splited, but 3686 * If those blocks were preallocated, we mark sure they are splited, but
3576 * still keep the range to write as unintialized. 3687 * still keep the range to write as unintialized.
3577 * 3688 *
3578 * When end_io call back function called at the last IO complete time, 3689 * The unwrritten extents will be converted to written when DIO is completed.
3579 * those extents will be converted to written extents. 3690 * For async direct IO, since the IO may still pending when return, we
3691 * set up an end_io call back function, which will do the convertion
3692 * when async direct IO completed.
3580 * 3693 *
3581 * If the O_DIRECT write will extend the file then add this inode to the 3694 * If the O_DIRECT write will extend the file then add this inode to the
3582 * orphan list. So recovery will truncate it back to the original size 3695 * orphan list. So recovery will truncate it back to the original size
@@ -3595,28 +3708,76 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3595 loff_t final_size = offset + count; 3708 loff_t final_size = offset + count;
3596 if (rw == WRITE && final_size <= inode->i_size) { 3709 if (rw == WRITE && final_size <= inode->i_size) {
3597 /* 3710 /*
3598 * For DIO we fallocate blocks for holes, we fallocate blocks 3711 * We could direct write to holes and fallocate.
3599 * The fallocated extent for hole is marked as uninitialized 3712 *
3713 * Allocated blocks to fill the hole are marked as uninitialized
3600 * to prevent paralel buffered read to expose the stale data 3714 * to prevent paralel buffered read to expose the stale data
3601 * before DIO complete the data IO. 3715 * before DIO complete the data IO.
3602 * as for previously fallocated extents, ext4 get_block 3716 *
3717 * As to previously fallocated extents, ext4 get_block
3603 * will just simply mark the buffer mapped but still 3718 * will just simply mark the buffer mapped but still
3604 * keep the extents uninitialized. 3719 * keep the extents uninitialized.
3605 * 3720 *
3606 * At the end of IO, the ext4 end_io callback function 3721 * for non AIO case, we will convert those unwritten extents
3607 * will convert those unwritten extents to written, 3722 * to written after return back from blockdev_direct_IO.
3608 * 3723 *
3724 * for async DIO, the conversion needs to be defered when
3725 * the IO is completed. The ext4 end_io callback function
3726 * will be called to take care of the conversion work.
3727 * Here for async case, we allocate an io_end structure to
3728 * hook to the iocb.
3609 */ 3729 */
3610 iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); 3730 iocb->private = NULL;
3611 if (!iocb->private) 3731 EXT4_I(inode)->cur_aio_dio = NULL;
3612 return -ENOMEM; 3732 if (!is_sync_kiocb(iocb)) {
3733 iocb->private = ext4_init_io_end(inode);
3734 if (!iocb->private)
3735 return -ENOMEM;
3736 /*
3737 * we save the io structure for current async
3738 * direct IO, so that later ext4_get_blocks()
3739 * could flag the io structure whether there
3740 * is a unwritten extents needs to be converted
3741 * when IO is completed.
3742 */
3743 EXT4_I(inode)->cur_aio_dio = iocb->private;
3744 }
3745
3613 ret = blockdev_direct_IO(rw, iocb, inode, 3746 ret = blockdev_direct_IO(rw, iocb, inode,
3614 inode->i_sb->s_bdev, iov, 3747 inode->i_sb->s_bdev, iov,
3615 offset, nr_segs, 3748 offset, nr_segs,
3616 ext4_get_block_dio_write, 3749 ext4_get_block_dio_write,
3617 ext4_end_io_dio); 3750 ext4_end_io_dio);
3751 if (iocb->private)
3752 EXT4_I(inode)->cur_aio_dio = NULL;
3753 /*
3754 * The io_end structure takes a reference to the inode,
3755 * that structure needs to be destroyed and the
3756 * reference to the inode need to be dropped, when IO is
3757 * complete, even with 0 byte write, or failed.
3758 *
3759 * In the successful AIO DIO case, the io_end structure will be
3760 * desctroyed and the reference to the inode will be dropped
3761 * after the end_io call back function is called.
3762 *
3763 * In the case there is 0 byte write, or error case, since
3764 * VFS direct IO won't invoke the end_io call back function,
3765 * we need to free the end_io structure here.
3766 */
3767 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3768 ext4_free_io_end(iocb->private);
3769 iocb->private = NULL;
3770 } else if (ret > 0)
3771 /*
3772 * for non AIO case, since the IO is already
3773 * completed, we could do the convertion right here
3774 */
3775 ret = ext4_convert_unwritten_extents(inode,
3776 offset, ret);
3618 return ret; 3777 return ret;
3619 } 3778 }
3779
3780 /* for write the the end of file case, we fall back to old way */
3620 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3781 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3621} 3782}
3622 3783