aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
authorMingming Cao <cmm@us.ibm.com>2009-09-28 15:48:29 -0400
committerTheodore Ts'o <tytso@mit.edu>2009-09-28 15:48:29 -0400
commit8d5d02e6b176565c77ff03604908b1453a22044d (patch)
tree0d29e4f28233f24960c7921c1c0a7608077bf713 /fs/ext4/inode.c
parent4c0425ff68b1b87b802ffeda7b6a46ff7da7241c (diff)
ext4: async direct IO for holes and fallocate support
For async direct IO that covers holes or fallocate, the end_io callback function now queued the convertion work on workqueue but don't flush the work rightaway as it might take too long to afford. But when fsync is called after all the data is completed, user expects the metadata also being updated before fsync returns. Thus we need to flush the conversion work when fsync() is called. This patch keep track of a listed of completed async direct io that has a work queued on workqueue. When fsync() is called, it will go through the list and do the conversion. Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c231
1 files changed, 196 insertions, 35 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5633af6a7045..118e16ca91d7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3451,6 +3451,8 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3451 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3451 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3452 int dio_credits; 3452 int dio_credits;
3453 3453
3454 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3455 inode->i_ino, create);
3454 /* 3456 /*
3455 * DIO VFS code passes create = 0 flag for write to 3457 * DIO VFS code passes create = 0 flag for write to
3456 * the middle of file. It does this to avoid block 3458 * the middle of file. It does this to avoid block
@@ -3491,55 +3493,152 @@ out:
3491 return ret; 3493 return ret;
3492} 3494}
3493 3495
3494#define DIO_AIO 0x1
3495
3496static void ext4_free_io_end(ext4_io_end_t *io) 3496static void ext4_free_io_end(ext4_io_end_t *io)
3497{ 3497{
3498 BUG_ON(!io);
3499 iput(io->inode);
3498 kfree(io); 3500 kfree(io);
3499} 3501}
3502static void dump_aio_dio_list(struct inode * inode)
3503{
3504#ifdef EXT4_DEBUG
3505 struct list_head *cur, *before, *after;
3506 ext4_io_end_t *io, *io0, *io1;
3507
3508 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3509 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3510 return;
3511 }
3512
3513 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3514 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3515 cur = &io->list;
3516 before = cur->prev;
3517 io0 = container_of(before, ext4_io_end_t, list);
3518 after = cur->next;
3519 io1 = container_of(after, ext4_io_end_t, list);
3520
3521 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3522 io, inode->i_ino, io0, io1);
3523 }
3524#endif
3525}
3500 3526
3501/* 3527/*
3502 * IO write completion for unwritten extents.
3503 *
3504 * check a range of space and convert unwritten extents to written. 3528 * check a range of space and convert unwritten extents to written.
3505 */ 3529 */
3506static void ext4_end_dio_unwritten(struct work_struct *work) 3530static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3507{ 3531{
3508 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3509 struct inode *inode = io->inode; 3532 struct inode *inode = io->inode;
3510 loff_t offset = io->offset; 3533 loff_t offset = io->offset;
3511 size_t size = io->size; 3534 size_t size = io->size;
3512 int ret = 0; 3535 int ret = 0;
3513 int aio = io->flag & DIO_AIO;
3514 3536
3515 if (aio) 3537 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3516 mutex_lock(&inode->i_mutex); 3538 "list->prev 0x%p\n",
3539 io, inode->i_ino, io->list.next, io->list.prev);
3540
3541 if (list_empty(&io->list))
3542 return ret;
3543
3544 if (io->flag != DIO_AIO_UNWRITTEN)
3545 return ret;
3546
3517 if (offset + size <= i_size_read(inode)) 3547 if (offset + size <= i_size_read(inode))
3518 ret = ext4_convert_unwritten_extents(inode, offset, size); 3548 ret = ext4_convert_unwritten_extents(inode, offset, size);
3519 3549
3520 if (ret < 0) 3550 if (ret < 0) {
3521 printk(KERN_EMERG "%s: failed to convert unwritten" 3551 printk(KERN_EMERG "%s: failed to convert unwritten"
3522 "extents to written extents, error is %d\n", 3552 "extents to written extents, error is %d"
3523 __func__, ret); 3553 " io is still on inode %lu aio dio list\n",
3554 __func__, ret, inode->i_ino);
3555 return ret;
3556 }
3557
3558 /* clear the DIO AIO unwritten flag */
3559 io->flag = 0;
3560 return ret;
3561}
3562/*
3563 * work on completed aio dio IO, to convert unwritten extents to extents
3564 */
3565static void ext4_end_aio_dio_work(struct work_struct *work)
3566{
3567 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3568 struct inode *inode = io->inode;
3569 int ret = 0;
3570
3571 mutex_lock(&inode->i_mutex);
3572 ret = ext4_end_aio_dio_nolock(io);
3573 if (ret >= 0) {
3574 if (!list_empty(&io->list))
3575 list_del_init(&io->list);
3576 ext4_free_io_end(io);
3577 }
3578 mutex_unlock(&inode->i_mutex);
3579}
3580/*
3581 * This function is called from ext4_sync_file().
3582 *
3583 * When AIO DIO IO is completed, the work to convert unwritten
3584 * extents to written is queued on workqueue but may not get immediately
3585 * scheduled. When fsync is called, we need to ensure the
3586 * conversion is complete before fsync returns.
3587 * The inode keeps track of a list of completed AIO from DIO path
3588 * that might needs to do the conversion. This function walks through
3589 * the list and convert the related unwritten extents to written.
3590 */
3591int flush_aio_dio_completed_IO(struct inode *inode)
3592{
3593 ext4_io_end_t *io;
3594 int ret = 0;
3595 int ret2 = 0;
3524 3596
3525 ext4_free_io_end(io); 3597 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3526 if (aio) 3598 return ret;
3527 mutex_unlock(&inode->i_mutex); 3599
3600 dump_aio_dio_list(inode);
3601 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3602 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3603 ext4_io_end_t, list);
3604 /*
3605 * Calling ext4_end_aio_dio_nolock() to convert completed
3606 * IO to written.
3607 *
3608 * When ext4_sync_file() is called, run_queue() may already
3609 * about to flush the work corresponding to this io structure.
3610 * It will be upset if it founds the io structure related
3611 * to the work-to-be schedule is freed.
3612 *
3613 * Thus we need to keep the io structure still valid here after
3614 * convertion finished. The io structure has a flag to
3615 * avoid double converting from both fsync and background work
3616 * queue work.
3617 */
3618 ret = ext4_end_aio_dio_nolock(io);
3619 if (ret < 0)
3620 ret2 = ret;
3621 else
3622 list_del_init(&io->list);
3623 }
3624 return (ret2 < 0) ? ret2 : 0;
3528} 3625}
3529 3626
3530static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) 3627static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3531{ 3628{
3532 ext4_io_end_t *io = NULL; 3629 ext4_io_end_t *io = NULL;
3533 3630
3534 io = kmalloc(sizeof(*io), GFP_NOFS); 3631 io = kmalloc(sizeof(*io), GFP_NOFS);
3535 3632
3536 if (io) { 3633 if (io) {
3634 igrab(inode);
3537 io->inode = inode; 3635 io->inode = inode;
3538 io->flag = flag; 3636 io->flag = 0;
3539 io->offset = 0; 3637 io->offset = 0;
3540 io->size = 0; 3638 io->size = 0;
3541 io->error = 0; 3639 io->error = 0;
3542 INIT_WORK(&io->work, ext4_end_dio_unwritten); 3640 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3641 INIT_LIST_HEAD(&io->list);
3543 } 3642 }
3544 3643
3545 return io; 3644 return io;
@@ -3551,19 +3650,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3551 ext4_io_end_t *io_end = iocb->private; 3650 ext4_io_end_t *io_end = iocb->private;
3552 struct workqueue_struct *wq; 3651 struct workqueue_struct *wq;
3553 3652
3554 /* if not hole or unwritten extents, just simple return */ 3653 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3555 if (!io_end || !size || !iocb->private) 3654 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3655 iocb->private, io_end->inode->i_ino, iocb, offset,
3656 size);
3657 /* if not async direct IO or dio with 0 bytes write, just return */
3658 if (!io_end || !size)
3556 return; 3659 return;
3660
3661 /* if not aio dio with unwritten extents, just free io and return */
3662 if (io_end->flag != DIO_AIO_UNWRITTEN){
3663 ext4_free_io_end(io_end);
3664 iocb->private = NULL;
3665 return;
3666 }
3667
3557 io_end->offset = offset; 3668 io_end->offset = offset;
3558 io_end->size = size; 3669 io_end->size = size;
3559 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3670 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3560 3671
3561 /* We need to convert unwritten extents to written */ 3672 /* queue the work to convert unwritten extents to written */
3562 queue_work(wq, &io_end->work); 3673 queue_work(wq, &io_end->work);
3563 3674
3564 if (is_sync_kiocb(iocb)) 3675 /* Add the io_end to per-inode completed aio dio list*/
3565 flush_workqueue(wq); 3676 list_add_tail(&io_end->list,
3566 3677 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3567 iocb->private = NULL; 3678 iocb->private = NULL;
3568} 3679}
3569/* 3680/*
@@ -3575,8 +3686,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3575 * If those blocks were preallocated, we mark sure they are splited, but 3686 * If those blocks were preallocated, we mark sure they are splited, but
3576 * still keep the range to write as unintialized. 3687 * still keep the range to write as unintialized.
3577 * 3688 *
3578 * When end_io call back function called at the last IO complete time, 3689 * The unwrritten extents will be converted to written when DIO is completed.
3579 * those extents will be converted to written extents. 3690 * For async direct IO, since the IO may still pending when return, we
3691 * set up an end_io call back function, which will do the convertion
3692 * when async direct IO completed.
3580 * 3693 *
3581 * If the O_DIRECT write will extend the file then add this inode to the 3694 * If the O_DIRECT write will extend the file then add this inode to the
3582 * orphan list. So recovery will truncate it back to the original size 3695 * orphan list. So recovery will truncate it back to the original size
@@ -3595,28 +3708,76 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3595 loff_t final_size = offset + count; 3708 loff_t final_size = offset + count;
3596 if (rw == WRITE && final_size <= inode->i_size) { 3709 if (rw == WRITE && final_size <= inode->i_size) {
3597 /* 3710 /*
3598 * For DIO we fallocate blocks for holes, we fallocate blocks 3711 * We could direct write to holes and fallocate.
3599 * The fallocated extent for hole is marked as uninitialized 3712 *
3713 * Allocated blocks to fill the hole are marked as uninitialized
3600 * to prevent paralel buffered read to expose the stale data 3714 * to prevent paralel buffered read to expose the stale data
3601 * before DIO complete the data IO. 3715 * before DIO complete the data IO.
3602 * as for previously fallocated extents, ext4 get_block 3716 *
3717 * As to previously fallocated extents, ext4 get_block
3603 * will just simply mark the buffer mapped but still 3718 * will just simply mark the buffer mapped but still
3604 * keep the extents uninitialized. 3719 * keep the extents uninitialized.
3605 * 3720 *
3606 * At the end of IO, the ext4 end_io callback function 3721 * for non AIO case, we will convert those unwritten extents
3607 * will convert those unwritten extents to written, 3722 * to written after return back from blockdev_direct_IO.
3608 * 3723 *
3724 * for async DIO, the conversion needs to be defered when
3725 * the IO is completed. The ext4 end_io callback function
3726 * will be called to take care of the conversion work.
3727 * Here for async case, we allocate an io_end structure to
3728 * hook to the iocb.
3609 */ 3729 */
3610 iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); 3730 iocb->private = NULL;
3611 if (!iocb->private) 3731 EXT4_I(inode)->cur_aio_dio = NULL;
3612 return -ENOMEM; 3732 if (!is_sync_kiocb(iocb)) {
3733 iocb->private = ext4_init_io_end(inode);
3734 if (!iocb->private)
3735 return -ENOMEM;
3736 /*
3737 * we save the io structure for current async
3738 * direct IO, so that later ext4_get_blocks()
3739 * could flag the io structure whether there
3740 * is a unwritten extents needs to be converted
3741 * when IO is completed.
3742 */
3743 EXT4_I(inode)->cur_aio_dio = iocb->private;
3744 }
3745
3613 ret = blockdev_direct_IO(rw, iocb, inode, 3746 ret = blockdev_direct_IO(rw, iocb, inode,
3614 inode->i_sb->s_bdev, iov, 3747 inode->i_sb->s_bdev, iov,
3615 offset, nr_segs, 3748 offset, nr_segs,
3616 ext4_get_block_dio_write, 3749 ext4_get_block_dio_write,
3617 ext4_end_io_dio); 3750 ext4_end_io_dio);
3751 if (iocb->private)
3752 EXT4_I(inode)->cur_aio_dio = NULL;
3753 /*
3754 * The io_end structure takes a reference to the inode,
3755 * that structure needs to be destroyed and the
3756 * reference to the inode need to be dropped, when IO is
3757 * complete, even with 0 byte write, or failed.
3758 *
3759 * In the successful AIO DIO case, the io_end structure will be
3760 * desctroyed and the reference to the inode will be dropped
3761 * after the end_io call back function is called.
3762 *
3763 * In the case there is 0 byte write, or error case, since
3764 * VFS direct IO won't invoke the end_io call back function,
3765 * we need to free the end_io structure here.
3766 */
3767 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3768 ext4_free_io_end(iocb->private);
3769 iocb->private = NULL;
3770 } else if (ret > 0)
3771 /*
3772 * for non AIO case, since the IO is already
3773 * completed, we could do the convertion right here
3774 */
3775 ret = ext4_convert_unwritten_extents(inode,
3776 offset, ret);
3618 return ret; 3777 return ret;
3619 } 3778 }
3779
3780 /* for write the the end of file case, we fall back to old way */
3620 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3781 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3621} 3782}
3622 3783