diff options
| -rw-r--r-- | fs/ext4/ext4.h | 12 | ||||
| -rw-r--r-- | fs/ext4/extents.c | 19 | ||||
| -rw-r--r-- | fs/ext4/fsync.c | 5 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 231 | ||||
| -rw-r--r-- | fs/ext4/super.c | 8 |
5 files changed, 234 insertions, 41 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ccb4dbf359c4..b491576e11c3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
| @@ -127,10 +127,11 @@ struct mpage_da_data { | |||
| 127 | int pages_written; | 127 | int pages_written; |
| 128 | int retval; | 128 | int retval; |
| 129 | }; | 129 | }; |
| 130 | 130 | #define DIO_AIO_UNWRITTEN 0x1 | |
| 131 | typedef struct ext4_io_end { | 131 | typedef struct ext4_io_end { |
| 132 | struct list_head list; /* per-file finished AIO list */ | ||
| 132 | struct inode *inode; /* file being written to */ | 133 | struct inode *inode; /* file being written to */ |
| 133 | unsigned int flag; /* sync IO or AIO */ | 134 | unsigned int flag; /* unwritten or not */ |
| 134 | int error; /* I/O error code */ | 135 | int error; /* I/O error code */ |
| 135 | ext4_lblk_t offset; /* offset in the file */ | 136 | ext4_lblk_t offset; /* offset in the file */ |
| 136 | size_t size; /* size of the extent */ | 137 | size_t size; /* size of the extent */ |
| @@ -690,6 +691,11 @@ struct ext4_inode_info { | |||
| 690 | __u16 i_extra_isize; | 691 | __u16 i_extra_isize; |
| 691 | 692 | ||
| 692 | spinlock_t i_block_reservation_lock; | 693 | spinlock_t i_block_reservation_lock; |
| 694 | |||
| 695 | /* completed async DIOs that might need unwritten extents handling */ | ||
| 696 | struct list_head i_aio_dio_complete_list; | ||
| 697 | /* current io_end structure for async DIO write*/ | ||
| 698 | ext4_io_end_t *cur_aio_dio; | ||
| 693 | }; | 699 | }; |
| 694 | 700 | ||
| 695 | /* | 701 | /* |
| @@ -1419,7 +1425,7 @@ extern int ext4_block_truncate_page(handle_t *handle, | |||
| 1419 | struct address_space *mapping, loff_t from); | 1425 | struct address_space *mapping, loff_t from); |
| 1420 | extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 1426 | extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
| 1421 | extern qsize_t ext4_get_reserved_space(struct inode *inode); | 1427 | extern qsize_t ext4_get_reserved_space(struct inode *inode); |
| 1422 | 1428 | extern int flush_aio_dio_completed_IO(struct inode *inode); | |
| 1423 | /* ioctl.c */ | 1429 | /* ioctl.c */ |
| 1424 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); | 1430 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); |
| 1425 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); | 1431 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a38e651c004e..10a63096a95a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
| @@ -3033,6 +3033,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, | |||
| 3033 | { | 3033 | { |
| 3034 | int ret = 0; | 3034 | int ret = 0; |
| 3035 | int err = 0; | 3035 | int err = 0; |
| 3036 | ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; | ||
| 3036 | 3037 | ||
| 3037 | ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" | 3038 | ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" |
| 3038 | "block %llu, max_blocks %u, flags %d, allocated %u", | 3039 | "block %llu, max_blocks %u, flags %d, allocated %u", |
| @@ -3045,6 +3046,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, | |||
| 3045 | ret = ext4_split_unwritten_extents(handle, | 3046 | ret = ext4_split_unwritten_extents(handle, |
| 3046 | inode, path, iblock, | 3047 | inode, path, iblock, |
| 3047 | max_blocks, flags); | 3048 | max_blocks, flags); |
| 3049 | /* flag the io_end struct that we need convert when IO done */ | ||
| 3050 | if (io) | ||
| 3051 | io->flag = DIO_AIO_UNWRITTEN; | ||
| 3048 | goto out; | 3052 | goto out; |
| 3049 | } | 3053 | } |
| 3050 | /* DIO end_io complete, convert the filled extent to written */ | 3054 | /* DIO end_io complete, convert the filled extent to written */ |
| @@ -3130,6 +3134,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
| 3130 | int err = 0, depth, ret, cache_type; | 3134 | int err = 0, depth, ret, cache_type; |
| 3131 | unsigned int allocated = 0; | 3135 | unsigned int allocated = 0; |
| 3132 | struct ext4_allocation_request ar; | 3136 | struct ext4_allocation_request ar; |
| 3137 | ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; | ||
| 3133 | 3138 | ||
| 3134 | __clear_bit(BH_New, &bh_result->b_state); | 3139 | __clear_bit(BH_New, &bh_result->b_state); |
| 3135 | ext_debug("blocks %u/%u requested for inode %lu\n", | 3140 | ext_debug("blocks %u/%u requested for inode %lu\n", |
| @@ -3279,8 +3284,20 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
| 3279 | /* try to insert new extent into found leaf and return */ | 3284 | /* try to insert new extent into found leaf and return */ |
| 3280 | ext4_ext_store_pblock(&newex, newblock); | 3285 | ext4_ext_store_pblock(&newex, newblock); |
| 3281 | newex.ee_len = cpu_to_le16(ar.len); | 3286 | newex.ee_len = cpu_to_le16(ar.len); |
| 3282 | if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ | 3287 | /* Mark uninitialized */ |
| 3288 | if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ | ||
| 3283 | ext4_ext_mark_uninitialized(&newex); | 3289 | ext4_ext_mark_uninitialized(&newex); |
| 3290 | /* | ||
| 3291 | * io_end structure was created for every async | ||
| 3292 | * direct IO write to the middle of the file. | ||
| 3293 | * To avoid unecessary convertion for every aio dio rewrite | ||
| 3294 | * to the mid of file, here we flag the IO that is really | ||
| 3295 | * need the convertion. | ||
| 3296 | * | ||
| 3297 | */ | ||
| 3298 | if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) | ||
| 3299 | io->flag = DIO_AIO_UNWRITTEN; | ||
| 3300 | } | ||
| 3284 | err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); | 3301 | err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); |
| 3285 | if (err) { | 3302 | if (err) { |
| 3286 | /* free data blocks we just allocated */ | 3303 | /* free data blocks we just allocated */ |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 07475740b512..2b1531266ee2 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
| @@ -44,6 +44,8 @@ | |||
| 44 | * | 44 | * |
| 45 | * What we do is just kick off a commit and wait on it. This will snapshot the | 45 | * What we do is just kick off a commit and wait on it. This will snapshot the |
| 46 | * inode to disk. | 46 | * inode to disk. |
| 47 | * | ||
| 48 | * i_mutex lock is held when entering and exiting this function | ||
| 47 | */ | 49 | */ |
| 48 | 50 | ||
| 49 | int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) | 51 | int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) |
| @@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) | |||
| 56 | 58 | ||
| 57 | trace_ext4_sync_file(file, dentry, datasync); | 59 | trace_ext4_sync_file(file, dentry, datasync); |
| 58 | 60 | ||
| 61 | ret = flush_aio_dio_completed_IO(inode); | ||
| 62 | if (ret < 0) | ||
| 63 | goto out; | ||
| 59 | /* | 64 | /* |
| 60 | * data=writeback: | 65 | * data=writeback: |
| 61 | * The caller's filemap_fdatawrite()/wait will sync the data. | 66 | * The caller's filemap_fdatawrite()/wait will sync the data. |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5633af6a7045..118e16ca91d7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
| @@ -3451,6 +3451,8 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, | |||
| 3451 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | 3451 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; |
| 3452 | int dio_credits; | 3452 | int dio_credits; |
| 3453 | 3453 | ||
| 3454 | ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", | ||
| 3455 | inode->i_ino, create); | ||
| 3454 | /* | 3456 | /* |
| 3455 | * DIO VFS code passes create = 0 flag for write to | 3457 | * DIO VFS code passes create = 0 flag for write to |
| 3456 | * the middle of file. It does this to avoid block | 3458 | * the middle of file. It does this to avoid block |
| @@ -3491,55 +3493,152 @@ out: | |||
| 3491 | return ret; | 3493 | return ret; |
| 3492 | } | 3494 | } |
| 3493 | 3495 | ||
| 3494 | #define DIO_AIO 0x1 | ||
| 3495 | |||
| 3496 | static void ext4_free_io_end(ext4_io_end_t *io) | 3496 | static void ext4_free_io_end(ext4_io_end_t *io) |
| 3497 | { | 3497 | { |
| 3498 | BUG_ON(!io); | ||
| 3499 | iput(io->inode); | ||
| 3498 | kfree(io); | 3500 | kfree(io); |
| 3499 | } | 3501 | } |
| 3502 | static void dump_aio_dio_list(struct inode * inode) | ||
| 3503 | { | ||
| 3504 | #ifdef EXT4_DEBUG | ||
| 3505 | struct list_head *cur, *before, *after; | ||
| 3506 | ext4_io_end_t *io, *io0, *io1; | ||
| 3507 | |||
| 3508 | if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ | ||
| 3509 | ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); | ||
| 3510 | return; | ||
| 3511 | } | ||
| 3512 | |||
| 3513 | ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); | ||
| 3514 | list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ | ||
| 3515 | cur = &io->list; | ||
| 3516 | before = cur->prev; | ||
| 3517 | io0 = container_of(before, ext4_io_end_t, list); | ||
| 3518 | after = cur->next; | ||
| 3519 | io1 = container_of(after, ext4_io_end_t, list); | ||
| 3520 | |||
| 3521 | ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", | ||
| 3522 | io, inode->i_ino, io0, io1); | ||
| 3523 | } | ||
| 3524 | #endif | ||
| 3525 | } | ||
| 3500 | 3526 | ||
| 3501 | /* | 3527 | /* |
| 3502 | * IO write completion for unwritten extents. | ||
| 3503 | * | ||
| 3504 | * check a range of space and convert unwritten extents to written. | 3528 | * check a range of space and convert unwritten extents to written. |
| 3505 | */ | 3529 | */ |
| 3506 | static void ext4_end_dio_unwritten(struct work_struct *work) | 3530 | static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) |
| 3507 | { | 3531 | { |
| 3508 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
| 3509 | struct inode *inode = io->inode; | 3532 | struct inode *inode = io->inode; |
| 3510 | loff_t offset = io->offset; | 3533 | loff_t offset = io->offset; |
| 3511 | size_t size = io->size; | 3534 | size_t size = io->size; |
| 3512 | int ret = 0; | 3535 | int ret = 0; |
| 3513 | int aio = io->flag & DIO_AIO; | ||
| 3514 | 3536 | ||
| 3515 | if (aio) | 3537 | ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," |
| 3516 | mutex_lock(&inode->i_mutex); | 3538 | "list->prev 0x%p\n", |
| 3539 | io, inode->i_ino, io->list.next, io->list.prev); | ||
| 3540 | |||
| 3541 | if (list_empty(&io->list)) | ||
| 3542 | return ret; | ||
| 3543 | |||
| 3544 | if (io->flag != DIO_AIO_UNWRITTEN) | ||
| 3545 | return ret; | ||
| 3546 | |||
| 3517 | if (offset + size <= i_size_read(inode)) | 3547 | if (offset + size <= i_size_read(inode)) |
| 3518 | ret = ext4_convert_unwritten_extents(inode, offset, size); | 3548 | ret = ext4_convert_unwritten_extents(inode, offset, size); |
| 3519 | 3549 | ||
| 3520 | if (ret < 0) | 3550 | if (ret < 0) { |
| 3521 | printk(KERN_EMERG "%s: failed to convert unwritten" | 3551 | printk(KERN_EMERG "%s: failed to convert unwritten" |
| 3522 | "extents to written extents, error is %d\n", | 3552 | "extents to written extents, error is %d" |
| 3523 | __func__, ret); | 3553 | " io is still on inode %lu aio dio list\n", |
| 3554 | __func__, ret, inode->i_ino); | ||
| 3555 | return ret; | ||
| 3556 | } | ||
| 3557 | |||
| 3558 | /* clear the DIO AIO unwritten flag */ | ||
| 3559 | io->flag = 0; | ||
| 3560 | return ret; | ||
| 3561 | } | ||
| 3562 | /* | ||
| 3563 | * work on completed aio dio IO, to convert unwritten extents to extents | ||
| 3564 | */ | ||
| 3565 | static void ext4_end_aio_dio_work(struct work_struct *work) | ||
| 3566 | { | ||
| 3567 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
| 3568 | struct inode *inode = io->inode; | ||
| 3569 | int ret = 0; | ||
| 3570 | |||
| 3571 | mutex_lock(&inode->i_mutex); | ||
| 3572 | ret = ext4_end_aio_dio_nolock(io); | ||
| 3573 | if (ret >= 0) { | ||
| 3574 | if (!list_empty(&io->list)) | ||
| 3575 | list_del_init(&io->list); | ||
| 3576 | ext4_free_io_end(io); | ||
| 3577 | } | ||
| 3578 | mutex_unlock(&inode->i_mutex); | ||
| 3579 | } | ||
| 3580 | /* | ||
| 3581 | * This function is called from ext4_sync_file(). | ||
| 3582 | * | ||
| 3583 | * When AIO DIO IO is completed, the work to convert unwritten | ||
| 3584 | * extents to written is queued on workqueue but may not get immediately | ||
| 3585 | * scheduled. When fsync is called, we need to ensure the | ||
| 3586 | * conversion is complete before fsync returns. | ||
| 3587 | * The inode keeps track of a list of completed AIO from DIO path | ||
| 3588 | * that might needs to do the conversion. This function walks through | ||
| 3589 | * the list and convert the related unwritten extents to written. | ||
| 3590 | */ | ||
| 3591 | int flush_aio_dio_completed_IO(struct inode *inode) | ||
| 3592 | { | ||
| 3593 | ext4_io_end_t *io; | ||
| 3594 | int ret = 0; | ||
| 3595 | int ret2 = 0; | ||
| 3524 | 3596 | ||
| 3525 | ext4_free_io_end(io); | 3597 | if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) |
| 3526 | if (aio) | 3598 | return ret; |
| 3527 | mutex_unlock(&inode->i_mutex); | 3599 | |
| 3600 | dump_aio_dio_list(inode); | ||
| 3601 | while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ | ||
| 3602 | io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, | ||
| 3603 | ext4_io_end_t, list); | ||
| 3604 | /* | ||
| 3605 | * Calling ext4_end_aio_dio_nolock() to convert completed | ||
| 3606 | * IO to written. | ||
| 3607 | * | ||
| 3608 | * When ext4_sync_file() is called, run_queue() may already | ||
| 3609 | * about to flush the work corresponding to this io structure. | ||
| 3610 | * It will be upset if it founds the io structure related | ||
| 3611 | * to the work-to-be schedule is freed. | ||
| 3612 | * | ||
| 3613 | * Thus we need to keep the io structure still valid here after | ||
| 3614 | * convertion finished. The io structure has a flag to | ||
| 3615 | * avoid double converting from both fsync and background work | ||
| 3616 | * queue work. | ||
| 3617 | */ | ||
| 3618 | ret = ext4_end_aio_dio_nolock(io); | ||
| 3619 | if (ret < 0) | ||
| 3620 | ret2 = ret; | ||
| 3621 | else | ||
| 3622 | list_del_init(&io->list); | ||
| 3623 | } | ||
| 3624 | return (ret2 < 0) ? ret2 : 0; | ||
| 3528 | } | 3625 | } |
| 3529 | 3626 | ||
| 3530 | static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) | 3627 | static ext4_io_end_t *ext4_init_io_end (struct inode *inode) |
| 3531 | { | 3628 | { |
| 3532 | ext4_io_end_t *io = NULL; | 3629 | ext4_io_end_t *io = NULL; |
| 3533 | 3630 | ||
| 3534 | io = kmalloc(sizeof(*io), GFP_NOFS); | 3631 | io = kmalloc(sizeof(*io), GFP_NOFS); |
| 3535 | 3632 | ||
| 3536 | if (io) { | 3633 | if (io) { |
| 3634 | igrab(inode); | ||
| 3537 | io->inode = inode; | 3635 | io->inode = inode; |
| 3538 | io->flag = flag; | 3636 | io->flag = 0; |
| 3539 | io->offset = 0; | 3637 | io->offset = 0; |
| 3540 | io->size = 0; | 3638 | io->size = 0; |
| 3541 | io->error = 0; | 3639 | io->error = 0; |
| 3542 | INIT_WORK(&io->work, ext4_end_dio_unwritten); | 3640 | INIT_WORK(&io->work, ext4_end_aio_dio_work); |
| 3641 | INIT_LIST_HEAD(&io->list); | ||
| 3543 | } | 3642 | } |
| 3544 | 3643 | ||
| 3545 | return io; | 3644 | return io; |
| @@ -3551,19 +3650,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
| 3551 | ext4_io_end_t *io_end = iocb->private; | 3650 | ext4_io_end_t *io_end = iocb->private; |
| 3552 | struct workqueue_struct *wq; | 3651 | struct workqueue_struct *wq; |
| 3553 | 3652 | ||
| 3554 | /* if not hole or unwritten extents, just simple return */ | 3653 | ext_debug("ext4_end_io_dio(): io_end 0x%p" |
| 3555 | if (!io_end || !size || !iocb->private) | 3654 | "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", |
| 3655 | iocb->private, io_end->inode->i_ino, iocb, offset, | ||
| 3656 | size); | ||
| 3657 | /* if not async direct IO or dio with 0 bytes write, just return */ | ||
| 3658 | if (!io_end || !size) | ||
| 3556 | return; | 3659 | return; |
| 3660 | |||
| 3661 | /* if not aio dio with unwritten extents, just free io and return */ | ||
| 3662 | if (io_end->flag != DIO_AIO_UNWRITTEN){ | ||
| 3663 | ext4_free_io_end(io_end); | ||
| 3664 | iocb->private = NULL; | ||
| 3665 | return; | ||
| 3666 | } | ||
| 3667 | |||
| 3557 | io_end->offset = offset; | 3668 | io_end->offset = offset; |
| 3558 | io_end->size = size; | 3669 | io_end->size = size; |
| 3559 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | 3670 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; |
| 3560 | 3671 | ||
| 3561 | /* We need to convert unwritten extents to written */ | 3672 | /* queue the work to convert unwritten extents to written */ |
| 3562 | queue_work(wq, &io_end->work); | 3673 | queue_work(wq, &io_end->work); |
| 3563 | 3674 | ||
| 3564 | if (is_sync_kiocb(iocb)) | 3675 | /* Add the io_end to per-inode completed aio dio list*/ |
| 3565 | flush_workqueue(wq); | 3676 | list_add_tail(&io_end->list, |
| 3566 | 3677 | &EXT4_I(io_end->inode)->i_aio_dio_complete_list); | |
| 3567 | iocb->private = NULL; | 3678 | iocb->private = NULL; |
| 3568 | } | 3679 | } |
| 3569 | /* | 3680 | /* |
| @@ -3575,8 +3686,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
| 3575 | * If those blocks were preallocated, we mark sure they are splited, but | 3686 | * If those blocks were preallocated, we mark sure they are splited, but |
| 3576 | * still keep the range to write as unintialized. | 3687 | * still keep the range to write as unintialized. |
| 3577 | * | 3688 | * |
| 3578 | * When end_io call back function called at the last IO complete time, | 3689 | * The unwrritten extents will be converted to written when DIO is completed. |
| 3579 | * those extents will be converted to written extents. | 3690 | * For async direct IO, since the IO may still pending when return, we |
| 3691 | * set up an end_io call back function, which will do the convertion | ||
| 3692 | * when async direct IO completed. | ||
| 3580 | * | 3693 | * |
| 3581 | * If the O_DIRECT write will extend the file then add this inode to the | 3694 | * If the O_DIRECT write will extend the file then add this inode to the |
| 3582 | * orphan list. So recovery will truncate it back to the original size | 3695 | * orphan list. So recovery will truncate it back to the original size |
| @@ -3595,28 +3708,76 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
| 3595 | loff_t final_size = offset + count; | 3708 | loff_t final_size = offset + count; |
| 3596 | if (rw == WRITE && final_size <= inode->i_size) { | 3709 | if (rw == WRITE && final_size <= inode->i_size) { |
| 3597 | /* | 3710 | /* |
| 3598 | * For DIO we fallocate blocks for holes, we fallocate blocks | 3711 | * We could direct write to holes and fallocate. |
| 3599 | * The fallocated extent for hole is marked as uninitialized | 3712 | * |
| 3713 | * Allocated blocks to fill the hole are marked as uninitialized | ||
| 3600 | * to prevent paralel buffered read to expose the stale data | 3714 | * to prevent paralel buffered read to expose the stale data |
| 3601 | * before DIO complete the data IO. | 3715 | * before DIO complete the data IO. |
| 3602 | * as for previously fallocated extents, ext4 get_block | 3716 | * |
| 3717 | * As to previously fallocated extents, ext4 get_block | ||
| 3603 | * will just simply mark the buffer mapped but still | 3718 | * will just simply mark the buffer mapped but still |
| 3604 | * keep the extents uninitialized. | 3719 | * keep the extents uninitialized. |
| 3605 | * | 3720 | * |
| 3606 | * At the end of IO, the ext4 end_io callback function | 3721 | * for non AIO case, we will convert those unwritten extents |
| 3607 | * will convert those unwritten extents to written, | 3722 | * to written after return back from blockdev_direct_IO. |
| 3608 | * | 3723 | * |
| 3724 | * for async DIO, the conversion needs to be defered when | ||
| 3725 | * the IO is completed. The ext4 end_io callback function | ||
| 3726 | * will be called to take care of the conversion work. | ||
| 3727 | * Here for async case, we allocate an io_end structure to | ||
| 3728 | * hook to the iocb. | ||
| 3609 | */ | 3729 | */ |
| 3610 | iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); | 3730 | iocb->private = NULL; |
| 3611 | if (!iocb->private) | 3731 | EXT4_I(inode)->cur_aio_dio = NULL; |
| 3612 | return -ENOMEM; | 3732 | if (!is_sync_kiocb(iocb)) { |
| 3733 | iocb->private = ext4_init_io_end(inode); | ||
| 3734 | if (!iocb->private) | ||
| 3735 | return -ENOMEM; | ||
| 3736 | /* | ||
| 3737 | * we save the io structure for current async | ||
| 3738 | * direct IO, so that later ext4_get_blocks() | ||
| 3739 | * could flag the io structure whether there | ||
| 3740 | * is a unwritten extents needs to be converted | ||
| 3741 | * when IO is completed. | ||
| 3742 | */ | ||
| 3743 | EXT4_I(inode)->cur_aio_dio = iocb->private; | ||
| 3744 | } | ||
| 3745 | |||
| 3613 | ret = blockdev_direct_IO(rw, iocb, inode, | 3746 | ret = blockdev_direct_IO(rw, iocb, inode, |
| 3614 | inode->i_sb->s_bdev, iov, | 3747 | inode->i_sb->s_bdev, iov, |
| 3615 | offset, nr_segs, | 3748 | offset, nr_segs, |
| 3616 | ext4_get_block_dio_write, | 3749 | ext4_get_block_dio_write, |
| 3617 | ext4_end_io_dio); | 3750 | ext4_end_io_dio); |
| 3751 | if (iocb->private) | ||
| 3752 | EXT4_I(inode)->cur_aio_dio = NULL; | ||
| 3753 | /* | ||
| 3754 | * The io_end structure takes a reference to the inode, | ||
| 3755 | * that structure needs to be destroyed and the | ||
| 3756 | * reference to the inode need to be dropped, when IO is | ||
| 3757 | * complete, even with 0 byte write, or failed. | ||
| 3758 | * | ||
| 3759 | * In the successful AIO DIO case, the io_end structure will be | ||
| 3760 | * desctroyed and the reference to the inode will be dropped | ||
| 3761 | * after the end_io call back function is called. | ||
| 3762 | * | ||
| 3763 | * In the case there is 0 byte write, or error case, since | ||
| 3764 | * VFS direct IO won't invoke the end_io call back function, | ||
| 3765 | * we need to free the end_io structure here. | ||
| 3766 | */ | ||
| 3767 | if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { | ||
| 3768 | ext4_free_io_end(iocb->private); | ||
| 3769 | iocb->private = NULL; | ||
| 3770 | } else if (ret > 0) | ||
| 3771 | /* | ||
| 3772 | * for non AIO case, since the IO is already | ||
| 3773 | * completed, we could do the convertion right here | ||
| 3774 | */ | ||
| 3775 | ret = ext4_convert_unwritten_extents(inode, | ||
| 3776 | offset, ret); | ||
| 3618 | return ret; | 3777 | return ret; |
| 3619 | } | 3778 | } |
| 3779 | |||
| 3780 | /* for write the the end of file case, we fall back to old way */ | ||
| 3620 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | 3781 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); |
| 3621 | } | 3782 | } |
| 3622 | 3783 | ||
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1a03ea98fdd1..f095c60b569e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
| @@ -687,6 +687,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
| 687 | ei->i_allocated_meta_blocks = 0; | 687 | ei->i_allocated_meta_blocks = 0; |
| 688 | ei->i_delalloc_reserved_flag = 0; | 688 | ei->i_delalloc_reserved_flag = 0; |
| 689 | spin_lock_init(&(ei->i_block_reservation_lock)); | 689 | spin_lock_init(&(ei->i_block_reservation_lock)); |
| 690 | INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); | ||
| 691 | ei->cur_aio_dio = NULL; | ||
| 690 | 692 | ||
| 691 | return &ei->vfs_inode; | 693 | return &ei->vfs_inode; |
| 692 | } | 694 | } |
| @@ -3375,11 +3377,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait) | |||
| 3375 | { | 3377 | { |
| 3376 | int ret = 0; | 3378 | int ret = 0; |
| 3377 | tid_t target; | 3379 | tid_t target; |
| 3380 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
| 3378 | 3381 | ||
| 3379 | trace_ext4_sync_fs(sb, wait); | 3382 | trace_ext4_sync_fs(sb, wait); |
| 3380 | if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { | 3383 | flush_workqueue(sbi->dio_unwritten_wq); |
| 3384 | if (jbd2_journal_start_commit(sbi->s_journal, &target)) { | ||
| 3381 | if (wait) | 3385 | if (wait) |
| 3382 | jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); | 3386 | jbd2_log_wait_commit(sbi->s_journal, target); |
| 3383 | } | 3387 | } |
| 3384 | return ret; | 3388 | return ret; |
| 3385 | } | 3389 | } |
