diff options
-rw-r--r-- | fs/ext4/ext4.h | 3 | ||||
-rw-r--r-- | fs/ext4/inode.c | 197 | ||||
-rw-r--r-- | fs/ext4/super.c | 11 |
3 files changed, 210 insertions, 1 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2b4293aac162..ccb4dbf359c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -999,6 +999,9 @@ struct ext4_sb_info { | |||
999 | 999 | ||
1000 | unsigned int s_log_groups_per_flex; | 1000 | unsigned int s_log_groups_per_flex; |
1001 | struct flex_groups *s_flex_groups; | 1001 | struct flex_groups *s_flex_groups; |
1002 | |||
1003 | /* workqueue for dio unwritten */ | ||
1004 | struct workqueue_struct *dio_unwritten_wq; | ||
1002 | }; | 1005 | }; |
1003 | 1006 | ||
1004 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) | 1007 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index da4f2ecb5447..5633af6a7045 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/namei.h> | 37 | #include <linux/namei.h> |
38 | #include <linux/uio.h> | 38 | #include <linux/uio.h> |
39 | #include <linux/bio.h> | 39 | #include <linux/bio.h> |
40 | #include <linux/workqueue.h> | ||
40 | 41 | ||
41 | #include "ext4_jbd2.h" | 42 | #include "ext4_jbd2.h" |
42 | #include "xattr.h" | 43 | #include "xattr.h" |
@@ -3356,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3356 | } | 3357 | } |
3357 | 3358 | ||
3358 | /* | 3359 | /* |
3360 | * O_DIRECT for ext3 (or indirect map) based files | ||
3361 | * | ||
3359 | * If the O_DIRECT write will extend the file then add this inode to the | 3362 | * If the O_DIRECT write will extend the file then add this inode to the |
3360 | * orphan list. So recovery will truncate it back to the original size | 3363 | * orphan list. So recovery will truncate it back to the original size |
3361 | * if the machine crashes during the write. | 3364 | * if the machine crashes during the write. |
@@ -3364,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3364 | * crashes then stale disk data _may_ be exposed inside the file. But current | 3367 | * crashes then stale disk data _may_ be exposed inside the file. But current |
3365 | * VFS code falls back into buffered path in that case so we are safe. | 3368 | * VFS code falls back into buffered path in that case so we are safe. |
3366 | */ | 3369 | */ |
3367 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | 3370 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, |
3368 | const struct iovec *iov, loff_t offset, | 3371 | const struct iovec *iov, loff_t offset, |
3369 | unsigned long nr_segs) | 3372 | unsigned long nr_segs) |
3370 | { | 3373 | { |
@@ -3438,6 +3441,198 @@ out: | |||
3438 | return ret; | 3441 | return ret; |
3439 | } | 3442 | } |
3440 | 3443 | ||
3444 | /* Maximum number of blocks we map for direct IO at once. */ | ||
3445 | |||
3446 | static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, | ||
3447 | struct buffer_head *bh_result, int create) | ||
3448 | { | ||
3449 | handle_t *handle = NULL; | ||
3450 | int ret = 0; | ||
3451 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
3452 | int dio_credits; | ||
3453 | |||
3454 | /* | ||
3455 | * DIO VFS code passes create = 0 flag for write to | ||
3456 | * the middle of file. It does this to avoid block | ||
3457 | * allocation for holes, to prevent expose stale data | ||
3458 | * out when there is parallel buffered read (which does | ||
3459 | * not hold the i_mutex lock) while direct IO write has | ||
3460 | * not completed. DIO request on holes finally falls back | ||
3461 | * to buffered IO for this reason. | ||
3462 | * | ||
3463 | * For ext4 extent based file, since we support fallocate, | ||
3464 | * new allocated extent as uninitialized, for holes, we | ||
3465 | * could fallocate blocks for holes, thus parallel | ||
3466 | * buffered IO read will zero out the page when read on | ||
3467 | * a hole while parallel DIO write to the hole has not completed. | ||
3468 | * | ||
3469 | * when we come here, we know it's a direct IO write to | ||
3470 | * to the middle of file (<i_size) | ||
3471 | * so it's safe to override the create flag from VFS. | ||
3472 | */ | ||
3473 | create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; | ||
3474 | |||
3475 | if (max_blocks > DIO_MAX_BLOCKS) | ||
3476 | max_blocks = DIO_MAX_BLOCKS; | ||
3477 | dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); | ||
3478 | handle = ext4_journal_start(inode, dio_credits); | ||
3479 | if (IS_ERR(handle)) { | ||
3480 | ret = PTR_ERR(handle); | ||
3481 | goto out; | ||
3482 | } | ||
3483 | ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, | ||
3484 | create); | ||
3485 | if (ret > 0) { | ||
3486 | bh_result->b_size = (ret << inode->i_blkbits); | ||
3487 | ret = 0; | ||
3488 | } | ||
3489 | ext4_journal_stop(handle); | ||
3490 | out: | ||
3491 | return ret; | ||
3492 | } | ||
3493 | |||
3494 | #define DIO_AIO 0x1 | ||
3495 | |||
3496 | static void ext4_free_io_end(ext4_io_end_t *io) | ||
3497 | { | ||
3498 | kfree(io); | ||
3499 | } | ||
3500 | |||
3501 | /* | ||
3502 | * IO write completion for unwritten extents. | ||
3503 | * | ||
3504 | * check a range of space and convert unwritten extents to written. | ||
3505 | */ | ||
3506 | static void ext4_end_dio_unwritten(struct work_struct *work) | ||
3507 | { | ||
3508 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
3509 | struct inode *inode = io->inode; | ||
3510 | loff_t offset = io->offset; | ||
3511 | size_t size = io->size; | ||
3512 | int ret = 0; | ||
3513 | int aio = io->flag & DIO_AIO; | ||
3514 | |||
3515 | if (aio) | ||
3516 | mutex_lock(&inode->i_mutex); | ||
3517 | if (offset + size <= i_size_read(inode)) | ||
3518 | ret = ext4_convert_unwritten_extents(inode, offset, size); | ||
3519 | |||
3520 | if (ret < 0) | ||
3521 | printk(KERN_EMERG "%s: failed to convert unwritten" | ||
3522 | "extents to written extents, error is %d\n", | ||
3523 | __func__, ret); | ||
3524 | |||
3525 | ext4_free_io_end(io); | ||
3526 | if (aio) | ||
3527 | mutex_unlock(&inode->i_mutex); | ||
3528 | } | ||
3529 | |||
3530 | static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) | ||
3531 | { | ||
3532 | ext4_io_end_t *io = NULL; | ||
3533 | |||
3534 | io = kmalloc(sizeof(*io), GFP_NOFS); | ||
3535 | |||
3536 | if (io) { | ||
3537 | io->inode = inode; | ||
3538 | io->flag = flag; | ||
3539 | io->offset = 0; | ||
3540 | io->size = 0; | ||
3541 | io->error = 0; | ||
3542 | INIT_WORK(&io->work, ext4_end_dio_unwritten); | ||
3543 | } | ||
3544 | |||
3545 | return io; | ||
3546 | } | ||
3547 | |||
3548 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | ||
3549 | ssize_t size, void *private) | ||
3550 | { | ||
3551 | ext4_io_end_t *io_end = iocb->private; | ||
3552 | struct workqueue_struct *wq; | ||
3553 | |||
3554 | /* if not hole or unwritten extents, just simple return */ | ||
3555 | if (!io_end || !size || !iocb->private) | ||
3556 | return; | ||
3557 | io_end->offset = offset; | ||
3558 | io_end->size = size; | ||
3559 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
3560 | |||
3561 | /* We need to convert unwritten extents to written */ | ||
3562 | queue_work(wq, &io_end->work); | ||
3563 | |||
3564 | if (is_sync_kiocb(iocb)) | ||
3565 | flush_workqueue(wq); | ||
3566 | |||
3567 | iocb->private = NULL; | ||
3568 | } | ||
3569 | /* | ||
3570 | * For ext4 extent files, ext4 will do direct-io write to holes, | ||
3571 | * preallocated extents, and those write extend the file, no need to | ||
3572 | * fall back to buffered IO. | ||
3573 | * | ||
3574 | * For holes, we fallocate those blocks, mark them as unintialized | ||
3575 | * If those blocks were preallocated, we mark sure they are splited, but | ||
3576 | * still keep the range to write as unintialized. | ||
3577 | * | ||
3578 | * When end_io call back function called at the last IO complete time, | ||
3579 | * those extents will be converted to written extents. | ||
3580 | * | ||
3581 | * If the O_DIRECT write will extend the file then add this inode to the | ||
3582 | * orphan list. So recovery will truncate it back to the original size | ||
3583 | * if the machine crashes during the write. | ||
3584 | * | ||
3585 | */ | ||
3586 | static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | ||
3587 | const struct iovec *iov, loff_t offset, | ||
3588 | unsigned long nr_segs) | ||
3589 | { | ||
3590 | struct file *file = iocb->ki_filp; | ||
3591 | struct inode *inode = file->f_mapping->host; | ||
3592 | ssize_t ret; | ||
3593 | size_t count = iov_length(iov, nr_segs); | ||
3594 | |||
3595 | loff_t final_size = offset + count; | ||
3596 | if (rw == WRITE && final_size <= inode->i_size) { | ||
3597 | /* | ||
3598 | * For DIO we fallocate blocks for holes, we fallocate blocks | ||
3599 | * The fallocated extent for hole is marked as uninitialized | ||
3600 | * to prevent paralel buffered read to expose the stale data | ||
3601 | * before DIO complete the data IO. | ||
3602 | * as for previously fallocated extents, ext4 get_block | ||
3603 | * will just simply mark the buffer mapped but still | ||
3604 | * keep the extents uninitialized. | ||
3605 | * | ||
3606 | * At the end of IO, the ext4 end_io callback function | ||
3607 | * will convert those unwritten extents to written, | ||
3608 | * | ||
3609 | */ | ||
3610 | iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); | ||
3611 | if (!iocb->private) | ||
3612 | return -ENOMEM; | ||
3613 | ret = blockdev_direct_IO(rw, iocb, inode, | ||
3614 | inode->i_sb->s_bdev, iov, | ||
3615 | offset, nr_segs, | ||
3616 | ext4_get_block_dio_write, | ||
3617 | ext4_end_io_dio); | ||
3618 | return ret; | ||
3619 | } | ||
3620 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3621 | } | ||
3622 | |||
3623 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | ||
3624 | const struct iovec *iov, loff_t offset, | ||
3625 | unsigned long nr_segs) | ||
3626 | { | ||
3627 | struct file *file = iocb->ki_filp; | ||
3628 | struct inode *inode = file->f_mapping->host; | ||
3629 | |||
3630 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | ||
3631 | return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3632 | |||
3633 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
3634 | } | ||
3635 | |||
3441 | /* | 3636 | /* |
3442 | * Pages can be marked dirty completely asynchronously from ext4's journalling | 3637 | * Pages can be marked dirty completely asynchronously from ext4's journalling |
3443 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do | 3638 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16817737ba52..1a03ea98fdd1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -580,6 +580,9 @@ static void ext4_put_super(struct super_block *sb) | |||
580 | struct ext4_super_block *es = sbi->s_es; | 580 | struct ext4_super_block *es = sbi->s_es; |
581 | int i, err; | 581 | int i, err; |
582 | 582 | ||
583 | flush_workqueue(sbi->dio_unwritten_wq); | ||
584 | destroy_workqueue(sbi->dio_unwritten_wq); | ||
585 | |||
583 | lock_super(sb); | 586 | lock_super(sb); |
584 | lock_kernel(); | 587 | lock_kernel(); |
585 | if (sb->s_dirt) | 588 | if (sb->s_dirt) |
@@ -2801,6 +2804,12 @@ no_journal: | |||
2801 | clear_opt(sbi->s_mount_opt, NOBH); | 2804 | clear_opt(sbi->s_mount_opt, NOBH); |
2802 | } | 2805 | } |
2803 | } | 2806 | } |
2807 | EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); | ||
2808 | if (!EXT4_SB(sb)->dio_unwritten_wq) { | ||
2809 | printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); | ||
2810 | goto failed_mount_wq; | ||
2811 | } | ||
2812 | |||
2804 | /* | 2813 | /* |
2805 | * The jbd2_journal_load will have done any necessary log recovery, | 2814 | * The jbd2_journal_load will have done any necessary log recovery, |
2806 | * so we can safely mount the rest of the filesystem now. | 2815 | * so we can safely mount the rest of the filesystem now. |
@@ -2913,6 +2922,8 @@ cantfind_ext4: | |||
2913 | 2922 | ||
2914 | failed_mount4: | 2923 | failed_mount4: |
2915 | ext4_msg(sb, KERN_ERR, "mount failed"); | 2924 | ext4_msg(sb, KERN_ERR, "mount failed"); |
2925 | destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); | ||
2926 | failed_mount_wq: | ||
2916 | ext4_release_system_zone(sb); | 2927 | ext4_release_system_zone(sb); |
2917 | if (sbi->s_journal) { | 2928 | if (sbi->s_journal) { |
2918 | jbd2_journal_destroy(sbi->s_journal); | 2929 | jbd2_journal_destroy(sbi->s_journal); |