diff options
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/ext4/ext4.h | 3 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 197 | ||||
| -rw-r--r-- | fs/ext4/super.c | 11 |
3 files changed, 210 insertions, 1 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2b4293aac162..ccb4dbf359c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
| @@ -999,6 +999,9 @@ struct ext4_sb_info { | |||
| 999 | 999 | ||
| 1000 | unsigned int s_log_groups_per_flex; | 1000 | unsigned int s_log_groups_per_flex; |
| 1001 | struct flex_groups *s_flex_groups; | 1001 | struct flex_groups *s_flex_groups; |
| 1002 | |||
| 1003 | /* workqueue for dio unwritten */ | ||
| 1004 | struct workqueue_struct *dio_unwritten_wq; | ||
| 1002 | }; | 1005 | }; |
| 1003 | 1006 | ||
| 1004 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) | 1007 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index da4f2ecb5447..5633af6a7045 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
| @@ -37,6 +37,7 @@ | |||
| 37 | #include <linux/namei.h> | 37 | #include <linux/namei.h> |
| 38 | #include <linux/uio.h> | 38 | #include <linux/uio.h> |
| 39 | #include <linux/bio.h> | 39 | #include <linux/bio.h> |
| 40 | #include <linux/workqueue.h> | ||
| 40 | 41 | ||
| 41 | #include "ext4_jbd2.h" | 42 | #include "ext4_jbd2.h" |
| 42 | #include "xattr.h" | 43 | #include "xattr.h" |
| @@ -3356,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
| 3356 | } | 3357 | } |
| 3357 | 3358 | ||
| 3358 | /* | 3359 | /* |
| 3360 | * O_DIRECT for ext3 (or indirect map) based files | ||
| 3361 | * | ||
| 3359 | * If the O_DIRECT write will extend the file then add this inode to the | 3362 | * If the O_DIRECT write will extend the file then add this inode to the |
| 3360 | * orphan list. So recovery will truncate it back to the original size | 3363 | * orphan list. So recovery will truncate it back to the original size |
| 3361 | * if the machine crashes during the write. | 3364 | * if the machine crashes during the write. |
| @@ -3364,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
| 3364 | * crashes then stale disk data _may_ be exposed inside the file. But current | 3367 | * crashes then stale disk data _may_ be exposed inside the file. But current |
| 3365 | * VFS code falls back into buffered path in that case so we are safe. | 3368 | * VFS code falls back into buffered path in that case so we are safe. |
| 3366 | */ | 3369 | */ |
| 3367 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | 3370 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, |
| 3368 | const struct iovec *iov, loff_t offset, | 3371 | const struct iovec *iov, loff_t offset, |
| 3369 | unsigned long nr_segs) | 3372 | unsigned long nr_segs) |
| 3370 | { | 3373 | { |
| @@ -3438,6 +3441,198 @@ out: | |||
| 3438 | return ret; | 3441 | return ret; |
| 3439 | } | 3442 | } |
| 3440 | 3443 | ||
| 3444 | /* Maximum number of blocks we map for direct IO at once. */ | ||
| 3445 | |||
| 3446 | static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, | ||
| 3447 | struct buffer_head *bh_result, int create) | ||
| 3448 | { | ||
| 3449 | handle_t *handle = NULL; | ||
| 3450 | int ret = 0; | ||
| 3451 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
| 3452 | int dio_credits; | ||
| 3453 | |||
| 3454 | /* | ||
| 3455 | * DIO VFS code passes create = 0 flag for write to | ||
| 3456 | * the middle of file. It does this to avoid block | ||
| 3457 | * allocation for holes, to prevent expose stale data | ||
| 3458 | * out when there is parallel buffered read (which does | ||
| 3459 | * not hold the i_mutex lock) while direct IO write has | ||
| 3460 | * not completed. DIO request on holes finally falls back | ||
| 3461 | * to buffered IO for this reason. | ||
| 3462 | * | ||
| 3463 | * For ext4 extent based file, since we support fallocate, | ||
| 3464 | * new allocated extent as uninitialized, for holes, we | ||
| 3465 | * could fallocate blocks for holes, thus parallel | ||
| 3466 | * buffered IO read will zero out the page when read on | ||
| 3467 | * a hole while parallel DIO write to the hole has not completed. | ||
| 3468 | * | ||
| 3469 | * when we come here, we know it's a direct IO write to | ||
| 3470 | * to the middle of file (<i_size) | ||
| 3471 | * so it's safe to override the create flag from VFS. | ||
| 3472 | */ | ||
| 3473 | create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; | ||
| 3474 | |||
| 3475 | if (max_blocks > DIO_MAX_BLOCKS) | ||
| 3476 | max_blocks = DIO_MAX_BLOCKS; | ||
| 3477 | dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); | ||
| 3478 | handle = ext4_journal_start(inode, dio_credits); | ||
| 3479 | if (IS_ERR(handle)) { | ||
| 3480 | ret = PTR_ERR(handle); | ||
| 3481 | goto out; | ||
| 3482 | } | ||
| 3483 | ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, | ||
| 3484 | create); | ||
| 3485 | if (ret > 0) { | ||
| 3486 | bh_result->b_size = (ret << inode->i_blkbits); | ||
| 3487 | ret = 0; | ||
| 3488 | } | ||
| 3489 | ext4_journal_stop(handle); | ||
| 3490 | out: | ||
| 3491 | return ret; | ||
| 3492 | } | ||
| 3493 | |||
| 3494 | #define DIO_AIO 0x1 | ||
| 3495 | |||
| 3496 | static void ext4_free_io_end(ext4_io_end_t *io) | ||
| 3497 | { | ||
| 3498 | kfree(io); | ||
| 3499 | } | ||
| 3500 | |||
| 3501 | /* | ||
| 3502 | * IO write completion for unwritten extents. | ||
| 3503 | * | ||
| 3504 | * check a range of space and convert unwritten extents to written. | ||
| 3505 | */ | ||
| 3506 | static void ext4_end_dio_unwritten(struct work_struct *work) | ||
| 3507 | { | ||
| 3508 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
| 3509 | struct inode *inode = io->inode; | ||
| 3510 | loff_t offset = io->offset; | ||
| 3511 | size_t size = io->size; | ||
| 3512 | int ret = 0; | ||
| 3513 | int aio = io->flag & DIO_AIO; | ||
| 3514 | |||
| 3515 | if (aio) | ||
| 3516 | mutex_lock(&inode->i_mutex); | ||
| 3517 | if (offset + size <= i_size_read(inode)) | ||
| 3518 | ret = ext4_convert_unwritten_extents(inode, offset, size); | ||
| 3519 | |||
| 3520 | if (ret < 0) | ||
| 3521 | printk(KERN_EMERG "%s: failed to convert unwritten" | ||
| 3522 | "extents to written extents, error is %d\n", | ||
| 3523 | __func__, ret); | ||
| 3524 | |||
| 3525 | ext4_free_io_end(io); | ||
| 3526 | if (aio) | ||
| 3527 | mutex_unlock(&inode->i_mutex); | ||
| 3528 | } | ||
| 3529 | |||
| 3530 | static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) | ||
| 3531 | { | ||
| 3532 | ext4_io_end_t *io = NULL; | ||
| 3533 | |||
| 3534 | io = kmalloc(sizeof(*io), GFP_NOFS); | ||
| 3535 | |||
| 3536 | if (io) { | ||
| 3537 | io->inode = inode; | ||
| 3538 | io->flag = flag; | ||
| 3539 | io->offset = 0; | ||
| 3540 | io->size = 0; | ||
| 3541 | io->error = 0; | ||
| 3542 | INIT_WORK(&io->work, ext4_end_dio_unwritten); | ||
| 3543 | } | ||
| 3544 | |||
| 3545 | return io; | ||
| 3546 | } | ||
| 3547 | |||
| 3548 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | ||
| 3549 | ssize_t size, void *private) | ||
| 3550 | { | ||
| 3551 | ext4_io_end_t *io_end = iocb->private; | ||
| 3552 | struct workqueue_struct *wq; | ||
| 3553 | |||
| 3554 | /* if not hole or unwritten extents, just simple return */ | ||
| 3555 | if (!io_end || !size || !iocb->private) | ||
| 3556 | return; | ||
| 3557 | io_end->offset = offset; | ||
| 3558 | io_end->size = size; | ||
| 3559 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
| 3560 | |||
| 3561 | /* We need to convert unwritten extents to written */ | ||
| 3562 | queue_work(wq, &io_end->work); | ||
| 3563 | |||
| 3564 | if (is_sync_kiocb(iocb)) | ||
| 3565 | flush_workqueue(wq); | ||
| 3566 | |||
| 3567 | iocb->private = NULL; | ||
| 3568 | } | ||
| 3569 | /* | ||
| 3570 | * For ext4 extent files, ext4 will do direct-io write to holes, | ||
| 3571 | * preallocated extents, and those write extend the file, no need to | ||
| 3572 | * fall back to buffered IO. | ||
| 3573 | * | ||
| 3574 | * For holes, we fallocate those blocks, mark them as unintialized | ||
| 3575 | * If those blocks were preallocated, we mark sure they are splited, but | ||
| 3576 | * still keep the range to write as unintialized. | ||
| 3577 | * | ||
| 3578 | * When end_io call back function called at the last IO complete time, | ||
| 3579 | * those extents will be converted to written extents. | ||
| 3580 | * | ||
| 3581 | * If the O_DIRECT write will extend the file then add this inode to the | ||
| 3582 | * orphan list. So recovery will truncate it back to the original size | ||
| 3583 | * if the machine crashes during the write. | ||
| 3584 | * | ||
| 3585 | */ | ||
| 3586 | static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | ||
| 3587 | const struct iovec *iov, loff_t offset, | ||
| 3588 | unsigned long nr_segs) | ||
| 3589 | { | ||
| 3590 | struct file *file = iocb->ki_filp; | ||
| 3591 | struct inode *inode = file->f_mapping->host; | ||
| 3592 | ssize_t ret; | ||
| 3593 | size_t count = iov_length(iov, nr_segs); | ||
| 3594 | |||
| 3595 | loff_t final_size = offset + count; | ||
| 3596 | if (rw == WRITE && final_size <= inode->i_size) { | ||
| 3597 | /* | ||
| 3598 | * For DIO we fallocate blocks for holes, we fallocate blocks | ||
| 3599 | * The fallocated extent for hole is marked as uninitialized | ||
| 3600 | * to prevent paralel buffered read to expose the stale data | ||
| 3601 | * before DIO complete the data IO. | ||
| 3602 | * as for previously fallocated extents, ext4 get_block | ||
| 3603 | * will just simply mark the buffer mapped but still | ||
| 3604 | * keep the extents uninitialized. | ||
| 3605 | * | ||
| 3606 | * At the end of IO, the ext4 end_io callback function | ||
| 3607 | * will convert those unwritten extents to written, | ||
| 3608 | * | ||
| 3609 | */ | ||
| 3610 | iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); | ||
| 3611 | if (!iocb->private) | ||
| 3612 | return -ENOMEM; | ||
| 3613 | ret = blockdev_direct_IO(rw, iocb, inode, | ||
| 3614 | inode->i_sb->s_bdev, iov, | ||
| 3615 | offset, nr_segs, | ||
| 3616 | ext4_get_block_dio_write, | ||
| 3617 | ext4_end_io_dio); | ||
| 3618 | return ret; | ||
| 3619 | } | ||
| 3620 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
| 3621 | } | ||
| 3622 | |||
| 3623 | static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, | ||
| 3624 | const struct iovec *iov, loff_t offset, | ||
| 3625 | unsigned long nr_segs) | ||
| 3626 | { | ||
| 3627 | struct file *file = iocb->ki_filp; | ||
| 3628 | struct inode *inode = file->f_mapping->host; | ||
| 3629 | |||
| 3630 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | ||
| 3631 | return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
| 3632 | |||
| 3633 | return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); | ||
| 3634 | } | ||
| 3635 | |||
| 3441 | /* | 3636 | /* |
| 3442 | * Pages can be marked dirty completely asynchronously from ext4's journalling | 3637 | * Pages can be marked dirty completely asynchronously from ext4's journalling |
| 3443 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do | 3638 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16817737ba52..1a03ea98fdd1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
| @@ -580,6 +580,9 @@ static void ext4_put_super(struct super_block *sb) | |||
| 580 | struct ext4_super_block *es = sbi->s_es; | 580 | struct ext4_super_block *es = sbi->s_es; |
| 581 | int i, err; | 581 | int i, err; |
| 582 | 582 | ||
| 583 | flush_workqueue(sbi->dio_unwritten_wq); | ||
| 584 | destroy_workqueue(sbi->dio_unwritten_wq); | ||
| 585 | |||
| 583 | lock_super(sb); | 586 | lock_super(sb); |
| 584 | lock_kernel(); | 587 | lock_kernel(); |
| 585 | if (sb->s_dirt) | 588 | if (sb->s_dirt) |
| @@ -2801,6 +2804,12 @@ no_journal: | |||
| 2801 | clear_opt(sbi->s_mount_opt, NOBH); | 2804 | clear_opt(sbi->s_mount_opt, NOBH); |
| 2802 | } | 2805 | } |
| 2803 | } | 2806 | } |
| 2807 | EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); | ||
| 2808 | if (!EXT4_SB(sb)->dio_unwritten_wq) { | ||
| 2809 | printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); | ||
| 2810 | goto failed_mount_wq; | ||
| 2811 | } | ||
| 2812 | |||
| 2804 | /* | 2813 | /* |
| 2805 | * The jbd2_journal_load will have done any necessary log recovery, | 2814 | * The jbd2_journal_load will have done any necessary log recovery, |
| 2806 | * so we can safely mount the rest of the filesystem now. | 2815 | * so we can safely mount the rest of the filesystem now. |
| @@ -2913,6 +2922,8 @@ cantfind_ext4: | |||
| 2913 | 2922 | ||
| 2914 | failed_mount4: | 2923 | failed_mount4: |
| 2915 | ext4_msg(sb, KERN_ERR, "mount failed"); | 2924 | ext4_msg(sb, KERN_ERR, "mount failed"); |
| 2925 | destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); | ||
| 2926 | failed_mount_wq: | ||
| 2916 | ext4_release_system_zone(sb); | 2927 | ext4_release_system_zone(sb); |
| 2917 | if (sbi->s_journal) { | 2928 | if (sbi->s_journal) { |
| 2918 | jbd2_journal_destroy(sbi->s_journal); | 2929 | jbd2_journal_destroy(sbi->s_journal); |
