aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorMingming Cao <cmm@us.ibm.com>2009-09-28 15:48:41 -0400
committerTheodore Ts'o <tytso@mit.edu>2009-09-28 15:48:41 -0400
commit4c0425ff68b1b87b802ffeda7b6a46ff7da7241c (patch)
treea8718f5f4574af8e15fd876b24f4aec88d62451b /fs
parent0031462b5b392f90d17f1d75abb795883c44e969 (diff)
ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O
Currently the DIO VFS code passes create = 0 when writing to the middle of file. It does this to avoid block allocation for holes, so as not to expose stale data out when there is a parallel buffered read (which does not hold the i_mutex lock). Direct I/O writes into holes falls back to buffered IO for this reason. Since preallocated extents are treated as holes when doing a get_block() look up (buffer is not mapped), direct IO over fallocate also falls back to buffered IO. Thus ext4 actually silently falls back to buffered IO in above two cases, which is undesirable. To fix this, this patch creates unitialized extents when a direct I/O write into holes in sparse files, and registering an end_io callback which converts the uninitialized extent to an initialized extent after the I/O is completed. Singed-Off-By: Mingming Cao <cmm@us.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/ext4.h3
-rw-r--r--fs/ext4/inode.c197
-rw-r--r--fs/ext4/super.c11
3 files changed, 210 insertions, 1 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2b4293aac162..ccb4dbf359c4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -999,6 +999,9 @@ struct ext4_sb_info {
999 999
1000 unsigned int s_log_groups_per_flex; 1000 unsigned int s_log_groups_per_flex;
1001 struct flex_groups *s_flex_groups; 1001 struct flex_groups *s_flex_groups;
1002
1003 /* workqueue for dio unwritten */
1004 struct workqueue_struct *dio_unwritten_wq;
1002}; 1005};
1003 1006
1004static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1007static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index da4f2ecb5447..5633af6a7045 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -3356,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3356} 3357}
3357 3358
3358/* 3359/*
3360 * O_DIRECT for ext3 (or indirect map) based files
3361 *
3359 * If the O_DIRECT write will extend the file then add this inode to the 3362 * If the O_DIRECT write will extend the file then add this inode to the
3360 * orphan list. So recovery will truncate it back to the original size 3363 * orphan list. So recovery will truncate it back to the original size
3361 * if the machine crashes during the write. 3364 * if the machine crashes during the write.
@@ -3364,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3364 * crashes then stale disk data _may_ be exposed inside the file. But current 3367 * crashes then stale disk data _may_ be exposed inside the file. But current
3365 * VFS code falls back into buffered path in that case so we are safe. 3368 * VFS code falls back into buffered path in that case so we are safe.
3366 */ 3369 */
3367static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3370static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3368 const struct iovec *iov, loff_t offset, 3371 const struct iovec *iov, loff_t offset,
3369 unsigned long nr_segs) 3372 unsigned long nr_segs)
3370{ 3373{
@@ -3438,6 +3441,198 @@ out:
3438 return ret; 3441 return ret;
3439} 3442}
3440 3443
3444/* Maximum number of blocks we map for direct IO at once. */
3445
3446static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3447 struct buffer_head *bh_result, int create)
3448{
3449 handle_t *handle = NULL;
3450 int ret = 0;
3451 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3452 int dio_credits;
3453
3454 /*
3455 * DIO VFS code passes create = 0 flag for write to
3456 * the middle of file. It does this to avoid block
3457 * allocation for holes, to prevent expose stale data
3458 * out when there is parallel buffered read (which does
3459 * not hold the i_mutex lock) while direct IO write has
3460 * not completed. DIO request on holes finally falls back
3461 * to buffered IO for this reason.
3462 *
3463 * For ext4 extent based file, since we support fallocate,
3464 * new allocated extent as uninitialized, for holes, we
3465 * could fallocate blocks for holes, thus parallel
3466 * buffered IO read will zero out the page when read on
3467 * a hole while parallel DIO write to the hole has not completed.
3468 *
3469 * when we come here, we know it's a direct IO write to
3470 * to the middle of file (<i_size)
3471 * so it's safe to override the create flag from VFS.
3472 */
3473 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3474
3475 if (max_blocks > DIO_MAX_BLOCKS)
3476 max_blocks = DIO_MAX_BLOCKS;
3477 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3478 handle = ext4_journal_start(inode, dio_credits);
3479 if (IS_ERR(handle)) {
3480 ret = PTR_ERR(handle);
3481 goto out;
3482 }
3483 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3484 create);
3485 if (ret > 0) {
3486 bh_result->b_size = (ret << inode->i_blkbits);
3487 ret = 0;
3488 }
3489 ext4_journal_stop(handle);
3490out:
3491 return ret;
3492}
3493
3494#define DIO_AIO 0x1
3495
3496static void ext4_free_io_end(ext4_io_end_t *io)
3497{
3498 kfree(io);
3499}
3500
3501/*
3502 * IO write completion for unwritten extents.
3503 *
3504 * check a range of space and convert unwritten extents to written.
3505 */
3506static void ext4_end_dio_unwritten(struct work_struct *work)
3507{
3508 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3509 struct inode *inode = io->inode;
3510 loff_t offset = io->offset;
3511 size_t size = io->size;
3512 int ret = 0;
3513 int aio = io->flag & DIO_AIO;
3514
3515 if (aio)
3516 mutex_lock(&inode->i_mutex);
3517 if (offset + size <= i_size_read(inode))
3518 ret = ext4_convert_unwritten_extents(inode, offset, size);
3519
3520 if (ret < 0)
3521 printk(KERN_EMERG "%s: failed to convert unwritten"
3522 "extents to written extents, error is %d\n",
3523 __func__, ret);
3524
3525 ext4_free_io_end(io);
3526 if (aio)
3527 mutex_unlock(&inode->i_mutex);
3528}
3529
3530static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
3531{
3532 ext4_io_end_t *io = NULL;
3533
3534 io = kmalloc(sizeof(*io), GFP_NOFS);
3535
3536 if (io) {
3537 io->inode = inode;
3538 io->flag = flag;
3539 io->offset = 0;
3540 io->size = 0;
3541 io->error = 0;
3542 INIT_WORK(&io->work, ext4_end_dio_unwritten);
3543 }
3544
3545 return io;
3546}
3547
3548static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3549 ssize_t size, void *private)
3550{
3551 ext4_io_end_t *io_end = iocb->private;
3552 struct workqueue_struct *wq;
3553
3554 /* if not hole or unwritten extents, just simple return */
3555 if (!io_end || !size || !iocb->private)
3556 return;
3557 io_end->offset = offset;
3558 io_end->size = size;
3559 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3560
3561 /* We need to convert unwritten extents to written */
3562 queue_work(wq, &io_end->work);
3563
3564 if (is_sync_kiocb(iocb))
3565 flush_workqueue(wq);
3566
3567 iocb->private = NULL;
3568}
3569/*
3570 * For ext4 extent files, ext4 will do direct-io write to holes,
3571 * preallocated extents, and those write extend the file, no need to
3572 * fall back to buffered IO.
3573 *
3574 * For holes, we fallocate those blocks, mark them as unintialized
3575 * If those blocks were preallocated, we mark sure they are splited, but
3576 * still keep the range to write as unintialized.
3577 *
3578 * When end_io call back function called at the last IO complete time,
3579 * those extents will be converted to written extents.
3580 *
3581 * If the O_DIRECT write will extend the file then add this inode to the
3582 * orphan list. So recovery will truncate it back to the original size
3583 * if the machine crashes during the write.
3584 *
3585 */
3586static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3587 const struct iovec *iov, loff_t offset,
3588 unsigned long nr_segs)
3589{
3590 struct file *file = iocb->ki_filp;
3591 struct inode *inode = file->f_mapping->host;
3592 ssize_t ret;
3593 size_t count = iov_length(iov, nr_segs);
3594
3595 loff_t final_size = offset + count;
3596 if (rw == WRITE && final_size <= inode->i_size) {
3597 /*
3598 * For DIO we fallocate blocks for holes, we fallocate blocks
3599 * The fallocated extent for hole is marked as uninitialized
3600 * to prevent paralel buffered read to expose the stale data
3601 * before DIO complete the data IO.
3602 * as for previously fallocated extents, ext4 get_block
3603 * will just simply mark the buffer mapped but still
3604 * keep the extents uninitialized.
3605 *
3606 * At the end of IO, the ext4 end_io callback function
3607 * will convert those unwritten extents to written,
3608 *
3609 */
3610 iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
3611 if (!iocb->private)
3612 return -ENOMEM;
3613 ret = blockdev_direct_IO(rw, iocb, inode,
3614 inode->i_sb->s_bdev, iov,
3615 offset, nr_segs,
3616 ext4_get_block_dio_write,
3617 ext4_end_io_dio);
3618 return ret;
3619 }
3620 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3621}
3622
3623static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3624 const struct iovec *iov, loff_t offset,
3625 unsigned long nr_segs)
3626{
3627 struct file *file = iocb->ki_filp;
3628 struct inode *inode = file->f_mapping->host;
3629
3630 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3631 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3632
3633 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3634}
3635
3441/* 3636/*
3442 * Pages can be marked dirty completely asynchronously from ext4's journalling 3637 * Pages can be marked dirty completely asynchronously from ext4's journalling
3443 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3638 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 16817737ba52..1a03ea98fdd1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -580,6 +580,9 @@ static void ext4_put_super(struct super_block *sb)
580 struct ext4_super_block *es = sbi->s_es; 580 struct ext4_super_block *es = sbi->s_es;
581 int i, err; 581 int i, err;
582 582
583 flush_workqueue(sbi->dio_unwritten_wq);
584 destroy_workqueue(sbi->dio_unwritten_wq);
585
583 lock_super(sb); 586 lock_super(sb);
584 lock_kernel(); 587 lock_kernel();
585 if (sb->s_dirt) 588 if (sb->s_dirt)
@@ -2801,6 +2804,12 @@ no_journal:
2801 clear_opt(sbi->s_mount_opt, NOBH); 2804 clear_opt(sbi->s_mount_opt, NOBH);
2802 } 2805 }
2803 } 2806 }
2807 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2808 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2809 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
2810 goto failed_mount_wq;
2811 }
2812
2804 /* 2813 /*
2805 * The jbd2_journal_load will have done any necessary log recovery, 2814 * The jbd2_journal_load will have done any necessary log recovery,
2806 * so we can safely mount the rest of the filesystem now. 2815 * so we can safely mount the rest of the filesystem now.
@@ -2913,6 +2922,8 @@ cantfind_ext4:
2913 2922
2914failed_mount4: 2923failed_mount4:
2915 ext4_msg(sb, KERN_ERR, "mount failed"); 2924 ext4_msg(sb, KERN_ERR, "mount failed");
2925 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
2926failed_mount_wq:
2916 ext4_release_system_zone(sb); 2927 ext4_release_system_zone(sb);
2917 if (sbi->s_journal) { 2928 if (sbi->s_journal) {
2918 jbd2_journal_destroy(sbi->s_journal); 2929 jbd2_journal_destroy(sbi->s_journal);