aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c197
1 files changed, 196 insertions, 1 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index da4f2ecb5447..5633af6a7045 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -3356,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3356} 3357}
3357 3358
3358/* 3359/*
3360 * O_DIRECT for ext3 (or indirect map) based files
3361 *
3359 * If the O_DIRECT write will extend the file then add this inode to the 3362 * If the O_DIRECT write will extend the file then add this inode to the
3360 * orphan list. So recovery will truncate it back to the original size 3363 * orphan list. So recovery will truncate it back to the original size
3361 * if the machine crashes during the write. 3364 * if the machine crashes during the write.
@@ -3364,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3364 * crashes then stale disk data _may_ be exposed inside the file. But current 3367 * crashes then stale disk data _may_ be exposed inside the file. But current
3365 * VFS code falls back into buffered path in that case so we are safe. 3368 * VFS code falls back into buffered path in that case so we are safe.
3366 */ 3369 */
3367static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3370static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3368 const struct iovec *iov, loff_t offset, 3371 const struct iovec *iov, loff_t offset,
3369 unsigned long nr_segs) 3372 unsigned long nr_segs)
3370{ 3373{
@@ -3438,6 +3441,198 @@ out:
3438 return ret; 3441 return ret;
3439} 3442}
3440 3443
3444/* Maximum number of blocks we map for direct IO at once. */
3445
3446static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3447 struct buffer_head *bh_result, int create)
3448{
3449 handle_t *handle = NULL;
3450 int ret = 0;
3451 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3452 int dio_credits;
3453
3454 /*
3455 * DIO VFS code passes create = 0 flag for write to
3456 * the middle of file. It does this to avoid block
3457 * allocation for holes, to prevent expose stale data
3458 * out when there is parallel buffered read (which does
3459 * not hold the i_mutex lock) while direct IO write has
3460 * not completed. DIO request on holes finally falls back
3461 * to buffered IO for this reason.
3462 *
3463 * For ext4 extent based file, since we support fallocate,
3464 * new allocated extent as uninitialized, for holes, we
3465 * could fallocate blocks for holes, thus parallel
3466 * buffered IO read will zero out the page when read on
3467 * a hole while parallel DIO write to the hole has not completed.
3468 *
3469 * when we come here, we know it's a direct IO write to
3470 * to the middle of file (<i_size)
3471 * so it's safe to override the create flag from VFS.
3472 */
3473 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3474
3475 if (max_blocks > DIO_MAX_BLOCKS)
3476 max_blocks = DIO_MAX_BLOCKS;
3477 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3478 handle = ext4_journal_start(inode, dio_credits);
3479 if (IS_ERR(handle)) {
3480 ret = PTR_ERR(handle);
3481 goto out;
3482 }
3483 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3484 create);
3485 if (ret > 0) {
3486 bh_result->b_size = (ret << inode->i_blkbits);
3487 ret = 0;
3488 }
3489 ext4_journal_stop(handle);
3490out:
3491 return ret;
3492}
3493
3494#define DIO_AIO 0x1
3495
3496static void ext4_free_io_end(ext4_io_end_t *io)
3497{
3498 kfree(io);
3499}
3500
3501/*
3502 * IO write completion for unwritten extents.
3503 *
3504 * check a range of space and convert unwritten extents to written.
3505 */
3506static void ext4_end_dio_unwritten(struct work_struct *work)
3507{
3508 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3509 struct inode *inode = io->inode;
3510 loff_t offset = io->offset;
3511 size_t size = io->size;
3512 int ret = 0;
3513 int aio = io->flag & DIO_AIO;
3514
3515 if (aio)
3516 mutex_lock(&inode->i_mutex);
3517 if (offset + size <= i_size_read(inode))
3518 ret = ext4_convert_unwritten_extents(inode, offset, size);
3519
3520 if (ret < 0)
3521 printk(KERN_EMERG "%s: failed to convert unwritten"
3522 "extents to written extents, error is %d\n",
3523 __func__, ret);
3524
3525 ext4_free_io_end(io);
3526 if (aio)
3527 mutex_unlock(&inode->i_mutex);
3528}
3529
3530static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
3531{
3532 ext4_io_end_t *io = NULL;
3533
3534 io = kmalloc(sizeof(*io), GFP_NOFS);
3535
3536 if (io) {
3537 io->inode = inode;
3538 io->flag = flag;
3539 io->offset = 0;
3540 io->size = 0;
3541 io->error = 0;
3542 INIT_WORK(&io->work, ext4_end_dio_unwritten);
3543 }
3544
3545 return io;
3546}
3547
3548static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3549 ssize_t size, void *private)
3550{
3551 ext4_io_end_t *io_end = iocb->private;
3552 struct workqueue_struct *wq;
3553
3554 /* if not hole or unwritten extents, just simple return */
3555 if (!io_end || !size || !iocb->private)
3556 return;
3557 io_end->offset = offset;
3558 io_end->size = size;
3559 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3560
3561 /* We need to convert unwritten extents to written */
3562 queue_work(wq, &io_end->work);
3563
3564 if (is_sync_kiocb(iocb))
3565 flush_workqueue(wq);
3566
3567 iocb->private = NULL;
3568}
3569/*
3570 * For ext4 extent files, ext4 will do direct-io write to holes,
3571 * preallocated extents, and those write extend the file, no need to
3572 * fall back to buffered IO.
3573 *
3574 * For holes, we fallocate those blocks, mark them as unintialized
3575 * If those blocks were preallocated, we mark sure they are splited, but
3576 * still keep the range to write as unintialized.
3577 *
3578 * When end_io call back function called at the last IO complete time,
3579 * those extents will be converted to written extents.
3580 *
3581 * If the O_DIRECT write will extend the file then add this inode to the
3582 * orphan list. So recovery will truncate it back to the original size
3583 * if the machine crashes during the write.
3584 *
3585 */
3586static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3587 const struct iovec *iov, loff_t offset,
3588 unsigned long nr_segs)
3589{
3590 struct file *file = iocb->ki_filp;
3591 struct inode *inode = file->f_mapping->host;
3592 ssize_t ret;
3593 size_t count = iov_length(iov, nr_segs);
3594
3595 loff_t final_size = offset + count;
3596 if (rw == WRITE && final_size <= inode->i_size) {
3597 /*
3598 * For DIO we fallocate blocks for holes, we fallocate blocks
3599 * The fallocated extent for hole is marked as uninitialized
3600 * to prevent paralel buffered read to expose the stale data
3601 * before DIO complete the data IO.
3602 * as for previously fallocated extents, ext4 get_block
3603 * will just simply mark the buffer mapped but still
3604 * keep the extents uninitialized.
3605 *
3606 * At the end of IO, the ext4 end_io callback function
3607 * will convert those unwritten extents to written,
3608 *
3609 */
3610 iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
3611 if (!iocb->private)
3612 return -ENOMEM;
3613 ret = blockdev_direct_IO(rw, iocb, inode,
3614 inode->i_sb->s_bdev, iov,
3615 offset, nr_segs,
3616 ext4_get_block_dio_write,
3617 ext4_end_io_dio);
3618 return ret;
3619 }
3620 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3621}
3622
3623static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3624 const struct iovec *iov, loff_t offset,
3625 unsigned long nr_segs)
3626{
3627 struct file *file = iocb->ki_filp;
3628 struct inode *inode = file->f_mapping->host;
3629
3630 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3631 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3632
3633 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3634}
3635
3441/* 3636/*
3442 * Pages can be marked dirty completely asynchronously from ext4's journalling 3637 * Pages can be marked dirty completely asynchronously from ext4's journalling
3443 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3638 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do