aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJiaying Zhang <jiayingz@google.com>2010-03-04 16:14:02 -0500
committerTheodore Ts'o <tytso@mit.edu>2010-03-04 16:14:02 -0500
commit744692dc059845b2a3022119871846e74d4f6e11 (patch)
treeed246651aebcb8dae57de8c58dc20983064ee017
parentc7064ef13b2181a489836349f9baf87df0dab28f (diff)
ext4: use ext4_get_block_write in buffer write
Allocate uninitialized extent before ext4 buffer write and convert the extent to initialized after io completes. The purpose is to make sure an extent can only be marked initialized after it has been written with new data so we can safely drop the i_mutex lock in ext4 DIO read without exposing stale data. This helps to improve multi-thread DIO read performance on high-speed disks. Skip the nobh and data=journal mount cases to make things simple for now. Signed-off-by: Jiaying Zhang <jiayingz@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/ext4.h15
-rw-r--r--fs/ext4/ext4_jbd2.h24
-rw-r--r--fs/ext4/extents.c22
-rw-r--r--fs/ext4/inode.c213
-rw-r--r--fs/ext4/super.c37
5 files changed, 256 insertions, 55 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c831a580bd76..dee45800dc95 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -138,7 +138,7 @@ typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */ 138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */ 139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */ 140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */ 141 struct page *page; /* page struct for buffer write */
142 loff_t offset; /* offset in the file */ 142 loff_t offset; /* offset in the file */
143 ssize_t size; /* size of the extent */ 143 ssize_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */ 144 struct work_struct work; /* data work queue */
@@ -361,7 +361,7 @@ struct ext4_new_group_data {
361 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 361 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
362 /* Convert extent to initialized after IO complete */ 362 /* Convert extent to initialized after IO complete */
363#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 363#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
364 EXT4_GET_BLOCKS_IO_CREATE_EXT) 364 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
365 365
366/* 366/*
367 * Flags used by ext4_free_blocks 367 * Flags used by ext4_free_blocks
@@ -702,6 +702,7 @@ struct ext4_inode_info {
702 702
703 /* completed IOs that might need unwritten extents handling */ 703 /* completed IOs that might need unwritten extents handling */
704 struct list_head i_completed_io_list; 704 struct list_head i_completed_io_list;
705 spinlock_t i_completed_io_lock;
705 /* current io_end structure for async DIO write*/ 706 /* current io_end structure for async DIO write*/
706 ext4_io_end_t *cur_aio_dio; 707 ext4_io_end_t *cur_aio_dio;
707 708
@@ -752,6 +753,7 @@ struct ext4_inode_info {
752#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 753#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
753#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 754#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
754#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 755#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
756#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
755#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 757#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
756#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 758#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
757#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 759#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -1781,6 +1783,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1781 __u64 len, __u64 *moved_len); 1783 __u64 len, __u64 *moved_len);
1782 1784
1783 1785
1786/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
1787enum ext4_state_bits {
1788 BH_Uninit /* blocks are allocated but uninitialized on disk */
1789 = BH_JBDPrivateStart,
1790};
1791
1792BUFFER_FNS(Uninit, uninit)
1793TAS_BUFFER_FNS(Uninit, uninit)
1794
1784/* 1795/*
1785 * Add new method to test wether block and inode bitmaps are properly 1796 * Add new method to test wether block and inode bitmaps are properly
1786 * initialized. With uninit_bg reading the block from disk is not enough 1797 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d704..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
304 return 0; 304 return 0;
305} 305}
306 306
307/*
308 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is
312 * enabled, since the dioread_nolock code uses b_private to pass
313 * information back to the I/O completion handler, and this conflicts
314 * with the jbd's use of b_private.
315 */
316static inline int ext4_should_dioread_nolock(struct inode *inode)
317{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode))
323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
325 return 0;
326 if (ext4_should_journal_data(inode))
327 return 0;
328 return 1;
329}
330
307#endif /* _EXT4_JBD2_H */ 331#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 90ba8d9df697..c7f166ab50eb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1619 BUG_ON(path[depth].p_hdr == NULL); 1619 BUG_ON(path[depth].p_hdr == NULL);
1620 1620
1621 /* try to insert block into found extent and return */ 1621 /* try to insert block into found extent and return */
1622 if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO) 1622 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1623 && ext4_can_extents_be_merged(inode, ex, newext)) { 1623 && ext4_can_extents_be_merged(inode, ex, newext)) {
1624 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1624 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1625 ext4_ext_is_uninitialized(newext), 1625 ext4_ext_is_uninitialized(newext),
@@ -1740,7 +1740,7 @@ has_space:
1740 1740
1741merge: 1741merge:
1742 /* try to merge extents to the right */ 1742 /* try to merge extents to the right */
1743 if (flag != EXT4_GET_BLOCKS_PRE_IO) 1743 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1744 ext4_ext_try_to_merge(inode, path, nearex); 1744 ext4_ext_try_to_merge(inode, path, nearex);
1745 1745
1746 /* try to merge extents to the left */ 1746 /* try to merge extents to the left */
@@ -3065,7 +3065,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3065 ext4_ext_show_leaf(inode, path); 3065 ext4_ext_show_leaf(inode, path);
3066 3066
3067 /* get_block() before submit the IO, split the extent */ 3067 /* get_block() before submit the IO, split the extent */
3068 if (flags == EXT4_GET_BLOCKS_PRE_IO) { 3068 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3069 ret = ext4_split_unwritten_extents(handle, 3069 ret = ext4_split_unwritten_extents(handle,
3070 inode, path, iblock, 3070 inode, path, iblock,
3071 max_blocks, flags); 3071 max_blocks, flags);
@@ -3078,10 +3078,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3078 io->flag = EXT4_IO_UNWRITTEN; 3078 io->flag = EXT4_IO_UNWRITTEN;
3079 else 3079 else
3080 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3080 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3081 if (ext4_should_dioread_nolock(inode))
3082 set_buffer_uninit(bh_result);
3081 goto out; 3083 goto out;
3082 } 3084 }
3083 /* IO end_io complete, convert the filled extent to written */ 3085 /* IO end_io complete, convert the filled extent to written */
3084 if (flags == EXT4_GET_BLOCKS_CONVERT) { 3086 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3085 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3087 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3086 path); 3088 path);
3087 if (ret >= 0) 3089 if (ret >= 0)
@@ -3351,21 +3353,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3351 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3353 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3352 ext4_ext_mark_uninitialized(&newex); 3354 ext4_ext_mark_uninitialized(&newex);
3353 /* 3355 /*
3354 * io_end structure was created for every async 3356 * io_end structure was created for every IO write to an
3355 * direct IO write to the middle of the file. 3357 * uninitialized extent. To avoid unecessary conversion,
3356 * To avoid unecessary convertion for every aio dio rewrite 3358 * here we flag the IO that really needs the conversion.
3357 * to the mid of file, here we flag the IO that is really
3358 * need the convertion.
3359 * For non asycn direct IO case, flag the inode state 3359 * For non asycn direct IO case, flag the inode state
3360 * that we need to perform convertion when IO is done. 3360 * that we need to perform convertion when IO is done.
3361 */ 3361 */
3362 if (flags == EXT4_GET_BLOCKS_PRE_IO) { 3362 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3363 if (io) 3363 if (io)
3364 io->flag = EXT4_IO_UNWRITTEN; 3364 io->flag = EXT4_IO_UNWRITTEN;
3365 else 3365 else
3366 ext4_set_inode_state(inode, 3366 ext4_set_inode_state(inode,
3367 EXT4_STATE_DIO_UNWRITTEN); 3367 EXT4_STATE_DIO_UNWRITTEN);
3368 } 3368 }
3369 if (ext4_should_dioread_nolock(inode))
3370 set_buffer_uninit(bh_result);
3369 } 3371 }
3370 3372
3371 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { 3373 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28f116bdc405..d291310aef6b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h>
41 42
42#include "ext4_jbd2.h" 43#include "ext4_jbd2.h"
43#include "xattr.h" 44#include "xattr.h"
@@ -1534,6 +1535,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
1534 ext4_truncate(inode); 1535 ext4_truncate(inode);
1535} 1536}
1536 1537
1538static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1539 struct buffer_head *bh_result, int create);
1537static int ext4_write_begin(struct file *file, struct address_space *mapping, 1540static int ext4_write_begin(struct file *file, struct address_space *mapping,
1538 loff_t pos, unsigned len, unsigned flags, 1541 loff_t pos, unsigned len, unsigned flags,
1539 struct page **pagep, void **fsdata) 1542 struct page **pagep, void **fsdata)
@@ -1575,8 +1578,12 @@ retry:
1575 } 1578 }
1576 *pagep = page; 1579 *pagep = page;
1577 1580
1578 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1581 if (ext4_should_dioread_nolock(inode))
1579 ext4_get_block); 1582 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1583 fsdata, ext4_get_block_write);
1584 else
1585 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1586 fsdata, ext4_get_block);
1580 1587
1581 if (!ret && ext4_should_journal_data(inode)) { 1588 if (!ret && ext4_should_journal_data(inode)) {
1582 ret = walk_page_buffers(handle, page_buffers(page), 1589 ret = walk_page_buffers(handle, page_buffers(page),
@@ -2092,6 +2099,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2092 } else if (buffer_mapped(bh)) 2099 } else if (buffer_mapped(bh))
2093 BUG_ON(bh->b_blocknr != pblock); 2100 BUG_ON(bh->b_blocknr != pblock);
2094 2101
2102 if (buffer_uninit(exbh))
2103 set_buffer_uninit(bh);
2095 cur_logical++; 2104 cur_logical++;
2096 pblock++; 2105 pblock++;
2097 } while ((bh = bh->b_this_page) != head); 2106 } while ((bh = bh->b_this_page) != head);
@@ -2221,6 +2230,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2221 */ 2230 */
2222 new.b_state = 0; 2231 new.b_state = 0;
2223 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2232 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2233 if (ext4_should_dioread_nolock(mpd->inode))
2234 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2224 if (mpd->b_state & (1 << BH_Delay)) 2235 if (mpd->b_state & (1 << BH_Delay))
2225 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2236 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2226 2237
@@ -2636,6 +2647,9 @@ out:
2636 return ret; 2647 return ret;
2637} 2648}
2638 2649
2650static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2651static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2652
2639/* 2653/*
2640 * Note that we don't need to start a transaction unless we're journaling data 2654 * Note that we don't need to start a transaction unless we're journaling data
2641 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2655 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2683,7 +2697,7 @@ static int ext4_writepage(struct page *page,
2683 int ret = 0; 2697 int ret = 0;
2684 loff_t size; 2698 loff_t size;
2685 unsigned int len; 2699 unsigned int len;
2686 struct buffer_head *page_bufs; 2700 struct buffer_head *page_bufs = NULL;
2687 struct inode *inode = page->mapping->host; 2701 struct inode *inode = page->mapping->host;
2688 2702
2689 trace_ext4_writepage(inode, page); 2703 trace_ext4_writepage(inode, page);
@@ -2759,7 +2773,11 @@ static int ext4_writepage(struct page *page,
2759 2773
2760 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2774 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2761 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2775 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2762 else 2776 else if (page_bufs && buffer_uninit(page_bufs)) {
2777 ext4_set_bh_endio(page_bufs, inode);
2778 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2779 wbc, ext4_end_io_buffer_write);
2780 } else
2763 ret = block_write_full_page(page, noalloc_get_block_write, 2781 ret = block_write_full_page(page, noalloc_get_block_write,
2764 wbc); 2782 wbc);
2765 2783
@@ -3347,11 +3365,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3347 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3365 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3348} 3366}
3349 3367
3368static void ext4_free_io_end(ext4_io_end_t *io)
3369{
3370 BUG_ON(!io);
3371 if (io->page)
3372 put_page(io->page);
3373 iput(io->inode);
3374 kfree(io);
3375}
3376
3377static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3378{
3379 struct buffer_head *head, *bh;
3380 unsigned int curr_off = 0;
3381
3382 if (!page_has_buffers(page))
3383 return;
3384 head = bh = page_buffers(page);
3385 do {
3386 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3387 && bh->b_private) {
3388 ext4_free_io_end(bh->b_private);
3389 bh->b_private = NULL;
3390 bh->b_end_io = NULL;
3391 }
3392 curr_off = curr_off + bh->b_size;
3393 bh = bh->b_this_page;
3394 } while (bh != head);
3395}
3396
3350static void ext4_invalidatepage(struct page *page, unsigned long offset) 3397static void ext4_invalidatepage(struct page *page, unsigned long offset)
3351{ 3398{
3352 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3399 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3353 3400
3354 /* 3401 /*
3402 * free any io_end structure allocated for buffers to be discarded
3403 */
3404 if (ext4_should_dioread_nolock(page->mapping->host))
3405 ext4_invalidatepage_free_endio(page, offset);
3406 /*
3355 * If it's a full truncate we just forget about the pending dirtying 3407 * If it's a full truncate we just forget about the pending dirtying
3356 */ 3408 */
3357 if (offset == 0) 3409 if (offset == 0)
@@ -3471,10 +3523,11 @@ out:
3471static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3523static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3472 struct buffer_head *bh_result, int create) 3524 struct buffer_head *bh_result, int create)
3473{ 3525{
3474 handle_t *handle = NULL; 3526 handle_t *handle = ext4_journal_current_handle();
3475 int ret = 0; 3527 int ret = 0;
3476 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3528 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3477 int dio_credits; 3529 int dio_credits;
3530 int started = 0;
3478 3531
3479 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3532 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3480 inode->i_ino, create); 3533 inode->i_ino, create);
@@ -3485,37 +3538,36 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3485 */ 3538 */
3486 create = EXT4_GET_BLOCKS_IO_CREATE_EXT; 3539 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3487 3540
3488 if (max_blocks > DIO_MAX_BLOCKS) 3541 if (!handle) {
3489 max_blocks = DIO_MAX_BLOCKS; 3542 if (max_blocks > DIO_MAX_BLOCKS)
3490 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 3543 max_blocks = DIO_MAX_BLOCKS;
3491 handle = ext4_journal_start(inode, dio_credits); 3544 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3492 if (IS_ERR(handle)) { 3545 handle = ext4_journal_start(inode, dio_credits);
3493 ret = PTR_ERR(handle); 3546 if (IS_ERR(handle)) {
3494 goto out; 3547 ret = PTR_ERR(handle);
3548 goto out;
3549 }
3550 started = 1;
3495 } 3551 }
3552
3496 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 3553 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3497 create); 3554 create);
3498 if (ret > 0) { 3555 if (ret > 0) {
3499 bh_result->b_size = (ret << inode->i_blkbits); 3556 bh_result->b_size = (ret << inode->i_blkbits);
3500 ret = 0; 3557 ret = 0;
3501 } 3558 }
3502 ext4_journal_stop(handle); 3559 if (started)
3560 ext4_journal_stop(handle);
3503out: 3561out:
3504 return ret; 3562 return ret;
3505} 3563}
3506 3564
3507static void ext4_free_io_end(ext4_io_end_t *io)
3508{
3509 BUG_ON(!io);
3510 iput(io->inode);
3511 kfree(io);
3512}
3513
3514static void dump_completed_IO(struct inode * inode) 3565static void dump_completed_IO(struct inode * inode)
3515{ 3566{
3516#ifdef EXT4_DEBUG 3567#ifdef EXT4_DEBUG
3517 struct list_head *cur, *before, *after; 3568 struct list_head *cur, *before, *after;
3518 ext4_io_end_t *io, *io0, *io1; 3569 ext4_io_end_t *io, *io0, *io1;
3570 unsigned long flags;
3519 3571
3520 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ 3572 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3521 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); 3573 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
@@ -3523,6 +3575,7 @@ static void dump_completed_IO(struct inode * inode)
3523 } 3575 }
3524 3576
3525 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); 3577 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3578 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3526 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ 3579 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3527 cur = &io->list; 3580 cur = &io->list;
3528 before = cur->prev; 3581 before = cur->prev;
@@ -3533,6 +3586,7 @@ static void dump_completed_IO(struct inode * inode)
3533 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 3586 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3534 io, inode->i_ino, io0, io1); 3587 io, inode->i_ino, io0, io1);
3535 } 3588 }
3589 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3536#endif 3590#endif
3537} 3591}
3538 3592
@@ -3556,9 +3610,7 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
3556 if (io->flag != EXT4_IO_UNWRITTEN) 3610 if (io->flag != EXT4_IO_UNWRITTEN)
3557 return ret; 3611 return ret;
3558 3612
3559 if (offset + size <= i_size_read(inode)) 3613 ret = ext4_convert_unwritten_extents(inode, offset, size);
3560 ret = ext4_convert_unwritten_extents(inode, offset, size);
3561
3562 if (ret < 0) { 3614 if (ret < 0) {
3563 printk(KERN_EMERG "%s: failed to convert unwritten" 3615 printk(KERN_EMERG "%s: failed to convert unwritten"
3564 "extents to written extents, error is %d" 3616 "extents to written extents, error is %d"
@@ -3577,18 +3629,25 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
3577 */ 3629 */
3578static void ext4_end_io_work(struct work_struct *work) 3630static void ext4_end_io_work(struct work_struct *work)
3579{ 3631{
3580 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 3632 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3581 struct inode *inode = io->inode; 3633 struct inode *inode = io->inode;
3582 int ret = 0; 3634 struct ext4_inode_info *ei = EXT4_I(inode);
3635 unsigned long flags;
3636 int ret;
3583 3637
3584 mutex_lock(&inode->i_mutex); 3638 mutex_lock(&inode->i_mutex);
3585 ret = ext4_end_io_nolock(io); 3639 ret = ext4_end_io_nolock(io);
3586 if (ret >= 0) { 3640 if (ret < 0) {
3587 if (!list_empty(&io->list)) 3641 mutex_unlock(&inode->i_mutex);
3588 list_del_init(&io->list); 3642 return;
3589 ext4_free_io_end(io);
3590 } 3643 }
3644
3645 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3646 if (!list_empty(&io->list))
3647 list_del_init(&io->list);
3648 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3591 mutex_unlock(&inode->i_mutex); 3649 mutex_unlock(&inode->i_mutex);
3650 ext4_free_io_end(io);
3592} 3651}
3593 3652
3594/* 3653/*
@@ -3607,15 +3666,18 @@ static void ext4_end_io_work(struct work_struct *work)
3607int flush_completed_IO(struct inode *inode) 3666int flush_completed_IO(struct inode *inode)
3608{ 3667{
3609 ext4_io_end_t *io; 3668 ext4_io_end_t *io;
3669 struct ext4_inode_info *ei = EXT4_I(inode);
3670 unsigned long flags;
3610 int ret = 0; 3671 int ret = 0;
3611 int ret2 = 0; 3672 int ret2 = 0;
3612 3673
3613 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) 3674 if (list_empty(&ei->i_completed_io_list))
3614 return ret; 3675 return ret;
3615 3676
3616 dump_completed_IO(inode); 3677 dump_completed_IO(inode);
3617 while (!list_empty(&EXT4_I(inode)->i_completed_io_list)){ 3678 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3618 io = list_entry(EXT4_I(inode)->i_completed_io_list.next, 3679 while (!list_empty(&ei->i_completed_io_list)){
3680 io = list_entry(ei->i_completed_io_list.next,
3619 ext4_io_end_t, list); 3681 ext4_io_end_t, list);
3620 /* 3682 /*
3621 * Calling ext4_end_io_nolock() to convert completed 3683 * Calling ext4_end_io_nolock() to convert completed
@@ -3631,20 +3693,23 @@ int flush_completed_IO(struct inode *inode)
3631 * avoid double converting from both fsync and background work 3693 * avoid double converting from both fsync and background work
3632 * queue work. 3694 * queue work.
3633 */ 3695 */
3696 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3634 ret = ext4_end_io_nolock(io); 3697 ret = ext4_end_io_nolock(io);
3698 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3635 if (ret < 0) 3699 if (ret < 0)
3636 ret2 = ret; 3700 ret2 = ret;
3637 else 3701 else
3638 list_del_init(&io->list); 3702 list_del_init(&io->list);
3639 } 3703 }
3704 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3640 return (ret2 < 0) ? ret2 : 0; 3705 return (ret2 < 0) ? ret2 : 0;
3641} 3706}
3642 3707
3643static ext4_io_end_t *ext4_init_io_end (struct inode *inode) 3708static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3644{ 3709{
3645 ext4_io_end_t *io = NULL; 3710 ext4_io_end_t *io = NULL;
3646 3711
3647 io = kmalloc(sizeof(*io), GFP_NOFS); 3712 io = kmalloc(sizeof(*io), flags);
3648 3713
3649 if (io) { 3714 if (io) {
3650 igrab(inode); 3715 igrab(inode);
@@ -3652,7 +3717,7 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3652 io->flag = 0; 3717 io->flag = 0;
3653 io->offset = 0; 3718 io->offset = 0;
3654 io->size = 0; 3719 io->size = 0;
3655 io->error = 0; 3720 io->page = NULL;
3656 INIT_WORK(&io->work, ext4_end_io_work); 3721 INIT_WORK(&io->work, ext4_end_io_work);
3657 INIT_LIST_HEAD(&io->list); 3722 INIT_LIST_HEAD(&io->list);
3658 } 3723 }
@@ -3665,6 +3730,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3665{ 3730{
3666 ext4_io_end_t *io_end = iocb->private; 3731 ext4_io_end_t *io_end = iocb->private;
3667 struct workqueue_struct *wq; 3732 struct workqueue_struct *wq;
3733 unsigned long flags;
3734 struct ext4_inode_info *ei;
3668 3735
3669 /* if not async direct IO or dio with 0 bytes write, just return */ 3736 /* if not async direct IO or dio with 0 bytes write, just return */
3670 if (!io_end || !size) 3737 if (!io_end || !size)
@@ -3684,17 +3751,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3684 3751
3685 io_end->offset = offset; 3752 io_end->offset = offset;
3686 io_end->size = size; 3753 io_end->size = size;
3754 io_end->flag = EXT4_IO_UNWRITTEN;
3687 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3755 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3688 3756
3689 /* queue the work to convert unwritten extents to written */ 3757 /* queue the work to convert unwritten extents to written */
3690 queue_work(wq, &io_end->work); 3758 queue_work(wq, &io_end->work);
3691 3759
3692 /* Add the io_end to per-inode completed aio dio list*/ 3760 /* Add the io_end to per-inode completed aio dio list*/
3693 list_add_tail(&io_end->list, 3761 ei = EXT4_I(io_end->inode);
3694 &EXT4_I(io_end->inode)->i_completed_io_list); 3762 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3763 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3764 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3695 iocb->private = NULL; 3765 iocb->private = NULL;
3696} 3766}
3697 3767
3768static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3769{
3770 ext4_io_end_t *io_end = bh->b_private;
3771 struct workqueue_struct *wq;
3772 struct inode *inode;
3773 unsigned long flags;
3774
3775 if (!test_clear_buffer_uninit(bh) || !io_end)
3776 goto out;
3777
3778 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3779 printk("sb umounted, discard end_io request for inode %lu\n",
3780 io_end->inode->i_ino);
3781 ext4_free_io_end(io_end);
3782 goto out;
3783 }
3784
3785 io_end->flag = EXT4_IO_UNWRITTEN;
3786 inode = io_end->inode;
3787
3788 /* Add the io_end to per-inode completed io list*/
3789 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3790 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3791 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3792
3793 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3794 /* queue the work to convert unwritten extents to written */
3795 queue_work(wq, &io_end->work);
3796out:
3797 bh->b_private = NULL;
3798 bh->b_end_io = NULL;
3799 clear_buffer_uninit(bh);
3800 end_buffer_async_write(bh, uptodate);
3801}
3802
3803static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3804{
3805 ext4_io_end_t *io_end;
3806 struct page *page = bh->b_page;
3807 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3808 size_t size = bh->b_size;
3809
3810retry:
3811 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3812 if (!io_end) {
3813 if (printk_ratelimit())
3814 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3815 schedule();
3816 goto retry;
3817 }
3818 io_end->offset = offset;
3819 io_end->size = size;
3820 /*
3821 * We need to hold a reference to the page to make sure it
3822 * doesn't get evicted before ext4_end_io_work() has a chance
3823 * to convert the extent from written to unwritten.
3824 */
3825 io_end->page = page;
3826 get_page(io_end->page);
3827
3828 bh->b_private = io_end;
3829 bh->b_end_io = ext4_end_io_buffer_write;
3830 return 0;
3831}
3832
3698/* 3833/*
3699 * For ext4 extent files, ext4 will do direct-io write to holes, 3834 * For ext4 extent files, ext4 will do direct-io write to holes,
3700 * preallocated extents, and those write extend the file, no need to 3835 * preallocated extents, and those write extend the file, no need to
@@ -3748,7 +3883,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3748 iocb->private = NULL; 3883 iocb->private = NULL;
3749 EXT4_I(inode)->cur_aio_dio = NULL; 3884 EXT4_I(inode)->cur_aio_dio = NULL;
3750 if (!is_sync_kiocb(iocb)) { 3885 if (!is_sync_kiocb(iocb)) {
3751 iocb->private = ext4_init_io_end(inode); 3886 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3752 if (!iocb->private) 3887 if (!iocb->private)
3753 return -ENOMEM; 3888 return -ENOMEM;
3754 /* 3889 /*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc7a97e79e3b..5e8f9077b0fc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -709,6 +709,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
709 ei->i_reserved_quota = 0; 709 ei->i_reserved_quota = 0;
710#endif 710#endif
711 INIT_LIST_HEAD(&ei->i_completed_io_list); 711 INIT_LIST_HEAD(&ei->i_completed_io_list);
712 spin_lock_init(&ei->i_completed_io_lock);
712 ei->cur_aio_dio = NULL; 713 ei->cur_aio_dio = NULL;
713 ei->i_sync_tid = 0; 714 ei->i_sync_tid = 0;
714 ei->i_datasync_tid = 0; 715 ei->i_datasync_tid = 0;
@@ -926,6 +927,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
926 if (test_opt(sb, NOLOAD)) 927 if (test_opt(sb, NOLOAD))
927 seq_puts(seq, ",norecovery"); 928 seq_puts(seq, ",norecovery");
928 929
930 if (test_opt(sb, DIOREAD_NOLOCK))
931 seq_puts(seq, ",dioread_nolock");
932
929 ext4_show_quota_options(seq, sb); 933 ext4_show_quota_options(seq, sb);
930 934
931 return 0; 935 return 0;
@@ -1109,6 +1113,7 @@ enum {
1109 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1113 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1110 Opt_block_validity, Opt_noblock_validity, 1114 Opt_block_validity, Opt_noblock_validity,
1111 Opt_inode_readahead_blks, Opt_journal_ioprio, 1115 Opt_inode_readahead_blks, Opt_journal_ioprio,
1116 Opt_dioread_nolock, Opt_dioread_lock,
1112 Opt_discard, Opt_nodiscard, 1117 Opt_discard, Opt_nodiscard,
1113}; 1118};
1114 1119
@@ -1176,6 +1181,8 @@ static const match_table_t tokens = {
1176 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1181 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1177 {Opt_auto_da_alloc, "auto_da_alloc"}, 1182 {Opt_auto_da_alloc, "auto_da_alloc"},
1178 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1183 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1184 {Opt_dioread_nolock, "dioread_nolock"},
1185 {Opt_dioread_lock, "dioread_lock"},
1179 {Opt_discard, "discard"}, 1186 {Opt_discard, "discard"},
1180 {Opt_nodiscard, "nodiscard"}, 1187 {Opt_nodiscard, "nodiscard"},
1181 {Opt_err, NULL}, 1188 {Opt_err, NULL},
@@ -1640,6 +1647,12 @@ set_qf_format:
1640 case Opt_nodiscard: 1647 case Opt_nodiscard:
1641 clear_opt(sbi->s_mount_opt, DISCARD); 1648 clear_opt(sbi->s_mount_opt, DISCARD);
1642 break; 1649 break;
1650 case Opt_dioread_nolock:
1651 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1652 break;
1653 case Opt_dioread_lock:
1654 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1655 break;
1643 default: 1656 default:
1644 ext4_msg(sb, KERN_ERR, 1657 ext4_msg(sb, KERN_ERR,
1645 "Unrecognized mount option \"%s\" " 1658 "Unrecognized mount option \"%s\" "
@@ -2795,7 +2808,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2795 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2808 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2796 ext4_msg(sb, KERN_ERR, "required journal recovery " 2809 ext4_msg(sb, KERN_ERR, "required journal recovery "
2797 "suppressed and not mounted read-only"); 2810 "suppressed and not mounted read-only");
2798 goto failed_mount4; 2811 goto failed_mount_wq;
2799 } else { 2812 } else {
2800 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2813 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2801 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 2814 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2808,7 +2821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2808 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2821 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2809 JBD2_FEATURE_INCOMPAT_64BIT)) { 2822 JBD2_FEATURE_INCOMPAT_64BIT)) {
2810 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 2823 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2811 goto failed_mount4; 2824 goto failed_mount_wq;
2812 } 2825 }
2813 2826
2814 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2827 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2847,7 +2860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2847 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2860 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2848 ext4_msg(sb, KERN_ERR, "Journal does not support " 2861 ext4_msg(sb, KERN_ERR, "Journal does not support "
2849 "requested data journaling mode"); 2862 "requested data journaling mode");
2850 goto failed_mount4; 2863 goto failed_mount_wq;
2851 } 2864 }
2852 default: 2865 default:
2853 break; 2866 break;
@@ -2855,13 +2868,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2855 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2868 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2856 2869
2857no_journal: 2870no_journal:
2858
2859 if (test_opt(sb, NOBH)) { 2871 if (test_opt(sb, NOBH)) {
2860 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2872 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2861 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2873 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2862 "its supported only with writeback mode"); 2874 "its supported only with writeback mode");
2863 clear_opt(sbi->s_mount_opt, NOBH); 2875 clear_opt(sbi->s_mount_opt, NOBH);
2864 } 2876 }
2877 if (test_opt(sb, DIOREAD_NOLOCK)) {
2878 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2879 "not supported with nobh mode");
2880 goto failed_mount_wq;
2881 }
2865 } 2882 }
2866 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 2883 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2867 if (!EXT4_SB(sb)->dio_unwritten_wq) { 2884 if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2926,6 +2943,18 @@ no_journal:
2926 "requested data journaling mode"); 2943 "requested data journaling mode");
2927 clear_opt(sbi->s_mount_opt, DELALLOC); 2944 clear_opt(sbi->s_mount_opt, DELALLOC);
2928 } 2945 }
2946 if (test_opt(sb, DIOREAD_NOLOCK)) {
2947 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2948 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2949 "option - requested data journaling mode");
2950 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2951 }
2952 if (sb->s_blocksize < PAGE_SIZE) {
2953 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2954 "option - block size is too small");
2955 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2956 }
2957 }
2929 2958
2930 err = ext4_setup_system_zone(sb); 2959 err = ext4_setup_system_zone(sb);
2931 if (err) { 2960 if (err) {