aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/ext4.h15
-rw-r--r--fs/ext4/ext4_jbd2.h24
-rw-r--r--fs/ext4/extents.c22
-rw-r--r--fs/ext4/inode.c213
-rw-r--r--fs/ext4/super.c37
5 files changed, 256 insertions, 55 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c831a580bd76..dee45800dc95 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -138,7 +138,7 @@ typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */ 138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */ 139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */ 140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */ 141 struct page *page; /* page struct for buffer write */
142 loff_t offset; /* offset in the file */ 142 loff_t offset; /* offset in the file */
143 ssize_t size; /* size of the extent */ 143 ssize_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */ 144 struct work_struct work; /* data work queue */
@@ -361,7 +361,7 @@ struct ext4_new_group_data {
361 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 361 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
362 /* Convert extent to initialized after IO complete */ 362 /* Convert extent to initialized after IO complete */
363#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 363#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
364 EXT4_GET_BLOCKS_IO_CREATE_EXT) 364 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
365 365
366/* 366/*
367 * Flags used by ext4_free_blocks 367 * Flags used by ext4_free_blocks
@@ -702,6 +702,7 @@ struct ext4_inode_info {
702 702
703 /* completed IOs that might need unwritten extents handling */ 703 /* completed IOs that might need unwritten extents handling */
704 struct list_head i_completed_io_list; 704 struct list_head i_completed_io_list;
705 spinlock_t i_completed_io_lock;
705 /* current io_end structure for async DIO write*/ 706 /* current io_end structure for async DIO write*/
706 ext4_io_end_t *cur_aio_dio; 707 ext4_io_end_t *cur_aio_dio;
707 708
@@ -752,6 +753,7 @@ struct ext4_inode_info {
752#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 753#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
753#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 754#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
754#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 755#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
756#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
755#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 757#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
756#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 758#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
757#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 759#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -1781,6 +1783,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1781 __u64 len, __u64 *moved_len); 1783 __u64 len, __u64 *moved_len);
1782 1784
1783 1785
1786/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
1787enum ext4_state_bits {
1788 BH_Uninit /* blocks are allocated but uninitialized on disk */
1789 = BH_JBDPrivateStart,
1790};
1791
1792BUFFER_FNS(Uninit, uninit)
1793TAS_BUFFER_FNS(Uninit, uninit)
1794
1784/* 1795/*
1785 * Add new method to test wether block and inode bitmaps are properly 1796 * Add new method to test wether block and inode bitmaps are properly
1786 * initialized. With uninit_bg reading the block from disk is not enough 1797 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d704..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
304 return 0; 304 return 0;
305} 305}
306 306
307/*
308 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is
312 * enabled, since the dioread_nolock code uses b_private to pass
313 * information back to the I/O completion handler, and this conflicts
314 * with the jbd's use of b_private.
315 */
316static inline int ext4_should_dioread_nolock(struct inode *inode)
317{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode))
323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
325 return 0;
326 if (ext4_should_journal_data(inode))
327 return 0;
328 return 1;
329}
330
307#endif /* _EXT4_JBD2_H */ 331#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 90ba8d9df697..c7f166ab50eb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1619 BUG_ON(path[depth].p_hdr == NULL); 1619 BUG_ON(path[depth].p_hdr == NULL);
1620 1620
1621 /* try to insert block into found extent and return */ 1621 /* try to insert block into found extent and return */
1622 if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO) 1622 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1623 && ext4_can_extents_be_merged(inode, ex, newext)) { 1623 && ext4_can_extents_be_merged(inode, ex, newext)) {
1624 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1624 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1625 ext4_ext_is_uninitialized(newext), 1625 ext4_ext_is_uninitialized(newext),
@@ -1740,7 +1740,7 @@ has_space:
1740 1740
1741merge: 1741merge:
1742 /* try to merge extents to the right */ 1742 /* try to merge extents to the right */
1743 if (flag != EXT4_GET_BLOCKS_PRE_IO) 1743 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1744 ext4_ext_try_to_merge(inode, path, nearex); 1744 ext4_ext_try_to_merge(inode, path, nearex);
1745 1745
1746 /* try to merge extents to the left */ 1746 /* try to merge extents to the left */
@@ -3065,7 +3065,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3065 ext4_ext_show_leaf(inode, path); 3065 ext4_ext_show_leaf(inode, path);
3066 3066
3067 /* get_block() before submit the IO, split the extent */ 3067 /* get_block() before submit the IO, split the extent */
3068 if (flags == EXT4_GET_BLOCKS_PRE_IO) { 3068 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3069 ret = ext4_split_unwritten_extents(handle, 3069 ret = ext4_split_unwritten_extents(handle,
3070 inode, path, iblock, 3070 inode, path, iblock,
3071 max_blocks, flags); 3071 max_blocks, flags);
@@ -3078,10 +3078,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3078 io->flag = EXT4_IO_UNWRITTEN; 3078 io->flag = EXT4_IO_UNWRITTEN;
3079 else 3079 else
3080 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3080 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3081 if (ext4_should_dioread_nolock(inode))
3082 set_buffer_uninit(bh_result);
3081 goto out; 3083 goto out;
3082 } 3084 }
3083 /* IO end_io complete, convert the filled extent to written */ 3085 /* IO end_io complete, convert the filled extent to written */
3084 if (flags == EXT4_GET_BLOCKS_CONVERT) { 3086 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3085 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3087 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3086 path); 3088 path);
3087 if (ret >= 0) 3089 if (ret >= 0)
@@ -3351,21 +3353,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3351 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3353 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3352 ext4_ext_mark_uninitialized(&newex); 3354 ext4_ext_mark_uninitialized(&newex);
3353 /* 3355 /*
3354 * io_end structure was created for every async 3356 * io_end structure was created for every IO write to an
3355 * direct IO write to the middle of the file. 3357 * uninitialized extent. To avoid unecessary conversion,
3356 * To avoid unecessary convertion for every aio dio rewrite 3358 * here we flag the IO that really needs the conversion.
3357 * to the mid of file, here we flag the IO that is really
3358 * need the convertion.
3359 * For non asycn direct IO case, flag the inode state 3359 * For non asycn direct IO case, flag the inode state
3360 * that we need to perform convertion when IO is done. 3360 * that we need to perform convertion when IO is done.
3361 */ 3361 */
3362 if (flags == EXT4_GET_BLOCKS_PRE_IO) { 3362 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3363 if (io) 3363 if (io)
3364 io->flag = EXT4_IO_UNWRITTEN; 3364 io->flag = EXT4_IO_UNWRITTEN;
3365 else 3365 else
3366 ext4_set_inode_state(inode, 3366 ext4_set_inode_state(inode,
3367 EXT4_STATE_DIO_UNWRITTEN); 3367 EXT4_STATE_DIO_UNWRITTEN);
3368 } 3368 }
3369 if (ext4_should_dioread_nolock(inode))
3370 set_buffer_uninit(bh_result);
3369 } 3371 }
3370 3372
3371 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { 3373 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28f116bdc405..d291310aef6b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h>
41 42
42#include "ext4_jbd2.h" 43#include "ext4_jbd2.h"
43#include "xattr.h" 44#include "xattr.h"
@@ -1534,6 +1535,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
1534 ext4_truncate(inode); 1535 ext4_truncate(inode);
1535} 1536}
1536 1537
1538static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1539 struct buffer_head *bh_result, int create);
1537static int ext4_write_begin(struct file *file, struct address_space *mapping, 1540static int ext4_write_begin(struct file *file, struct address_space *mapping,
1538 loff_t pos, unsigned len, unsigned flags, 1541 loff_t pos, unsigned len, unsigned flags,
1539 struct page **pagep, void **fsdata) 1542 struct page **pagep, void **fsdata)
@@ -1575,8 +1578,12 @@ retry:
1575 } 1578 }
1576 *pagep = page; 1579 *pagep = page;
1577 1580
1578 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1581 if (ext4_should_dioread_nolock(inode))
1579 ext4_get_block); 1582 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1583 fsdata, ext4_get_block_write);
1584 else
1585 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1586 fsdata, ext4_get_block);
1580 1587
1581 if (!ret && ext4_should_journal_data(inode)) { 1588 if (!ret && ext4_should_journal_data(inode)) {
1582 ret = walk_page_buffers(handle, page_buffers(page), 1589 ret = walk_page_buffers(handle, page_buffers(page),
@@ -2092,6 +2099,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2092 } else if (buffer_mapped(bh)) 2099 } else if (buffer_mapped(bh))
2093 BUG_ON(bh->b_blocknr != pblock); 2100 BUG_ON(bh->b_blocknr != pblock);
2094 2101
2102 if (buffer_uninit(exbh))
2103 set_buffer_uninit(bh);
2095 cur_logical++; 2104 cur_logical++;
2096 pblock++; 2105 pblock++;
2097 } while ((bh = bh->b_this_page) != head); 2106 } while ((bh = bh->b_this_page) != head);
@@ -2221,6 +2230,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2221 */ 2230 */
2222 new.b_state = 0; 2231 new.b_state = 0;
2223 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2232 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2233 if (ext4_should_dioread_nolock(mpd->inode))
2234 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2224 if (mpd->b_state & (1 << BH_Delay)) 2235 if (mpd->b_state & (1 << BH_Delay))
2225 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2236 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2226 2237
@@ -2636,6 +2647,9 @@ out:
2636 return ret; 2647 return ret;
2637} 2648}
2638 2649
2650static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2651static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2652
2639/* 2653/*
2640 * Note that we don't need to start a transaction unless we're journaling data 2654 * Note that we don't need to start a transaction unless we're journaling data
2641 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2655 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2683,7 +2697,7 @@ static int ext4_writepage(struct page *page,
2683 int ret = 0; 2697 int ret = 0;
2684 loff_t size; 2698 loff_t size;
2685 unsigned int len; 2699 unsigned int len;
2686 struct buffer_head *page_bufs; 2700 struct buffer_head *page_bufs = NULL;
2687 struct inode *inode = page->mapping->host; 2701 struct inode *inode = page->mapping->host;
2688 2702
2689 trace_ext4_writepage(inode, page); 2703 trace_ext4_writepage(inode, page);
@@ -2759,7 +2773,11 @@ static int ext4_writepage(struct page *page,
2759 2773
2760 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2774 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2761 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2775 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2762 else 2776 else if (page_bufs && buffer_uninit(page_bufs)) {
2777 ext4_set_bh_endio(page_bufs, inode);
2778 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2779 wbc, ext4_end_io_buffer_write);
2780 } else
2763 ret = block_write_full_page(page, noalloc_get_block_write, 2781 ret = block_write_full_page(page, noalloc_get_block_write,
2764 wbc); 2782 wbc);
2765 2783
@@ -3347,11 +3365,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3347 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3365 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3348} 3366}
3349 3367
3368static void ext4_free_io_end(ext4_io_end_t *io)
3369{
3370 BUG_ON(!io);
3371 if (io->page)
3372 put_page(io->page);
3373 iput(io->inode);
3374 kfree(io);
3375}
3376
3377static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3378{
3379 struct buffer_head *head, *bh;
3380 unsigned int curr_off = 0;
3381
3382 if (!page_has_buffers(page))
3383 return;
3384 head = bh = page_buffers(page);
3385 do {
3386 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3387 && bh->b_private) {
3388 ext4_free_io_end(bh->b_private);
3389 bh->b_private = NULL;
3390 bh->b_end_io = NULL;
3391 }
3392 curr_off = curr_off + bh->b_size;
3393 bh = bh->b_this_page;
3394 } while (bh != head);
3395}
3396
3350static void ext4_invalidatepage(struct page *page, unsigned long offset) 3397static void ext4_invalidatepage(struct page *page, unsigned long offset)
3351{ 3398{
3352 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3399 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3353 3400
3354 /* 3401 /*
3402 * free any io_end structure allocated for buffers to be discarded
3403 */
3404 if (ext4_should_dioread_nolock(page->mapping->host))
3405 ext4_invalidatepage_free_endio(page, offset);
3406 /*
3355 * If it's a full truncate we just forget about the pending dirtying 3407 * If it's a full truncate we just forget about the pending dirtying
3356 */ 3408 */
3357 if (offset == 0) 3409 if (offset == 0)
@@ -3471,10 +3523,11 @@ out:
3471static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3523static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3472 struct buffer_head *bh_result, int create) 3524 struct buffer_head *bh_result, int create)
3473{ 3525{
3474 handle_t *handle = NULL; 3526 handle_t *handle = ext4_journal_current_handle();
3475 int ret = 0; 3527 int ret = 0;
3476 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3528 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3477 int dio_credits; 3529 int dio_credits;
3530 int started = 0;
3478 3531
3479 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3532 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3480 inode->i_ino, create); 3533 inode->i_ino, create);
@@ -3485,37 +3538,36 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3485 */ 3538 */
3486 create = EXT4_GET_BLOCKS_IO_CREATE_EXT; 3539 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3487 3540
3488 if (max_blocks > DIO_MAX_BLOCKS) 3541 if (!handle) {
3489 max_blocks = DIO_MAX_BLOCKS; 3542 if (max_blocks > DIO_MAX_BLOCKS)
3490 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 3543 max_blocks = DIO_MAX_BLOCKS;
3491 handle = ext4_journal_start(inode, dio_credits); 3544 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3492 if (IS_ERR(handle)) { 3545 handle = ext4_journal_start(inode, dio_credits);
3493 ret = PTR_ERR(handle); 3546 if (IS_ERR(handle)) {
3494 goto out; 3547 ret = PTR_ERR(handle);
3548 goto out;
3549 }
3550 started = 1;
3495 } 3551 }
3552
3496 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 3553 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3497 create); 3554 create);
3498 if (ret > 0) { 3555 if (ret > 0) {
3499 bh_result->b_size = (ret << inode->i_blkbits); 3556 bh_result->b_size = (ret << inode->i_blkbits);
3500 ret = 0; 3557 ret = 0;
3501 } 3558 }
3502 ext4_journal_stop(handle); 3559 if (started)
3560 ext4_journal_stop(handle);
3503out: 3561out:
3504 return ret; 3562 return ret;
3505} 3563}
3506 3564
3507static void ext4_free_io_end(ext4_io_end_t *io)
3508{
3509 BUG_ON(!io);
3510 iput(io->inode);
3511 kfree(io);
3512}
3513
3514static void dump_completed_IO(struct inode * inode) 3565static void dump_completed_IO(struct inode * inode)
3515{ 3566{
3516#ifdef EXT4_DEBUG 3567#ifdef EXT4_DEBUG
3517 struct list_head *cur, *before, *after; 3568 struct list_head *cur, *before, *after;
3518 ext4_io_end_t *io, *io0, *io1; 3569 ext4_io_end_t *io, *io0, *io1;
3570 unsigned long flags;
3519 3571
3520 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ 3572 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3521 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); 3573 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
@@ -3523,6 +3575,7 @@ static void dump_completed_IO(struct inode * inode)
3523 } 3575 }
3524 3576
3525 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); 3577 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3578 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3526 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ 3579 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3527 cur = &io->list; 3580 cur = &io->list;
3528 before = cur->prev; 3581 before = cur->prev;
@@ -3533,6 +3586,7 @@ static void dump_completed_IO(struct inode * inode)
3533 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 3586 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3534 io, inode->i_ino, io0, io1); 3587 io, inode->i_ino, io0, io1);
3535 } 3588 }
3589 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3536#endif 3590#endif
3537} 3591}
3538 3592
@@ -3556,9 +3610,7 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
3556 if (io->flag != EXT4_IO_UNWRITTEN) 3610 if (io->flag != EXT4_IO_UNWRITTEN)
3557 return ret; 3611 return ret;
3558 3612
3559 if (offset + size <= i_size_read(inode)) 3613 ret = ext4_convert_unwritten_extents(inode, offset, size);
3560 ret = ext4_convert_unwritten_extents(inode, offset, size);
3561
3562 if (ret < 0) { 3614 if (ret < 0) {
3563 printk(KERN_EMERG "%s: failed to convert unwritten" 3615 printk(KERN_EMERG "%s: failed to convert unwritten"
3564 "extents to written extents, error is %d" 3616 "extents to written extents, error is %d"
@@ -3577,18 +3629,25 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
3577 */ 3629 */
3578static void ext4_end_io_work(struct work_struct *work) 3630static void ext4_end_io_work(struct work_struct *work)
3579{ 3631{
3580 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 3632 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3581 struct inode *inode = io->inode; 3633 struct inode *inode = io->inode;
3582 int ret = 0; 3634 struct ext4_inode_info *ei = EXT4_I(inode);
3635 unsigned long flags;
3636 int ret;
3583 3637
3584 mutex_lock(&inode->i_mutex); 3638 mutex_lock(&inode->i_mutex);
3585 ret = ext4_end_io_nolock(io); 3639 ret = ext4_end_io_nolock(io);
3586 if (ret >= 0) { 3640 if (ret < 0) {
3587 if (!list_empty(&io->list)) 3641 mutex_unlock(&inode->i_mutex);
3588 list_del_init(&io->list); 3642 return;
3589 ext4_free_io_end(io);
3590 } 3643 }
3644
3645 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3646 if (!list_empty(&io->list))
3647 list_del_init(&io->list);
3648 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3591 mutex_unlock(&inode->i_mutex); 3649 mutex_unlock(&inode->i_mutex);
3650 ext4_free_io_end(io);
3592} 3651}
3593 3652
3594/* 3653/*
@@ -3607,15 +3666,18 @@ static void ext4_end_io_work(struct work_struct *work)
3607int flush_completed_IO(struct inode *inode) 3666int flush_completed_IO(struct inode *inode)
3608{ 3667{
3609 ext4_io_end_t *io; 3668 ext4_io_end_t *io;
3669 struct ext4_inode_info *ei = EXT4_I(inode);
3670 unsigned long flags;
3610 int ret = 0; 3671 int ret = 0;
3611 int ret2 = 0; 3672 int ret2 = 0;
3612 3673
3613 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) 3674 if (list_empty(&ei->i_completed_io_list))
3614 return ret; 3675 return ret;
3615 3676
3616 dump_completed_IO(inode); 3677 dump_completed_IO(inode);
3617 while (!list_empty(&EXT4_I(inode)->i_completed_io_list)){ 3678 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3618 io = list_entry(EXT4_I(inode)->i_completed_io_list.next, 3679 while (!list_empty(&ei->i_completed_io_list)){
3680 io = list_entry(ei->i_completed_io_list.next,
3619 ext4_io_end_t, list); 3681 ext4_io_end_t, list);
3620 /* 3682 /*
3621 * Calling ext4_end_io_nolock() to convert completed 3683 * Calling ext4_end_io_nolock() to convert completed
@@ -3631,20 +3693,23 @@ int flush_completed_IO(struct inode *inode)
3631 * avoid double converting from both fsync and background work 3693 * avoid double converting from both fsync and background work
3632 * queue work. 3694 * queue work.
3633 */ 3695 */
3696 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3634 ret = ext4_end_io_nolock(io); 3697 ret = ext4_end_io_nolock(io);
3698 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3635 if (ret < 0) 3699 if (ret < 0)
3636 ret2 = ret; 3700 ret2 = ret;
3637 else 3701 else
3638 list_del_init(&io->list); 3702 list_del_init(&io->list);
3639 } 3703 }
3704 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3640 return (ret2 < 0) ? ret2 : 0; 3705 return (ret2 < 0) ? ret2 : 0;
3641} 3706}
3642 3707
3643static ext4_io_end_t *ext4_init_io_end (struct inode *inode) 3708static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3644{ 3709{
3645 ext4_io_end_t *io = NULL; 3710 ext4_io_end_t *io = NULL;
3646 3711
3647 io = kmalloc(sizeof(*io), GFP_NOFS); 3712 io = kmalloc(sizeof(*io), flags);
3648 3713
3649 if (io) { 3714 if (io) {
3650 igrab(inode); 3715 igrab(inode);
@@ -3652,7 +3717,7 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3652 io->flag = 0; 3717 io->flag = 0;
3653 io->offset = 0; 3718 io->offset = 0;
3654 io->size = 0; 3719 io->size = 0;
3655 io->error = 0; 3720 io->page = NULL;
3656 INIT_WORK(&io->work, ext4_end_io_work); 3721 INIT_WORK(&io->work, ext4_end_io_work);
3657 INIT_LIST_HEAD(&io->list); 3722 INIT_LIST_HEAD(&io->list);
3658 } 3723 }
@@ -3665,6 +3730,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3665{ 3730{
3666 ext4_io_end_t *io_end = iocb->private; 3731 ext4_io_end_t *io_end = iocb->private;
3667 struct workqueue_struct *wq; 3732 struct workqueue_struct *wq;
3733 unsigned long flags;
3734 struct ext4_inode_info *ei;
3668 3735
3669 /* if not async direct IO or dio with 0 bytes write, just return */ 3736 /* if not async direct IO or dio with 0 bytes write, just return */
3670 if (!io_end || !size) 3737 if (!io_end || !size)
@@ -3684,17 +3751,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3684 3751
3685 io_end->offset = offset; 3752 io_end->offset = offset;
3686 io_end->size = size; 3753 io_end->size = size;
3754 io_end->flag = EXT4_IO_UNWRITTEN;
3687 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3755 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3688 3756
3689 /* queue the work to convert unwritten extents to written */ 3757 /* queue the work to convert unwritten extents to written */
3690 queue_work(wq, &io_end->work); 3758 queue_work(wq, &io_end->work);
3691 3759
3692 /* Add the io_end to per-inode completed aio dio list*/ 3760 /* Add the io_end to per-inode completed aio dio list*/
3693 list_add_tail(&io_end->list, 3761 ei = EXT4_I(io_end->inode);
3694 &EXT4_I(io_end->inode)->i_completed_io_list); 3762 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3763 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3764 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3695 iocb->private = NULL; 3765 iocb->private = NULL;
3696} 3766}
3697 3767
3768static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3769{
3770 ext4_io_end_t *io_end = bh->b_private;
3771 struct workqueue_struct *wq;
3772 struct inode *inode;
3773 unsigned long flags;
3774
3775 if (!test_clear_buffer_uninit(bh) || !io_end)
3776 goto out;
3777
3778 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3779 printk("sb umounted, discard end_io request for inode %lu\n",
3780 io_end->inode->i_ino);
3781 ext4_free_io_end(io_end);
3782 goto out;
3783 }
3784
3785 io_end->flag = EXT4_IO_UNWRITTEN;
3786 inode = io_end->inode;
3787
3788 /* Add the io_end to per-inode completed io list*/
3789 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3790 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3791 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3792
3793 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3794 /* queue the work to convert unwritten extents to written */
3795 queue_work(wq, &io_end->work);
3796out:
3797 bh->b_private = NULL;
3798 bh->b_end_io = NULL;
3799 clear_buffer_uninit(bh);
3800 end_buffer_async_write(bh, uptodate);
3801}
3802
3803static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3804{
3805 ext4_io_end_t *io_end;
3806 struct page *page = bh->b_page;
3807 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3808 size_t size = bh->b_size;
3809
3810retry:
3811 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3812 if (!io_end) {
3813 if (printk_ratelimit())
3814 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3815 schedule();
3816 goto retry;
3817 }
3818 io_end->offset = offset;
3819 io_end->size = size;
3820 /*
3821 * We need to hold a reference to the page to make sure it
3822 * doesn't get evicted before ext4_end_io_work() has a chance
3823 * to convert the extent from written to unwritten.
3824 */
3825 io_end->page = page;
3826 get_page(io_end->page);
3827
3828 bh->b_private = io_end;
3829 bh->b_end_io = ext4_end_io_buffer_write;
3830 return 0;
3831}
3832
3698/* 3833/*
3699 * For ext4 extent files, ext4 will do direct-io write to holes, 3834 * For ext4 extent files, ext4 will do direct-io write to holes,
3700 * preallocated extents, and those write extend the file, no need to 3835 * preallocated extents, and those write extend the file, no need to
@@ -3748,7 +3883,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3748 iocb->private = NULL; 3883 iocb->private = NULL;
3749 EXT4_I(inode)->cur_aio_dio = NULL; 3884 EXT4_I(inode)->cur_aio_dio = NULL;
3750 if (!is_sync_kiocb(iocb)) { 3885 if (!is_sync_kiocb(iocb)) {
3751 iocb->private = ext4_init_io_end(inode); 3886 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3752 if (!iocb->private) 3887 if (!iocb->private)
3753 return -ENOMEM; 3888 return -ENOMEM;
3754 /* 3889 /*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc7a97e79e3b..5e8f9077b0fc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -709,6 +709,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
709 ei->i_reserved_quota = 0; 709 ei->i_reserved_quota = 0;
710#endif 710#endif
711 INIT_LIST_HEAD(&ei->i_completed_io_list); 711 INIT_LIST_HEAD(&ei->i_completed_io_list);
712 spin_lock_init(&ei->i_completed_io_lock);
712 ei->cur_aio_dio = NULL; 713 ei->cur_aio_dio = NULL;
713 ei->i_sync_tid = 0; 714 ei->i_sync_tid = 0;
714 ei->i_datasync_tid = 0; 715 ei->i_datasync_tid = 0;
@@ -926,6 +927,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
926 if (test_opt(sb, NOLOAD)) 927 if (test_opt(sb, NOLOAD))
927 seq_puts(seq, ",norecovery"); 928 seq_puts(seq, ",norecovery");
928 929
930 if (test_opt(sb, DIOREAD_NOLOCK))
931 seq_puts(seq, ",dioread_nolock");
932
929 ext4_show_quota_options(seq, sb); 933 ext4_show_quota_options(seq, sb);
930 934
931 return 0; 935 return 0;
@@ -1109,6 +1113,7 @@ enum {
1109 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1113 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1110 Opt_block_validity, Opt_noblock_validity, 1114 Opt_block_validity, Opt_noblock_validity,
1111 Opt_inode_readahead_blks, Opt_journal_ioprio, 1115 Opt_inode_readahead_blks, Opt_journal_ioprio,
1116 Opt_dioread_nolock, Opt_dioread_lock,
1112 Opt_discard, Opt_nodiscard, 1117 Opt_discard, Opt_nodiscard,
1113}; 1118};
1114 1119
@@ -1176,6 +1181,8 @@ static const match_table_t tokens = {
1176 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1181 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1177 {Opt_auto_da_alloc, "auto_da_alloc"}, 1182 {Opt_auto_da_alloc, "auto_da_alloc"},
1178 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1183 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1184 {Opt_dioread_nolock, "dioread_nolock"},
1185 {Opt_dioread_lock, "dioread_lock"},
1179 {Opt_discard, "discard"}, 1186 {Opt_discard, "discard"},
1180 {Opt_nodiscard, "nodiscard"}, 1187 {Opt_nodiscard, "nodiscard"},
1181 {Opt_err, NULL}, 1188 {Opt_err, NULL},
@@ -1640,6 +1647,12 @@ set_qf_format:
1640 case Opt_nodiscard: 1647 case Opt_nodiscard:
1641 clear_opt(sbi->s_mount_opt, DISCARD); 1648 clear_opt(sbi->s_mount_opt, DISCARD);
1642 break; 1649 break;
1650 case Opt_dioread_nolock:
1651 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1652 break;
1653 case Opt_dioread_lock:
1654 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1655 break;
1643 default: 1656 default:
1644 ext4_msg(sb, KERN_ERR, 1657 ext4_msg(sb, KERN_ERR,
1645 "Unrecognized mount option \"%s\" " 1658 "Unrecognized mount option \"%s\" "
@@ -2795,7 +2808,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2795 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2808 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2796 ext4_msg(sb, KERN_ERR, "required journal recovery " 2809 ext4_msg(sb, KERN_ERR, "required journal recovery "
2797 "suppressed and not mounted read-only"); 2810 "suppressed and not mounted read-only");
2798 goto failed_mount4; 2811 goto failed_mount_wq;
2799 } else { 2812 } else {
2800 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2813 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2801 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 2814 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2808,7 +2821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2808 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2821 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2809 JBD2_FEATURE_INCOMPAT_64BIT)) { 2822 JBD2_FEATURE_INCOMPAT_64BIT)) {
2810 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 2823 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2811 goto failed_mount4; 2824 goto failed_mount_wq;
2812 } 2825 }
2813 2826
2814 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2827 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2847,7 +2860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2847 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2860 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2848 ext4_msg(sb, KERN_ERR, "Journal does not support " 2861 ext4_msg(sb, KERN_ERR, "Journal does not support "
2849 "requested data journaling mode"); 2862 "requested data journaling mode");
2850 goto failed_mount4; 2863 goto failed_mount_wq;
2851 } 2864 }
2852 default: 2865 default:
2853 break; 2866 break;
@@ -2855,13 +2868,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2855 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2868 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2856 2869
2857no_journal: 2870no_journal:
2858
2859 if (test_opt(sb, NOBH)) { 2871 if (test_opt(sb, NOBH)) {
2860 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2872 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2861 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2873 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2862 "its supported only with writeback mode"); 2874 "its supported only with writeback mode");
2863 clear_opt(sbi->s_mount_opt, NOBH); 2875 clear_opt(sbi->s_mount_opt, NOBH);
2864 } 2876 }
2877 if (test_opt(sb, DIOREAD_NOLOCK)) {
2878 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2879 "not supported with nobh mode");
2880 goto failed_mount_wq;
2881 }
2865 } 2882 }
2866 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 2883 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2867 if (!EXT4_SB(sb)->dio_unwritten_wq) { 2884 if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2926,6 +2943,18 @@ no_journal:
2926 "requested data journaling mode"); 2943 "requested data journaling mode");
2927 clear_opt(sbi->s_mount_opt, DELALLOC); 2944 clear_opt(sbi->s_mount_opt, DELALLOC);
2928 } 2945 }
2946 if (test_opt(sb, DIOREAD_NOLOCK)) {
2947 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2948 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2949 "option - requested data journaling mode");
2950 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2951 }
2952 if (sb->s_blocksize < PAGE_SIZE) {
2953 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2954 "option - block size is too small");
2955 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2956 }
2957 }
2929 2958
2930 err = ext4_setup_system_zone(sb); 2959 err = ext4_setup_system_zone(sb);
2931 if (err) { 2960 if (err) {