aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c1198
1 files changed, 532 insertions, 666 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..e3126c051006 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/printk.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/ratelimit.h>
43 45
44#include "ext4_jbd2.h" 46#include "ext4_jbd2.h"
45#include "xattr.h" 47#include "xattr.h"
@@ -53,13 +55,27 @@
53static inline int ext4_begin_ordered_truncate(struct inode *inode, 55static inline int ext4_begin_ordered_truncate(struct inode *inode,
54 loff_t new_size) 56 loff_t new_size)
55{ 57{
56 return jbd2_journal_begin_ordered_truncate( 58 trace_ext4_begin_ordered_truncate(inode, new_size);
57 EXT4_SB(inode->i_sb)->s_journal, 59 /*
58 &EXT4_I(inode)->jinode, 60 * If jinode is zero, then we never opened the file for
59 new_size); 61 * writing, so there's no need to call
62 * jbd2_journal_begin_ordered_truncate() since there's no
63 * outstanding writes we need to flush.
64 */
65 if (!EXT4_I(inode)->jinode)
66 return 0;
67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
68 EXT4_I(inode)->jinode,
69 new_size);
60} 70}
61 71
62static void ext4_invalidatepage(struct page *page, unsigned long offset); 72static void ext4_invalidatepage(struct page *page, unsigned long offset);
73static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
74 struct buffer_head *bh_result, int create);
75static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
76static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
77static int __ext4_journalled_writepage(struct page *page, unsigned int len);
78static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
63 79
64/* 80/*
65 * Test whether an inode is a fast symlink. 81 * Test whether an inode is a fast symlink.
@@ -157,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
157 BUG_ON(EXT4_JOURNAL(inode) == NULL); 173 BUG_ON(EXT4_JOURNAL(inode) == NULL);
158 jbd_debug(2, "restarting handle %p\n", handle); 174 jbd_debug(2, "restarting handle %p\n", handle);
159 up_write(&EXT4_I(inode)->i_data_sem); 175 up_write(&EXT4_I(inode)->i_data_sem);
160 ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); 176 ret = ext4_journal_restart(handle, nblocks);
161 down_write(&EXT4_I(inode)->i_data_sem); 177 down_write(&EXT4_I(inode)->i_data_sem);
162 ext4_discard_preallocations(inode); 178 ext4_discard_preallocations(inode);
163 179
@@ -172,6 +188,7 @@ void ext4_evict_inode(struct inode *inode)
172 handle_t *handle; 188 handle_t *handle;
173 int err; 189 int err;
174 190
191 trace_ext4_evict_inode(inode);
175 if (inode->i_nlink) { 192 if (inode->i_nlink) {
176 truncate_inode_pages(&inode->i_data, 0); 193 truncate_inode_pages(&inode->i_data, 0);
177 goto no_delete; 194 goto no_delete;
@@ -544,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
544} 561}
545 562
546/** 563/**
547 * ext4_blks_to_allocate: Look up the block map and count the number 564 * ext4_blks_to_allocate - Look up the block map and count the number
548 * of direct blocks need to be allocated for the given branch. 565 * of direct blocks need to be allocated for the given branch.
549 * 566 *
550 * @branch: chain of indirect blocks 567 * @branch: chain of indirect blocks
@@ -583,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
583 600
584/** 601/**
585 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation
586 * @indirect_blks: the number of blocks need to allocate for indirect 607 * @indirect_blks: the number of blocks need to allocate for indirect
587 * blocks 608 * blocks
588 * 609 * @blks: number of desired blocks
589 * @new_blocks: on return it will store the new block numbers for 610 * @new_blocks: on return it will store the new block numbers for
590 * the indirect blocks(if needed) and the first direct block, 611 * the indirect blocks(if needed) and the first direct block,
591 * @blks: on return it will store the total number of allocated 612 * @err: on return it will store the error code
592 * direct blocks 613 *
614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters.
593 */ 616 */
594static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 617static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
595 ext4_lblk_t iblock, ext4_fsblk_t goal, 618 ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -616,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
616 while (target > 0) { 639 while (target > 0) {
617 count = target; 640 count = target;
618 /* allocating blocks for indirect blocks and direct blocks */ 641 /* allocating blocks for indirect blocks and direct blocks */
619 current_block = ext4_new_meta_blocks(handle, inode, 642 current_block = ext4_new_meta_blocks(handle, inode, goal,
620 goal, &count, err); 643 0, &count, err);
621 if (*err) 644 if (*err)
622 goto failed_out; 645 goto failed_out;
623 646
@@ -697,15 +720,17 @@ allocated:
697 return ret; 720 return ret;
698failed_out: 721failed_out:
699 for (i = 0; i < index; i++) 722 for (i = 0; i < index; i++)
700 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
701 return ret; 724 return ret;
702} 725}
703 726
704/** 727/**
705 * ext4_alloc_branch - allocate and set up a chain of blocks. 728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction
706 * @inode: owner 730 * @inode: owner
707 * @indirect_blks: number of allocated indirect blocks 731 * @indirect_blks: number of allocated indirect blocks
708 * @blks: number of allocated direct blocks 732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation
709 * @offsets: offsets (in the blocks) to store the pointers to next. 734 * @offsets: offsets (in the blocks) to store the pointers to next.
710 * @branch: place to store the chain in. 735 * @branch: place to store the chain in.
711 * 736 *
@@ -755,6 +780,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 * parent to disk. 780 * parent to disk.
756 */ 781 */
757 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
783 if (unlikely(!bh)) {
784 err = -EIO;
785 goto failed;
786 }
787
758 branch[n].bh = bh; 788 branch[n].bh = bh;
759 lock_buffer(bh); 789 lock_buffer(bh);
760 BUFFER_TRACE(bh, "call get_create_access"); 790 BUFFER_TRACE(bh, "call get_create_access");
@@ -793,26 +823,27 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
793 return err; 823 return err;
794failed: 824failed:
795 /* Allocation failed, free what we already allocated */ 825 /* Allocation failed, free what we already allocated */
796 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
797 for (i = 1; i <= n ; i++) { 827 for (i = 1; i <= n ; i++) {
798 /* 828 /*
799 * branch[i].bh is newly allocated, so there is no 829 * branch[i].bh is newly allocated, so there is no
800 * need to revoke the block, which is why we don't 830 * need to revoke the block, which is why we don't
801 * need to set EXT4_FREE_BLOCKS_METADATA. 831 * need to set EXT4_FREE_BLOCKS_METADATA.
802 */ 832 */
803 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
804 EXT4_FREE_BLOCKS_FORGET); 834 EXT4_FREE_BLOCKS_FORGET);
805 } 835 }
806 for (i = n+1; i < indirect_blks; i++) 836 for (i = n+1; i < indirect_blks; i++)
807 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
808 838
809 ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); 839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
810 840
811 return err; 841 return err;
812} 842}
813 843
814/** 844/**
815 * ext4_splice_branch - splice the allocated branch onto inode. 845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction
816 * @inode: owner 847 * @inode: owner
817 * @block: (logical) number of block we are adding 848 * @block: (logical) number of block we are adding
818 * @chain: chain of indirect blocks (with a missing link - see 849 * @chain: chain of indirect blocks (with a missing link - see
@@ -893,7 +924,7 @@ err_out:
893 ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
894 EXT4_FREE_BLOCKS_FORGET); 925 EXT4_FREE_BLOCKS_FORGET);
895 } 926 }
896 ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), 927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
897 blks, 0); 928 blks, 0);
898 929
899 return err; 930 return err;
@@ -942,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
942 int count = 0; 973 int count = 0;
943 ext4_fsblk_t first_block = 0; 974 ext4_fsblk_t first_block = 0;
944 975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
945 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
946 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
947 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1027,6 +1059,8 @@ cleanup:
1027 partial--; 1059 partial--;
1028 } 1060 }
1029out: 1061out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err);
1030 return err; 1064 return err;
1031} 1065}
1032 1066
@@ -1068,7 +1102,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1068 * Calculate the number of metadata blocks need to reserve 1102 * Calculate the number of metadata blocks need to reserve
1069 * to allocate a block located at @lblock 1103 * to allocate a block located at @lblock
1070 */ 1104 */
1071static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1105static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1072{ 1106{
1073 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1107 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1074 return ext4_ext_calc_metadata_amount(inode, lblock); 1108 return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1207,8 +1241,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1207 break; 1241 break;
1208 idx++; 1242 idx++;
1209 num++; 1243 num++;
1210 if (num >= max_pages) 1244 if (num >= max_pages) {
1245 done = 1;
1211 break; 1246 break;
1247 }
1212 } 1248 }
1213 pagevec_release(&pvec); 1249 pagevec_release(&pvec);
1214 } 1250 }
@@ -1305,7 +1341,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1305 * avoid double accounting 1341 * avoid double accounting
1306 */ 1342 */
1307 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1343 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1308 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1344 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1309 /* 1345 /*
1310 * We need to check for EXT4 here because migrate 1346 * We need to check for EXT4 here because migrate
1311 * could have changed the inode type in between 1347 * could have changed the inode type in between
@@ -1335,7 +1371,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1335 ext4_da_update_reserve_space(inode, retval, 1); 1371 ext4_da_update_reserve_space(inode, retval, 1);
1336 } 1372 }
1337 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1373 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1338 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1374 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1339 1375
1340 up_write((&EXT4_I(inode)->i_data_sem)); 1376 up_write((&EXT4_I(inode)->i_data_sem));
1341 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1377 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1538,10 +1574,10 @@ static int do_journal_get_write_access(handle_t *handle,
1538 if (!buffer_mapped(bh) || buffer_freed(bh)) 1574 if (!buffer_mapped(bh) || buffer_freed(bh))
1539 return 0; 1575 return 0;
1540 /* 1576 /*
1541 * __block_prepare_write() could have dirtied some buffers. Clean 1577 * __block_write_begin() could have dirtied some buffers. Clean
1542 * the dirty bit as jbd2_journal_get_write_access() could complain 1578 * the dirty bit as jbd2_journal_get_write_access() could complain
1543 * otherwise about fs integrity issues. Setting of the dirty bit 1579 * otherwise about fs integrity issues. Setting of the dirty bit
1544 * by __block_prepare_write() isn't a real problem here as we clear 1580 * by __block_write_begin() isn't a real problem here as we clear
1545 * the bit before releasing a page lock and thus writeback cannot 1581 * the bit before releasing a page lock and thus writeback cannot
1546 * ever write the buffer. 1582 * ever write the buffer.
1547 */ 1583 */
@@ -1863,7 +1899,7 @@ static int ext4_journalled_write_end(struct file *file,
1863/* 1899/*
1864 * Reserve a single block located at lblock 1900 * Reserve a single block located at lblock
1865 */ 1901 */
1866static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) 1902static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1867{ 1903{
1868 int retries = 0; 1904 int retries = 0;
1869 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1894,7 +1930,7 @@ repeat:
1894 * We do still charge estimated metadata to the sb though; 1930 * We do still charge estimated metadata to the sb though;
1895 * we cannot afford to run out of free blocks. 1931 * we cannot afford to run out of free blocks.
1896 */ 1932 */
1897 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1898 dquot_release_reservation_block(inode, 1); 1934 dquot_release_reservation_block(inode, 1);
1899 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1900 yield(); 1936 yield();
@@ -1995,16 +2031,23 @@ static void ext4_da_page_release_reservation(struct page *page,
1995 * 2031 *
1996 * As pages are already locked by write_cache_pages(), we can't use it 2032 * As pages are already locked by write_cache_pages(), we can't use it
1997 */ 2033 */
1998static int mpage_da_submit_io(struct mpage_da_data *mpd) 2034static int mpage_da_submit_io(struct mpage_da_data *mpd,
2035 struct ext4_map_blocks *map)
1999{ 2036{
2000 long pages_skipped;
2001 struct pagevec pvec; 2037 struct pagevec pvec;
2002 unsigned long index, end; 2038 unsigned long index, end;
2003 int ret = 0, err, nr_pages, i; 2039 int ret = 0, err, nr_pages, i;
2004 struct inode *inode = mpd->inode; 2040 struct inode *inode = mpd->inode;
2005 struct address_space *mapping = inode->i_mapping; 2041 struct address_space *mapping = inode->i_mapping;
2042 loff_t size = i_size_read(inode);
2043 unsigned int len, block_start;
2044 struct buffer_head *bh, *page_bufs = NULL;
2045 int journal_data = ext4_should_journal_data(inode);
2046 sector_t pblock = 0, cur_logical = 0;
2047 struct ext4_io_submit io_submit;
2006 2048
2007 BUG_ON(mpd->next_page <= mpd->first_page); 2049 BUG_ON(mpd->next_page <= mpd->first_page);
2050 memset(&io_submit, 0, sizeof(io_submit));
2008 /* 2051 /*
2009 * We need to start from the first_page to the next_page - 1 2052 * We need to start from the first_page to the next_page - 1
2010 * to make sure we also write the mapped dirty buffer_heads. 2053 * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,124 +2063,111 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2020 if (nr_pages == 0) 2063 if (nr_pages == 0)
2021 break; 2064 break;
2022 for (i = 0; i < nr_pages; i++) { 2065 for (i = 0; i < nr_pages; i++) {
2066 int commit_write = 0, skip_page = 0;
2023 struct page *page = pvec.pages[i]; 2067 struct page *page = pvec.pages[i];
2024 2068
2025 index = page->index; 2069 index = page->index;
2026 if (index > end) 2070 if (index > end)
2027 break; 2071 break;
2072
2073 if (index == size >> PAGE_CACHE_SHIFT)
2074 len = size & ~PAGE_CACHE_MASK;
2075 else
2076 len = PAGE_CACHE_SIZE;
2077 if (map) {
2078 cur_logical = index << (PAGE_CACHE_SHIFT -
2079 inode->i_blkbits);
2080 pblock = map->m_pblk + (cur_logical -
2081 map->m_lblk);
2082 }
2028 index++; 2083 index++;
2029 2084
2030 BUG_ON(!PageLocked(page)); 2085 BUG_ON(!PageLocked(page));
2031 BUG_ON(PageWriteback(page)); 2086 BUG_ON(PageWriteback(page));
2032 2087
2033 pages_skipped = mpd->wbc->pages_skipped;
2034 err = mapping->a_ops->writepage(page, mpd->wbc);
2035 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
2036 /*
2037 * have successfully written the page
2038 * without skipping the same
2039 */
2040 mpd->pages_written++;
2041 /* 2088 /*
2042 * In error case, we have to continue because 2089 * If the page does not have buffers (for
2043 * remaining pages are still locked 2090 * whatever reason), try to create them using
2044 * XXX: unlock and re-dirty them? 2091 * __block_write_begin. If this fails,
2092 * skip the page and move on.
2045 */ 2093 */
2046 if (ret == 0) 2094 if (!page_has_buffers(page)) {
2047 ret = err; 2095 if (__block_write_begin(page, 0, len,
2048 } 2096 noalloc_get_block_write)) {
2049 pagevec_release(&pvec); 2097 skip_page:
2050 } 2098 unlock_page(page);
2051 return ret; 2099 continue;
2052} 2100 }
2053 2101 commit_write = 1;
2054/* 2102 }
2055 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
2056 *
2057 * the function goes through all passed space and put actual disk
2058 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2059 */
2060static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2061 struct ext4_map_blocks *map)
2062{
2063 struct inode *inode = mpd->inode;
2064 struct address_space *mapping = inode->i_mapping;
2065 int blocks = map->m_len;
2066 sector_t pblock = map->m_pblk, cur_logical;
2067 struct buffer_head *head, *bh;
2068 pgoff_t index, end;
2069 struct pagevec pvec;
2070 int nr_pages, i;
2071
2072 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2073 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2074 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2075
2076 pagevec_init(&pvec, 0);
2077
2078 while (index <= end) {
2079 /* XXX: optimize tail */
2080 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2081 if (nr_pages == 0)
2082 break;
2083 for (i = 0; i < nr_pages; i++) {
2084 struct page *page = pvec.pages[i];
2085
2086 index = page->index;
2087 if (index > end)
2088 break;
2089 index++;
2090
2091 BUG_ON(!PageLocked(page));
2092 BUG_ON(PageWriteback(page));
2093 BUG_ON(!page_has_buffers(page));
2094
2095 bh = page_buffers(page);
2096 head = bh;
2097
2098 /* skip blocks out of the range */
2099 do {
2100 if (cur_logical >= map->m_lblk)
2101 break;
2102 cur_logical++;
2103 } while ((bh = bh->b_this_page) != head);
2104 2103
2104 bh = page_bufs = page_buffers(page);
2105 block_start = 0;
2105 do { 2106 do {
2106 if (cur_logical >= map->m_lblk + blocks) 2107 if (!bh)
2107 break; 2108 goto skip_page;
2108 2109 if (map && (cur_logical >= map->m_lblk) &&
2109 if (buffer_delay(bh) || buffer_unwritten(bh)) { 2110 (cur_logical <= (map->m_lblk +
2110 2111 (map->m_len - 1)))) {
2111 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2112
2113 if (buffer_delay(bh)) { 2112 if (buffer_delay(bh)) {
2114 clear_buffer_delay(bh); 2113 clear_buffer_delay(bh);
2115 bh->b_blocknr = pblock; 2114 bh->b_blocknr = pblock;
2116 } else {
2117 /*
2118 * unwritten already should have
2119 * blocknr assigned. Verify that
2120 */
2121 clear_buffer_unwritten(bh);
2122 BUG_ON(bh->b_blocknr != pblock);
2123 } 2115 }
2116 if (buffer_unwritten(bh) ||
2117 buffer_mapped(bh))
2118 BUG_ON(bh->b_blocknr != pblock);
2119 if (map->m_flags & EXT4_MAP_UNINIT)
2120 set_buffer_uninit(bh);
2121 clear_buffer_unwritten(bh);
2122 }
2124 2123
2125 } else if (buffer_mapped(bh)) 2124 /* skip page if block allocation undone */
2126 BUG_ON(bh->b_blocknr != pblock); 2125 if (buffer_delay(bh) || buffer_unwritten(bh))
2127 2126 skip_page = 1;
2128 if (map->m_flags & EXT4_MAP_UNINIT) 2127 bh = bh->b_this_page;
2129 set_buffer_uninit(bh); 2128 block_start += bh->b_size;
2130 cur_logical++; 2129 cur_logical++;
2131 pblock++; 2130 pblock++;
2132 } while ((bh = bh->b_this_page) != head); 2131 } while (bh != page_bufs);
2132
2133 if (skip_page)
2134 goto skip_page;
2135
2136 if (commit_write)
2137 /* mark the buffer_heads as dirty & uptodate */
2138 block_commit_write(page, 0, len);
2139
2140 clear_page_dirty_for_io(page);
2141 /*
2142 * Delalloc doesn't support data journalling,
2143 * but eventually maybe we'll lift this
2144 * restriction.
2145 */
2146 if (unlikely(journal_data && PageChecked(page)))
2147 err = __ext4_journalled_writepage(page, len);
2148 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2149 err = ext4_bio_write_page(&io_submit, page,
2150 len, mpd->wbc);
2151 else
2152 err = block_write_full_page(page,
2153 noalloc_get_block_write, mpd->wbc);
2154
2155 if (!err)
2156 mpd->pages_written++;
2157 /*
2158 * In error case, we have to continue because
2159 * remaining pages are still locked
2160 */
2161 if (ret == 0)
2162 ret = err;
2133 } 2163 }
2134 pagevec_release(&pvec); 2164 pagevec_release(&pvec);
2135 } 2165 }
2166 ext4_io_submit(&io_submit);
2167 return ret;
2136} 2168}
2137 2169
2138 2170static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
2139static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2140 sector_t logical, long blk_cnt)
2141{ 2171{
2142 int nr_pages, i; 2172 int nr_pages, i;
2143 pgoff_t index, end; 2173 pgoff_t index, end;
@@ -2145,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2145 struct inode *inode = mpd->inode; 2175 struct inode *inode = mpd->inode;
2146 struct address_space *mapping = inode->i_mapping; 2176 struct address_space *mapping = inode->i_mapping;
2147 2177
2148 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2178 index = mpd->first_page;
2149 end = (logical + blk_cnt - 1) >> 2179 end = mpd->next_page - 1;
2150 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2151 while (index <= end) { 2180 while (index <= end) {
2152 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2153 if (nr_pages == 0) 2182 if (nr_pages == 0)
@@ -2187,35 +2216,32 @@ static void ext4_print_free_blocks(struct inode *inode)
2187} 2216}
2188 2217
2189/* 2218/*
2190 * mpage_da_map_blocks - go through given space 2219 * mpage_da_map_and_submit - go through given space, map them
2220 * if necessary, and then submit them for I/O
2191 * 2221 *
2192 * @mpd - bh describing space 2222 * @mpd - bh describing space
2193 * 2223 *
2194 * The function skips space we know is already mapped to disk blocks. 2224 * The function skips space we know is already mapped to disk blocks.
2195 * 2225 *
2196 */ 2226 */
2197static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2227static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2198{ 2228{
2199 int err, blks, get_blocks_flags; 2229 int err, blks, get_blocks_flags;
2200 struct ext4_map_blocks map; 2230 struct ext4_map_blocks map, *mapp = NULL;
2201 sector_t next = mpd->b_blocknr; 2231 sector_t next = mpd->b_blocknr;
2202 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2232 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2203 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2233 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2204 handle_t *handle = NULL; 2234 handle_t *handle = NULL;
2205 2235
2206 /* 2236 /*
2207 * We consider only non-mapped and non-allocated blocks 2237 * If the blocks are mapped already, or we couldn't accumulate
2238 * any blocks, then proceed immediately to the submission stage.
2208 */ 2239 */
2209 if ((mpd->b_state & (1 << BH_Mapped)) && 2240 if ((mpd->b_size == 0) ||
2210 !(mpd->b_state & (1 << BH_Delay)) && 2241 ((mpd->b_state & (1 << BH_Mapped)) &&
2211 !(mpd->b_state & (1 << BH_Unwritten))) 2242 !(mpd->b_state & (1 << BH_Delay)) &&
2212 return 0; 2243 !(mpd->b_state & (1 << BH_Unwritten))))
2213 2244 goto submit_io;
2214 /*
2215 * If we didn't accumulate anything to write simply return
2216 */
2217 if (!mpd->b_size)
2218 return 0;
2219 2245
2220 handle = ext4_journal_current_handle(); 2246 handle = ext4_journal_current_handle();
2221 BUG_ON(!handle); 2247 BUG_ON(!handle);
@@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2231 * affects functions in many different parts of the allocation 2257 * affects functions in many different parts of the allocation
2232 * call path. This flag exists primarily because we don't 2258 * call path. This flag exists primarily because we don't
2233 * want to change *many* call functions, so ext4_map_blocks() 2259 * want to change *many* call functions, so ext4_map_blocks()
2234 * will set the magic i_delalloc_reserved_flag once the 2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2235 * inode's allocation semaphore is taken. 2261 * inode's allocation semaphore is taken.
2236 * 2262 *
2237 * If the blocks in questions were delalloc blocks, set 2263 * If the blocks in questions were delalloc blocks, set
@@ -2252,17 +2278,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2252 2278
2253 err = blks; 2279 err = blks;
2254 /* 2280 /*
2255 * If get block returns with error we simply 2281 * If get block returns EAGAIN or ENOSPC and there
2256 * return. Later writepage will redirty the page and 2282 * appears to be free blocks we will just let
2257 * writepages will find the dirty page again 2283 * mpage_da_submit_io() unlock all of the pages.
2258 */ 2284 */
2259 if (err == -EAGAIN) 2285 if (err == -EAGAIN)
2260 return 0; 2286 goto submit_io;
2261 2287
2262 if (err == -ENOSPC && 2288 if (err == -ENOSPC &&
2263 ext4_count_free_blocks(sb)) { 2289 ext4_count_free_blocks(sb)) {
2264 mpd->retval = err; 2290 mpd->retval = err;
2265 return 0; 2291 goto submit_io;
2266 } 2292 }
2267 2293
2268 /* 2294 /*
@@ -2285,12 +2311,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2285 ext4_print_free_blocks(mpd->inode); 2311 ext4_print_free_blocks(mpd->inode);
2286 } 2312 }
2287 /* invalidate all the pages */ 2313 /* invalidate all the pages */
2288 ext4_da_block_invalidatepages(mpd, next, 2314 ext4_da_block_invalidatepages(mpd);
2289 mpd->b_size >> mpd->inode->i_blkbits); 2315
2290 return err; 2316 /* Mark this page range as having been completed */
2317 mpd->io_done = 1;
2318 return;
2291 } 2319 }
2292 BUG_ON(blks == 0); 2320 BUG_ON(blks == 0);
2293 2321
2322 mapp = &map;
2294 if (map.m_flags & EXT4_MAP_NEW) { 2323 if (map.m_flags & EXT4_MAP_NEW) {
2295 struct block_device *bdev = mpd->inode->i_sb->s_bdev; 2324 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2296 int i; 2325 int i;
@@ -2299,18 +2328,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2299 unmap_underlying_metadata(bdev, map.m_pblk + i); 2328 unmap_underlying_metadata(bdev, map.m_pblk + i);
2300 } 2329 }
2301 2330
2302 /*
2303 * If blocks are delayed marked, we need to
2304 * put actual blocknr and drop delayed bit
2305 */
2306 if ((mpd->b_state & (1 << BH_Delay)) ||
2307 (mpd->b_state & (1 << BH_Unwritten)))
2308 mpage_put_bnr_to_bhs(mpd, &map);
2309
2310 if (ext4_should_order_data(mpd->inode)) { 2331 if (ext4_should_order_data(mpd->inode)) {
2311 err = ext4_jbd2_file_inode(handle, mpd->inode); 2332 err = ext4_jbd2_file_inode(handle, mpd->inode);
2312 if (err) 2333 if (err)
2313 return err; 2334 /* This only happens if the journal is aborted */
2335 return;
2314 } 2336 }
2315 2337
2316 /* 2338 /*
@@ -2321,10 +2343,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2321 disksize = i_size_read(mpd->inode); 2343 disksize = i_size_read(mpd->inode);
2322 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2344 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2323 ext4_update_i_disksize(mpd->inode, disksize); 2345 ext4_update_i_disksize(mpd->inode, disksize);
2324 return ext4_mark_inode_dirty(handle, mpd->inode); 2346 err = ext4_mark_inode_dirty(handle, mpd->inode);
2347 if (err)
2348 ext4_error(mpd->inode->i_sb,
2349 "Failed to mark inode %lu dirty",
2350 mpd->inode->i_ino);
2325 } 2351 }
2326 2352
2327 return 0; 2353submit_io:
2354 mpage_da_submit_io(mpd, mapp);
2355 mpd->io_done = 1;
2328} 2356}
2329 2357
2330#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2358#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2429,7 @@ flush_it:
2401 * We couldn't merge the block to our extent, so we 2429 * We couldn't merge the block to our extent, so we
2402 * need to flush current extent and start new one 2430 * need to flush current extent and start new one
2403 */ 2431 */
2404 if (mpage_da_map_blocks(mpd) == 0) 2432 mpage_da_map_and_submit(mpd);
2405 mpage_da_submit_io(mpd);
2406 mpd->io_done = 1;
2407 return; 2433 return;
2408} 2434}
2409 2435
@@ -2413,104 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2413} 2439}
2414 2440
2415/* 2441/*
2416 * __mpage_da_writepage - finds extent of pages and blocks
2417 *
2418 * @page: page to consider
2419 * @wbc: not used, we just follow rules
2420 * @data: context
2421 *
2422 * The function finds extents of pages and scan them for all blocks.
2423 */
2424static int __mpage_da_writepage(struct page *page,
2425 struct writeback_control *wbc, void *data)
2426{
2427 struct mpage_da_data *mpd = data;
2428 struct inode *inode = mpd->inode;
2429 struct buffer_head *bh, *head;
2430 sector_t logical;
2431
2432 /*
2433 * Can we merge this page to current extent?
2434 */
2435 if (mpd->next_page != page->index) {
2436 /*
2437 * Nope, we can't. So, we map non-allocated blocks
2438 * and start IO on them using writepage()
2439 */
2440 if (mpd->next_page != mpd->first_page) {
2441 if (mpage_da_map_blocks(mpd) == 0)
2442 mpage_da_submit_io(mpd);
2443 /*
2444 * skip rest of the page in the page_vec
2445 */
2446 mpd->io_done = 1;
2447 redirty_page_for_writepage(wbc, page);
2448 unlock_page(page);
2449 return MPAGE_DA_EXTENT_TAIL;
2450 }
2451
2452 /*
2453 * Start next extent of pages ...
2454 */
2455 mpd->first_page = page->index;
2456
2457 /*
2458 * ... and blocks
2459 */
2460 mpd->b_size = 0;
2461 mpd->b_state = 0;
2462 mpd->b_blocknr = 0;
2463 }
2464
2465 mpd->next_page = page->index + 1;
2466 logical = (sector_t) page->index <<
2467 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2468
2469 if (!page_has_buffers(page)) {
2470 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2471 (1 << BH_Dirty) | (1 << BH_Uptodate));
2472 if (mpd->io_done)
2473 return MPAGE_DA_EXTENT_TAIL;
2474 } else {
2475 /*
2476 * Page with regular buffer heads, just add all dirty ones
2477 */
2478 head = page_buffers(page);
2479 bh = head;
2480 do {
2481 BUG_ON(buffer_locked(bh));
2482 /*
2483 * We need to try to allocate
2484 * unmapped blocks in the same page.
2485 * Otherwise we won't make progress
2486 * with the page in ext4_writepage
2487 */
2488 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2489 mpage_add_bh_to_extent(mpd, logical,
2490 bh->b_size,
2491 bh->b_state);
2492 if (mpd->io_done)
2493 return MPAGE_DA_EXTENT_TAIL;
2494 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2495 /*
2496 * mapped dirty buffer. We need to update
2497 * the b_state because we look at
2498 * b_state in mpage_da_map_blocks. We don't
2499 * update b_size because if we find an
2500 * unmapped buffer_head later we need to
2501 * use the b_state flag of that buffer_head.
2502 */
2503 if (mpd->b_size == 0)
2504 mpd->b_state = bh->b_state & BH_FLAGS;
2505 }
2506 logical++;
2507 } while ((bh = bh->b_this_page) != head);
2508 }
2509
2510 return 0;
2511}
2512
2513/*
2514 * This is a special get_blocks_t callback which is used by 2442 * This is a special get_blocks_t callback which is used by
2515 * ext4_da_write_begin(). It will either return mapped block or 2443 * ext4_da_write_begin(). It will either return mapped block or
2516 * reserve space for a single block. 2444 * reserve space for a single block.
@@ -2550,8 +2478,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2550 if (buffer_delay(bh)) 2478 if (buffer_delay(bh))
2551 return 0; /* Not sure this could or should happen */ 2479 return 0; /* Not sure this could or should happen */
2552 /* 2480 /*
2553 * XXX: __block_prepare_write() unmaps passed block, 2481 * XXX: __block_write_begin() unmaps passed block, is it OK?
2554 * is it OK?
2555 */ 2482 */
2556 ret = ext4_da_reserve_space(inode, iblock); 2483 ret = ext4_da_reserve_space(inode, iblock);
2557 if (ret) 2484 if (ret)
@@ -2583,7 +2510,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2583/* 2510/*
2584 * This function is used as a standard get_block_t calback function 2511 * This function is used as a standard get_block_t calback function
2585 * when there is no desire to allocate any blocks. It is used as a 2512 * when there is no desire to allocate any blocks. It is used as a
2586 * callback function for block_prepare_write() and block_write_full_page(). 2513 * callback function for block_write_begin() and block_write_full_page().
2587 * These functions should only try to map a single block at a time. 2514 * These functions should only try to map a single block at a time.
2588 * 2515 *
2589 * Since this function doesn't do block allocations even if the caller 2516 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2550,7 @@ static int __ext4_journalled_writepage(struct page *page,
2623 int ret = 0; 2550 int ret = 0;
2624 int err; 2551 int err;
2625 2552
2553 ClearPageChecked(page);
2626 page_bufs = page_buffers(page); 2554 page_bufs = page_buffers(page);
2627 BUG_ON(!page_bufs); 2555 BUG_ON(!page_bufs);
2628 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2556 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2661,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2661 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2589 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2662 * need to file the inode to the transaction's list in ordered mode because if 2590 * need to file the inode to the transaction's list in ordered mode because if
2663 * we are writing back data added by write(), the inode is already there and if 2591 * we are writing back data added by write(), the inode is already there and if
2664 * we are writing back data modified via mmap(), noone guarantees in which 2592 * we are writing back data modified via mmap(), no one guarantees in which
2665 * transaction the data will hit the disk. In case we are journaling data, we 2593 * transaction the data will hit the disk. In case we are journaling data, we
2666 * cannot start transaction directly because transaction start ranks above page 2594 * cannot start transaction directly because transaction start ranks above page
2667 * lock so we have to do some magic. 2595 * lock so we have to do some magic.
@@ -2700,84 +2628,57 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2700static int ext4_writepage(struct page *page, 2628static int ext4_writepage(struct page *page,
2701 struct writeback_control *wbc) 2629 struct writeback_control *wbc)
2702{ 2630{
2703 int ret = 0; 2631 int ret = 0, commit_write = 0;
2704 loff_t size; 2632 loff_t size;
2705 unsigned int len; 2633 unsigned int len;
2706 struct buffer_head *page_bufs = NULL; 2634 struct buffer_head *page_bufs = NULL;
2707 struct inode *inode = page->mapping->host; 2635 struct inode *inode = page->mapping->host;
2708 2636
2709 trace_ext4_writepage(inode, page); 2637 trace_ext4_writepage(page);
2710 size = i_size_read(inode); 2638 size = i_size_read(inode);
2711 if (page->index == size >> PAGE_CACHE_SHIFT) 2639 if (page->index == size >> PAGE_CACHE_SHIFT)
2712 len = size & ~PAGE_CACHE_MASK; 2640 len = size & ~PAGE_CACHE_MASK;
2713 else 2641 else
2714 len = PAGE_CACHE_SIZE; 2642 len = PAGE_CACHE_SIZE;
2715 2643
2716 if (page_has_buffers(page)) { 2644 /*
2717 page_bufs = page_buffers(page); 2645 * If the page does not have buffers (for whatever reason),
2718 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2646 * try to create them using __block_write_begin. If this
2719 ext4_bh_delay_or_unwritten)) { 2647 * fails, redirty the page and move on.
2720 /* 2648 */
2721 * We don't want to do block allocation 2649 if (!page_has_buffers(page)) {
2722 * So redirty the page and return 2650 if (__block_write_begin(page, 0, len,
2723 * We may reach here when we do a journal commit 2651 noalloc_get_block_write)) {
2724 * via journal_submit_inode_data_buffers. 2652 redirty_page:
2725 * If we don't have mapping block we just ignore
2726 * them. We can also reach here via shrink_page_list
2727 */
2728 redirty_page_for_writepage(wbc, page); 2653 redirty_page_for_writepage(wbc, page);
2729 unlock_page(page); 2654 unlock_page(page);
2730 return 0; 2655 return 0;
2731 } 2656 }
2732 } else { 2657 commit_write = 1;
2658 }
2659 page_bufs = page_buffers(page);
2660 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2661 ext4_bh_delay_or_unwritten)) {
2733 /* 2662 /*
2734 * The test for page_has_buffers() is subtle: 2663 * We don't want to do block allocation, so redirty
2735 * We know the page is dirty but it lost buffers. That means 2664 * the page and return. We may reach here when we do
2736 * that at some moment in time after write_begin()/write_end() 2665 * a journal commit via journal_submit_inode_data_buffers.
2737 * has been called all buffers have been clean and thus they 2666 * We can also reach here via shrink_page_list
2738 * must have been written at least once. So they are all
2739 * mapped and we can happily proceed with mapping them
2740 * and writing the page.
2741 *
2742 * Try to initialize the buffer_heads and check whether
2743 * all are mapped and non delay. We don't want to
2744 * do block allocation here.
2745 */ 2667 */
2746 ret = block_prepare_write(page, 0, len, 2668 goto redirty_page;
2747 noalloc_get_block_write); 2669 }
2748 if (!ret) { 2670 if (commit_write)
2749 page_bufs = page_buffers(page);
2750 /* check whether all are mapped and non delay */
2751 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2752 ext4_bh_delay_or_unwritten)) {
2753 redirty_page_for_writepage(wbc, page);
2754 unlock_page(page);
2755 return 0;
2756 }
2757 } else {
2758 /*
2759 * We can't do block allocation here
2760 * so just redity the page and unlock
2761 * and return
2762 */
2763 redirty_page_for_writepage(wbc, page);
2764 unlock_page(page);
2765 return 0;
2766 }
2767 /* now mark the buffer_heads as dirty and uptodate */ 2671 /* now mark the buffer_heads as dirty and uptodate */
2768 block_commit_write(page, 0, len); 2672 block_commit_write(page, 0, len);
2769 }
2770 2673
2771 if (PageChecked(page) && ext4_should_journal_data(inode)) { 2674 if (PageChecked(page) && ext4_should_journal_data(inode))
2772 /* 2675 /*
2773 * It's mmapped pagecache. Add buffers and journal it. There 2676 * It's mmapped pagecache. Add buffers and journal it. There
2774 * doesn't seem much point in redirtying the page here. 2677 * doesn't seem much point in redirtying the page here.
2775 */ 2678 */
2776 ClearPageChecked(page);
2777 return __ext4_journalled_writepage(page, len); 2679 return __ext4_journalled_writepage(page, len);
2778 }
2779 2680
2780 if (page_bufs && buffer_uninit(page_bufs)) { 2681 if (buffer_uninit(page_bufs)) {
2781 ext4_set_bh_endio(page_bufs, inode); 2682 ext4_set_bh_endio(page_bufs, inode);
2782 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2683 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2783 wbc, ext4_end_io_buffer_write); 2684 wbc, ext4_end_io_buffer_write);
@@ -2790,7 +2691,7 @@ static int ext4_writepage(struct page *page,
2790 2691
2791/* 2692/*
2792 * This is called via ext4_da_writepages() to 2693 * This is called via ext4_da_writepages() to
2793 * calulate the total number of credits to reserve to fit 2694 * calculate the total number of credits to reserve to fit
2794 * a single extent allocation into a single transaction, 2695 * a single extent allocation into a single transaction,
2795 * ext4_da_writpeages() will loop calling this before 2696 * ext4_da_writpeages() will loop calling this before
2796 * the block allocation. 2697 * the block allocation.
@@ -2815,37 +2716,42 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2815 2716
2816/* 2717/*
2817 * write_cache_pages_da - walk the list of dirty pages of the given 2718 * write_cache_pages_da - walk the list of dirty pages of the given
2818 * address space and call the callback function (which usually writes 2719 * address space and accumulate pages that need writing, and call
2819 * the pages). 2720 * mpage_da_map_and_submit to map a single contiguous memory region
2820 * 2721 * and then write them.
2821 * This is a forked version of write_cache_pages(). Differences:
2822 * Range cyclic is ignored.
2823 * no_nrwrite_index_update is always presumed true
2824 */ 2722 */
2825static int write_cache_pages_da(struct address_space *mapping, 2723static int write_cache_pages_da(struct address_space *mapping,
2826 struct writeback_control *wbc, 2724 struct writeback_control *wbc,
2827 struct mpage_da_data *mpd) 2725 struct mpage_da_data *mpd,
2726 pgoff_t *done_index)
2828{ 2727{
2829 int ret = 0; 2728 struct buffer_head *bh, *head;
2830 int done = 0; 2729 struct inode *inode = mapping->host;
2831 struct pagevec pvec; 2730 struct pagevec pvec;
2832 int nr_pages; 2731 unsigned int nr_pages;
2833 pgoff_t index; 2732 sector_t logical;
2834 pgoff_t end; /* Inclusive */ 2733 pgoff_t index, end;
2835 long nr_to_write = wbc->nr_to_write; 2734 long nr_to_write = wbc->nr_to_write;
2836 2735 int i, tag, ret = 0;
2736
2737 memset(mpd, 0, sizeof(struct mpage_da_data));
2738 mpd->wbc = wbc;
2739 mpd->inode = inode;
2837 pagevec_init(&pvec, 0); 2740 pagevec_init(&pvec, 0);
2838 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2741 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2839 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2742 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2840 2743
2841 while (!done && (index <= end)) { 2744 if (wbc->sync_mode == WB_SYNC_ALL)
2842 int i; 2745 tag = PAGECACHE_TAG_TOWRITE;
2746 else
2747 tag = PAGECACHE_TAG_DIRTY;
2843 2748
2844 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2749 *done_index = index;
2845 PAGECACHE_TAG_DIRTY, 2750 while (index <= end) {
2751 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2846 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2752 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2847 if (nr_pages == 0) 2753 if (nr_pages == 0)
2848 break; 2754 return 0;
2849 2755
2850 for (i = 0; i < nr_pages; i++) { 2756 for (i = 0; i < nr_pages; i++) {
2851 struct page *page = pvec.pages[i]; 2757 struct page *page = pvec.pages[i];
@@ -2857,58 +2763,98 @@ static int write_cache_pages_da(struct address_space *mapping,
2857 * mapping. However, page->index will not change 2763 * mapping. However, page->index will not change
2858 * because we have a reference on the page. 2764 * because we have a reference on the page.
2859 */ 2765 */
2860 if (page->index > end) { 2766 if (page->index > end)
2861 done = 1; 2767 goto out;
2862 break; 2768
2769 *done_index = page->index + 1;
2770
2771 /*
2772 * If we can't merge this page, and we have
2773 * accumulated an contiguous region, write it
2774 */
2775 if ((mpd->next_page != page->index) &&
2776 (mpd->next_page != mpd->first_page)) {
2777 mpage_da_map_and_submit(mpd);
2778 goto ret_extent_tail;
2863 } 2779 }
2864 2780
2865 lock_page(page); 2781 lock_page(page);
2866 2782
2867 /* 2783 /*
2868 * Page truncated or invalidated. We can freely skip it 2784 * If the page is no longer dirty, or its
2869 * then, even for data integrity operations: the page 2785 * mapping no longer corresponds to inode we
2870 * has disappeared concurrently, so there could be no 2786 * are writing (which means it has been
2871 * real expectation of this data interity operation 2787 * truncated or invalidated), or the page is
2872 * even if there is now a new, dirty page at the same 2788 * already under writeback and we are not
2873 * pagecache address. 2789 * doing a data integrity writeback, skip the page
2874 */ 2790 */
2875 if (unlikely(page->mapping != mapping)) { 2791 if (!PageDirty(page) ||
2876continue_unlock: 2792 (PageWriteback(page) &&
2793 (wbc->sync_mode == WB_SYNC_NONE)) ||
2794 unlikely(page->mapping != mapping)) {
2877 unlock_page(page); 2795 unlock_page(page);
2878 continue; 2796 continue;
2879 } 2797 }
2880 2798
2881 if (!PageDirty(page)) { 2799 wait_on_page_writeback(page);
2882 /* someone wrote it for us */
2883 goto continue_unlock;
2884 }
2885
2886 if (PageWriteback(page)) {
2887 if (wbc->sync_mode != WB_SYNC_NONE)
2888 wait_on_page_writeback(page);
2889 else
2890 goto continue_unlock;
2891 }
2892
2893 BUG_ON(PageWriteback(page)); 2800 BUG_ON(PageWriteback(page));
2894 if (!clear_page_dirty_for_io(page))
2895 goto continue_unlock;
2896 2801
2897 ret = __mpage_da_writepage(page, wbc, mpd); 2802 if (mpd->next_page != page->index)
2898 if (unlikely(ret)) { 2803 mpd->first_page = page->index;
2899 if (ret == AOP_WRITEPAGE_ACTIVATE) { 2804 mpd->next_page = page->index + 1;
2900 unlock_page(page); 2805 logical = (sector_t) page->index <<
2901 ret = 0; 2806 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2902 } else { 2807
2903 done = 1; 2808 if (!page_has_buffers(page)) {
2904 break; 2809 mpage_add_bh_to_extent(mpd, logical,
2905 } 2810 PAGE_CACHE_SIZE,
2811 (1 << BH_Dirty) | (1 << BH_Uptodate));
2812 if (mpd->io_done)
2813 goto ret_extent_tail;
2814 } else {
2815 /*
2816 * Page with regular buffer heads,
2817 * just add all dirty ones
2818 */
2819 head = page_buffers(page);
2820 bh = head;
2821 do {
2822 BUG_ON(buffer_locked(bh));
2823 /*
2824 * We need to try to allocate
2825 * unmapped blocks in the same page.
2826 * Otherwise we won't make progress
2827 * with the page in ext4_writepage
2828 */
2829 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2830 mpage_add_bh_to_extent(mpd, logical,
2831 bh->b_size,
2832 bh->b_state);
2833 if (mpd->io_done)
2834 goto ret_extent_tail;
2835 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2836 /*
2837 * mapped dirty buffer. We need
2838 * to update the b_state
2839 * because we look at b_state
2840 * in mpage_da_map_blocks. We
2841 * don't update b_size because
2842 * if we find an unmapped
2843 * buffer_head later we need to
2844 * use the b_state flag of that
2845 * buffer_head.
2846 */
2847 if (mpd->b_size == 0)
2848 mpd->b_state = bh->b_state & BH_FLAGS;
2849 }
2850 logical++;
2851 } while ((bh = bh->b_this_page) != head);
2906 } 2852 }
2907 2853
2908 if (nr_to_write > 0) { 2854 if (nr_to_write > 0) {
2909 nr_to_write--; 2855 nr_to_write--;
2910 if (nr_to_write == 0 && 2856 if (nr_to_write == 0 &&
2911 wbc->sync_mode == WB_SYNC_NONE) { 2857 wbc->sync_mode == WB_SYNC_NONE)
2912 /* 2858 /*
2913 * We stop writing back only if we are 2859 * We stop writing back only if we are
2914 * not doing integrity sync. In case of 2860 * not doing integrity sync. In case of
@@ -2919,14 +2865,18 @@ continue_unlock:
2919 * pages, but have not synced all of the 2865 * pages, but have not synced all of the
2920 * old dirty pages. 2866 * old dirty pages.
2921 */ 2867 */
2922 done = 1; 2868 goto out;
2923 break;
2924 }
2925 } 2869 }
2926 } 2870 }
2927 pagevec_release(&pvec); 2871 pagevec_release(&pvec);
2928 cond_resched(); 2872 cond_resched();
2929 } 2873 }
2874 return 0;
2875ret_extent_tail:
2876 ret = MPAGE_DA_EXTENT_TAIL;
2877out:
2878 pagevec_release(&pvec);
2879 cond_resched();
2930 return ret; 2880 return ret;
2931} 2881}
2932 2882
@@ -2940,13 +2890,14 @@ static int ext4_da_writepages(struct address_space *mapping,
2940 struct mpage_da_data mpd; 2890 struct mpage_da_data mpd;
2941 struct inode *inode = mapping->host; 2891 struct inode *inode = mapping->host;
2942 int pages_written = 0; 2892 int pages_written = 0;
2943 long pages_skipped;
2944 unsigned int max_pages; 2893 unsigned int max_pages;
2945 int range_cyclic, cycled = 1, io_done = 0; 2894 int range_cyclic, cycled = 1, io_done = 0;
2946 int needed_blocks, ret = 0; 2895 int needed_blocks, ret = 0;
2947 long desired_nr_to_write, nr_to_writebump = 0; 2896 long desired_nr_to_write, nr_to_writebump = 0;
2948 loff_t range_start = wbc->range_start; 2897 loff_t range_start = wbc->range_start;
2949 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2898 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2899 pgoff_t done_index = 0;
2900 pgoff_t end;
2950 2901
2951 trace_ext4_da_writepages(inode, wbc); 2902 trace_ext4_da_writepages(inode, wbc);
2952 2903
@@ -2982,8 +2933,11 @@ static int ext4_da_writepages(struct address_space *mapping,
2982 wbc->range_start = index << PAGE_CACHE_SHIFT; 2933 wbc->range_start = index << PAGE_CACHE_SHIFT;
2983 wbc->range_end = LLONG_MAX; 2934 wbc->range_end = LLONG_MAX;
2984 wbc->range_cyclic = 0; 2935 wbc->range_cyclic = 0;
2985 } else 2936 end = -1;
2937 } else {
2986 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2938 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2939 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2940 }
2987 2941
2988 /* 2942 /*
2989 * This works around two forms of stupidity. The first is in 2943 * This works around two forms of stupidity. The first is in
@@ -3002,9 +2956,12 @@ static int ext4_da_writepages(struct address_space *mapping,
3002 * sbi->max_writeback_mb_bump whichever is smaller. 2956 * sbi->max_writeback_mb_bump whichever is smaller.
3003 */ 2957 */
3004 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 2958 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
3005 if (!range_cyclic && range_whole) 2959 if (!range_cyclic && range_whole) {
3006 desired_nr_to_write = wbc->nr_to_write * 8; 2960 if (wbc->nr_to_write == LONG_MAX)
3007 else 2961 desired_nr_to_write = wbc->nr_to_write;
2962 else
2963 desired_nr_to_write = wbc->nr_to_write * 8;
2964 } else
3008 desired_nr_to_write = ext4_num_dirty_pages(inode, index, 2965 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
3009 max_pages); 2966 max_pages);
3010 if (desired_nr_to_write > max_pages) 2967 if (desired_nr_to_write > max_pages)
@@ -3015,12 +2972,10 @@ static int ext4_da_writepages(struct address_space *mapping,
3015 wbc->nr_to_write = desired_nr_to_write; 2972 wbc->nr_to_write = desired_nr_to_write;
3016 } 2973 }
3017 2974
3018 mpd.wbc = wbc;
3019 mpd.inode = mapping->host;
3020
3021 pages_skipped = wbc->pages_skipped;
3022
3023retry: 2975retry:
2976 if (wbc->sync_mode == WB_SYNC_ALL)
2977 tag_pages_for_writeback(mapping, index, end);
2978
3024 while (!ret && wbc->nr_to_write > 0) { 2979 while (!ret && wbc->nr_to_write > 0) {
3025 2980
3026 /* 2981 /*
@@ -3043,32 +2998,18 @@ retry:
3043 } 2998 }
3044 2999
3045 /* 3000 /*
3046 * Now call __mpage_da_writepage to find the next 3001 * Now call write_cache_pages_da() to find the next
3047 * contiguous region of logical blocks that need 3002 * contiguous region of logical blocks that need
3048 * blocks to be allocated by ext4. We don't actually 3003 * blocks to be allocated by ext4 and submit them.
3049 * submit the blocks for I/O here, even though
3050 * write_cache_pages thinks it will, and will set the
3051 * pages as clean for write before calling
3052 * __mpage_da_writepage().
3053 */ 3004 */
3054 mpd.b_size = 0; 3005 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3055 mpd.b_state = 0;
3056 mpd.b_blocknr = 0;
3057 mpd.first_page = 0;
3058 mpd.next_page = 0;
3059 mpd.io_done = 0;
3060 mpd.pages_written = 0;
3061 mpd.retval = 0;
3062 ret = write_cache_pages_da(mapping, wbc, &mpd);
3063 /* 3006 /*
3064 * If we have a contiguous extent of pages and we 3007 * If we have a contiguous extent of pages and we
3065 * haven't done the I/O yet, map the blocks and submit 3008 * haven't done the I/O yet, map the blocks and submit
3066 * them for I/O. 3009 * them for I/O.
3067 */ 3010 */
3068 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 3011 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3069 if (mpage_da_map_blocks(&mpd) == 0) 3012 mpage_da_map_and_submit(&mpd);
3070 mpage_da_submit_io(&mpd);
3071 mpd.io_done = 1;
3072 ret = MPAGE_DA_EXTENT_TAIL; 3013 ret = MPAGE_DA_EXTENT_TAIL;
3073 } 3014 }
3074 trace_ext4_da_write_pages(inode, &mpd); 3015 trace_ext4_da_write_pages(inode, &mpd);
@@ -3082,7 +3023,6 @@ retry:
3082 * and try again 3023 * and try again
3083 */ 3024 */
3084 jbd2_journal_force_commit_nested(sbi->s_journal); 3025 jbd2_journal_force_commit_nested(sbi->s_journal);
3085 wbc->pages_skipped = pages_skipped;
3086 ret = 0; 3026 ret = 0;
3087 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 3027 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
3088 /* 3028 /*
@@ -3090,7 +3030,6 @@ retry:
3090 * rest of the pages 3030 * rest of the pages
3091 */ 3031 */
3092 pages_written += mpd.pages_written; 3032 pages_written += mpd.pages_written;
3093 wbc->pages_skipped = pages_skipped;
3094 ret = 0; 3033 ret = 0;
3095 io_done = 1; 3034 io_done = 1;
3096 } else if (wbc->nr_to_write) 3035 } else if (wbc->nr_to_write)
@@ -3108,21 +3047,15 @@ retry:
3108 wbc->range_end = mapping->writeback_index - 1; 3047 wbc->range_end = mapping->writeback_index - 1;
3109 goto retry; 3048 goto retry;
3110 } 3049 }
3111 if (pages_skipped != wbc->pages_skipped)
3112 ext4_msg(inode->i_sb, KERN_CRIT,
3113 "This should not happen leaving %s "
3114 "with nr_to_write = %ld ret = %d",
3115 __func__, wbc->nr_to_write, ret);
3116 3050
3117 /* Update index */ 3051 /* Update index */
3118 index += pages_written;
3119 wbc->range_cyclic = range_cyclic; 3052 wbc->range_cyclic = range_cyclic;
3120 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3053 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3121 /* 3054 /*
3122 * set the writeback_index so that range_cyclic 3055 * set the writeback_index so that range_cyclic
3123 * mode will write it back later 3056 * mode will write it back later
3124 */ 3057 */
3125 mapping->writeback_index = index; 3058 mapping->writeback_index = done_index;
3126 3059
3127out_writepages: 3060out_writepages:
3128 wbc->nr_to_write -= nr_to_writebump; 3061 wbc->nr_to_write -= nr_to_writebump;
@@ -3367,10 +3300,10 @@ int ext4_alloc_da_blocks(struct inode *inode)
3367 * doing I/O at all. 3300 * doing I/O at all.
3368 * 3301 *
3369 * We could call write_cache_pages(), and then redirty all of 3302 * We could call write_cache_pages(), and then redirty all of
3370 * the pages by calling redirty_page_for_writeback() but that 3303 * the pages by calling redirty_page_for_writepage() but that
3371 * would be ugly in the extreme. So instead we would need to 3304 * would be ugly in the extreme. So instead we would need to
3372 * replicate parts of the code in the above functions, 3305 * replicate parts of the code in the above functions,
3373 * simplifying them becuase we wouldn't actually intend to 3306 * simplifying them because we wouldn't actually intend to
3374 * write out the pages, but rather only collect contiguous 3307 * write out the pages, but rather only collect contiguous
3375 * logical block extents, call the multi-block allocator, and 3308 * logical block extents, call the multi-block allocator, and
3376 * then update the buffer heads with the block allocations. 3309 * then update the buffer heads with the block allocations.
@@ -3447,6 +3380,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3447 3380
3448static int ext4_readpage(struct file *file, struct page *page) 3381static int ext4_readpage(struct file *file, struct page *page)
3449{ 3382{
3383 trace_ext4_readpage(page);
3450 return mpage_readpage(page, ext4_get_block); 3384 return mpage_readpage(page, ext4_get_block);
3451} 3385}
3452 3386
@@ -3457,15 +3391,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3457 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3391 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3458} 3392}
3459 3393
3460static void ext4_free_io_end(ext4_io_end_t *io)
3461{
3462 BUG_ON(!io);
3463 if (io->page)
3464 put_page(io->page);
3465 iput(io->inode);
3466 kfree(io);
3467}
3468
3469static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) 3394static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3470{ 3395{
3471 struct buffer_head *head, *bh; 3396 struct buffer_head *head, *bh;
@@ -3490,6 +3415,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
3490{ 3415{
3491 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3416 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3492 3417
3418 trace_ext4_invalidatepage(page, offset);
3419
3493 /* 3420 /*
3494 * free any io_end structure allocated for buffers to be discarded 3421 * free any io_end structure allocated for buffers to be discarded
3495 */ 3422 */
@@ -3511,6 +3438,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3511{ 3438{
3512 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3439 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3513 3440
3441 trace_ext4_releasepage(page);
3442
3514 WARN_ON(PageChecked(page)); 3443 WARN_ON(PageChecked(page));
3515 if (!page_has_buffers(page)) 3444 if (!page_has_buffers(page))
3516 return 0; 3445 return 0;
@@ -3582,7 +3511,7 @@ retry:
3582 loff_t end = offset + iov_length(iov, nr_segs); 3511 loff_t end = offset + iov_length(iov, nr_segs);
3583 3512
3584 if (end > isize) 3513 if (end > isize)
3585 vmtruncate(inode, isize); 3514 ext4_truncate_failed_write(inode);
3586 } 3515 }
3587 } 3516 }
3588 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3517 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3642,173 +3571,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3642 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3571 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3643} 3572}
3644 3573
3645static void dump_completed_IO(struct inode * inode)
3646{
3647#ifdef EXT4_DEBUG
3648 struct list_head *cur, *before, *after;
3649 ext4_io_end_t *io, *io0, *io1;
3650 unsigned long flags;
3651
3652 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3653 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3654 return;
3655 }
3656
3657 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3658 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3659 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3660 cur = &io->list;
3661 before = cur->prev;
3662 io0 = container_of(before, ext4_io_end_t, list);
3663 after = cur->next;
3664 io1 = container_of(after, ext4_io_end_t, list);
3665
3666 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3667 io, inode->i_ino, io0, io1);
3668 }
3669 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3670#endif
3671}
3672
3673/*
3674 * check a range of space and convert unwritten extents to written.
3675 */
3676static int ext4_end_io_nolock(ext4_io_end_t *io)
3677{
3678 struct inode *inode = io->inode;
3679 loff_t offset = io->offset;
3680 ssize_t size = io->size;
3681 int ret = 0;
3682
3683 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3684 "list->prev 0x%p\n",
3685 io, inode->i_ino, io->list.next, io->list.prev);
3686
3687 if (list_empty(&io->list))
3688 return ret;
3689
3690 if (io->flag != EXT4_IO_UNWRITTEN)
3691 return ret;
3692
3693 ret = ext4_convert_unwritten_extents(inode, offset, size);
3694 if (ret < 0) {
3695 printk(KERN_EMERG "%s: failed to convert unwritten"
3696 "extents to written extents, error is %d"
3697 " io is still on inode %lu aio dio list\n",
3698 __func__, ret, inode->i_ino);
3699 return ret;
3700 }
3701
3702 if (io->iocb)
3703 aio_complete(io->iocb, io->result, 0);
3704 /* clear the DIO AIO unwritten flag */
3705 io->flag = 0;
3706 return ret;
3707}
3708
3709/*
3710 * work on completed aio dio IO, to convert unwritten extents to extents
3711 */
3712static void ext4_end_io_work(struct work_struct *work)
3713{
3714 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3715 struct inode *inode = io->inode;
3716 struct ext4_inode_info *ei = EXT4_I(inode);
3717 unsigned long flags;
3718 int ret;
3719
3720 mutex_lock(&inode->i_mutex);
3721 ret = ext4_end_io_nolock(io);
3722 if (ret < 0) {
3723 mutex_unlock(&inode->i_mutex);
3724 return;
3725 }
3726
3727 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3728 if (!list_empty(&io->list))
3729 list_del_init(&io->list);
3730 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3731 mutex_unlock(&inode->i_mutex);
3732 ext4_free_io_end(io);
3733}
3734
3735/*
3736 * This function is called from ext4_sync_file().
3737 *
3738 * When IO is completed, the work to convert unwritten extents to
3739 * written is queued on workqueue but may not get immediately
3740 * scheduled. When fsync is called, we need to ensure the
3741 * conversion is complete before fsync returns.
3742 * The inode keeps track of a list of pending/completed IO that
3743 * might needs to do the conversion. This function walks through
3744 * the list and convert the related unwritten extents for completed IO
3745 * to written.
3746 * The function return the number of pending IOs on success.
3747 */
3748int flush_completed_IO(struct inode *inode)
3749{
3750 ext4_io_end_t *io;
3751 struct ext4_inode_info *ei = EXT4_I(inode);
3752 unsigned long flags;
3753 int ret = 0;
3754 int ret2 = 0;
3755
3756 if (list_empty(&ei->i_completed_io_list))
3757 return ret;
3758
3759 dump_completed_IO(inode);
3760 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3761 while (!list_empty(&ei->i_completed_io_list)){
3762 io = list_entry(ei->i_completed_io_list.next,
3763 ext4_io_end_t, list);
3764 /*
3765 * Calling ext4_end_io_nolock() to convert completed
3766 * IO to written.
3767 *
3768 * When ext4_sync_file() is called, run_queue() may already
3769 * about to flush the work corresponding to this io structure.
3770 * It will be upset if it founds the io structure related
3771 * to the work-to-be schedule is freed.
3772 *
3773 * Thus we need to keep the io structure still valid here after
3774 * convertion finished. The io structure has a flag to
3775 * avoid double converting from both fsync and background work
3776 * queue work.
3777 */
3778 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3779 ret = ext4_end_io_nolock(io);
3780 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3781 if (ret < 0)
3782 ret2 = ret;
3783 else
3784 list_del_init(&io->list);
3785 }
3786 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3787 return (ret2 < 0) ? ret2 : 0;
3788}
3789
3790static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3791{
3792 ext4_io_end_t *io = NULL;
3793
3794 io = kmalloc(sizeof(*io), flags);
3795
3796 if (io) {
3797 igrab(inode);
3798 io->inode = inode;
3799 io->flag = 0;
3800 io->offset = 0;
3801 io->size = 0;
3802 io->page = NULL;
3803 io->iocb = NULL;
3804 io->result = 0;
3805 INIT_WORK(&io->work, ext4_end_io_work);
3806 INIT_LIST_HEAD(&io->list);
3807 }
3808
3809 return io;
3810}
3811
3812static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3574static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3813 ssize_t size, void *private, int ret, 3575 ssize_t size, void *private, int ret,
3814 bool is_async) 3576 bool is_async)
@@ -3828,7 +3590,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3828 size); 3590 size);
3829 3591
3830 /* if not aio dio with unwritten extents, just free io and return */ 3592 /* if not aio dio with unwritten extents, just free io and return */
3831 if (io_end->flag != EXT4_IO_UNWRITTEN){ 3593 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3832 ext4_free_io_end(io_end); 3594 ext4_free_io_end(io_end);
3833 iocb->private = NULL; 3595 iocb->private = NULL;
3834out: 3596out:
@@ -3845,14 +3607,14 @@ out:
3845 } 3607 }
3846 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3608 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3847 3609
3848 /* queue the work to convert unwritten extents to written */
3849 queue_work(wq, &io_end->work);
3850
3851 /* Add the io_end to per-inode completed aio dio list*/ 3610 /* Add the io_end to per-inode completed aio dio list*/
3852 ei = EXT4_I(io_end->inode); 3611 ei = EXT4_I(io_end->inode);
3853 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 3612 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3854 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3613 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3855 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3614 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3615
3616 /* queue the work to convert unwritten extents to written */
3617 queue_work(wq, &io_end->work);
3856 iocb->private = NULL; 3618 iocb->private = NULL;
3857} 3619}
3858 3620
@@ -3873,7 +3635,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3873 goto out; 3635 goto out;
3874 } 3636 }
3875 3637
3876 io_end->flag = EXT4_IO_UNWRITTEN; 3638 io_end->flag = EXT4_IO_END_UNWRITTEN;
3877 inode = io_end->inode; 3639 inode = io_end->inode;
3878 3640
3879 /* Add the io_end to per-inode completed io list*/ 3641 /* Add the io_end to per-inode completed io list*/
@@ -3901,8 +3663,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3901retry: 3663retry:
3902 io_end = ext4_init_io_end(inode, GFP_ATOMIC); 3664 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3903 if (!io_end) { 3665 if (!io_end) {
3904 if (printk_ratelimit()) 3666 pr_warn_ratelimited("%s: allocation fail\n", __func__);
3905 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3906 schedule(); 3667 schedule();
3907 goto retry; 3668 goto retry;
3908 } 3669 }
@@ -3926,13 +3687,13 @@ retry:
3926 * preallocated extents, and those write extend the file, no need to 3687 * preallocated extents, and those write extend the file, no need to
3927 * fall back to buffered IO. 3688 * fall back to buffered IO.
3928 * 3689 *
3929 * For holes, we fallocate those blocks, mark them as unintialized 3690 * For holes, we fallocate those blocks, mark them as uninitialized
3930 * If those blocks were preallocated, we mark sure they are splited, but 3691 * If those blocks were preallocated, we mark sure they are splited, but
3931 * still keep the range to write as unintialized. 3692 * still keep the range to write as uninitialized.
3932 * 3693 *
3933 * The unwrritten extents will be converted to written when DIO is completed. 3694 * The unwrritten extents will be converted to written when DIO is completed.
3934 * For async direct IO, since the IO may still pending when return, we 3695 * For async direct IO, since the IO may still pending when return, we
3935 * set up an end_io call back function, which will do the convertion 3696 * set up an end_io call back function, which will do the conversion
3936 * when async direct IO completed. 3697 * when async direct IO completed.
3937 * 3698 *
3938 * If the O_DIRECT write will extend the file then add this inode to the 3699 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3955,7 +3716,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3955 * We could direct write to holes and fallocate. 3716 * We could direct write to holes and fallocate.
3956 * 3717 *
3957 * Allocated blocks to fill the hole are marked as uninitialized 3718 * Allocated blocks to fill the hole are marked as uninitialized
3958 * to prevent paralel buffered read to expose the stale data 3719 * to prevent parallel buffered read to expose the stale data
3959 * before DIO complete the data IO. 3720 * before DIO complete the data IO.
3960 * 3721 *
3961 * As to previously fallocated extents, ext4 get_block 3722 * As to previously fallocated extents, ext4 get_block
@@ -4016,7 +3777,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
4016 int err; 3777 int err;
4017 /* 3778 /*
4018 * for non AIO case, since the IO is already 3779 * for non AIO case, since the IO is already
4019 * completed, we could do the convertion right here 3780 * completed, we could do the conversion right here
4020 */ 3781 */
4021 err = ext4_convert_unwritten_extents(inode, 3782 err = ext4_convert_unwritten_extents(inode,
4022 offset, ret); 3783 offset, ret);
@@ -4037,11 +3798,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
4037{ 3798{
4038 struct file *file = iocb->ki_filp; 3799 struct file *file = iocb->ki_filp;
4039 struct inode *inode = file->f_mapping->host; 3800 struct inode *inode = file->f_mapping->host;
3801 ssize_t ret;
4040 3802
3803 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
4041 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3804 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4042 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3805 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
4043 3806 else
4044 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3807 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3808 trace_ext4_direct_IO_exit(inode, offset,
3809 iov_length(iov, nr_segs), rw, ret);
3810 return ret;
4045} 3811}
4046 3812
4047/* 3813/*
@@ -4067,7 +3833,6 @@ static const struct address_space_operations ext4_ordered_aops = {
4067 .readpage = ext4_readpage, 3833 .readpage = ext4_readpage,
4068 .readpages = ext4_readpages, 3834 .readpages = ext4_readpages,
4069 .writepage = ext4_writepage, 3835 .writepage = ext4_writepage,
4070 .sync_page = block_sync_page,
4071 .write_begin = ext4_write_begin, 3836 .write_begin = ext4_write_begin,
4072 .write_end = ext4_ordered_write_end, 3837 .write_end = ext4_ordered_write_end,
4073 .bmap = ext4_bmap, 3838 .bmap = ext4_bmap,
@@ -4083,7 +3848,6 @@ static const struct address_space_operations ext4_writeback_aops = {
4083 .readpage = ext4_readpage, 3848 .readpage = ext4_readpage,
4084 .readpages = ext4_readpages, 3849 .readpages = ext4_readpages,
4085 .writepage = ext4_writepage, 3850 .writepage = ext4_writepage,
4086 .sync_page = block_sync_page,
4087 .write_begin = ext4_write_begin, 3851 .write_begin = ext4_write_begin,
4088 .write_end = ext4_writeback_write_end, 3852 .write_end = ext4_writeback_write_end,
4089 .bmap = ext4_bmap, 3853 .bmap = ext4_bmap,
@@ -4099,7 +3863,6 @@ static const struct address_space_operations ext4_journalled_aops = {
4099 .readpage = ext4_readpage, 3863 .readpage = ext4_readpage,
4100 .readpages = ext4_readpages, 3864 .readpages = ext4_readpages,
4101 .writepage = ext4_writepage, 3865 .writepage = ext4_writepage,
4102 .sync_page = block_sync_page,
4103 .write_begin = ext4_write_begin, 3866 .write_begin = ext4_write_begin,
4104 .write_end = ext4_journalled_write_end, 3867 .write_end = ext4_journalled_write_end,
4105 .set_page_dirty = ext4_journalled_set_page_dirty, 3868 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -4115,7 +3878,6 @@ static const struct address_space_operations ext4_da_aops = {
4115 .readpages = ext4_readpages, 3878 .readpages = ext4_readpages,
4116 .writepage = ext4_writepage, 3879 .writepage = ext4_writepage,
4117 .writepages = ext4_da_writepages, 3880 .writepages = ext4_da_writepages,
4118 .sync_page = block_sync_page,
4119 .write_begin = ext4_da_write_begin, 3881 .write_begin = ext4_da_write_begin,
4120 .write_end = ext4_da_write_end, 3882 .write_end = ext4_da_write_end,
4121 .bmap = ext4_bmap, 3883 .bmap = ext4_bmap,
@@ -4152,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
4152int ext4_block_truncate_page(handle_t *handle, 3914int ext4_block_truncate_page(handle_t *handle,
4153 struct address_space *mapping, loff_t from) 3915 struct address_space *mapping, loff_t from)
4154{ 3916{
3917 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3918 unsigned length;
3919 unsigned blocksize;
3920 struct inode *inode = mapping->host;
3921
3922 blocksize = inode->i_sb->s_blocksize;
3923 length = blocksize - (offset & (blocksize - 1));
3924
3925 return ext4_block_zero_page_range(handle, mapping, from, length);
3926}
3927
3928/*
3929 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3930 * starting from file offset 'from'. The range to be zero'd must
3931 * be contained with in one block. If the specified range exceeds
3932 * the end of the block it will be shortened to end of the block
3933 * that cooresponds to 'from'
3934 */
3935int ext4_block_zero_page_range(handle_t *handle,
3936 struct address_space *mapping, loff_t from, loff_t length)
3937{
4155 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3938 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
4156 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3939 unsigned offset = from & (PAGE_CACHE_SIZE-1);
4157 unsigned blocksize, length, pos; 3940 unsigned blocksize, max, pos;
4158 ext4_lblk_t iblock; 3941 ext4_lblk_t iblock;
4159 struct inode *inode = mapping->host; 3942 struct inode *inode = mapping->host;
4160 struct buffer_head *bh; 3943 struct buffer_head *bh;
@@ -4167,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
4167 return -EINVAL; 3950 return -EINVAL;
4168 3951
4169 blocksize = inode->i_sb->s_blocksize; 3952 blocksize = inode->i_sb->s_blocksize;
4170 length = blocksize - (offset & (blocksize - 1)); 3953 max = blocksize - (offset & (blocksize - 1));
3954
3955 /*
3956 * correct length if it does not fall between
3957 * 'from' and the end of the block
3958 */
3959 if (length > max || length < 0)
3960 length = max;
3961
4171 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3962 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4172 3963
4173 if (!page_has_buffers(page)) 3964 if (!page_has_buffers(page))
@@ -4226,7 +4017,7 @@ int ext4_block_truncate_page(handle_t *handle,
4226 if (ext4_should_journal_data(inode)) { 4017 if (ext4_should_journal_data(inode)) {
4227 err = ext4_handle_dirty_metadata(handle, inode, bh); 4018 err = ext4_handle_dirty_metadata(handle, inode, bh);
4228 } else { 4019 } else {
4229 if (ext4_should_order_data(inode)) 4020 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4230 err = ext4_jbd2_file_inode(handle, inode); 4021 err = ext4_jbd2_file_inode(handle, inode);
4231 mark_buffer_dirty(bh); 4022 mark_buffer_dirty(bh);
4232 } 4023 }
@@ -4262,7 +4053,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
4262 * 4053 *
4263 * When we do truncate() we may have to clean the ends of several 4054 * When we do truncate() we may have to clean the ends of several
4264 * indirect blocks but leave the blocks themselves alive. Block is 4055 * indirect blocks but leave the blocks themselves alive. Block is
4265 * partially truncated if some data below the new i_size is refered 4056 * partially truncated if some data below the new i_size is referred
4266 * from it (and it is on the path to the first completely truncated 4057 * from it (and it is on the path to the first completely truncated
4267 * data block, indeed). We have to free the top of that path along 4058 * data block, indeed). We have to free the top of that path along
4268 * with everything to the right of the path. Since no allocation 4059 * with everything to the right of the path. Since no allocation
@@ -4341,6 +4132,9 @@ no_top:
4341 * 4132 *
4342 * We release `count' blocks on disk, but (last - first) may be greater 4133 * We release `count' blocks on disk, but (last - first) may be greater
4343 * than `count' because there can be holes in there. 4134 * than `count' because there can be holes in there.
4135 *
4136 * Return 0 on success, 1 on invalid block range
4137 * and < 0 on fatal error.
4344 */ 4138 */
4345static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 4139static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4346 struct buffer_head *bh, 4140 struct buffer_head *bh,
@@ -4350,6 +4144,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4350{ 4144{
4351 __le32 *p; 4145 __le32 *p;
4352 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 4146 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4147 int err;
4353 4148
4354 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4149 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4355 flags |= EXT4_FREE_BLOCKS_METADATA; 4150 flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4365,22 +4160,33 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4365 if (try_to_extend_transaction(handle, inode)) { 4160 if (try_to_extend_transaction(handle, inode)) {
4366 if (bh) { 4161 if (bh) {
4367 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4162 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4368 ext4_handle_dirty_metadata(handle, inode, bh); 4163 err = ext4_handle_dirty_metadata(handle, inode, bh);
4164 if (unlikely(err))
4165 goto out_err;
4369 } 4166 }
4370 ext4_mark_inode_dirty(handle, inode); 4167 err = ext4_mark_inode_dirty(handle, inode);
4371 ext4_truncate_restart_trans(handle, inode, 4168 if (unlikely(err))
4372 blocks_for_truncate(inode)); 4169 goto out_err;
4170 err = ext4_truncate_restart_trans(handle, inode,
4171 blocks_for_truncate(inode));
4172 if (unlikely(err))
4173 goto out_err;
4373 if (bh) { 4174 if (bh) {
4374 BUFFER_TRACE(bh, "retaking write access"); 4175 BUFFER_TRACE(bh, "retaking write access");
4375 ext4_journal_get_write_access(handle, bh); 4176 err = ext4_journal_get_write_access(handle, bh);
4177 if (unlikely(err))
4178 goto out_err;
4376 } 4179 }
4377 } 4180 }
4378 4181
4379 for (p = first; p < last; p++) 4182 for (p = first; p < last; p++)
4380 *p = 0; 4183 *p = 0;
4381 4184
4382 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4185 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4383 return 0; 4186 return 0;
4187out_err:
4188 ext4_std_error(inode->i_sb, err);
4189 return err;
4384} 4190}
4385 4191
4386/** 4192/**
@@ -4391,7 +4197,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4391 * @first: array of block numbers 4197 * @first: array of block numbers
4392 * @last: points immediately past the end of array 4198 * @last: points immediately past the end of array
4393 * 4199 *
4394 * We are freeing all blocks refered from that array (numbers are stored as 4200 * We are freeing all blocks referred from that array (numbers are stored as
4395 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 4201 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4396 * 4202 *
4397 * We accumulate contiguous runs of blocks to free. Conveniently, if these 4203 * We accumulate contiguous runs of blocks to free. Conveniently, if these
@@ -4414,7 +4220,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4414 ext4_fsblk_t nr; /* Current block # */ 4220 ext4_fsblk_t nr; /* Current block # */
4415 __le32 *p; /* Pointer into inode/ind 4221 __le32 *p; /* Pointer into inode/ind
4416 for current block */ 4222 for current block */
4417 int err; 4223 int err = 0;
4418 4224
4419 if (this_bh) { /* For indirect block */ 4225 if (this_bh) { /* For indirect block */
4420 BUFFER_TRACE(this_bh, "get_write_access"); 4226 BUFFER_TRACE(this_bh, "get_write_access");
@@ -4436,9 +4242,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4436 } else if (nr == block_to_free + count) { 4242 } else if (nr == block_to_free + count) {
4437 count++; 4243 count++;
4438 } else { 4244 } else {
4439 if (ext4_clear_blocks(handle, inode, this_bh, 4245 err = ext4_clear_blocks(handle, inode, this_bh,
4440 block_to_free, count, 4246 block_to_free, count,
4441 block_to_free_p, p)) 4247 block_to_free_p, p);
4248 if (err)
4442 break; 4249 break;
4443 block_to_free = nr; 4250 block_to_free = nr;
4444 block_to_free_p = p; 4251 block_to_free_p = p;
@@ -4447,9 +4254,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4447 } 4254 }
4448 } 4255 }
4449 4256
4450 if (count > 0) 4257 if (!err && count > 0)
4451 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 4258 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4452 count, block_to_free_p, p); 4259 count, block_to_free_p, p);
4260 if (err < 0)
4261 /* fatal error */
4262 return;
4453 4263
4454 if (this_bh) { 4264 if (this_bh) {
4455 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 4265 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4479,7 +4289,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4479 * @last: pointer immediately past the end of array 4289 * @last: pointer immediately past the end of array
4480 * @depth: depth of the branches to free 4290 * @depth: depth of the branches to free
4481 * 4291 *
4482 * We are freeing all blocks refered from these branches (numbers are 4292 * We are freeing all blocks referred from these branches (numbers are
4483 * stored as little-endian 32-bit) and updating @inode->i_blocks 4293 * stored as little-endian 32-bit) and updating @inode->i_blocks
4484 * appropriately. 4294 * appropriately.
4485 */ 4295 */
@@ -4530,6 +4340,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4530 (__le32 *) bh->b_data, 4340 (__le32 *) bh->b_data,
4531 (__le32 *) bh->b_data + addr_per_block, 4341 (__le32 *) bh->b_data + addr_per_block,
4532 depth); 4342 depth);
4343 brelse(bh);
4533 4344
4534 /* 4345 /*
4535 * Everything below this this pointer has been 4346 * Everything below this this pointer has been
@@ -4566,7 +4377,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4566 * transaction where the data blocks are 4377 * transaction where the data blocks are
4567 * actually freed. 4378 * actually freed.
4568 */ 4379 */
4569 ext4_free_blocks(handle, inode, 0, nr, 1, 4380 ext4_free_blocks(handle, inode, NULL, nr, 1,
4570 EXT4_FREE_BLOCKS_METADATA| 4381 EXT4_FREE_BLOCKS_METADATA|
4571 EXT4_FREE_BLOCKS_FORGET); 4382 EXT4_FREE_BLOCKS_FORGET);
4572 4383
@@ -4596,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4596 4407
4597int ext4_can_truncate(struct inode *inode) 4408int ext4_can_truncate(struct inode *inode)
4598{ 4409{
4599 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4600 return 0;
4601 if (S_ISREG(inode->i_mode)) 4410 if (S_ISREG(inode->i_mode))
4602 return 1; 4411 return 1;
4603 if (S_ISDIR(inode->i_mode)) 4412 if (S_ISDIR(inode->i_mode))
@@ -4608,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
4608} 4417}
4609 4418
4610/* 4419/*
4420 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
4421 * associated with the given offset and length
4422 *
4423 * @inode: File inode
4424 * @offset: The offset where the hole will begin
4425 * @len: The length of the hole
4426 *
4427 * Returns: 0 on sucess or negative on failure
4428 */
4429
4430int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4431{
4432 struct inode *inode = file->f_path.dentry->d_inode;
4433 if (!S_ISREG(inode->i_mode))
4434 return -ENOTSUPP;
4435
4436 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4437 /* TODO: Add support for non extent hole punching */
4438 return -ENOTSUPP;
4439 }
4440
4441 return ext4_ext_punch_hole(file, offset, length);
4442}
4443
4444/*
4611 * ext4_truncate() 4445 * ext4_truncate()
4612 * 4446 *
4613 * We block out ext4_get_block() block instantiations across the entire 4447 * We block out ext4_get_block() block instantiations across the entire
@@ -4646,10 +4480,12 @@ void ext4_truncate(struct inode *inode)
4646 Indirect chain[4]; 4480 Indirect chain[4];
4647 Indirect *partial; 4481 Indirect *partial;
4648 __le32 nr = 0; 4482 __le32 nr = 0;
4649 int n; 4483 int n = 0;
4650 ext4_lblk_t last_block; 4484 ext4_lblk_t last_block, max_block;
4651 unsigned blocksize = inode->i_sb->s_blocksize; 4485 unsigned blocksize = inode->i_sb->s_blocksize;
4652 4486
4487 trace_ext4_truncate_enter(inode);
4488
4653 if (!ext4_can_truncate(inode)) 4489 if (!ext4_can_truncate(inode))
4654 return; 4490 return;
4655 4491
@@ -4660,6 +4496,7 @@ void ext4_truncate(struct inode *inode)
4660 4496
4661 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4497 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4662 ext4_ext_truncate(inode); 4498 ext4_ext_truncate(inode);
4499 trace_ext4_truncate_exit(inode);
4663 return; 4500 return;
4664 } 4501 }
4665 4502
@@ -4669,14 +4506,18 @@ void ext4_truncate(struct inode *inode)
4669 4506
4670 last_block = (inode->i_size + blocksize-1) 4507 last_block = (inode->i_size + blocksize-1)
4671 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4508 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4509 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4510 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4672 4511
4673 if (inode->i_size & (blocksize - 1)) 4512 if (inode->i_size & (blocksize - 1))
4674 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 4513 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4675 goto out_stop; 4514 goto out_stop;
4676 4515
4677 n = ext4_block_to_path(inode, last_block, offsets, NULL); 4516 if (last_block != max_block) {
4678 if (n == 0) 4517 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4679 goto out_stop; /* error */ 4518 if (n == 0)
4519 goto out_stop; /* error */
4520 }
4680 4521
4681 /* 4522 /*
4682 * OK. This truncate is going to happen. We add the inode to the 4523 * OK. This truncate is going to happen. We add the inode to the
@@ -4707,7 +4548,13 @@ void ext4_truncate(struct inode *inode)
4707 */ 4548 */
4708 ei->i_disksize = inode->i_size; 4549 ei->i_disksize = inode->i_size;
4709 4550
4710 if (n == 1) { /* direct blocks */ 4551 if (last_block == max_block) {
4552 /*
4553 * It is unnecessary to free any data blocks if last_block is
4554 * equal to the indirect block limit.
4555 */
4556 goto out_unlock;
4557 } else if (n == 1) { /* direct blocks */
4711 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 4558 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4712 i_data + EXT4_NDIR_BLOCKS); 4559 i_data + EXT4_NDIR_BLOCKS);
4713 goto do_indirects; 4560 goto do_indirects;
@@ -4767,6 +4614,7 @@ do_indirects:
4767 ; 4614 ;
4768 } 4615 }
4769 4616
4617out_unlock:
4770 up_write(&ei->i_data_sem); 4618 up_write(&ei->i_data_sem);
4771 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4619 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4772 ext4_mark_inode_dirty(handle, inode); 4620 ext4_mark_inode_dirty(handle, inode);
@@ -4789,6 +4637,7 @@ out_stop:
4789 ext4_orphan_del(handle, inode); 4637 ext4_orphan_del(handle, inode);
4790 4638
4791 ext4_journal_stop(handle); 4639 ext4_journal_stop(handle);
4640 trace_ext4_truncate_exit(inode);
4792} 4641}
4793 4642
4794/* 4643/*
@@ -4818,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
4818 /* 4667 /*
4819 * Figure out the offset within the block group inode table 4668 * Figure out the offset within the block group inode table
4820 */ 4669 */
4821 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4670 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4822 inode_offset = ((inode->i_ino - 1) % 4671 inode_offset = ((inode->i_ino - 1) %
4823 EXT4_INODES_PER_GROUP(sb)); 4672 EXT4_INODES_PER_GROUP(sb));
4824 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4673 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -4920,6 +4769,7 @@ make_io:
4920 * has in-inode xattrs, or we don't have this inode in memory. 4769 * has in-inode xattrs, or we don't have this inode in memory.
4921 * Read the block from disk. 4770 * Read the block from disk.
4922 */ 4771 */
4772 trace_ext4_load_inode(inode);
4923 get_bh(bh); 4773 get_bh(bh);
4924 bh->b_end_io = end_buffer_read_sync; 4774 bh->b_end_io = end_buffer_read_sync;
4925 submit_bh(READ_META, bh); 4775 submit_bh(READ_META, bh);
@@ -5025,7 +4875,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5025 return inode; 4875 return inode;
5026 4876
5027 ei = EXT4_I(inode); 4877 ei = EXT4_I(inode);
5028 iloc.bh = 0; 4878 iloc.bh = NULL;
5029 4879
5030 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4880 ret = __ext4_get_inode_loc(inode, &iloc, 0);
5031 if (ret < 0) 4881 if (ret < 0)
@@ -5040,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5040 } 4890 }
5041 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4891 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
5042 4892
5043 ei->i_state_flags = 0; 4893 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
5044 ei->i_dir_start_lookup = 0; 4894 ei->i_dir_start_lookup = 0;
5045 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4895 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
5046 /* We now have enough fields to check if the inode was active or not. 4896 /* We now have enough fields to check if the inode was active or not.
@@ -5299,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
5299 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 5149 if (ext4_inode_blocks_set(handle, raw_inode, ei))
5300 goto out_brelse; 5150 goto out_brelse;
5301 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 5151 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5302 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 5152 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5303 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 5153 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5304 cpu_to_le32(EXT4_OS_HURD)) 5154 cpu_to_le32(EXT4_OS_HURD))
5305 raw_inode->i_file_acl_high = 5155 raw_inode->i_file_acl_high =
@@ -5464,6 +5314,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5464{ 5314{
5465 struct inode *inode = dentry->d_inode; 5315 struct inode *inode = dentry->d_inode;
5466 int error, rc = 0; 5316 int error, rc = 0;
5317 int orphan = 0;
5467 const unsigned int ia_valid = attr->ia_valid; 5318 const unsigned int ia_valid = attr->ia_valid;
5468 5319
5469 error = inode_change_ok(inode, attr); 5320 error = inode_change_ok(inode, attr);
@@ -5510,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5510 5361
5511 if (S_ISREG(inode->i_mode) && 5362 if (S_ISREG(inode->i_mode) &&
5512 attr->ia_valid & ATTR_SIZE && 5363 attr->ia_valid & ATTR_SIZE &&
5513 (attr->ia_size < inode->i_size || 5364 (attr->ia_size < inode->i_size)) {
5514 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5515 handle_t *handle; 5365 handle_t *handle;
5516 5366
5517 handle = ext4_journal_start(inode, 3); 5367 handle = ext4_journal_start(inode, 3);
@@ -5519,8 +5369,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5519 error = PTR_ERR(handle); 5369 error = PTR_ERR(handle);
5520 goto err_out; 5370 goto err_out;
5521 } 5371 }
5522 5372 if (ext4_handle_valid(handle)) {
5523 error = ext4_orphan_add(handle, inode); 5373 error = ext4_orphan_add(handle, inode);
5374 orphan = 1;
5375 }
5524 EXT4_I(inode)->i_disksize = attr->ia_size; 5376 EXT4_I(inode)->i_disksize = attr->ia_size;
5525 rc = ext4_mark_inode_dirty(handle, inode); 5377 rc = ext4_mark_inode_dirty(handle, inode);
5526 if (!error) 5378 if (!error)
@@ -5538,18 +5390,20 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5538 goto err_out; 5390 goto err_out;
5539 } 5391 }
5540 ext4_orphan_del(handle, inode); 5392 ext4_orphan_del(handle, inode);
5393 orphan = 0;
5541 ext4_journal_stop(handle); 5394 ext4_journal_stop(handle);
5542 goto err_out; 5395 goto err_out;
5543 } 5396 }
5544 } 5397 }
5545 /* ext4_truncate will clear the flag */
5546 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5547 ext4_truncate(inode);
5548 } 5398 }
5549 5399
5550 if ((attr->ia_valid & ATTR_SIZE) && 5400 if (attr->ia_valid & ATTR_SIZE) {
5551 attr->ia_size != i_size_read(inode)) 5401 if (attr->ia_size != i_size_read(inode)) {
5552 rc = vmtruncate(inode, attr->ia_size); 5402 truncate_setsize(inode, attr->ia_size);
5403 ext4_truncate(inode);
5404 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5405 ext4_truncate(inode);
5406 }
5553 5407
5554 if (!rc) { 5408 if (!rc) {
5555 setattr_copy(inode, attr); 5409 setattr_copy(inode, attr);
@@ -5560,7 +5414,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5560 * If the call to ext4_truncate failed to get a transaction handle at 5414 * If the call to ext4_truncate failed to get a transaction handle at
5561 * all, we need to clean up the in-core orphan list manually. 5415 * all, we need to clean up the in-core orphan list manually.
5562 */ 5416 */
5563 if (inode->i_nlink) 5417 if (orphan && inode->i_nlink)
5564 ext4_orphan_del(NULL, inode); 5418 ext4_orphan_del(NULL, inode);
5565 5419
5566 if (!rc && (ia_valid & ATTR_MODE)) 5420 if (!rc && (ia_valid & ATTR_MODE))
@@ -5592,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5592 * will return the blocks that include the delayed allocation 5446 * will return the blocks that include the delayed allocation
5593 * blocks for this file. 5447 * blocks for this file.
5594 */ 5448 */
5595 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
5596 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 5449 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5597 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
5598 5450
5599 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 5451 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5600 return 0; 5452 return 0;
@@ -5608,13 +5460,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5608 /* if nrblocks are contiguous */ 5460 /* if nrblocks are contiguous */
5609 if (chunk) { 5461 if (chunk) {
5610 /* 5462 /*
5611 * With N contiguous data blocks, it need at most 5463 * With N contiguous data blocks, we need at most
5612 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 5464 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5613 * 2 dindirect blocks 5465 * 2 dindirect blocks, and 1 tindirect block
5614 * 1 tindirect block
5615 */ 5466 */
5616 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 5467 return DIV_ROUND_UP(nrblocks,
5617 return indirects + 3; 5468 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5618 } 5469 }
5619 /* 5470 /*
5620 * if nrblocks are not contiguous, worse case, each block touch 5471 * if nrblocks are not contiguous, worse case, each block touch
@@ -5643,7 +5494,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5643 * 5494 *
5644 * Also account for superblock, inode, quota and xattr blocks 5495 * Also account for superblock, inode, quota and xattr blocks
5645 */ 5496 */
5646int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5497static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5647{ 5498{
5648 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5499 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5649 int gdpblocks; 5500 int gdpblocks;
@@ -5688,7 +5539,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5688} 5539}
5689 5540
5690/* 5541/*
5691 * Calulate the total number of credits to reserve to fit 5542 * Calculate the total number of credits to reserve to fit
5692 * the modification of a single pages into a single transaction, 5543 * the modification of a single pages into a single transaction,
5693 * which may include multiple chunks of block allocations. 5544 * which may include multiple chunks of block allocations.
5694 * 5545 *
@@ -5831,6 +5682,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5831 int err, ret; 5682 int err, ret;
5832 5683
5833 might_sleep(); 5684 might_sleep();
5685 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5834 err = ext4_reserve_inode_write(handle, inode, &iloc); 5686 err = ext4_reserve_inode_write(handle, inode, &iloc);
5835 if (ext4_handle_valid(handle) && 5687 if (ext4_handle_valid(handle) &&
5836 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5688 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
@@ -5881,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5881 * so would cause a commit on atime updates, which we don't bother doing. 5733 * so would cause a commit on atime updates, which we don't bother doing.
5882 * We handle synchronous inodes at the highest possible level. 5734 * We handle synchronous inodes at the highest possible level.
5883 */ 5735 */
5884void ext4_dirty_inode(struct inode *inode) 5736void ext4_dirty_inode(struct inode *inode, int flags)
5885{ 5737{
5886 handle_t *handle; 5738 handle_t *handle;
5887 5739
@@ -6009,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6009 goto out_unlock; 5861 goto out_unlock;
6010 } 5862 }
6011 ret = 0; 5863 ret = 0;
6012 if (PageMappedToDisk(page)) 5864
6013 goto out_unlock; 5865 lock_page(page);
5866 wait_on_page_writeback(page);
5867 if (PageMappedToDisk(page)) {
5868 up_read(&inode->i_alloc_sem);
5869 return VM_FAULT_LOCKED;
5870 }
6014 5871
6015 if (page->index == size >> PAGE_CACHE_SHIFT) 5872 if (page->index == size >> PAGE_CACHE_SHIFT)
6016 len = size & ~PAGE_CACHE_MASK; 5873 len = size & ~PAGE_CACHE_MASK;
6017 else 5874 else
6018 len = PAGE_CACHE_SIZE; 5875 len = PAGE_CACHE_SIZE;
6019 5876
6020 lock_page(page);
6021 /* 5877 /*
6022 * return if we have all the buffers mapped. This avoid 5878 * return if we have all the buffers mapped. This avoid
6023 * the need to call write_begin/write_end which does a 5879 * the need to call write_begin/write_end which does a
@@ -6027,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6027 if (page_has_buffers(page)) { 5883 if (page_has_buffers(page)) {
6028 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5884 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
6029 ext4_bh_unmapped)) { 5885 ext4_bh_unmapped)) {
6030 unlock_page(page); 5886 up_read(&inode->i_alloc_sem);
6031 goto out_unlock; 5887 return VM_FAULT_LOCKED;
6032 } 5888 }
6033 } 5889 }
6034 unlock_page(page); 5890 unlock_page(page);
@@ -6048,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6048 if (ret < 0) 5904 if (ret < 0)
6049 goto out_unlock; 5905 goto out_unlock;
6050 ret = 0; 5906 ret = 0;
5907
5908 /*
5909 * write_begin/end might have created a dirty page and someone
5910 * could wander in and start the IO. Make sure that hasn't
5911 * happened.
5912 */
5913 lock_page(page);
5914 wait_on_page_writeback(page);
5915 up_read(&inode->i_alloc_sem);
5916 return VM_FAULT_LOCKED;
6051out_unlock: 5917out_unlock:
6052 if (ret) 5918 if (ret)
6053 ret = VM_FAULT_SIGBUS; 5919 ret = VM_FAULT_SIGBUS;