aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2010-10-27 23:44:47 -0400
committerTheodore Ts'o <tytso@mit.edu>2010-10-27 23:44:47 -0400
commita107e5a3a473a2ea62bd5af24e11b84adf1486ff (patch)
treed36c2cb38d8be88d4d75cdebc354aa140aa0e470 /fs/ext4/inode.c
parente3e1288e86a07cdeb0aee5860a2dff111c6eff79 (diff)
parenta269029d0e2192046be4c07ed78a45022469ee4c (diff)
Merge branch 'next' into upstream-merge
Conflicts: fs/ext4/inode.c fs/ext4/mballoc.c include/trace/events/ext4.h
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c587
1 files changed, 204 insertions, 383 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 49635ef236f8..2d6c6c8c036d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -60,6 +60,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
60} 60}
61 61
62static void ext4_invalidatepage(struct page *page, unsigned long offset); 62static void ext4_invalidatepage(struct page *page, unsigned long offset);
63static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
64 struct buffer_head *bh_result, int create);
65static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
66static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
67static int __ext4_journalled_writepage(struct page *page, unsigned int len);
68static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
63 69
64/* 70/*
65 * Test whether an inode is a fast symlink. 71 * Test whether an inode is a fast symlink.
@@ -755,6 +761,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 * parent to disk. 761 * parent to disk.
756 */ 762 */
757 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 763 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
764 if (unlikely(!bh)) {
765 err = -EIO;
766 goto failed;
767 }
768
758 branch[n].bh = bh; 769 branch[n].bh = bh;
759 lock_buffer(bh); 770 lock_buffer(bh);
760 BUFFER_TRACE(bh, "call get_create_access"); 771 BUFFER_TRACE(bh, "call get_create_access");
@@ -1207,8 +1218,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1207 break; 1218 break;
1208 idx++; 1219 idx++;
1209 num++; 1220 num++;
1210 if (num >= max_pages) 1221 if (num >= max_pages) {
1222 done = 1;
1211 break; 1223 break;
1224 }
1212 } 1225 }
1213 pagevec_release(&pvec); 1226 pagevec_release(&pvec);
1214 } 1227 }
@@ -1995,16 +2008,23 @@ static void ext4_da_page_release_reservation(struct page *page,
1995 * 2008 *
1996 * As pages are already locked by write_cache_pages(), we can't use it 2009 * As pages are already locked by write_cache_pages(), we can't use it
1997 */ 2010 */
1998static int mpage_da_submit_io(struct mpage_da_data *mpd) 2011static int mpage_da_submit_io(struct mpage_da_data *mpd,
2012 struct ext4_map_blocks *map)
1999{ 2013{
2000 long pages_skipped;
2001 struct pagevec pvec; 2014 struct pagevec pvec;
2002 unsigned long index, end; 2015 unsigned long index, end;
2003 int ret = 0, err, nr_pages, i; 2016 int ret = 0, err, nr_pages, i;
2004 struct inode *inode = mpd->inode; 2017 struct inode *inode = mpd->inode;
2005 struct address_space *mapping = inode->i_mapping; 2018 struct address_space *mapping = inode->i_mapping;
2019 loff_t size = i_size_read(inode);
2020 unsigned int len, block_start;
2021 struct buffer_head *bh, *page_bufs = NULL;
2022 int journal_data = ext4_should_journal_data(inode);
2023 sector_t pblock = 0, cur_logical = 0;
2024 struct ext4_io_submit io_submit;
2006 2025
2007 BUG_ON(mpd->next_page <= mpd->first_page); 2026 BUG_ON(mpd->next_page <= mpd->first_page);
2027 memset(&io_submit, 0, sizeof(io_submit));
2008 /* 2028 /*
2009 * We need to start from the first_page to the next_page - 1 2029 * We need to start from the first_page to the next_page - 1
2010 * to make sure we also write the mapped dirty buffer_heads. 2030 * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,122 +2040,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2020 if (nr_pages == 0) 2040 if (nr_pages == 0)
2021 break; 2041 break;
2022 for (i = 0; i < nr_pages; i++) { 2042 for (i = 0; i < nr_pages; i++) {
2043 int commit_write = 0, redirty_page = 0;
2023 struct page *page = pvec.pages[i]; 2044 struct page *page = pvec.pages[i];
2024 2045
2025 index = page->index; 2046 index = page->index;
2026 if (index > end) 2047 if (index > end)
2027 break; 2048 break;
2049
2050 if (index == size >> PAGE_CACHE_SHIFT)
2051 len = size & ~PAGE_CACHE_MASK;
2052 else
2053 len = PAGE_CACHE_SIZE;
2054 if (map) {
2055 cur_logical = index << (PAGE_CACHE_SHIFT -
2056 inode->i_blkbits);
2057 pblock = map->m_pblk + (cur_logical -
2058 map->m_lblk);
2059 }
2028 index++; 2060 index++;
2029 2061
2030 BUG_ON(!PageLocked(page)); 2062 BUG_ON(!PageLocked(page));
2031 BUG_ON(PageWriteback(page)); 2063 BUG_ON(PageWriteback(page));
2032 2064
2033 pages_skipped = mpd->wbc->pages_skipped;
2034 err = mapping->a_ops->writepage(page, mpd->wbc);
2035 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
2036 /*
2037 * have successfully written the page
2038 * without skipping the same
2039 */
2040 mpd->pages_written++;
2041 /* 2065 /*
2042 * In error case, we have to continue because 2066 * If the page does not have buffers (for
2043 * remaining pages are still locked 2067 * whatever reason), try to create them using
2044 * XXX: unlock and re-dirty them? 2068 * __block_write_begin. If this fails,
2069 * redirty the page and move on.
2045 */ 2070 */
2046 if (ret == 0) 2071 if (!page_has_buffers(page)) {
2047 ret = err; 2072 if (__block_write_begin(page, 0, len,
2048 } 2073 noalloc_get_block_write)) {
2049 pagevec_release(&pvec); 2074 redirty_page:
2050 } 2075 redirty_page_for_writepage(mpd->wbc,
2051 return ret; 2076 page);
2052} 2077 unlock_page(page);
2053 2078 continue;
2054/* 2079 }
2055 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2080 commit_write = 1;
2056 * 2081 }
2057 * the function goes through all passed space and put actual disk
2058 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2059 */
2060static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2061 struct ext4_map_blocks *map)
2062{
2063 struct inode *inode = mpd->inode;
2064 struct address_space *mapping = inode->i_mapping;
2065 int blocks = map->m_len;
2066 sector_t pblock = map->m_pblk, cur_logical;
2067 struct buffer_head *head, *bh;
2068 pgoff_t index, end;
2069 struct pagevec pvec;
2070 int nr_pages, i;
2071
2072 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2073 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2074 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2075
2076 pagevec_init(&pvec, 0);
2077
2078 while (index <= end) {
2079 /* XXX: optimize tail */
2080 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2081 if (nr_pages == 0)
2082 break;
2083 for (i = 0; i < nr_pages; i++) {
2084 struct page *page = pvec.pages[i];
2085
2086 index = page->index;
2087 if (index > end)
2088 break;
2089 index++;
2090
2091 BUG_ON(!PageLocked(page));
2092 BUG_ON(PageWriteback(page));
2093 BUG_ON(!page_has_buffers(page));
2094
2095 bh = page_buffers(page);
2096 head = bh;
2097
2098 /* skip blocks out of the range */
2099 do {
2100 if (cur_logical >= map->m_lblk)
2101 break;
2102 cur_logical++;
2103 } while ((bh = bh->b_this_page) != head);
2104 2082
2083 bh = page_bufs = page_buffers(page);
2084 block_start = 0;
2105 do { 2085 do {
2106 if (cur_logical >= map->m_lblk + blocks) 2086 if (!bh)
2107 break; 2087 goto redirty_page;
2108 2088 if (map && (cur_logical >= map->m_lblk) &&
2109 if (buffer_delay(bh) || buffer_unwritten(bh)) { 2089 (cur_logical <= (map->m_lblk +
2110 2090 (map->m_len - 1)))) {
2111 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2112
2113 if (buffer_delay(bh)) { 2091 if (buffer_delay(bh)) {
2114 clear_buffer_delay(bh); 2092 clear_buffer_delay(bh);
2115 bh->b_blocknr = pblock; 2093 bh->b_blocknr = pblock;
2116 } else {
2117 /*
2118 * unwritten already should have
2119 * blocknr assigned. Verify that
2120 */
2121 clear_buffer_unwritten(bh);
2122 BUG_ON(bh->b_blocknr != pblock);
2123 } 2094 }
2095 if (buffer_unwritten(bh) ||
2096 buffer_mapped(bh))
2097 BUG_ON(bh->b_blocknr != pblock);
2098 if (map->m_flags & EXT4_MAP_UNINIT)
2099 set_buffer_uninit(bh);
2100 clear_buffer_unwritten(bh);
2101 }
2124 2102
2125 } else if (buffer_mapped(bh)) 2103 /* redirty page if block allocation undone */
2126 BUG_ON(bh->b_blocknr != pblock); 2104 if (buffer_delay(bh) || buffer_unwritten(bh))
2127 2105 redirty_page = 1;
2128 if (map->m_flags & EXT4_MAP_UNINIT) 2106 bh = bh->b_this_page;
2129 set_buffer_uninit(bh); 2107 block_start += bh->b_size;
2130 cur_logical++; 2108 cur_logical++;
2131 pblock++; 2109 pblock++;
2132 } while ((bh = bh->b_this_page) != head); 2110 } while (bh != page_bufs);
2111
2112 if (redirty_page)
2113 goto redirty_page;
2114
2115 if (commit_write)
2116 /* mark the buffer_heads as dirty & uptodate */
2117 block_commit_write(page, 0, len);
2118
2119 /*
2120 * Delalloc doesn't support data journalling,
2121 * but eventually maybe we'll lift this
2122 * restriction.
2123 */
2124 if (unlikely(journal_data && PageChecked(page)))
2125 err = __ext4_journalled_writepage(page, len);
2126 else
2127 err = ext4_bio_write_page(&io_submit, page,
2128 len, mpd->wbc);
2129
2130 if (!err)
2131 mpd->pages_written++;
2132 /*
2133 * In error case, we have to continue because
2134 * remaining pages are still locked
2135 */
2136 if (ret == 0)
2137 ret = err;
2133 } 2138 }
2134 pagevec_release(&pvec); 2139 pagevec_release(&pvec);
2135 } 2140 }
2141 ext4_io_submit(&io_submit);
2142 return ret;
2136} 2143}
2137 2144
2138
2139static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2145static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2140 sector_t logical, long blk_cnt) 2146 sector_t logical, long blk_cnt)
2141{ 2147{
@@ -2187,35 +2193,32 @@ static void ext4_print_free_blocks(struct inode *inode)
2187} 2193}
2188 2194
2189/* 2195/*
2190 * mpage_da_map_blocks - go through given space 2196 * mpage_da_map_and_submit - go through given space, map them
2197 * if necessary, and then submit them for I/O
2191 * 2198 *
2192 * @mpd - bh describing space 2199 * @mpd - bh describing space
2193 * 2200 *
2194 * The function skips space we know is already mapped to disk blocks. 2201 * The function skips space we know is already mapped to disk blocks.
2195 * 2202 *
2196 */ 2203 */
2197static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2204static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2198{ 2205{
2199 int err, blks, get_blocks_flags; 2206 int err, blks, get_blocks_flags;
2200 struct ext4_map_blocks map; 2207 struct ext4_map_blocks map, *mapp = NULL;
2201 sector_t next = mpd->b_blocknr; 2208 sector_t next = mpd->b_blocknr;
2202 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2209 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2203 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2210 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2204 handle_t *handle = NULL; 2211 handle_t *handle = NULL;
2205 2212
2206 /* 2213 /*
2207 * We consider only non-mapped and non-allocated blocks 2214 * If the blocks are mapped already, or we couldn't accumulate
2208 */ 2215 * any blocks, then proceed immediately to the submission stage.
2209 if ((mpd->b_state & (1 << BH_Mapped)) &&
2210 !(mpd->b_state & (1 << BH_Delay)) &&
2211 !(mpd->b_state & (1 << BH_Unwritten)))
2212 return 0;
2213
2214 /*
2215 * If we didn't accumulate anything to write simply return
2216 */ 2216 */
2217 if (!mpd->b_size) 2217 if ((mpd->b_size == 0) ||
2218 return 0; 2218 ((mpd->b_state & (1 << BH_Mapped)) &&
2219 !(mpd->b_state & (1 << BH_Delay)) &&
2220 !(mpd->b_state & (1 << BH_Unwritten))))
2221 goto submit_io;
2219 2222
2220 handle = ext4_journal_current_handle(); 2223 handle = ext4_journal_current_handle();
2221 BUG_ON(!handle); 2224 BUG_ON(!handle);
@@ -2252,17 +2255,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2252 2255
2253 err = blks; 2256 err = blks;
2254 /* 2257 /*
2255 * If get block returns with error we simply 2258 * If get block returns EAGAIN or ENOSPC and there
2256 * return. Later writepage will redirty the page and 2259 * appears to be free blocks we will call
2257 * writepages will find the dirty page again 2260 * ext4_writepage() for all of the pages which will
2261 * just redirty the pages.
2258 */ 2262 */
2259 if (err == -EAGAIN) 2263 if (err == -EAGAIN)
2260 return 0; 2264 goto submit_io;
2261 2265
2262 if (err == -ENOSPC && 2266 if (err == -ENOSPC &&
2263 ext4_count_free_blocks(sb)) { 2267 ext4_count_free_blocks(sb)) {
2264 mpd->retval = err; 2268 mpd->retval = err;
2265 return 0; 2269 goto submit_io;
2266 } 2270 }
2267 2271
2268 /* 2272 /*
@@ -2287,10 +2291,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2287 /* invalidate all the pages */ 2291 /* invalidate all the pages */
2288 ext4_da_block_invalidatepages(mpd, next, 2292 ext4_da_block_invalidatepages(mpd, next,
2289 mpd->b_size >> mpd->inode->i_blkbits); 2293 mpd->b_size >> mpd->inode->i_blkbits);
2290 return err; 2294 return;
2291 } 2295 }
2292 BUG_ON(blks == 0); 2296 BUG_ON(blks == 0);
2293 2297
2298 mapp = &map;
2294 if (map.m_flags & EXT4_MAP_NEW) { 2299 if (map.m_flags & EXT4_MAP_NEW) {
2295 struct block_device *bdev = mpd->inode->i_sb->s_bdev; 2300 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2296 int i; 2301 int i;
@@ -2299,18 +2304,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2299 unmap_underlying_metadata(bdev, map.m_pblk + i); 2304 unmap_underlying_metadata(bdev, map.m_pblk + i);
2300 } 2305 }
2301 2306
2302 /*
2303 * If blocks are delayed marked, we need to
2304 * put actual blocknr and drop delayed bit
2305 */
2306 if ((mpd->b_state & (1 << BH_Delay)) ||
2307 (mpd->b_state & (1 << BH_Unwritten)))
2308 mpage_put_bnr_to_bhs(mpd, &map);
2309
2310 if (ext4_should_order_data(mpd->inode)) { 2307 if (ext4_should_order_data(mpd->inode)) {
2311 err = ext4_jbd2_file_inode(handle, mpd->inode); 2308 err = ext4_jbd2_file_inode(handle, mpd->inode);
2312 if (err) 2309 if (err)
2313 return err; 2310 /* This only happens if the journal is aborted */
2311 return;
2314 } 2312 }
2315 2313
2316 /* 2314 /*
@@ -2321,10 +2319,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2321 disksize = i_size_read(mpd->inode); 2319 disksize = i_size_read(mpd->inode);
2322 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2320 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2323 ext4_update_i_disksize(mpd->inode, disksize); 2321 ext4_update_i_disksize(mpd->inode, disksize);
2324 return ext4_mark_inode_dirty(handle, mpd->inode); 2322 err = ext4_mark_inode_dirty(handle, mpd->inode);
2323 if (err)
2324 ext4_error(mpd->inode->i_sb,
2325 "Failed to mark inode %lu dirty",
2326 mpd->inode->i_ino);
2325 } 2327 }
2326 2328
2327 return 0; 2329submit_io:
2330 mpage_da_submit_io(mpd, mapp);
2331 mpd->io_done = 1;
2328} 2332}
2329 2333
2330#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2334#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2405,7 @@ flush_it:
2401 * We couldn't merge the block to our extent, so we 2405 * We couldn't merge the block to our extent, so we
2402 * need to flush current extent and start new one 2406 * need to flush current extent and start new one
2403 */ 2407 */
2404 if (mpage_da_map_blocks(mpd) == 0) 2408 mpage_da_map_and_submit(mpd);
2405 mpage_da_submit_io(mpd);
2406 mpd->io_done = 1;
2407 return; 2409 return;
2408} 2410}
2409 2411
@@ -2422,9 +2424,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2422 * The function finds extents of pages and scan them for all blocks. 2424 * The function finds extents of pages and scan them for all blocks.
2423 */ 2425 */
2424static int __mpage_da_writepage(struct page *page, 2426static int __mpage_da_writepage(struct page *page,
2425 struct writeback_control *wbc, void *data) 2427 struct writeback_control *wbc,
2428 struct mpage_da_data *mpd)
2426{ 2429{
2427 struct mpage_da_data *mpd = data;
2428 struct inode *inode = mpd->inode; 2430 struct inode *inode = mpd->inode;
2429 struct buffer_head *bh, *head; 2431 struct buffer_head *bh, *head;
2430 sector_t logical; 2432 sector_t logical;
@@ -2435,15 +2437,13 @@ static int __mpage_da_writepage(struct page *page,
2435 if (mpd->next_page != page->index) { 2437 if (mpd->next_page != page->index) {
2436 /* 2438 /*
2437 * Nope, we can't. So, we map non-allocated blocks 2439 * Nope, we can't. So, we map non-allocated blocks
2438 * and start IO on them using writepage() 2440 * and start IO on them
2439 */ 2441 */
2440 if (mpd->next_page != mpd->first_page) { 2442 if (mpd->next_page != mpd->first_page) {
2441 if (mpage_da_map_blocks(mpd) == 0) 2443 mpage_da_map_and_submit(mpd);
2442 mpage_da_submit_io(mpd);
2443 /* 2444 /*
2444 * skip rest of the page in the page_vec 2445 * skip rest of the page in the page_vec
2445 */ 2446 */
2446 mpd->io_done = 1;
2447 redirty_page_for_writepage(wbc, page); 2447 redirty_page_for_writepage(wbc, page);
2448 unlock_page(page); 2448 unlock_page(page);
2449 return MPAGE_DA_EXTENT_TAIL; 2449 return MPAGE_DA_EXTENT_TAIL;
@@ -2622,6 +2622,7 @@ static int __ext4_journalled_writepage(struct page *page,
2622 int ret = 0; 2622 int ret = 0;
2623 int err; 2623 int err;
2624 2624
2625 ClearPageChecked(page);
2625 page_bufs = page_buffers(page); 2626 page_bufs = page_buffers(page);
2626 BUG_ON(!page_bufs); 2627 BUG_ON(!page_bufs);
2627 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2628 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2699,7 +2700,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2699static int ext4_writepage(struct page *page, 2700static int ext4_writepage(struct page *page,
2700 struct writeback_control *wbc) 2701 struct writeback_control *wbc)
2701{ 2702{
2702 int ret = 0; 2703 int ret = 0, commit_write = 0;
2703 loff_t size; 2704 loff_t size;
2704 unsigned int len; 2705 unsigned int len;
2705 struct buffer_head *page_bufs = NULL; 2706 struct buffer_head *page_bufs = NULL;
@@ -2712,71 +2713,46 @@ static int ext4_writepage(struct page *page,
2712 else 2713 else
2713 len = PAGE_CACHE_SIZE; 2714 len = PAGE_CACHE_SIZE;
2714 2715
2715 if (page_has_buffers(page)) { 2716 /*
2716 page_bufs = page_buffers(page); 2717 * If the page does not have buffers (for whatever reason),
2717 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2718 * try to create them using __block_write_begin. If this
2718 ext4_bh_delay_or_unwritten)) { 2719 * fails, redirty the page and move on.
2719 /* 2720 */
2720 * We don't want to do block allocation 2721 if (!page_buffers(page)) {
2721 * So redirty the page and return 2722 if (__block_write_begin(page, 0, len,
2722 * We may reach here when we do a journal commit 2723 noalloc_get_block_write)) {
2723 * via journal_submit_inode_data_buffers. 2724 redirty_page:
2724 * If we don't have mapping block we just ignore
2725 * them. We can also reach here via shrink_page_list
2726 */
2727 redirty_page_for_writepage(wbc, page); 2725 redirty_page_for_writepage(wbc, page);
2728 unlock_page(page); 2726 unlock_page(page);
2729 return 0; 2727 return 0;
2730 } 2728 }
2731 } else { 2729 commit_write = 1;
2730 }
2731 page_bufs = page_buffers(page);
2732 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2733 ext4_bh_delay_or_unwritten)) {
2732 /* 2734 /*
2733 * The test for page_has_buffers() is subtle: 2735 * We don't want to do block allocation So redirty the
2734 * We know the page is dirty but it lost buffers. That means 2736 * page and return We may reach here when we do a
2735 * that at some moment in time after write_begin()/write_end() 2737 * journal commit via
2736 * has been called all buffers have been clean and thus they 2738 * journal_submit_inode_data_buffers. If we don't
2737 * must have been written at least once. So they are all 2739 * have mapping block we just ignore them. We can also
2738 * mapped and we can happily proceed with mapping them 2740 * reach here via shrink_page_list
2739 * and writing the page.
2740 *
2741 * Try to initialize the buffer_heads and check whether
2742 * all are mapped and non delay. We don't want to
2743 * do block allocation here.
2744 */ 2741 */
2745 ret = __block_write_begin(page, 0, len, 2742 goto redirty_page;
2746 noalloc_get_block_write); 2743 }
2747 if (!ret) { 2744 if (commit_write)
2748 page_bufs = page_buffers(page);
2749 /* check whether all are mapped and non delay */
2750 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2751 ext4_bh_delay_or_unwritten)) {
2752 redirty_page_for_writepage(wbc, page);
2753 unlock_page(page);
2754 return 0;
2755 }
2756 } else {
2757 /*
2758 * We can't do block allocation here
2759 * so just redity the page and unlock
2760 * and return
2761 */
2762 redirty_page_for_writepage(wbc, page);
2763 unlock_page(page);
2764 return 0;
2765 }
2766 /* now mark the buffer_heads as dirty and uptodate */ 2745 /* now mark the buffer_heads as dirty and uptodate */
2767 block_commit_write(page, 0, len); 2746 block_commit_write(page, 0, len);
2768 }
2769 2747
2770 if (PageChecked(page) && ext4_should_journal_data(inode)) { 2748 if (PageChecked(page) && ext4_should_journal_data(inode))
2771 /* 2749 /*
2772 * It's mmapped pagecache. Add buffers and journal it. There 2750 * It's mmapped pagecache. Add buffers and journal it. There
2773 * doesn't seem much point in redirtying the page here. 2751 * doesn't seem much point in redirtying the page here.
2774 */ 2752 */
2775 ClearPageChecked(page);
2776 return __ext4_journalled_writepage(page, len); 2753 return __ext4_journalled_writepage(page, len);
2777 }
2778 2754
2779 if (page_bufs && buffer_uninit(page_bufs)) { 2755 if (buffer_uninit(page_bufs)) {
2780 ext4_set_bh_endio(page_bufs, inode); 2756 ext4_set_bh_endio(page_bufs, inode);
2781 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2757 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2782 wbc, ext4_end_io_buffer_write); 2758 wbc, ext4_end_io_buffer_write);
@@ -2823,25 +2799,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2823 */ 2799 */
2824static int write_cache_pages_da(struct address_space *mapping, 2800static int write_cache_pages_da(struct address_space *mapping,
2825 struct writeback_control *wbc, 2801 struct writeback_control *wbc,
2826 struct mpage_da_data *mpd) 2802 struct mpage_da_data *mpd,
2803 pgoff_t *done_index)
2827{ 2804{
2828 int ret = 0; 2805 int ret = 0;
2829 int done = 0; 2806 int done = 0;
2830 struct pagevec pvec; 2807 struct pagevec pvec;
2831 int nr_pages; 2808 unsigned nr_pages;
2832 pgoff_t index; 2809 pgoff_t index;
2833 pgoff_t end; /* Inclusive */ 2810 pgoff_t end; /* Inclusive */
2834 long nr_to_write = wbc->nr_to_write; 2811 long nr_to_write = wbc->nr_to_write;
2812 int tag;
2835 2813
2836 pagevec_init(&pvec, 0); 2814 pagevec_init(&pvec, 0);
2837 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2815 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2838 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2816 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2839 2817
2818 if (wbc->sync_mode == WB_SYNC_ALL)
2819 tag = PAGECACHE_TAG_TOWRITE;
2820 else
2821 tag = PAGECACHE_TAG_DIRTY;
2822
2823 *done_index = index;
2840 while (!done && (index <= end)) { 2824 while (!done && (index <= end)) {
2841 int i; 2825 int i;
2842 2826
2843 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2827 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2844 PAGECACHE_TAG_DIRTY,
2845 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2828 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2846 if (nr_pages == 0) 2829 if (nr_pages == 0)
2847 break; 2830 break;
@@ -2861,6 +2844,8 @@ static int write_cache_pages_da(struct address_space *mapping,
2861 break; 2844 break;
2862 } 2845 }
2863 2846
2847 *done_index = page->index + 1;
2848
2864 lock_page(page); 2849 lock_page(page);
2865 2850
2866 /* 2851 /*
@@ -2946,6 +2931,8 @@ static int ext4_da_writepages(struct address_space *mapping,
2946 long desired_nr_to_write, nr_to_writebump = 0; 2931 long desired_nr_to_write, nr_to_writebump = 0;
2947 loff_t range_start = wbc->range_start; 2932 loff_t range_start = wbc->range_start;
2948 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2933 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2934 pgoff_t done_index = 0;
2935 pgoff_t end;
2949 2936
2950 trace_ext4_da_writepages(inode, wbc); 2937 trace_ext4_da_writepages(inode, wbc);
2951 2938
@@ -2981,8 +2968,11 @@ static int ext4_da_writepages(struct address_space *mapping,
2981 wbc->range_start = index << PAGE_CACHE_SHIFT; 2968 wbc->range_start = index << PAGE_CACHE_SHIFT;
2982 wbc->range_end = LLONG_MAX; 2969 wbc->range_end = LLONG_MAX;
2983 wbc->range_cyclic = 0; 2970 wbc->range_cyclic = 0;
2984 } else 2971 end = -1;
2972 } else {
2985 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2973 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2974 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2975 }
2986 2976
2987 /* 2977 /*
2988 * This works around two forms of stupidity. The first is in 2978 * This works around two forms of stupidity. The first is in
@@ -3001,9 +2991,12 @@ static int ext4_da_writepages(struct address_space *mapping,
3001 * sbi->max_writeback_mb_bump whichever is smaller. 2991 * sbi->max_writeback_mb_bump whichever is smaller.
3002 */ 2992 */
3003 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 2993 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
3004 if (!range_cyclic && range_whole) 2994 if (!range_cyclic && range_whole) {
3005 desired_nr_to_write = wbc->nr_to_write * 8; 2995 if (wbc->nr_to_write == LONG_MAX)
3006 else 2996 desired_nr_to_write = wbc->nr_to_write;
2997 else
2998 desired_nr_to_write = wbc->nr_to_write * 8;
2999 } else
3007 desired_nr_to_write = ext4_num_dirty_pages(inode, index, 3000 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
3008 max_pages); 3001 max_pages);
3009 if (desired_nr_to_write > max_pages) 3002 if (desired_nr_to_write > max_pages)
@@ -3020,6 +3013,9 @@ static int ext4_da_writepages(struct address_space *mapping,
3020 pages_skipped = wbc->pages_skipped; 3013 pages_skipped = wbc->pages_skipped;
3021 3014
3022retry: 3015retry:
3016 if (wbc->sync_mode == WB_SYNC_ALL)
3017 tag_pages_for_writeback(mapping, index, end);
3018
3023 while (!ret && wbc->nr_to_write > 0) { 3019 while (!ret && wbc->nr_to_write > 0) {
3024 3020
3025 /* 3021 /*
@@ -3058,16 +3054,14 @@ retry:
3058 mpd.io_done = 0; 3054 mpd.io_done = 0;
3059 mpd.pages_written = 0; 3055 mpd.pages_written = 0;
3060 mpd.retval = 0; 3056 mpd.retval = 0;
3061 ret = write_cache_pages_da(mapping, wbc, &mpd); 3057 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3062 /* 3058 /*
3063 * If we have a contiguous extent of pages and we 3059 * If we have a contiguous extent of pages and we
3064 * haven't done the I/O yet, map the blocks and submit 3060 * haven't done the I/O yet, map the blocks and submit
3065 * them for I/O. 3061 * them for I/O.
3066 */ 3062 */
3067 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 3063 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3068 if (mpage_da_map_blocks(&mpd) == 0) 3064 mpage_da_map_and_submit(&mpd);
3069 mpage_da_submit_io(&mpd);
3070 mpd.io_done = 1;
3071 ret = MPAGE_DA_EXTENT_TAIL; 3065 ret = MPAGE_DA_EXTENT_TAIL;
3072 } 3066 }
3073 trace_ext4_da_write_pages(inode, &mpd); 3067 trace_ext4_da_write_pages(inode, &mpd);
@@ -3114,14 +3108,13 @@ retry:
3114 __func__, wbc->nr_to_write, ret); 3108 __func__, wbc->nr_to_write, ret);
3115 3109
3116 /* Update index */ 3110 /* Update index */
3117 index += pages_written;
3118 wbc->range_cyclic = range_cyclic; 3111 wbc->range_cyclic = range_cyclic;
3119 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3112 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3120 /* 3113 /*
3121 * set the writeback_index so that range_cyclic 3114 * set the writeback_index so that range_cyclic
3122 * mode will write it back later 3115 * mode will write it back later
3123 */ 3116 */
3124 mapping->writeback_index = index; 3117 mapping->writeback_index = done_index;
3125 3118
3126out_writepages: 3119out_writepages:
3127 wbc->nr_to_write -= nr_to_writebump; 3120 wbc->nr_to_write -= nr_to_writebump;
@@ -3456,15 +3449,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3456 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3449 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3457} 3450}
3458 3451
3459static void ext4_free_io_end(ext4_io_end_t *io)
3460{
3461 BUG_ON(!io);
3462 if (io->page)
3463 put_page(io->page);
3464 iput(io->inode);
3465 kfree(io);
3466}
3467
3468static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) 3452static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3469{ 3453{
3470 struct buffer_head *head, *bh; 3454 struct buffer_head *head, *bh;
@@ -3641,173 +3625,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3641 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3625 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3642} 3626}
3643 3627
3644static void dump_completed_IO(struct inode * inode)
3645{
3646#ifdef EXT4_DEBUG
3647 struct list_head *cur, *before, *after;
3648 ext4_io_end_t *io, *io0, *io1;
3649 unsigned long flags;
3650
3651 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3652 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3653 return;
3654 }
3655
3656 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3657 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3658 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3659 cur = &io->list;
3660 before = cur->prev;
3661 io0 = container_of(before, ext4_io_end_t, list);
3662 after = cur->next;
3663 io1 = container_of(after, ext4_io_end_t, list);
3664
3665 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3666 io, inode->i_ino, io0, io1);
3667 }
3668 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3669#endif
3670}
3671
3672/*
3673 * check a range of space and convert unwritten extents to written.
3674 */
3675static int ext4_end_io_nolock(ext4_io_end_t *io)
3676{
3677 struct inode *inode = io->inode;
3678 loff_t offset = io->offset;
3679 ssize_t size = io->size;
3680 int ret = 0;
3681
3682 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3683 "list->prev 0x%p\n",
3684 io, inode->i_ino, io->list.next, io->list.prev);
3685
3686 if (list_empty(&io->list))
3687 return ret;
3688
3689 if (io->flag != EXT4_IO_UNWRITTEN)
3690 return ret;
3691
3692 ret = ext4_convert_unwritten_extents(inode, offset, size);
3693 if (ret < 0) {
3694 printk(KERN_EMERG "%s: failed to convert unwritten"
3695 "extents to written extents, error is %d"
3696 " io is still on inode %lu aio dio list\n",
3697 __func__, ret, inode->i_ino);
3698 return ret;
3699 }
3700
3701 if (io->iocb)
3702 aio_complete(io->iocb, io->result, 0);
3703 /* clear the DIO AIO unwritten flag */
3704 io->flag = 0;
3705 return ret;
3706}
3707
3708/*
3709 * work on completed aio dio IO, to convert unwritten extents to extents
3710 */
3711static void ext4_end_io_work(struct work_struct *work)
3712{
3713 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3714 struct inode *inode = io->inode;
3715 struct ext4_inode_info *ei = EXT4_I(inode);
3716 unsigned long flags;
3717 int ret;
3718
3719 mutex_lock(&inode->i_mutex);
3720 ret = ext4_end_io_nolock(io);
3721 if (ret < 0) {
3722 mutex_unlock(&inode->i_mutex);
3723 return;
3724 }
3725
3726 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3727 if (!list_empty(&io->list))
3728 list_del_init(&io->list);
3729 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3730 mutex_unlock(&inode->i_mutex);
3731 ext4_free_io_end(io);
3732}
3733
3734/*
3735 * This function is called from ext4_sync_file().
3736 *
3737 * When IO is completed, the work to convert unwritten extents to
3738 * written is queued on workqueue but may not get immediately
3739 * scheduled. When fsync is called, we need to ensure the
3740 * conversion is complete before fsync returns.
3741 * The inode keeps track of a list of pending/completed IO that
3742 * might needs to do the conversion. This function walks through
3743 * the list and convert the related unwritten extents for completed IO
3744 * to written.
3745 * The function return the number of pending IOs on success.
3746 */
3747int flush_completed_IO(struct inode *inode)
3748{
3749 ext4_io_end_t *io;
3750 struct ext4_inode_info *ei = EXT4_I(inode);
3751 unsigned long flags;
3752 int ret = 0;
3753 int ret2 = 0;
3754
3755 if (list_empty(&ei->i_completed_io_list))
3756 return ret;
3757
3758 dump_completed_IO(inode);
3759 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3760 while (!list_empty(&ei->i_completed_io_list)){
3761 io = list_entry(ei->i_completed_io_list.next,
3762 ext4_io_end_t, list);
3763 /*
3764 * Calling ext4_end_io_nolock() to convert completed
3765 * IO to written.
3766 *
3767 * When ext4_sync_file() is called, run_queue() may already
3768 * about to flush the work corresponding to this io structure.
3769 * It will be upset if it founds the io structure related
3770 * to the work-to-be schedule is freed.
3771 *
3772 * Thus we need to keep the io structure still valid here after
3773 * convertion finished. The io structure has a flag to
3774 * avoid double converting from both fsync and background work
3775 * queue work.
3776 */
3777 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3778 ret = ext4_end_io_nolock(io);
3779 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3780 if (ret < 0)
3781 ret2 = ret;
3782 else
3783 list_del_init(&io->list);
3784 }
3785 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3786 return (ret2 < 0) ? ret2 : 0;
3787}
3788
3789static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3790{
3791 ext4_io_end_t *io = NULL;
3792
3793 io = kmalloc(sizeof(*io), flags);
3794
3795 if (io) {
3796 igrab(inode);
3797 io->inode = inode;
3798 io->flag = 0;
3799 io->offset = 0;
3800 io->size = 0;
3801 io->page = NULL;
3802 io->iocb = NULL;
3803 io->result = 0;
3804 INIT_WORK(&io->work, ext4_end_io_work);
3805 INIT_LIST_HEAD(&io->list);
3806 }
3807
3808 return io;
3809}
3810
3811static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3628static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3812 ssize_t size, void *private, int ret, 3629 ssize_t size, void *private, int ret,
3813 bool is_async) 3630 bool is_async)
@@ -3827,7 +3644,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3827 size); 3644 size);
3828 3645
3829 /* if not aio dio with unwritten extents, just free io and return */ 3646 /* if not aio dio with unwritten extents, just free io and return */
3830 if (io_end->flag != EXT4_IO_UNWRITTEN){ 3647 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3831 ext4_free_io_end(io_end); 3648 ext4_free_io_end(io_end);
3832 iocb->private = NULL; 3649 iocb->private = NULL;
3833out: 3650out:
@@ -3844,14 +3661,14 @@ out:
3844 } 3661 }
3845 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3662 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3846 3663
3847 /* queue the work to convert unwritten extents to written */
3848 queue_work(wq, &io_end->work);
3849
3850 /* Add the io_end to per-inode completed aio dio list*/ 3664 /* Add the io_end to per-inode completed aio dio list*/
3851 ei = EXT4_I(io_end->inode); 3665 ei = EXT4_I(io_end->inode);
3852 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 3666 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3853 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3667 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3854 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3668 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3669
3670 /* queue the work to convert unwritten extents to written */
3671 queue_work(wq, &io_end->work);
3855 iocb->private = NULL; 3672 iocb->private = NULL;
3856} 3673}
3857 3674
@@ -3872,7 +3689,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3872 goto out; 3689 goto out;
3873 } 3690 }
3874 3691
3875 io_end->flag = EXT4_IO_UNWRITTEN; 3692 io_end->flag = EXT4_IO_END_UNWRITTEN;
3876 inode = io_end->inode; 3693 inode = io_end->inode;
3877 3694
3878 /* Add the io_end to per-inode completed io list*/ 3695 /* Add the io_end to per-inode completed io list*/
@@ -5463,6 +5280,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5463{ 5280{
5464 struct inode *inode = dentry->d_inode; 5281 struct inode *inode = dentry->d_inode;
5465 int error, rc = 0; 5282 int error, rc = 0;
5283 int orphan = 0;
5466 const unsigned int ia_valid = attr->ia_valid; 5284 const unsigned int ia_valid = attr->ia_valid;
5467 5285
5468 error = inode_change_ok(inode, attr); 5286 error = inode_change_ok(inode, attr);
@@ -5518,8 +5336,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5518 error = PTR_ERR(handle); 5336 error = PTR_ERR(handle);
5519 goto err_out; 5337 goto err_out;
5520 } 5338 }
5521 5339 if (ext4_handle_valid(handle)) {
5522 error = ext4_orphan_add(handle, inode); 5340 error = ext4_orphan_add(handle, inode);
5341 orphan = 1;
5342 }
5523 EXT4_I(inode)->i_disksize = attr->ia_size; 5343 EXT4_I(inode)->i_disksize = attr->ia_size;
5524 rc = ext4_mark_inode_dirty(handle, inode); 5344 rc = ext4_mark_inode_dirty(handle, inode);
5525 if (!error) 5345 if (!error)
@@ -5537,6 +5357,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5537 goto err_out; 5357 goto err_out;
5538 } 5358 }
5539 ext4_orphan_del(handle, inode); 5359 ext4_orphan_del(handle, inode);
5360 orphan = 0;
5540 ext4_journal_stop(handle); 5361 ext4_journal_stop(handle);
5541 goto err_out; 5362 goto err_out;
5542 } 5363 }
@@ -5559,7 +5380,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5559 * If the call to ext4_truncate failed to get a transaction handle at 5380 * If the call to ext4_truncate failed to get a transaction handle at
5560 * all, we need to clean up the in-core orphan list manually. 5381 * all, we need to clean up the in-core orphan list manually.
5561 */ 5382 */
5562 if (inode->i_nlink) 5383 if (orphan && inode->i_nlink)
5563 ext4_orphan_del(NULL, inode); 5384 ext4_orphan_del(NULL, inode);
5564 5385
5565 if (!rc && (ia_valid & ATTR_MODE)) 5386 if (!rc && (ia_valid & ATTR_MODE))
@@ -5642,7 +5463,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5642 * 5463 *
5643 * Also account for superblock, inode, quota and xattr blocks 5464 * Also account for superblock, inode, quota and xattr blocks
5644 */ 5465 */
5645int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5466static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5646{ 5467{
5647 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5468 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5648 int gdpblocks; 5469 int gdpblocks;