aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-26 17:52:45 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-26 17:52:45 -0500
commit6515925b8259549b7f2187e25d3260306e3e85e5 (patch)
tree7d51487f308f8f0ac95d3113606c39ba592111ba /fs/ext4/inode.c
parentbbbd27e694ce2c5fde9c8fcedbea618dd9153fe7 (diff)
parent304e220f0879198b1f5309ad6f0be862b4009491 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Theodore Ts'o: "The one new feature added in this patch series is the ability to use the "punch hole" functionality for inodes that are not using extent maps. In the bug fix category, we fixed some races in the AIO and fstrim code, and some potential NULL pointer dereferences and memory leaks in error handling code paths. In the optimization category, we fixed a performance regression in the jbd2 layer introduced by commit d9b01934d56a ("jbd: fix fsync() tid wraparound bug", introduced in v3.0) which shows up in the AIM7 benchmark. We also further optimized jbd2 by minimize the amount of time that transaction handles are held active. This patch series also features some additional enhancement of the extent status tree, which is now used to cache extent information in a more efficient/compact form than what we use on-disk." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (65 commits) ext4: fix free clusters calculation in bigalloc filesystem ext4: no need to remove extent if len is 0 in ext4_es_remove_extent() ext4: fix xattr block allocation/release with bigalloc ext4: reclaim extents from extent status tree ext4: adjust some functions for reclaiming extents from extent status tree ext4: remove single extent cache ext4: lookup block mapping in extent status tree ext4: track all extent status in extent status tree ext4: let ext4_ext_map_blocks return EXT4_MAP_UNWRITTEN flag ext4: rename and improbe ext4_es_find_extent() ext4: add physical block and status member into extent status tree ext4: refine extent status tree ext4: use ERR_PTR() abstraction for ext4_append() ext4: refactor code to read directory blocks into ext4_read_dirblock() ext4: add debugging context for warning in ext4_da_update_reserve_space() ext4: use KERN_WARNING for warning messages jbd2: use module parameters instead of debugfs for jbd_debug ext4: use module parameters instead of debugfs for mballoc_debug ext4: start handle at the last possible moment when creating inodes ext4: fix the number of credits needed for acl ops with inline data ...
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c664
1 files changed, 297 insertions, 367 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cd818d8bb221..88049d8d30cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
132} 132}
133 133
134static void ext4_invalidatepage(struct page *page, unsigned long offset); 134static void ext4_invalidatepage(struct page *page, unsigned long offset);
135static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
136 struct buffer_head *bh_result, int create);
137static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
138static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
139static int __ext4_journalled_writepage(struct page *page, unsigned int len); 135static int __ext4_journalled_writepage(struct page *page, unsigned int len);
140static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 136static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
141static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 137static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
@@ -238,7 +234,8 @@ void ext4_evict_inode(struct inode *inode)
238 * protection against it 234 * protection against it
239 */ 235 */
240 sb_start_intwrite(inode->i_sb); 236 sb_start_intwrite(inode->i_sb);
241 handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); 237 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
238 ext4_blocks_for_truncate(inode)+3);
242 if (IS_ERR(handle)) { 239 if (IS_ERR(handle)) {
243 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 240 ext4_std_error(inode->i_sb, PTR_ERR(handle));
244 /* 241 /*
@@ -346,7 +343,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
346 spin_lock(&ei->i_block_reservation_lock); 343 spin_lock(&ei->i_block_reservation_lock);
347 trace_ext4_da_update_reserve_space(inode, used, quota_claim); 344 trace_ext4_da_update_reserve_space(inode, used, quota_claim);
348 if (unlikely(used > ei->i_reserved_data_blocks)) { 345 if (unlikely(used > ei->i_reserved_data_blocks)) {
349 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 346 ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
350 "with only %d reserved data blocks", 347 "with only %d reserved data blocks",
351 __func__, inode->i_ino, used, 348 __func__, inode->i_ino, used,
352 ei->i_reserved_data_blocks); 349 ei->i_reserved_data_blocks);
@@ -355,10 +352,12 @@ void ext4_da_update_reserve_space(struct inode *inode,
355 } 352 }
356 353
357 if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { 354 if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
358 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d " 355 ext4_warning(inode->i_sb, "ino %lu, allocated %d "
359 "with only %d reserved metadata blocks\n", __func__, 356 "with only %d reserved metadata blocks "
360 inode->i_ino, ei->i_allocated_meta_blocks, 357 "(releasing %d blocks with reserved %d data blocks)",
361 ei->i_reserved_meta_blocks); 358 inode->i_ino, ei->i_allocated_meta_blocks,
359 ei->i_reserved_meta_blocks, used,
360 ei->i_reserved_data_blocks);
362 WARN_ON(1); 361 WARN_ON(1);
363 ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; 362 ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
364 } 363 }
@@ -508,12 +507,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
508int ext4_map_blocks(handle_t *handle, struct inode *inode, 507int ext4_map_blocks(handle_t *handle, struct inode *inode,
509 struct ext4_map_blocks *map, int flags) 508 struct ext4_map_blocks *map, int flags)
510{ 509{
510 struct extent_status es;
511 int retval; 511 int retval;
512 512
513 map->m_flags = 0; 513 map->m_flags = 0;
514 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," 514 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
515 "logical block %lu\n", inode->i_ino, flags, map->m_len, 515 "logical block %lu\n", inode->i_ino, flags, map->m_len,
516 (unsigned long) map->m_lblk); 516 (unsigned long) map->m_lblk);
517
518 /* Lookup extent status tree firstly */
519 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
520 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
521 map->m_pblk = ext4_es_pblock(&es) +
522 map->m_lblk - es.es_lblk;
523 map->m_flags |= ext4_es_is_written(&es) ?
524 EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
525 retval = es.es_len - (map->m_lblk - es.es_lblk);
526 if (retval > map->m_len)
527 retval = map->m_len;
528 map->m_len = retval;
529 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
530 retval = 0;
531 } else {
532 BUG_ON(1);
533 }
534 goto found;
535 }
536
517 /* 537 /*
518 * Try to see if we can get the block without requesting a new 538 * Try to see if we can get the block without requesting a new
519 * file system block. 539 * file system block.
@@ -527,20 +547,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
527 retval = ext4_ind_map_blocks(handle, inode, map, flags & 547 retval = ext4_ind_map_blocks(handle, inode, map, flags &
528 EXT4_GET_BLOCKS_KEEP_SIZE); 548 EXT4_GET_BLOCKS_KEEP_SIZE);
529 } 549 }
550 if (retval > 0) {
551 int ret;
552 unsigned long long status;
553
554 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
555 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
556 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
557 ext4_find_delalloc_range(inode, map->m_lblk,
558 map->m_lblk + map->m_len - 1))
559 status |= EXTENT_STATUS_DELAYED;
560 ret = ext4_es_insert_extent(inode, map->m_lblk,
561 map->m_len, map->m_pblk, status);
562 if (ret < 0)
563 retval = ret;
564 }
530 if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 565 if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
531 up_read((&EXT4_I(inode)->i_data_sem)); 566 up_read((&EXT4_I(inode)->i_data_sem));
532 567
568found:
533 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 569 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
534 int ret; 570 int ret = check_block_validity(inode, map);
535 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
536 /* delayed alloc may be allocated by fallocate and
537 * coverted to initialized by directIO.
538 * we need to handle delayed extent here.
539 */
540 down_write((&EXT4_I(inode)->i_data_sem));
541 goto delayed_mapped;
542 }
543 ret = check_block_validity(inode, map);
544 if (ret != 0) 571 if (ret != 0)
545 return ret; 572 return ret;
546 } 573 }
@@ -560,16 +587,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
560 return retval; 587 return retval;
561 588
562 /* 589 /*
563 * When we call get_blocks without the create flag, the 590 * Here we clear m_flags because after allocating an new extent,
564 * BH_Unwritten flag could have gotten set if the blocks 591 * it will be set again.
565 * requested were part of a uninitialized extent. We need to
566 * clear this flag now that we are committed to convert all or
567 * part of the uninitialized extent to be an initialized
568 * extent. This is because we need to avoid the combination
569 * of BH_Unwritten and BH_Mapped flags being simultaneously
570 * set on the buffer_head.
571 */ 592 */
572 map->m_flags &= ~EXT4_MAP_UNWRITTEN; 593 map->m_flags &= ~EXT4_MAP_FLAGS;
573 594
574 /* 595 /*
575 * New blocks allocate and/or writing to uninitialized extent 596 * New blocks allocate and/or writing to uninitialized extent
@@ -615,18 +636,23 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
615 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 636 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
616 ext4_da_update_reserve_space(inode, retval, 1); 637 ext4_da_update_reserve_space(inode, retval, 1);
617 } 638 }
618 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 639 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
619 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 640 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
620 641
621 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 642 if (retval > 0) {
622 int ret; 643 int ret;
623delayed_mapped: 644 unsigned long long status;
624 /* delayed allocation blocks has been allocated */ 645
625 ret = ext4_es_remove_extent(inode, map->m_lblk, 646 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
626 map->m_len); 647 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
627 if (ret < 0) 648 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
628 retval = ret; 649 ext4_find_delalloc_range(inode, map->m_lblk,
629 } 650 map->m_lblk + map->m_len - 1))
651 status |= EXTENT_STATUS_DELAYED;
652 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
653 map->m_pblk, status);
654 if (ret < 0)
655 retval = ret;
630 } 656 }
631 657
632 up_write((&EXT4_I(inode)->i_data_sem)); 658 up_write((&EXT4_I(inode)->i_data_sem));
@@ -660,7 +686,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
660 if (map.m_len > DIO_MAX_BLOCKS) 686 if (map.m_len > DIO_MAX_BLOCKS)
661 map.m_len = DIO_MAX_BLOCKS; 687 map.m_len = DIO_MAX_BLOCKS;
662 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); 688 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
663 handle = ext4_journal_start(inode, dio_credits); 689 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
690 dio_credits);
664 if (IS_ERR(handle)) { 691 if (IS_ERR(handle)) {
665 ret = PTR_ERR(handle); 692 ret = PTR_ERR(handle);
666 return ret; 693 return ret;
@@ -707,14 +734,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
707 /* ensure we send some value back into *errp */ 734 /* ensure we send some value back into *errp */
708 *errp = 0; 735 *errp = 0;
709 736
737 if (create && err == 0)
738 err = -ENOSPC; /* should never happen */
710 if (err < 0) 739 if (err < 0)
711 *errp = err; 740 *errp = err;
712 if (err <= 0) 741 if (err <= 0)
713 return NULL; 742 return NULL;
714 743
715 bh = sb_getblk(inode->i_sb, map.m_pblk); 744 bh = sb_getblk(inode->i_sb, map.m_pblk);
716 if (!bh) { 745 if (unlikely(!bh)) {
717 *errp = -EIO; 746 *errp = -ENOMEM;
718 return NULL; 747 return NULL;
719 } 748 }
720 if (map.m_flags & EXT4_MAP_NEW) { 749 if (map.m_flags & EXT4_MAP_NEW) {
@@ -808,11 +837,10 @@ int ext4_walk_page_buffers(handle_t *handle,
808 * and the commit_write(). So doing the jbd2_journal_start at the start of 837 * and the commit_write(). So doing the jbd2_journal_start at the start of
809 * prepare_write() is the right place. 838 * prepare_write() is the right place.
810 * 839 *
811 * Also, this function can nest inside ext4_writepage() -> 840 * Also, this function can nest inside ext4_writepage(). In that case, we
812 * block_write_full_page(). In that case, we *know* that ext4_writepage() 841 * *know* that ext4_writepage() has generated enough buffer credits to do the
813 * has generated enough buffer credits to do the whole page. So we won't 842 * whole page. So we won't block on the journal in that case, which is good,
814 * block on the journal in that case, which is good, because the caller may 843 * because the caller may be PF_MEMALLOC.
815 * be PF_MEMALLOC.
816 * 844 *
817 * By accident, ext4 can be reentered when a transaction is open via 845 * By accident, ext4 can be reentered when a transaction is open via
818 * quota file writes. If we were to commit the transaction while thus 846 * quota file writes. If we were to commit the transaction while thus
@@ -878,32 +906,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
878 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, 906 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
879 flags, pagep); 907 flags, pagep);
880 if (ret < 0) 908 if (ret < 0)
881 goto out; 909 return ret;
882 if (ret == 1) { 910 if (ret == 1)
883 ret = 0; 911 return 0;
884 goto out;
885 }
886 } 912 }
887 913
888retry: 914 /*
889 handle = ext4_journal_start(inode, needed_blocks); 915 * grab_cache_page_write_begin() can take a long time if the
916 * system is thrashing due to memory pressure, or if the page
917 * is being written back. So grab it first before we start
918 * the transaction handle. This also allows us to allocate
919 * the page (if needed) without using GFP_NOFS.
920 */
921retry_grab:
922 page = grab_cache_page_write_begin(mapping, index, flags);
923 if (!page)
924 return -ENOMEM;
925 unlock_page(page);
926
927retry_journal:
928 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
890 if (IS_ERR(handle)) { 929 if (IS_ERR(handle)) {
891 ret = PTR_ERR(handle); 930 page_cache_release(page);
892 goto out; 931 return PTR_ERR(handle);
893 } 932 }
894 933
895 /* We cannot recurse into the filesystem as the transaction is already 934 lock_page(page);
896 * started */ 935 if (page->mapping != mapping) {
897 flags |= AOP_FLAG_NOFS; 936 /* The page got truncated from under us */
898 937 unlock_page(page);
899 page = grab_cache_page_write_begin(mapping, index, flags); 938 page_cache_release(page);
900 if (!page) {
901 ext4_journal_stop(handle); 939 ext4_journal_stop(handle);
902 ret = -ENOMEM; 940 goto retry_grab;
903 goto out;
904 } 941 }
905 942 wait_on_page_writeback(page);
906 *pagep = page;
907 943
908 if (ext4_should_dioread_nolock(inode)) 944 if (ext4_should_dioread_nolock(inode))
909 ret = __block_write_begin(page, pos, len, ext4_get_block_write); 945 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -918,7 +954,6 @@ retry:
918 954
919 if (ret) { 955 if (ret) {
920 unlock_page(page); 956 unlock_page(page);
921 page_cache_release(page);
922 /* 957 /*
923 * __block_write_begin may have instantiated a few blocks 958 * __block_write_begin may have instantiated a few blocks
924 * outside i_size. Trim these off again. Don't need 959 * outside i_size. Trim these off again. Don't need
@@ -942,11 +977,14 @@ retry:
942 if (inode->i_nlink) 977 if (inode->i_nlink)
943 ext4_orphan_del(NULL, inode); 978 ext4_orphan_del(NULL, inode);
944 } 979 }
945 }
946 980
947 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 981 if (ret == -ENOSPC &&
948 goto retry; 982 ext4_should_retry_alloc(inode->i_sb, &retries))
949out: 983 goto retry_journal;
984 page_cache_release(page);
985 return ret;
986 }
987 *pagep = page;
950 return ret; 988 return ret;
951} 989}
952 990
@@ -1256,7 +1294,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1256 * function is called from invalidate page, it's 1294 * function is called from invalidate page, it's
1257 * harmless to return without any action. 1295 * harmless to return without any action.
1258 */ 1296 */
1259 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " 1297 ext4_warning(inode->i_sb, "ext4_da_release_space: "
1260 "ino %lu, to_free %d with only %d reserved " 1298 "ino %lu, to_free %d with only %d reserved "
1261 "data blocks", inode->i_ino, to_free, 1299 "data blocks", inode->i_ino, to_free,
1262 ei->i_reserved_data_blocks); 1300 ei->i_reserved_data_blocks);
@@ -1357,7 +1395,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1357 loff_t size = i_size_read(inode); 1395 loff_t size = i_size_read(inode);
1358 unsigned int len, block_start; 1396 unsigned int len, block_start;
1359 struct buffer_head *bh, *page_bufs = NULL; 1397 struct buffer_head *bh, *page_bufs = NULL;
1360 int journal_data = ext4_should_journal_data(inode);
1361 sector_t pblock = 0, cur_logical = 0; 1398 sector_t pblock = 0, cur_logical = 0;
1362 struct ext4_io_submit io_submit; 1399 struct ext4_io_submit io_submit;
1363 1400
@@ -1378,7 +1415,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1378 if (nr_pages == 0) 1415 if (nr_pages == 0)
1379 break; 1416 break;
1380 for (i = 0; i < nr_pages; i++) { 1417 for (i = 0; i < nr_pages; i++) {
1381 int commit_write = 0, skip_page = 0; 1418 int skip_page = 0;
1382 struct page *page = pvec.pages[i]; 1419 struct page *page = pvec.pages[i];
1383 1420
1384 index = page->index; 1421 index = page->index;
@@ -1400,27 +1437,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1400 BUG_ON(!PageLocked(page)); 1437 BUG_ON(!PageLocked(page));
1401 BUG_ON(PageWriteback(page)); 1438 BUG_ON(PageWriteback(page));
1402 1439
1403 /*
1404 * If the page does not have buffers (for
1405 * whatever reason), try to create them using
1406 * __block_write_begin. If this fails,
1407 * skip the page and move on.
1408 */
1409 if (!page_has_buffers(page)) {
1410 if (__block_write_begin(page, 0, len,
1411 noalloc_get_block_write)) {
1412 skip_page:
1413 unlock_page(page);
1414 continue;
1415 }
1416 commit_write = 1;
1417 }
1418
1419 bh = page_bufs = page_buffers(page); 1440 bh = page_bufs = page_buffers(page);
1420 block_start = 0; 1441 block_start = 0;
1421 do { 1442 do {
1422 if (!bh)
1423 goto skip_page;
1424 if (map && (cur_logical >= map->m_lblk) && 1443 if (map && (cur_logical >= map->m_lblk) &&
1425 (cur_logical <= (map->m_lblk + 1444 (cur_logical <= (map->m_lblk +
1426 (map->m_len - 1)))) { 1445 (map->m_len - 1)))) {
@@ -1448,33 +1467,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1448 pblock++; 1467 pblock++;
1449 } while (bh != page_bufs); 1468 } while (bh != page_bufs);
1450 1469
1451 if (skip_page) 1470 if (skip_page) {
1452 goto skip_page; 1471 unlock_page(page);
1453 1472 continue;
1454 if (commit_write) 1473 }
1455 /* mark the buffer_heads as dirty & uptodate */
1456 block_commit_write(page, 0, len);
1457 1474
1458 clear_page_dirty_for_io(page); 1475 clear_page_dirty_for_io(page);
1459 /* 1476 err = ext4_bio_write_page(&io_submit, page, len,
1460 * Delalloc doesn't support data journalling, 1477 mpd->wbc);
1461 * but eventually maybe we'll lift this
1462 * restriction.
1463 */
1464 if (unlikely(journal_data && PageChecked(page)))
1465 err = __ext4_journalled_writepage(page, len);
1466 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
1467 err = ext4_bio_write_page(&io_submit, page,
1468 len, mpd->wbc);
1469 else if (buffer_uninit(page_bufs)) {
1470 ext4_set_bh_endio(page_bufs, inode);
1471 err = block_write_full_page_endio(page,
1472 noalloc_get_block_write,
1473 mpd->wbc, ext4_end_io_buffer_write);
1474 } else
1475 err = block_write_full_page(page,
1476 noalloc_get_block_write, mpd->wbc);
1477
1478 if (!err) 1478 if (!err)
1479 mpd->pages_written++; 1479 mpd->pages_written++;
1480 /* 1480 /*
@@ -1640,7 +1640,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1640 (unsigned long long) next, 1640 (unsigned long long) next,
1641 mpd->b_size >> mpd->inode->i_blkbits, err); 1641 mpd->b_size >> mpd->inode->i_blkbits, err);
1642 ext4_msg(sb, KERN_CRIT, 1642 ext4_msg(sb, KERN_CRIT,
1643 "This should not happen!! Data will be lost\n"); 1643 "This should not happen!! Data will be lost");
1644 if (err == -ENOSPC) 1644 if (err == -ENOSPC)
1645 ext4_print_free_blocks(mpd->inode); 1645 ext4_print_free_blocks(mpd->inode);
1646 } 1646 }
@@ -1690,16 +1690,16 @@ submit_io:
1690 * 1690 *
1691 * @mpd->lbh - extent of blocks 1691 * @mpd->lbh - extent of blocks
1692 * @logical - logical number of the block in the file 1692 * @logical - logical number of the block in the file
1693 * @bh - bh of the block (used to access block's state) 1693 * @b_state - b_state of the buffer head added
1694 * 1694 *
1695 * the function is used to collect contig. blocks in same state 1695 * the function is used to collect contig. blocks in same state
1696 */ 1696 */
1697static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 1697static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
1698 sector_t logical, size_t b_size,
1699 unsigned long b_state) 1698 unsigned long b_state)
1700{ 1699{
1701 sector_t next; 1700 sector_t next;
1702 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 1701 int blkbits = mpd->inode->i_blkbits;
1702 int nrblocks = mpd->b_size >> blkbits;
1703 1703
1704 /* 1704 /*
1705 * XXX Don't go larger than mballoc is willing to allocate 1705 * XXX Don't go larger than mballoc is willing to allocate
@@ -1707,11 +1707,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1707 * mpage_da_submit_io() into this function and then call 1707 * mpage_da_submit_io() into this function and then call
1708 * ext4_map_blocks() multiple times in a loop 1708 * ext4_map_blocks() multiple times in a loop
1709 */ 1709 */
1710 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) 1710 if (nrblocks >= (8*1024*1024 >> blkbits))
1711 goto flush_it; 1711 goto flush_it;
1712 1712
1713 /* check if thereserved journal credits might overflow */ 1713 /* check if the reserved journal credits might overflow */
1714 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { 1714 if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
1715 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 1715 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1716 /* 1716 /*
1717 * With non-extent format we are limited by the journal 1717 * With non-extent format we are limited by the journal
@@ -1720,16 +1720,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1720 * nrblocks. So limit nrblocks. 1720 * nrblocks. So limit nrblocks.
1721 */ 1721 */
1722 goto flush_it; 1722 goto flush_it;
1723 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
1724 EXT4_MAX_TRANS_DATA) {
1725 /*
1726 * Adding the new buffer_head would make it cross the
1727 * allowed limit for which we have journal credit
1728 * reserved. So limit the new bh->b_size
1729 */
1730 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
1731 mpd->inode->i_blkbits;
1732 /* we will do mpage_da_submit_io in the next loop */
1733 } 1723 }
1734 } 1724 }
1735 /* 1725 /*
@@ -1737,7 +1727,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1737 */ 1727 */
1738 if (mpd->b_size == 0) { 1728 if (mpd->b_size == 0) {
1739 mpd->b_blocknr = logical; 1729 mpd->b_blocknr = logical;
1740 mpd->b_size = b_size; 1730 mpd->b_size = 1 << blkbits;
1741 mpd->b_state = b_state & BH_FLAGS; 1731 mpd->b_state = b_state & BH_FLAGS;
1742 return; 1732 return;
1743 } 1733 }
@@ -1747,7 +1737,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1747 * Can we merge the block to our big extent? 1737 * Can we merge the block to our big extent?
1748 */ 1738 */
1749 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { 1739 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1750 mpd->b_size += b_size; 1740 mpd->b_size += 1 << blkbits;
1751 return; 1741 return;
1752 } 1742 }
1753 1743
@@ -1775,6 +1765,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1775 struct ext4_map_blocks *map, 1765 struct ext4_map_blocks *map,
1776 struct buffer_head *bh) 1766 struct buffer_head *bh)
1777{ 1767{
1768 struct extent_status es;
1778 int retval; 1769 int retval;
1779 sector_t invalid_block = ~((sector_t) 0xffff); 1770 sector_t invalid_block = ~((sector_t) 0xffff);
1780 1771
@@ -1785,6 +1776,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1785 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," 1776 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1786 "logical block %lu\n", inode->i_ino, map->m_len, 1777 "logical block %lu\n", inode->i_ino, map->m_len,
1787 (unsigned long) map->m_lblk); 1778 (unsigned long) map->m_lblk);
1779
1780 /* Lookup extent status tree firstly */
1781 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1782
1783 if (ext4_es_is_hole(&es)) {
1784 retval = 0;
1785 down_read((&EXT4_I(inode)->i_data_sem));
1786 goto add_delayed;
1787 }
1788
1789 /*
1790 * Delayed extent could be allocated by fallocate.
1791 * So we need to check it.
1792 */
1793 if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
1794 map_bh(bh, inode->i_sb, invalid_block);
1795 set_buffer_new(bh);
1796 set_buffer_delay(bh);
1797 return 0;
1798 }
1799
1800 map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
1801 retval = es.es_len - (iblock - es.es_lblk);
1802 if (retval > map->m_len)
1803 retval = map->m_len;
1804 map->m_len = retval;
1805 if (ext4_es_is_written(&es))
1806 map->m_flags |= EXT4_MAP_MAPPED;
1807 else if (ext4_es_is_unwritten(&es))
1808 map->m_flags |= EXT4_MAP_UNWRITTEN;
1809 else
1810 BUG_ON(1);
1811
1812 return retval;
1813 }
1814
1788 /* 1815 /*
1789 * Try to see if we can get the block without requesting a new 1816 * Try to see if we can get the block without requesting a new
1790 * file system block. 1817 * file system block.
@@ -1803,11 +1830,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1803 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 1830 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
1804 retval = 0; 1831 retval = 0;
1805 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1832 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1806 retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1833 retval = ext4_ext_map_blocks(NULL, inode, map,
1834 EXT4_GET_BLOCKS_NO_PUT_HOLE);
1807 else 1835 else
1808 retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1836 retval = ext4_ind_map_blocks(NULL, inode, map,
1837 EXT4_GET_BLOCKS_NO_PUT_HOLE);
1809 1838
1839add_delayed:
1810 if (retval == 0) { 1840 if (retval == 0) {
1841 int ret;
1811 /* 1842 /*
1812 * XXX: __block_prepare_write() unmaps passed block, 1843 * XXX: __block_prepare_write() unmaps passed block,
1813 * is it OK? 1844 * is it OK?
@@ -1815,15 +1846,20 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1815 /* If the block was allocated from previously allocated cluster, 1846 /* If the block was allocated from previously allocated cluster,
1816 * then we dont need to reserve it again. */ 1847 * then we dont need to reserve it again. */
1817 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { 1848 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
1818 retval = ext4_da_reserve_space(inode, iblock); 1849 ret = ext4_da_reserve_space(inode, iblock);
1819 if (retval) 1850 if (ret) {
1820 /* not enough space to reserve */ 1851 /* not enough space to reserve */
1852 retval = ret;
1821 goto out_unlock; 1853 goto out_unlock;
1854 }
1822 } 1855 }
1823 1856
1824 retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); 1857 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1825 if (retval) 1858 ~0, EXTENT_STATUS_DELAYED);
1859 if (ret) {
1860 retval = ret;
1826 goto out_unlock; 1861 goto out_unlock;
1862 }
1827 1863
1828 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served 1864 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1829 * and it should not appear on the bh->b_state. 1865 * and it should not appear on the bh->b_state.
@@ -1833,6 +1869,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1833 map_bh(bh, inode->i_sb, invalid_block); 1869 map_bh(bh, inode->i_sb, invalid_block);
1834 set_buffer_new(bh); 1870 set_buffer_new(bh);
1835 set_buffer_delay(bh); 1871 set_buffer_delay(bh);
1872 } else if (retval > 0) {
1873 int ret;
1874 unsigned long long status;
1875
1876 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
1877 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
1878 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1879 map->m_pblk, status);
1880 if (ret != 0)
1881 retval = ret;
1836 } 1882 }
1837 1883
1838out_unlock: 1884out_unlock:
@@ -1890,27 +1936,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1890 return 0; 1936 return 0;
1891} 1937}
1892 1938
1893/*
1894 * This function is used as a standard get_block_t calback function
1895 * when there is no desire to allocate any blocks. It is used as a
1896 * callback function for block_write_begin() and block_write_full_page().
1897 * These functions should only try to map a single block at a time.
1898 *
1899 * Since this function doesn't do block allocations even if the caller
1900 * requests it by passing in create=1, it is critically important that
1901 * any caller checks to make sure that any buffer heads are returned
1902 * by this function are either all already mapped or marked for
1903 * delayed allocation before calling block_write_full_page(). Otherwise,
1904 * b_blocknr could be left unitialized, and the page write functions will
1905 * be taken by surprise.
1906 */
1907static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
1908 struct buffer_head *bh_result, int create)
1909{
1910 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1911 return _ext4_get_block(inode, iblock, bh_result, 0);
1912}
1913
1914static int bget_one(handle_t *handle, struct buffer_head *bh) 1939static int bget_one(handle_t *handle, struct buffer_head *bh)
1915{ 1940{
1916 get_bh(bh); 1941 get_bh(bh);
@@ -1955,7 +1980,8 @@ static int __ext4_journalled_writepage(struct page *page,
1955 * references to buffers so we are safe */ 1980 * references to buffers so we are safe */
1956 unlock_page(page); 1981 unlock_page(page);
1957 1982
1958 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 1983 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
1984 ext4_writepage_trans_blocks(inode));
1959 if (IS_ERR(handle)) { 1985 if (IS_ERR(handle)) {
1960 ret = PTR_ERR(handle); 1986 ret = PTR_ERR(handle);
1961 goto out; 1987 goto out;
@@ -2035,11 +2061,12 @@ out:
2035static int ext4_writepage(struct page *page, 2061static int ext4_writepage(struct page *page,
2036 struct writeback_control *wbc) 2062 struct writeback_control *wbc)
2037{ 2063{
2038 int ret = 0, commit_write = 0; 2064 int ret = 0;
2039 loff_t size; 2065 loff_t size;
2040 unsigned int len; 2066 unsigned int len;
2041 struct buffer_head *page_bufs = NULL; 2067 struct buffer_head *page_bufs = NULL;
2042 struct inode *inode = page->mapping->host; 2068 struct inode *inode = page->mapping->host;
2069 struct ext4_io_submit io_submit;
2043 2070
2044 trace_ext4_writepage(page); 2071 trace_ext4_writepage(page);
2045 size = i_size_read(inode); 2072 size = i_size_read(inode);
@@ -2048,39 +2075,29 @@ static int ext4_writepage(struct page *page,
2048 else 2075 else
2049 len = PAGE_CACHE_SIZE; 2076 len = PAGE_CACHE_SIZE;
2050 2077
2078 page_bufs = page_buffers(page);
2051 /* 2079 /*
2052 * If the page does not have buffers (for whatever reason), 2080 * We cannot do block allocation or other extent handling in this
2053 * try to create them using __block_write_begin. If this 2081 * function. If there are buffers needing that, we have to redirty
2054 * fails, redirty the page and move on. 2082 * the page. But we may reach here when we do a journal commit via
2083 * journal_submit_inode_data_buffers() and in that case we must write
2084 * allocated buffers to achieve data=ordered mode guarantees.
2055 */ 2085 */
2056 if (!page_has_buffers(page)) { 2086 if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2057 if (__block_write_begin(page, 0, len, 2087 ext4_bh_delay_or_unwritten)) {
2058 noalloc_get_block_write)) { 2088 redirty_page_for_writepage(wbc, page);
2059 redirty_page: 2089 if (current->flags & PF_MEMALLOC) {
2060 redirty_page_for_writepage(wbc, page); 2090 /*
2091 * For memory cleaning there's no point in writing only
2092 * some buffers. So just bail out. Warn if we came here
2093 * from direct reclaim.
2094 */
2095 WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
2096 == PF_MEMALLOC);
2061 unlock_page(page); 2097 unlock_page(page);
2062 return 0; 2098 return 0;
2063 } 2099 }
2064 commit_write = 1;
2065 } 2100 }
2066 page_bufs = page_buffers(page);
2067 if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2068 ext4_bh_delay_or_unwritten)) {
2069 /*
2070 * We don't want to do block allocation, so redirty
2071 * the page and return. We may reach here when we do
2072 * a journal commit via journal_submit_inode_data_buffers.
2073 * We can also reach here via shrink_page_list but it
2074 * should never be for direct reclaim so warn if that
2075 * happens
2076 */
2077 WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
2078 PF_MEMALLOC);
2079 goto redirty_page;
2080 }
2081 if (commit_write)
2082 /* now mark the buffer_heads as dirty and uptodate */
2083 block_commit_write(page, 0, len);
2084 2101
2085 if (PageChecked(page) && ext4_should_journal_data(inode)) 2102 if (PageChecked(page) && ext4_should_journal_data(inode))
2086 /* 2103 /*
@@ -2089,14 +2106,9 @@ static int ext4_writepage(struct page *page,
2089 */ 2106 */
2090 return __ext4_journalled_writepage(page, len); 2107 return __ext4_journalled_writepage(page, len);
2091 2108
2092 if (buffer_uninit(page_bufs)) { 2109 memset(&io_submit, 0, sizeof(io_submit));
2093 ext4_set_bh_endio(page_bufs, inode); 2110 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
2094 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2111 ext4_io_submit(&io_submit);
2095 wbc, ext4_end_io_buffer_write);
2096 } else
2097 ret = block_write_full_page(page, noalloc_get_block_write,
2098 wbc);
2099
2100 return ret; 2112 return ret;
2101} 2113}
2102 2114
@@ -2228,51 +2240,38 @@ static int write_cache_pages_da(handle_t *handle,
2228 logical = (sector_t) page->index << 2240 logical = (sector_t) page->index <<
2229 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2241 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2230 2242
2231 if (!page_has_buffers(page)) { 2243 /* Add all dirty buffers to mpd */
2232 mpage_add_bh_to_extent(mpd, logical, 2244 head = page_buffers(page);
2233 PAGE_CACHE_SIZE, 2245 bh = head;
2234 (1 << BH_Dirty) | (1 << BH_Uptodate)); 2246 do {
2235 if (mpd->io_done) 2247 BUG_ON(buffer_locked(bh));
2236 goto ret_extent_tail;
2237 } else {
2238 /* 2248 /*
2239 * Page with regular buffer heads, 2249 * We need to try to allocate unmapped blocks
2240 * just add all dirty ones 2250 * in the same page. Otherwise we won't make
2251 * progress with the page in ext4_writepage
2241 */ 2252 */
2242 head = page_buffers(page); 2253 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2243 bh = head; 2254 mpage_add_bh_to_extent(mpd, logical,
2244 do { 2255 bh->b_state);
2245 BUG_ON(buffer_locked(bh)); 2256 if (mpd->io_done)
2257 goto ret_extent_tail;
2258 } else if (buffer_dirty(bh) &&
2259 buffer_mapped(bh)) {
2246 /* 2260 /*
2247 * We need to try to allocate 2261 * mapped dirty buffer. We need to
2248 * unmapped blocks in the same page. 2262 * update the b_state because we look
2249 * Otherwise we won't make progress 2263 * at b_state in mpage_da_map_blocks.
2250 * with the page in ext4_writepage 2264 * We don't update b_size because if we
2265 * find an unmapped buffer_head later
2266 * we need to use the b_state flag of
2267 * that buffer_head.
2251 */ 2268 */
2252 if (ext4_bh_delay_or_unwritten(NULL, bh)) { 2269 if (mpd->b_size == 0)
2253 mpage_add_bh_to_extent(mpd, logical, 2270 mpd->b_state =
2254 bh->b_size, 2271 bh->b_state & BH_FLAGS;
2255 bh->b_state); 2272 }
2256 if (mpd->io_done) 2273 logical++;
2257 goto ret_extent_tail; 2274 } while ((bh = bh->b_this_page) != head);
2258 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2259 /*
2260 * mapped dirty buffer. We need
2261 * to update the b_state
2262 * because we look at b_state
2263 * in mpage_da_map_blocks. We
2264 * don't update b_size because
2265 * if we find an unmapped
2266 * buffer_head later we need to
2267 * use the b_state flag of that
2268 * buffer_head.
2269 */
2270 if (mpd->b_size == 0)
2271 mpd->b_state = bh->b_state & BH_FLAGS;
2272 }
2273 logical++;
2274 } while ((bh = bh->b_this_page) != head);
2275 }
2276 2275
2277 if (nr_to_write > 0) { 2276 if (nr_to_write > 0) {
2278 nr_to_write--; 2277 nr_to_write--;
@@ -2413,7 +2412,8 @@ retry:
2413 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2412 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2414 2413
2415 /* start a new transaction*/ 2414 /* start a new transaction*/
2416 handle = ext4_journal_start(inode, needed_blocks); 2415 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
2416 needed_blocks);
2417 if (IS_ERR(handle)) { 2417 if (IS_ERR(handle)) {
2418 ret = PTR_ERR(handle); 2418 ret = PTR_ERR(handle);
2419 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2419 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@ -2555,42 +2555,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2555 pos, len, flags, 2555 pos, len, flags,
2556 pagep, fsdata); 2556 pagep, fsdata);
2557 if (ret < 0) 2557 if (ret < 0)
2558 goto out; 2558 return ret;
2559 if (ret == 1) { 2559 if (ret == 1)
2560 ret = 0; 2560 return 0;
2561 goto out;
2562 }
2563 } 2561 }
2564 2562
2565retry: 2563 /*
2564 * grab_cache_page_write_begin() can take a long time if the
2565 * system is thrashing due to memory pressure, or if the page
2566 * is being written back. So grab it first before we start
2567 * the transaction handle. This also allows us to allocate
2568 * the page (if needed) without using GFP_NOFS.
2569 */
2570retry_grab:
2571 page = grab_cache_page_write_begin(mapping, index, flags);
2572 if (!page)
2573 return -ENOMEM;
2574 unlock_page(page);
2575
2566 /* 2576 /*
2567 * With delayed allocation, we don't log the i_disksize update 2577 * With delayed allocation, we don't log the i_disksize update
2568 * if there is delayed block allocation. But we still need 2578 * if there is delayed block allocation. But we still need
2569 * to journalling the i_disksize update if writes to the end 2579 * to journalling the i_disksize update if writes to the end
2570 * of file which has an already mapped buffer. 2580 * of file which has an already mapped buffer.
2571 */ 2581 */
2572 handle = ext4_journal_start(inode, 1); 2582retry_journal:
2583 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
2573 if (IS_ERR(handle)) { 2584 if (IS_ERR(handle)) {
2574 ret = PTR_ERR(handle); 2585 page_cache_release(page);
2575 goto out; 2586 return PTR_ERR(handle);
2576 } 2587 }
2577 /* We cannot recurse into the filesystem as the transaction is already
2578 * started */
2579 flags |= AOP_FLAG_NOFS;
2580 2588
2581 page = grab_cache_page_write_begin(mapping, index, flags); 2589 lock_page(page);
2582 if (!page) { 2590 if (page->mapping != mapping) {
2591 /* The page got truncated from under us */
2592 unlock_page(page);
2593 page_cache_release(page);
2583 ext4_journal_stop(handle); 2594 ext4_journal_stop(handle);
2584 ret = -ENOMEM; 2595 goto retry_grab;
2585 goto out;
2586 } 2596 }
2587 *pagep = page; 2597 /* In case writeback began while the page was unlocked */
2598 wait_on_page_writeback(page);
2588 2599
2589 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 2600 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
2590 if (ret < 0) { 2601 if (ret < 0) {
2591 unlock_page(page); 2602 unlock_page(page);
2592 ext4_journal_stop(handle); 2603 ext4_journal_stop(handle);
2593 page_cache_release(page);
2594 /* 2604 /*
2595 * block_write_begin may have instantiated a few blocks 2605 * block_write_begin may have instantiated a few blocks
2596 * outside i_size. Trim these off again. Don't need 2606 * outside i_size. Trim these off again. Don't need
@@ -2598,11 +2608,16 @@ retry:
2598 */ 2608 */
2599 if (pos + len > inode->i_size) 2609 if (pos + len > inode->i_size)
2600 ext4_truncate_failed_write(inode); 2610 ext4_truncate_failed_write(inode);
2611
2612 if (ret == -ENOSPC &&
2613 ext4_should_retry_alloc(inode->i_sb, &retries))
2614 goto retry_journal;
2615
2616 page_cache_release(page);
2617 return ret;
2601 } 2618 }
2602 2619
2603 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2620 *pagep = page;
2604 goto retry;
2605out:
2606 return ret; 2621 return ret;
2607} 2622}
2608 2623
@@ -2858,36 +2873,10 @@ ext4_readpages(struct file *file, struct address_space *mapping,
2858 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2873 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2859} 2874}
2860 2875
2861static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
2862{
2863 struct buffer_head *head, *bh;
2864 unsigned int curr_off = 0;
2865
2866 if (!page_has_buffers(page))
2867 return;
2868 head = bh = page_buffers(page);
2869 do {
2870 if (offset <= curr_off && test_clear_buffer_uninit(bh)
2871 && bh->b_private) {
2872 ext4_free_io_end(bh->b_private);
2873 bh->b_private = NULL;
2874 bh->b_end_io = NULL;
2875 }
2876 curr_off = curr_off + bh->b_size;
2877 bh = bh->b_this_page;
2878 } while (bh != head);
2879}
2880
2881static void ext4_invalidatepage(struct page *page, unsigned long offset) 2876static void ext4_invalidatepage(struct page *page, unsigned long offset)
2882{ 2877{
2883 trace_ext4_invalidatepage(page, offset); 2878 trace_ext4_invalidatepage(page, offset);
2884 2879
2885 /*
2886 * free any io_end structure allocated for buffers to be discarded
2887 */
2888 if (ext4_should_dioread_nolock(page->mapping->host))
2889 ext4_invalidatepage_free_endio(page, offset);
2890
2891 /* No journalling happens on data buffers when this function is used */ 2880 /* No journalling happens on data buffers when this function is used */
2892 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 2881 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
2893 2882
@@ -2977,9 +2966,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2977 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 2966 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2978 ext4_free_io_end(io_end); 2967 ext4_free_io_end(io_end);
2979out: 2968out:
2969 inode_dio_done(inode);
2980 if (is_async) 2970 if (is_async)
2981 aio_complete(iocb, ret, 0); 2971 aio_complete(iocb, ret, 0);
2982 inode_dio_done(inode);
2983 return; 2972 return;
2984 } 2973 }
2985 2974
@@ -2993,65 +2982,6 @@ out:
2993 ext4_add_complete_io(io_end); 2982 ext4_add_complete_io(io_end);
2994} 2983}
2995 2984
2996static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2997{
2998 ext4_io_end_t *io_end = bh->b_private;
2999 struct inode *inode;
3000
3001 if (!test_clear_buffer_uninit(bh) || !io_end)
3002 goto out;
3003
3004 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3005 ext4_msg(io_end->inode->i_sb, KERN_INFO,
3006 "sb umounted, discard end_io request for inode %lu",
3007 io_end->inode->i_ino);
3008 ext4_free_io_end(io_end);
3009 goto out;
3010 }
3011
3012 /*
3013 * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
3014 * but being more careful is always safe for the future change.
3015 */
3016 inode = io_end->inode;
3017 ext4_set_io_unwritten_flag(inode, io_end);
3018 ext4_add_complete_io(io_end);
3019out:
3020 bh->b_private = NULL;
3021 bh->b_end_io = NULL;
3022 clear_buffer_uninit(bh);
3023 end_buffer_async_write(bh, uptodate);
3024}
3025
3026static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3027{
3028 ext4_io_end_t *io_end;
3029 struct page *page = bh->b_page;
3030 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3031 size_t size = bh->b_size;
3032
3033retry:
3034 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3035 if (!io_end) {
3036 pr_warn_ratelimited("%s: allocation fail\n", __func__);
3037 schedule();
3038 goto retry;
3039 }
3040 io_end->offset = offset;
3041 io_end->size = size;
3042 /*
3043 * We need to hold a reference to the page to make sure it
3044 * doesn't get evicted before ext4_end_io_work() has a chance
3045 * to convert the extent from written to unwritten.
3046 */
3047 io_end->page = page;
3048 get_page(io_end->page);
3049
3050 bh->b_private = io_end;
3051 bh->b_end_io = ext4_end_io_buffer_write;
3052 return 0;
3053}
3054
3055/* 2985/*
3056 * For ext4 extent files, ext4 will do direct-io write to holes, 2986 * For ext4 extent files, ext4 will do direct-io write to holes,
3057 * preallocated extents, and those write extend the file, no need to 2987 * preallocated extents, and those write extend the file, no need to
@@ -3557,16 +3487,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3557 if (!S_ISREG(inode->i_mode)) 3487 if (!S_ISREG(inode->i_mode))
3558 return -EOPNOTSUPP; 3488 return -EOPNOTSUPP;
3559 3489
3560 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 3490 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3561 /* TODO: Add support for non extent hole punching */ 3491 return ext4_ind_punch_hole(file, offset, length);
3562 return -EOPNOTSUPP;
3563 }
3564 3492
3565 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { 3493 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3566 /* TODO: Add support for bigalloc file systems */ 3494 /* TODO: Add support for bigalloc file systems */
3567 return -EOPNOTSUPP; 3495 return -EOPNOTSUPP;
3568 } 3496 }
3569 3497
3498 trace_ext4_punch_hole(inode, offset, length);
3499
3570 return ext4_ext_punch_hole(file, offset, length); 3500 return ext4_ext_punch_hole(file, offset, length);
3571} 3501}
3572 3502
@@ -3660,11 +3590,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
3660 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 3590 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3661 3591
3662 bh = sb_getblk(sb, block); 3592 bh = sb_getblk(sb, block);
3663 if (!bh) { 3593 if (unlikely(!bh))
3664 EXT4_ERROR_INODE_BLOCK(inode, block, 3594 return -ENOMEM;
3665 "unable to read itable block");
3666 return -EIO;
3667 }
3668 if (!buffer_uptodate(bh)) { 3595 if (!buffer_uptodate(bh)) {
3669 lock_buffer(bh); 3596 lock_buffer(bh);
3670 3597
@@ -3696,7 +3623,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
3696 3623
3697 /* Is the inode bitmap in cache? */ 3624 /* Is the inode bitmap in cache? */
3698 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 3625 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3699 if (!bitmap_bh) 3626 if (unlikely(!bitmap_bh))
3700 goto make_io; 3627 goto make_io;
3701 3628
3702 /* 3629 /*
@@ -4404,8 +4331,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4404 4331
4405 /* (user+group)*(old+new) structure, inode write (sb, 4332 /* (user+group)*(old+new) structure, inode write (sb,
4406 * inode block, ? - but truncate inode update has it) */ 4333 * inode block, ? - but truncate inode update has it) */
4407 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ 4334 handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
4408 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); 4335 (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
4336 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
4409 if (IS_ERR(handle)) { 4337 if (IS_ERR(handle)) {
4410 error = PTR_ERR(handle); 4338 error = PTR_ERR(handle);
4411 goto err_out; 4339 goto err_out;
@@ -4440,7 +4368,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4440 (attr->ia_size < inode->i_size)) { 4368 (attr->ia_size < inode->i_size)) {
4441 handle_t *handle; 4369 handle_t *handle;
4442 4370
4443 handle = ext4_journal_start(inode, 3); 4371 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
4444 if (IS_ERR(handle)) { 4372 if (IS_ERR(handle)) {
4445 error = PTR_ERR(handle); 4373 error = PTR_ERR(handle);
4446 goto err_out; 4374 goto err_out;
@@ -4460,7 +4388,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4460 attr->ia_size); 4388 attr->ia_size);
4461 if (error) { 4389 if (error) {
4462 /* Do as much error cleanup as possible */ 4390 /* Do as much error cleanup as possible */
4463 handle = ext4_journal_start(inode, 3); 4391 handle = ext4_journal_start(inode,
4392 EXT4_HT_INODE, 3);
4464 if (IS_ERR(handle)) { 4393 if (IS_ERR(handle)) {
4465 ext4_orphan_del(NULL, inode); 4394 ext4_orphan_del(NULL, inode);
4466 goto err_out; 4395 goto err_out;
@@ -4801,7 +4730,7 @@ void ext4_dirty_inode(struct inode *inode, int flags)
4801{ 4730{
4802 handle_t *handle; 4731 handle_t *handle;
4803 4732
4804 handle = ext4_journal_start(inode, 2); 4733 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
4805 if (IS_ERR(handle)) 4734 if (IS_ERR(handle))
4806 goto out; 4735 goto out;
4807 4736
@@ -4902,7 +4831,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4902 4831
4903 /* Finally we can mark the inode as dirty. */ 4832 /* Finally we can mark the inode as dirty. */
4904 4833
4905 handle = ext4_journal_start(inode, 1); 4834 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
4906 if (IS_ERR(handle)) 4835 if (IS_ERR(handle))
4907 return PTR_ERR(handle); 4836 return PTR_ERR(handle);
4908 4837
@@ -4980,7 +4909,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4980 else 4909 else
4981 get_block = ext4_get_block; 4910 get_block = ext4_get_block;
4982retry_alloc: 4911retry_alloc:
4983 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 4912 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
4913 ext4_writepage_trans_blocks(inode));
4984 if (IS_ERR(handle)) { 4914 if (IS_ERR(handle)) {
4985 ret = VM_FAULT_SIGBUS; 4915 ret = VM_FAULT_SIGBUS;
4986 goto out; 4916 goto out;