aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c968
1 files changed, 590 insertions, 378 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa51addb..81d605412844 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,8 @@
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h>
42#include <linux/slab.h>
41 43
42#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
43#include "xattr.h" 45#include "xattr.h"
@@ -71,58 +73,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
71} 73}
72 74
73/* 75/*
74 * The ext4 forget function must perform a revoke if we are freeing data
75 * which has been journaled. Metadata (eg. indirect blocks) must be
76 * revoked in all cases.
77 *
78 * "bh" may be NULL: a metadata block may have been freed from memory
79 * but there may still be a record of it in the journal, and that record
80 * still needs to be revoked.
81 *
82 * If the handle isn't valid we're not journaling, but we still need to
83 * call into ext4_journal_revoke() to put the buffer head.
84 */
85int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
86 struct buffer_head *bh, ext4_fsblk_t blocknr)
87{
88 int err;
89
90 might_sleep();
91
92 BUFFER_TRACE(bh, "enter");
93
94 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
95 "data mode %x\n",
96 bh, is_metadata, inode->i_mode,
97 test_opt(inode->i_sb, DATA_FLAGS));
98
99 /* Never use the revoke function if we are doing full data
100 * journaling: there is no need to, and a V1 superblock won't
101 * support it. Otherwise, only skip the revoke on un-journaled
102 * data blocks. */
103
104 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
105 (!is_metadata && !ext4_should_journal_data(inode))) {
106 if (bh) {
107 BUFFER_TRACE(bh, "call jbd2_journal_forget");
108 return ext4_journal_forget(handle, bh);
109 }
110 return 0;
111 }
112
113 /*
114 * data!=journal && (is_metadata || should_journal_data(inode))
115 */
116 BUFFER_TRACE(bh, "call ext4_journal_revoke");
117 err = ext4_journal_revoke(handle, blocknr, bh);
118 if (err)
119 ext4_abort(inode->i_sb, __func__,
120 "error %d when attempting revoke", err);
121 BUFFER_TRACE(bh, "exit");
122 return err;
123}
124
125/*
126 * Work out how many blocks we need to proceed with the next chunk of a 76 * Work out how many blocks we need to proceed with the next chunk of a
127 * truncate transaction. 77 * truncate transaction.
128 */ 78 */
@@ -222,6 +172,9 @@ void ext4_delete_inode(struct inode *inode)
222 handle_t *handle; 172 handle_t *handle;
223 int err; 173 int err;
224 174
175 if (!is_bad_inode(inode))
176 dquot_initialize(inode);
177
225 if (ext4_should_order_data(inode)) 178 if (ext4_should_order_data(inode))
226 ext4_begin_ordered_truncate(inode, 0); 179 ext4_begin_ordered_truncate(inode, 0);
227 truncate_inode_pages(&inode->i_data, 0); 180 truncate_inode_pages(&inode->i_data, 0);
@@ -246,7 +199,7 @@ void ext4_delete_inode(struct inode *inode)
246 inode->i_size = 0; 199 inode->i_size = 0;
247 err = ext4_mark_inode_dirty(handle, inode); 200 err = ext4_mark_inode_dirty(handle, inode);
248 if (err) { 201 if (err) {
249 ext4_warning(inode->i_sb, __func__, 202 ext4_warning(inode->i_sb,
250 "couldn't mark inode dirty (err %d)", err); 203 "couldn't mark inode dirty (err %d)", err);
251 goto stop_handle; 204 goto stop_handle;
252 } 205 }
@@ -264,7 +217,7 @@ void ext4_delete_inode(struct inode *inode)
264 if (err > 0) 217 if (err > 0)
265 err = ext4_journal_restart(handle, 3); 218 err = ext4_journal_restart(handle, 3);
266 if (err != 0) { 219 if (err != 0) {
267 ext4_warning(inode->i_sb, __func__, 220 ext4_warning(inode->i_sb,
268 "couldn't extend journal (err %d)", err); 221 "couldn't extend journal (err %d)", err);
269 stop_handle: 222 stop_handle:
270 ext4_journal_stop(handle); 223 ext4_journal_stop(handle);
@@ -375,8 +328,7 @@ static int ext4_block_to_path(struct inode *inode,
375 offsets[n++] = i_block & (ptrs - 1); 328 offsets[n++] = i_block & (ptrs - 1);
376 final = ptrs; 329 final = ptrs;
377 } else { 330 } else {
378 ext4_warning(inode->i_sb, "ext4_block_to_path", 331 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
379 "block %lu > max in inode %lu",
380 i_block + direct_blocks + 332 i_block + direct_blocks +
381 indirect_blocks + double_blocks, inode->i_ino); 333 indirect_blocks + double_blocks, inode->i_ino);
382 } 334 }
@@ -396,7 +348,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
396 if (blk && 348 if (blk &&
397 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
398 blk, 1))) { 350 blk, 1))) {
399 ext4_error(inode->i_sb, function, 351 __ext4_error(inode->i_sb, function,
400 "invalid block reference %u " 352 "invalid block reference %u "
401 "in inode #%lu", blk, inode->i_ino); 353 "in inode #%lu", blk, inode->i_ino);
402 return -EIO; 354 return -EIO;
@@ -659,7 +611,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
659 if (*err) 611 if (*err)
660 goto failed_out; 612 goto failed_out;
661 613
662 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); 614 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
615 EXT4_ERROR_INODE(inode,
616 "current_block %llu + count %lu > %d!",
617 current_block, count,
618 EXT4_MAX_BLOCK_FILE_PHYS);
619 *err = -EIO;
620 goto failed_out;
621 }
663 622
664 target -= count; 623 target -= count;
665 /* allocate blocks for indirect blocks */ 624 /* allocate blocks for indirect blocks */
@@ -695,7 +654,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
695 ar.flags = EXT4_MB_HINT_DATA; 654 ar.flags = EXT4_MB_HINT_DATA;
696 655
697 current_block = ext4_mb_new_blocks(handle, &ar, err); 656 current_block = ext4_mb_new_blocks(handle, &ar, err);
698 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); 657 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
658 EXT4_ERROR_INODE(inode,
659 "current_block %llu + ar.len %d > %d!",
660 current_block, ar.len,
661 EXT4_MAX_BLOCK_FILE_PHYS);
662 *err = -EIO;
663 goto failed_out;
664 }
699 665
700 if (*err && (target == blks)) { 666 if (*err && (target == blks)) {
701 /* 667 /*
@@ -721,7 +687,7 @@ allocated:
721 return ret; 687 return ret;
722failed_out: 688failed_out:
723 for (i = 0; i < index; i++) 689 for (i = 0; i < index; i++)
724 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 690 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
725 return ret; 691 return ret;
726} 692}
727 693
@@ -817,14 +783,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
817 return err; 783 return err;
818failed: 784failed:
819 /* Allocation failed, free what we already allocated */ 785 /* Allocation failed, free what we already allocated */
786 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
820 for (i = 1; i <= n ; i++) { 787 for (i = 1; i <= n ; i++) {
821 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 788 /*
822 ext4_journal_forget(handle, branch[i].bh); 789 * branch[i].bh is newly allocated, so there is no
790 * need to revoke the block, which is why we don't
791 * need to set EXT4_FREE_BLOCKS_METADATA.
792 */
793 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
794 EXT4_FREE_BLOCKS_FORGET);
823 } 795 }
824 for (i = 0; i < indirect_blks; i++) 796 for (i = n+1; i < indirect_blks; i++)
825 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 797 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
826 798
827 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 799 ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
828 800
829 return err; 801 return err;
830} 802}
@@ -903,12 +875,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
903 875
904err_out: 876err_out:
905 for (i = 1; i <= num; i++) { 877 for (i = 1; i <= num; i++) {
906 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 878 /*
907 ext4_journal_forget(handle, where[i].bh); 879 * branch[i].bh is newly allocated, so there is no
908 ext4_free_blocks(handle, inode, 880 * need to revoke the block, which is why we don't
909 le32_to_cpu(where[i-1].key), 1, 0); 881 * need to set EXT4_FREE_BLOCKS_METADATA.
882 */
883 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
884 EXT4_FREE_BLOCKS_FORGET);
910 } 885 }
911 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 886 ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
887 blks, 0);
912 888
913 return err; 889 return err;
914} 890}
@@ -1021,10 +997,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
1021 if (!err) 997 if (!err)
1022 err = ext4_splice_branch(handle, inode, iblock, 998 err = ext4_splice_branch(handle, inode, iblock,
1023 partial, indirect_blks, count); 999 partial, indirect_blks, count);
1024 else 1000 if (err)
1025 goto cleanup; 1001 goto cleanup;
1026 1002
1027 set_buffer_new(bh_result); 1003 set_buffer_new(bh_result);
1004
1005 ext4_update_inode_fsync_trans(handle, inode, 1);
1028got_it: 1006got_it:
1029 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1007 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
1030 if (count > blocks_to_boundary) 1008 if (count > blocks_to_boundary)
@@ -1043,92 +1021,121 @@ out:
1043 return err; 1021 return err;
1044} 1022}
1045 1023
1046qsize_t ext4_get_reserved_space(struct inode *inode) 1024#ifdef CONFIG_QUOTA
1025qsize_t *ext4_get_reserved_space(struct inode *inode)
1047{ 1026{
1048 unsigned long long total; 1027 return &EXT4_I(inode)->i_reserved_quota;
1049
1050 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1051 total = EXT4_I(inode)->i_reserved_data_blocks +
1052 EXT4_I(inode)->i_reserved_meta_blocks;
1053 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1054
1055 return total;
1056} 1028}
1029#endif
1030
1057/* 1031/*
1058 * Calculate the number of metadata blocks need to reserve 1032 * Calculate the number of metadata blocks need to reserve
1059 * to allocate @blocks for non extent file based file 1033 * to allocate a new block at @lblocks for non extent file based file
1060 */ 1034 */
1061static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 1035static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1036 sector_t lblock)
1062{ 1037{
1063 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1038 struct ext4_inode_info *ei = EXT4_I(inode);
1064 int ind_blks, dind_blks, tind_blks; 1039 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1065 1040 int blk_bits;
1066 /* number of new indirect blocks needed */
1067 ind_blks = (blocks + icap - 1) / icap;
1068 1041
1069 dind_blks = (ind_blks + icap - 1) / icap; 1042 if (lblock < EXT4_NDIR_BLOCKS)
1043 return 0;
1070 1044
1071 tind_blks = 1; 1045 lblock -= EXT4_NDIR_BLOCKS;
1072 1046
1073 return ind_blks + dind_blks + tind_blks; 1047 if (ei->i_da_metadata_calc_len &&
1048 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1049 ei->i_da_metadata_calc_len++;
1050 return 0;
1051 }
1052 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1053 ei->i_da_metadata_calc_len = 1;
1054 blk_bits = order_base_2(lblock);
1055 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1074} 1056}
1075 1057
1076/* 1058/*
1077 * Calculate the number of metadata blocks need to reserve 1059 * Calculate the number of metadata blocks need to reserve
1078 * to allocate given number of blocks 1060 * to allocate a block located at @lblock
1079 */ 1061 */
1080static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 1062static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1081{ 1063{
1082 if (!blocks)
1083 return 0;
1084
1085 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1086 return ext4_ext_calc_metadata_amount(inode, blocks); 1065 return ext4_ext_calc_metadata_amount(inode, lblock);
1087 1066
1088 return ext4_indirect_calc_metadata_amount(inode, blocks); 1067 return ext4_indirect_calc_metadata_amount(inode, lblock);
1089} 1068}
1090 1069
1091static void ext4_da_update_reserve_space(struct inode *inode, int used) 1070/*
1071 * Called with i_data_sem down, which is important since we can call
1072 * ext4_discard_preallocations() from here.
1073 */
1074void ext4_da_update_reserve_space(struct inode *inode,
1075 int used, int quota_claim)
1092{ 1076{
1093 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1094 int total, mdb, mdb_free; 1078 struct ext4_inode_info *ei = EXT4_I(inode);
1095 1079 int mdb_free = 0, allocated_meta_blocks = 0;
1096 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1080
1097 /* recalculate the number of metablocks still need to be reserved */ 1081 spin_lock(&ei->i_block_reservation_lock);
1098 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1082 trace_ext4_da_update_reserve_space(inode, used);
1099 mdb = ext4_calc_metadata_amount(inode, total); 1083 if (unlikely(used > ei->i_reserved_data_blocks)) {
1100 1084 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1101 /* figure out how many metablocks to release */ 1085 "with only %d reserved data blocks\n",
1102 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1086 __func__, inode->i_ino, used,
1103 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1087 ei->i_reserved_data_blocks);
1104 1088 WARN_ON(1);
1105 if (mdb_free) { 1089 used = ei->i_reserved_data_blocks;
1106 /* Account for allocated meta_blocks */ 1090 }
1107 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1091
1108 1092 /* Update per-inode reservations */
1109 /* update fs dirty blocks counter */ 1093 ei->i_reserved_data_blocks -= used;
1094 used += ei->i_allocated_meta_blocks;
1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks;
1097 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099
1100 if (ei->i_reserved_data_blocks == 0) {
1101 /*
1102 * We can release all of the reserved metadata blocks
1103 * only when we have written all of the delayed
1104 * allocation blocks.
1105 */
1106 mdb_free = ei->i_reserved_meta_blocks;
1107 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0;
1110 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1111 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1112 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1113 } 1110 }
1114
1115 /* update per-inode reservations */
1116 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1117 EXT4_I(inode)->i_reserved_data_blocks -= used;
1118 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1119 1112
1120 /* 1113 /* Update quota subsystem */
1121 * free those over-booking quota for metadata blocks 1114 if (quota_claim) {
1122 */ 1115 dquot_claim_block(inode, used);
1123 if (mdb_free) 1116 if (mdb_free)
1124 vfs_dq_release_reservation_block(inode, mdb_free); 1117 dquot_release_reservation_block(inode, mdb_free);
1118 } else {
1119 /*
1120 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */
1127 if (allocated_meta_blocks)
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used);
1130 }
1125 1131
1126 /* 1132 /*
1127 * If we have done all the pending block allocations and if 1133 * If we have done all the pending block allocations and if
1128 * there aren't any writers on the inode, we can discard the 1134 * there aren't any writers on the inode, we can discard the
1129 * inode's preallocations. 1135 * inode's preallocations.
1130 */ 1136 */
1131 if (!total && (atomic_read(&inode->i_writecount) == 0)) 1137 if ((ei->i_reserved_data_blocks == 0) &&
1138 (atomic_read(&inode->i_writecount) == 0))
1132 ext4_discard_preallocations(inode); 1139 ext4_discard_preallocations(inode);
1133} 1140}
1134 1141
@@ -1136,7 +1143,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
1136 sector_t logical, sector_t phys, int len) 1143 sector_t logical, sector_t phys, int len)
1137{ 1144{
1138 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1145 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1139 ext4_error(inode->i_sb, msg, 1146 __ext4_error(inode->i_sb, msg,
1140 "inode #%lu logical block %llu mapped to %llu " 1147 "inode #%lu logical block %llu mapped to %llu "
1141 "(size %d)", inode->i_ino, 1148 "(size %d)", inode->i_ino,
1142 (unsigned long long) logical, 1149 (unsigned long long) logical,
@@ -1318,20 +1325,22 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1318 * i_data's format changing. Force the migrate 1325 * i_data's format changing. Force the migrate
1319 * to fail by clearing migrate flags 1326 * to fail by clearing migrate flags
1320 */ 1327 */
1321 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 1328 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1322 } 1329 }
1323 }
1324 1330
1331 /*
1332 * Update reserved blocks/metadata blocks after successful
1333 * block allocation which had been deferred till now. We don't
1334 * support fallocate for non extent files. So we can update
1335 * reserve space here.
1336 */
1337 if ((retval > 0) &&
1338 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
1339 ext4_da_update_reserve_space(inode, retval, 1);
1340 }
1325 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1341 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1326 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1342 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1327 1343
1328 /*
1329 * Update reserved blocks/metadata blocks after successful
1330 * block allocation which had been deferred till now.
1331 */
1332 if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
1333 ext4_da_update_reserve_space(inode, retval);
1334
1335 up_write((&EXT4_I(inode)->i_data_sem)); 1344 up_write((&EXT4_I(inode)->i_data_sem));
1336 if (retval > 0 && buffer_mapped(bh)) { 1345 if (retval > 0 && buffer_mapped(bh)) {
1337 int ret = check_block_validity(inode, "file system " 1346 int ret = check_block_validity(inode, "file system "
@@ -1534,6 +1543,18 @@ static int do_journal_get_write_access(handle_t *handle,
1534 return ext4_journal_get_write_access(handle, bh); 1543 return ext4_journal_get_write_access(handle, bh);
1535} 1544}
1536 1545
1546/*
1547 * Truncate blocks that were not used by write. We have to truncate the
1548 * pagecache as well so that corresponding buffers get properly unmapped.
1549 */
1550static void ext4_truncate_failed_write(struct inode *inode)
1551{
1552 truncate_inode_pages(inode->i_mapping, inode->i_size);
1553 ext4_truncate(inode);
1554}
1555
1556static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1557 struct buffer_head *bh_result, int create);
1537static int ext4_write_begin(struct file *file, struct address_space *mapping, 1558static int ext4_write_begin(struct file *file, struct address_space *mapping,
1538 loff_t pos, unsigned len, unsigned flags, 1559 loff_t pos, unsigned len, unsigned flags,
1539 struct page **pagep, void **fsdata) 1560 struct page **pagep, void **fsdata)
@@ -1575,8 +1596,12 @@ retry:
1575 } 1596 }
1576 *pagep = page; 1597 *pagep = page;
1577 1598
1578 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1599 if (ext4_should_dioread_nolock(inode))
1579 ext4_get_block); 1600 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1601 fsdata, ext4_get_block_write);
1602 else
1603 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1604 fsdata, ext4_get_block);
1580 1605
1581 if (!ret && ext4_should_journal_data(inode)) { 1606 if (!ret && ext4_should_journal_data(inode)) {
1582 ret = walk_page_buffers(handle, page_buffers(page), 1607 ret = walk_page_buffers(handle, page_buffers(page),
@@ -1599,7 +1624,7 @@ retry:
1599 1624
1600 ext4_journal_stop(handle); 1625 ext4_journal_stop(handle);
1601 if (pos + len > inode->i_size) { 1626 if (pos + len > inode->i_size) {
1602 ext4_truncate(inode); 1627 ext4_truncate_failed_write(inode);
1603 /* 1628 /*
1604 * If truncate failed early the inode might 1629 * If truncate failed early the inode might
1605 * still be on the orphan list; we need to 1630 * still be on the orphan list; we need to
@@ -1709,7 +1734,7 @@ static int ext4_ordered_write_end(struct file *file,
1709 ret = ret2; 1734 ret = ret2;
1710 1735
1711 if (pos + len > inode->i_size) { 1736 if (pos + len > inode->i_size) {
1712 ext4_truncate(inode); 1737 ext4_truncate_failed_write(inode);
1713 /* 1738 /*
1714 * If truncate failed early the inode might still be 1739 * If truncate failed early the inode might still be
1715 * on the orphan list; we need to make sure the inode 1740 * on the orphan list; we need to make sure the inode
@@ -1751,7 +1776,7 @@ static int ext4_writeback_write_end(struct file *file,
1751 ret = ret2; 1776 ret = ret2;
1752 1777
1753 if (pos + len > inode->i_size) { 1778 if (pos + len > inode->i_size) {
1754 ext4_truncate(inode); 1779 ext4_truncate_failed_write(inode);
1755 /* 1780 /*
1756 * If truncate failed early the inode might still be 1781 * If truncate failed early the inode might still be
1757 * on the orphan list; we need to make sure the inode 1782 * on the orphan list; we need to make sure the inode
@@ -1793,7 +1818,7 @@ static int ext4_journalled_write_end(struct file *file,
1793 new_i_size = pos + copied; 1818 new_i_size = pos + copied;
1794 if (new_i_size > inode->i_size) 1819 if (new_i_size > inode->i_size)
1795 i_size_write(inode, pos+copied); 1820 i_size_write(inode, pos+copied);
1796 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1821 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1797 if (new_i_size > EXT4_I(inode)->i_disksize) { 1822 if (new_i_size > EXT4_I(inode)->i_disksize) {
1798 ext4_update_i_disksize(inode, new_i_size); 1823 ext4_update_i_disksize(inode, new_i_size);
1799 ret2 = ext4_mark_inode_dirty(handle, inode); 1824 ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1814,7 +1839,7 @@ static int ext4_journalled_write_end(struct file *file,
1814 if (!ret) 1839 if (!ret)
1815 ret = ret2; 1840 ret = ret2;
1816 if (pos + len > inode->i_size) { 1841 if (pos + len > inode->i_size) {
1817 ext4_truncate(inode); 1842 ext4_truncate_failed_write(inode);
1818 /* 1843 /*
1819 * If truncate failed early the inode might still be 1844 * If truncate failed early the inode might still be
1820 * on the orphan list; we need to make sure the inode 1845 * on the orphan list; we need to make sure the inode
@@ -1827,11 +1852,16 @@ static int ext4_journalled_write_end(struct file *file,
1827 return ret ? ret : copied; 1852 return ret ? ret : copied;
1828} 1853}
1829 1854
1830static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1855/*
1856 * Reserve a single block located at lblock
1857 */
1858static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1831{ 1859{
1832 int retries = 0; 1860 int retries = 0;
1833 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1861 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1834 unsigned long md_needed, mdblocks, total = 0; 1862 struct ext4_inode_info *ei = EXT4_I(inode);
1863 unsigned long md_needed, md_reserved;
1864 int ret;
1835 1865
1836 /* 1866 /*
1837 * recalculate the amount of metadata blocks to reserve 1867 * recalculate the amount of metadata blocks to reserve
@@ -1839,86 +1869,80 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1839 * worse case is one extent per block 1869 * worse case is one extent per block
1840 */ 1870 */
1841repeat: 1871repeat:
1842 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1872 spin_lock(&ei->i_block_reservation_lock);
1843 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1873 md_reserved = ei->i_reserved_meta_blocks;
1844 mdblocks = ext4_calc_metadata_amount(inode, total); 1874 md_needed = ext4_calc_metadata_amount(inode, lblock);
1845 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1875 trace_ext4_da_reserve_space(inode, md_needed);
1846 1876 spin_unlock(&ei->i_block_reservation_lock);
1847 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1848 total = md_needed + nrblocks;
1849 1877
1850 /* 1878 /*
1851 * Make quota reservation here to prevent quota overflow 1879 * Make quota reservation here to prevent quota overflow
1852 * later. Real quota accounting is done at pages writeout 1880 * later. Real quota accounting is done at pages writeout
1853 * time. 1881 * time.
1854 */ 1882 */
1855 if (vfs_dq_reserve_block(inode, total)) { 1883 ret = dquot_reserve_block(inode, md_needed + 1);
1856 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1884 if (ret)
1857 return -EDQUOT; 1885 return ret;
1858 }
1859 1886
1860 if (ext4_claim_free_blocks(sbi, total)) { 1887 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1861 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1888 dquot_release_reservation_block(inode, md_needed + 1);
1862 vfs_dq_release_reservation_block(inode, total);
1863 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1889 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1864 yield(); 1890 yield();
1865 goto repeat; 1891 goto repeat;
1866 } 1892 }
1867 return -ENOSPC; 1893 return -ENOSPC;
1868 } 1894 }
1869 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1895 spin_lock(&ei->i_block_reservation_lock);
1870 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1896 ei->i_reserved_data_blocks++;
1897 ei->i_reserved_meta_blocks += md_needed;
1898 spin_unlock(&ei->i_block_reservation_lock);
1871 1899
1872 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1873 return 0; /* success */ 1900 return 0; /* success */
1874} 1901}
1875 1902
1876static void ext4_da_release_space(struct inode *inode, int to_free) 1903static void ext4_da_release_space(struct inode *inode, int to_free)
1877{ 1904{
1878 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1879 int total, mdb, mdb_free, release; 1906 struct ext4_inode_info *ei = EXT4_I(inode);
1880 1907
1881 if (!to_free) 1908 if (!to_free)
1882 return; /* Nothing to release, exit */ 1909 return; /* Nothing to release, exit */
1883 1910
1884 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1911 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1885 1912
1886 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1913 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1887 /* 1914 /*
1888 * if there is no reserved blocks, but we try to free some 1915 * if there aren't enough reserved blocks, then the
1889 * then the counter is messed up somewhere. 1916 * counter is messed up somewhere. Since this
1890 * but since this function is called from invalidate 1917 * function is called from invalidate page, it's
1891 * page, it's harmless to return without any action 1918 * harmless to return without any action.
1892 */ 1919 */
1893 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1920 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1894 "blocks for inode %lu, but there is no reserved " 1921 "ino %lu, to_free %d with only %d reserved "
1895 "data blocks\n", to_free, inode->i_ino); 1922 "data blocks\n", inode->i_ino, to_free,
1896 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1923 ei->i_reserved_data_blocks);
1897 return; 1924 WARN_ON(1);
1925 to_free = ei->i_reserved_data_blocks;
1898 } 1926 }
1927 ei->i_reserved_data_blocks -= to_free;
1899 1928
1900 /* recalculate the number of metablocks still need to be reserved */ 1929 if (ei->i_reserved_data_blocks == 0) {
1901 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1930 /*
1902 mdb = ext4_calc_metadata_amount(inode, total); 1931 * We can release all of the reserved metadata blocks
1903 1932 * only when we have written all of the delayed
1904 /* figure out how many metablocks to release */ 1933 * allocation blocks.
1905 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1934 */
1906 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1935 to_free += ei->i_reserved_meta_blocks;
1907 1936 ei->i_reserved_meta_blocks = 0;
1908 release = to_free + mdb_free; 1937 ei->i_da_metadata_calc_len = 0;
1909 1938 }
1910 /* update fs dirty blocks counter for truncate case */
1911 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1912 1939
1913 /* update per-inode reservations */ 1940 /* update fs dirty blocks counter */
1914 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1941 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1915 EXT4_I(inode)->i_reserved_data_blocks -= to_free;
1916 1942
1917 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1918 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1919 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1943 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1920 1944
1921 vfs_dq_release_reservation_block(inode, release); 1945 dquot_release_reservation_block(inode, to_free);
1922} 1946}
1923 1947
1924static void ext4_da_page_release_reservation(struct page *page, 1948static void ext4_da_page_release_reservation(struct page *page,
@@ -2095,6 +2119,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2095 } else if (buffer_mapped(bh)) 2119 } else if (buffer_mapped(bh))
2096 BUG_ON(bh->b_blocknr != pblock); 2120 BUG_ON(bh->b_blocknr != pblock);
2097 2121
2122 if (buffer_uninit(exbh))
2123 set_buffer_uninit(bh);
2098 cur_logical++; 2124 cur_logical++;
2099 pblock++; 2125 pblock++;
2100 } while ((bh = bh->b_this_page) != head); 2126 } while ((bh = bh->b_this_page) != head);
@@ -2137,17 +2163,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2137 break; 2163 break;
2138 for (i = 0; i < nr_pages; i++) { 2164 for (i = 0; i < nr_pages; i++) {
2139 struct page *page = pvec.pages[i]; 2165 struct page *page = pvec.pages[i];
2140 index = page->index; 2166 if (page->index > end)
2141 if (index > end)
2142 break; 2167 break;
2143 index++;
2144
2145 BUG_ON(!PageLocked(page)); 2168 BUG_ON(!PageLocked(page));
2146 BUG_ON(PageWriteback(page)); 2169 BUG_ON(PageWriteback(page));
2147 block_invalidatepage(page, 0); 2170 block_invalidatepage(page, 0);
2148 ClearPageUptodate(page); 2171 ClearPageUptodate(page);
2149 unlock_page(page); 2172 unlock_page(page);
2150 } 2173 }
2174 index = pvec.pages[nr_pages - 1]->index + 1;
2175 pagevec_release(&pvec);
2151 } 2176 }
2152 return; 2177 return;
2153} 2178}
@@ -2223,10 +2248,12 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2223 * variables are updated after the blocks have been allocated. 2248 * variables are updated after the blocks have been allocated.
2224 */ 2249 */
2225 new.b_state = 0; 2250 new.b_state = 0;
2226 get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | 2251 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2227 EXT4_GET_BLOCKS_DELALLOC_RESERVE); 2252 if (ext4_should_dioread_nolock(mpd->inode))
2253 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2228 if (mpd->b_state & (1 << BH_Delay)) 2254 if (mpd->b_state & (1 << BH_Delay))
2229 get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; 2255 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2256
2230 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2257 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
2231 &new, get_blocks_flags); 2258 &new, get_blocks_flags);
2232 if (blks < 0) { 2259 if (blks < 0) {
@@ -2524,7 +2551,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2524 * XXX: __block_prepare_write() unmaps passed block, 2551 * XXX: __block_prepare_write() unmaps passed block,
2525 * is it OK? 2552 * is it OK?
2526 */ 2553 */
2527 ret = ext4_da_reserve_space(inode, 1); 2554 ret = ext4_da_reserve_space(inode, iblock);
2528 if (ret) 2555 if (ret)
2529 /* not enough space to reserve */ 2556 /* not enough space to reserve */
2530 return ret; 2557 return ret;
@@ -2600,7 +2627,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
2600} 2627}
2601 2628
2602static int __ext4_journalled_writepage(struct page *page, 2629static int __ext4_journalled_writepage(struct page *page,
2603 struct writeback_control *wbc,
2604 unsigned int len) 2630 unsigned int len)
2605{ 2631{
2606 struct address_space *mapping = page->mapping; 2632 struct address_space *mapping = page->mapping;
@@ -2635,11 +2661,14 @@ static int __ext4_journalled_writepage(struct page *page,
2635 ret = err; 2661 ret = err;
2636 2662
2637 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2663 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2638 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 2664 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2639out: 2665out:
2640 return ret; 2666 return ret;
2641} 2667}
2642 2668
2669static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2670static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2671
2643/* 2672/*
2644 * Note that we don't need to start a transaction unless we're journaling data 2673 * Note that we don't need to start a transaction unless we're journaling data
2645 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2674 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2687,7 +2716,7 @@ static int ext4_writepage(struct page *page,
2687 int ret = 0; 2716 int ret = 0;
2688 loff_t size; 2717 loff_t size;
2689 unsigned int len; 2718 unsigned int len;
2690 struct buffer_head *page_bufs; 2719 struct buffer_head *page_bufs = NULL;
2691 struct inode *inode = page->mapping->host; 2720 struct inode *inode = page->mapping->host;
2692 2721
2693 trace_ext4_writepage(inode, page); 2722 trace_ext4_writepage(inode, page);
@@ -2758,12 +2787,16 @@ static int ext4_writepage(struct page *page,
2758 * doesn't seem much point in redirtying the page here. 2787 * doesn't seem much point in redirtying the page here.
2759 */ 2788 */
2760 ClearPageChecked(page); 2789 ClearPageChecked(page);
2761 return __ext4_journalled_writepage(page, wbc, len); 2790 return __ext4_journalled_writepage(page, len);
2762 } 2791 }
2763 2792
2764 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2793 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2765 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2794 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2766 else 2795 else if (page_bufs && buffer_uninit(page_bufs)) {
2796 ext4_set_bh_endio(page_bufs, inode);
2797 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2798 wbc, ext4_end_io_buffer_write);
2799 } else
2767 ret = block_write_full_page(page, noalloc_get_block_write, 2800 ret = block_write_full_page(page, noalloc_get_block_write,
2768 wbc); 2801 wbc);
2769 2802
@@ -2788,7 +2821,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2788 * number of contiguous block. So we will limit 2821 * number of contiguous block. So we will limit
2789 * number of contiguous block to a sane value 2822 * number of contiguous block to a sane value
2790 */ 2823 */
2791 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2824 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
2792 (max_blocks > EXT4_MAX_TRANS_DATA)) 2825 (max_blocks > EXT4_MAX_TRANS_DATA))
2793 max_blocks = EXT4_MAX_TRANS_DATA; 2826 max_blocks = EXT4_MAX_TRANS_DATA;
2794 2827
@@ -2933,7 +2966,7 @@ retry:
2933 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 2966 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2934 &mpd); 2967 &mpd);
2935 /* 2968 /*
2936 * If we have a contigous extent of pages and we 2969 * If we have a contiguous extent of pages and we
2937 * haven't done the I/O yet, map the blocks and submit 2970 * haven't done the I/O yet, map the blocks and submit
2938 * them for I/O. 2971 * them for I/O.
2939 */ 2972 */
@@ -2999,8 +3032,7 @@ retry:
2999out_writepages: 3032out_writepages:
3000 if (!no_nrwrite_index_update) 3033 if (!no_nrwrite_index_update)
3001 wbc->no_nrwrite_index_update = 0; 3034 wbc->no_nrwrite_index_update = 0;
3002 if (wbc->nr_to_write > nr_to_writebump) 3035 wbc->nr_to_write -= nr_to_writebump;
3003 wbc->nr_to_write -= nr_to_writebump;
3004 wbc->range_start = range_start; 3036 wbc->range_start = range_start;
3005 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3037 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
3006 return ret; 3038 return ret;
@@ -3025,11 +3057,18 @@ static int ext4_nonda_switch(struct super_block *sb)
3025 if (2 * free_blocks < 3 * dirty_blocks || 3057 if (2 * free_blocks < 3 * dirty_blocks ||
3026 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 3058 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
3027 /* 3059 /*
3028 * free block count is less that 150% of dirty blocks 3060 * free block count is less than 150% of dirty blocks
3029 * or free blocks is less that watermark 3061 * or free blocks is less than watermark
3030 */ 3062 */
3031 return 1; 3063 return 1;
3032 } 3064 }
3065 /*
3066 * Even if we don't switch but are nearing capacity,
3067 * start pushing delalloc when 1/2 of free blocks are dirty.
3068 */
3069 if (free_blocks < 2 * dirty_blocks)
3070 writeback_inodes_sb_if_idle(sb);
3071
3033 return 0; 3072 return 0;
3034} 3073}
3035 3074
@@ -3037,7 +3076,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3037 loff_t pos, unsigned len, unsigned flags, 3076 loff_t pos, unsigned len, unsigned flags,
3038 struct page **pagep, void **fsdata) 3077 struct page **pagep, void **fsdata)
3039{ 3078{
3040 int ret, retries = 0; 3079 int ret, retries = 0, quota_retries = 0;
3041 struct page *page; 3080 struct page *page;
3042 pgoff_t index; 3081 pgoff_t index;
3043 unsigned from, to; 3082 unsigned from, to;
@@ -3091,11 +3130,27 @@ retry:
3091 * i_size_read because we hold i_mutex. 3130 * i_size_read because we hold i_mutex.
3092 */ 3131 */
3093 if (pos + len > inode->i_size) 3132 if (pos + len > inode->i_size)
3094 ext4_truncate(inode); 3133 ext4_truncate_failed_write(inode);
3095 } 3134 }
3096 3135
3097 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3136 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3098 goto retry; 3137 goto retry;
3138
3139 if ((ret == -EDQUOT) &&
3140 EXT4_I(inode)->i_reserved_meta_blocks &&
3141 (quota_retries++ < 3)) {
3142 /*
3143 * Since we often over-estimate the number of meta
3144 * data blocks required, we may sometimes get a
3145 * spurios out of quota error even though there would
3146 * be enough space once we write the data blocks and
3147 * find out how many meta data blocks were _really_
3148 * required. So try forcing the inode write to see if
3149 * that helps.
3150 */
3151 write_inode_now(inode, (quota_retries == 3));
3152 goto retry;
3153 }
3099out: 3154out:
3100 return ret; 3155 return ret;
3101} 3156}
@@ -3284,7 +3339,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3284 filemap_write_and_wait(mapping); 3339 filemap_write_and_wait(mapping);
3285 } 3340 }
3286 3341
3287 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 3342 if (EXT4_JOURNAL(inode) &&
3343 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3288 /* 3344 /*
3289 * This is a REALLY heavyweight approach, but the use of 3345 * This is a REALLY heavyweight approach, but the use of
3290 * bmap on dirty files is expected to be extremely rare: 3346 * bmap on dirty files is expected to be extremely rare:
@@ -3303,7 +3359,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3303 * everything they get. 3359 * everything they get.
3304 */ 3360 */
3305 3361
3306 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 3362 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3307 journal = EXT4_JOURNAL(inode); 3363 journal = EXT4_JOURNAL(inode);
3308 jbd2_journal_lock_updates(journal); 3364 jbd2_journal_lock_updates(journal);
3309 err = jbd2_journal_flush(journal); 3365 err = jbd2_journal_flush(journal);
@@ -3328,11 +3384,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3328 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3384 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3329} 3385}
3330 3386
3387static void ext4_free_io_end(ext4_io_end_t *io)
3388{
3389 BUG_ON(!io);
3390 if (io->page)
3391 put_page(io->page);
3392 iput(io->inode);
3393 kfree(io);
3394}
3395
3396static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3397{
3398 struct buffer_head *head, *bh;
3399 unsigned int curr_off = 0;
3400
3401 if (!page_has_buffers(page))
3402 return;
3403 head = bh = page_buffers(page);
3404 do {
3405 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3406 && bh->b_private) {
3407 ext4_free_io_end(bh->b_private);
3408 bh->b_private = NULL;
3409 bh->b_end_io = NULL;
3410 }
3411 curr_off = curr_off + bh->b_size;
3412 bh = bh->b_this_page;
3413 } while (bh != head);
3414}
3415
3331static void ext4_invalidatepage(struct page *page, unsigned long offset) 3416static void ext4_invalidatepage(struct page *page, unsigned long offset)
3332{ 3417{
3333 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3418 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3334 3419
3335 /* 3420 /*
3421 * free any io_end structure allocated for buffers to be discarded
3422 */
3423 if (ext4_should_dioread_nolock(page->mapping->host))
3424 ext4_invalidatepage_free_endio(page, offset);
3425 /*
3336 * If it's a full truncate we just forget about the pending dirtying 3426 * If it's a full truncate we just forget about the pending dirtying
3337 */ 3427 */
3338 if (offset == 0) 3428 if (offset == 0)
@@ -3403,7 +3493,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3403 } 3493 }
3404 3494
3405retry: 3495retry:
3406 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3496 if (rw == READ && ext4_should_dioread_nolock(inode))
3497 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
3498 inode->i_sb->s_bdev, iov,
3499 offset, nr_segs,
3500 ext4_get_block, NULL);
3501 else
3502 ret = blockdev_direct_IO(rw, iocb, inode,
3503 inode->i_sb->s_bdev, iov,
3407 offset, nr_segs, 3504 offset, nr_segs,
3408 ext4_get_block, NULL); 3505 ext4_get_block, NULL);
3409 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3506 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3419,6 +3516,9 @@ retry:
3419 * but cannot extend i_size. Bail out and pretend 3516 * but cannot extend i_size. Bail out and pretend
3420 * the write failed... */ 3517 * the write failed... */
3421 ret = PTR_ERR(handle); 3518 ret = PTR_ERR(handle);
3519 if (inode->i_nlink)
3520 ext4_orphan_del(NULL, inode);
3521
3422 goto out; 3522 goto out;
3423 } 3523 }
3424 if (inode->i_nlink) 3524 if (inode->i_nlink)
@@ -3446,75 +3546,63 @@ out:
3446 return ret; 3546 return ret;
3447} 3547}
3448 3548
3449static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, 3549static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3450 struct buffer_head *bh_result, int create) 3550 struct buffer_head *bh_result, int create)
3451{ 3551{
3452 handle_t *handle = NULL; 3552 handle_t *handle = ext4_journal_current_handle();
3453 int ret = 0; 3553 int ret = 0;
3454 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3554 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3455 int dio_credits; 3555 int dio_credits;
3556 int started = 0;
3456 3557
3457 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", 3558 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3458 inode->i_ino, create); 3559 inode->i_ino, create);
3459 /* 3560 /*
3460 * DIO VFS code passes create = 0 flag for write to 3561 * ext4_get_block in prepare for a DIO write or buffer write.
3461 * the middle of file. It does this to avoid block 3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3462 * allocation for holes, to prevent expose stale data 3563 * The extent will be converted to initialized after IO complete.
3463 * out when there is parallel buffered read (which does
3464 * not hold the i_mutex lock) while direct IO write has
3465 * not completed. DIO request on holes finally falls back
3466 * to buffered IO for this reason.
3467 *
3468 * For ext4 extent based file, since we support fallocate,
3469 * new allocated extent as uninitialized, for holes, we
3470 * could fallocate blocks for holes, thus parallel
3471 * buffered IO read will zero out the page when read on
3472 * a hole while parallel DIO write to the hole has not completed.
3473 *
3474 * when we come here, we know it's a direct IO write to
3475 * to the middle of file (<i_size)
3476 * so it's safe to override the create flag from VFS.
3477 */ 3564 */
3478 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; 3565 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3479 3566
3480 if (max_blocks > DIO_MAX_BLOCKS) 3567 if (!handle) {
3481 max_blocks = DIO_MAX_BLOCKS; 3568 if (max_blocks > DIO_MAX_BLOCKS)
3482 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 3569 max_blocks = DIO_MAX_BLOCKS;
3483 handle = ext4_journal_start(inode, dio_credits); 3570 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3484 if (IS_ERR(handle)) { 3571 handle = ext4_journal_start(inode, dio_credits);
3485 ret = PTR_ERR(handle); 3572 if (IS_ERR(handle)) {
3486 goto out; 3573 ret = PTR_ERR(handle);
3574 goto out;
3575 }
3576 started = 1;
3487 } 3577 }
3578
3488 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 3579 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3489 create); 3580 create);
3490 if (ret > 0) { 3581 if (ret > 0) {
3491 bh_result->b_size = (ret << inode->i_blkbits); 3582 bh_result->b_size = (ret << inode->i_blkbits);
3492 ret = 0; 3583 ret = 0;
3493 } 3584 }
3494 ext4_journal_stop(handle); 3585 if (started)
3586 ext4_journal_stop(handle);
3495out: 3587out:
3496 return ret; 3588 return ret;
3497} 3589}
3498 3590
3499static void ext4_free_io_end(ext4_io_end_t *io) 3591static void dump_completed_IO(struct inode * inode)
3500{
3501 BUG_ON(!io);
3502 iput(io->inode);
3503 kfree(io);
3504}
3505static void dump_aio_dio_list(struct inode * inode)
3506{ 3592{
3507#ifdef EXT4_DEBUG 3593#ifdef EXT4_DEBUG
3508 struct list_head *cur, *before, *after; 3594 struct list_head *cur, *before, *after;
3509 ext4_io_end_t *io, *io0, *io1; 3595 ext4_io_end_t *io, *io0, *io1;
3596 unsigned long flags;
3510 3597
3511 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3598 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3512 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); 3599 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3513 return; 3600 return;
3514 } 3601 }
3515 3602
3516 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); 3603 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3517 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ 3604 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3605 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3518 cur = &io->list; 3606 cur = &io->list;
3519 before = cur->prev; 3607 before = cur->prev;
3520 io0 = container_of(before, ext4_io_end_t, list); 3608 io0 = container_of(before, ext4_io_end_t, list);
@@ -3524,32 +3612,31 @@ static void dump_aio_dio_list(struct inode * inode)
3524 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 3612 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3525 io, inode->i_ino, io0, io1); 3613 io, inode->i_ino, io0, io1);
3526 } 3614 }
3615 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3527#endif 3616#endif
3528} 3617}
3529 3618
3530/* 3619/*
3531 * check a range of space and convert unwritten extents to written. 3620 * check a range of space and convert unwritten extents to written.
3532 */ 3621 */
3533static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) 3622static int ext4_end_io_nolock(ext4_io_end_t *io)
3534{ 3623{
3535 struct inode *inode = io->inode; 3624 struct inode *inode = io->inode;
3536 loff_t offset = io->offset; 3625 loff_t offset = io->offset;
3537 size_t size = io->size; 3626 ssize_t size = io->size;
3538 int ret = 0; 3627 int ret = 0;
3539 3628
3540 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," 3629 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3541 "list->prev 0x%p\n", 3630 "list->prev 0x%p\n",
3542 io, inode->i_ino, io->list.next, io->list.prev); 3631 io, inode->i_ino, io->list.next, io->list.prev);
3543 3632
3544 if (list_empty(&io->list)) 3633 if (list_empty(&io->list))
3545 return ret; 3634 return ret;
3546 3635
3547 if (io->flag != DIO_AIO_UNWRITTEN) 3636 if (io->flag != EXT4_IO_UNWRITTEN)
3548 return ret; 3637 return ret;
3549 3638
3550 if (offset + size <= i_size_read(inode)) 3639 ret = ext4_convert_unwritten_extents(inode, offset, size);
3551 ret = ext4_convert_unwritten_extents(inode, offset, size);
3552
3553 if (ret < 0) { 3640 if (ret < 0) {
3554 printk(KERN_EMERG "%s: failed to convert unwritten" 3641 printk(KERN_EMERG "%s: failed to convert unwritten"
3555 "extents to written extents, error is %d" 3642 "extents to written extents, error is %d"
@@ -3562,50 +3649,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3562 io->flag = 0; 3649 io->flag = 0;
3563 return ret; 3650 return ret;
3564} 3651}
3652
3565/* 3653/*
3566 * work on completed aio dio IO, to convert unwritten extents to extents 3654 * work on completed aio dio IO, to convert unwritten extents to extents
3567 */ 3655 */
3568static void ext4_end_aio_dio_work(struct work_struct *work) 3656static void ext4_end_io_work(struct work_struct *work)
3569{ 3657{
3570 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 3658 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3571 struct inode *inode = io->inode; 3659 struct inode *inode = io->inode;
3572 int ret = 0; 3660 struct ext4_inode_info *ei = EXT4_I(inode);
3661 unsigned long flags;
3662 int ret;
3573 3663
3574 mutex_lock(&inode->i_mutex); 3664 mutex_lock(&inode->i_mutex);
3575 ret = ext4_end_aio_dio_nolock(io); 3665 ret = ext4_end_io_nolock(io);
3576 if (ret >= 0) { 3666 if (ret < 0) {
3577 if (!list_empty(&io->list)) 3667 mutex_unlock(&inode->i_mutex);
3578 list_del_init(&io->list); 3668 return;
3579 ext4_free_io_end(io);
3580 } 3669 }
3670
3671 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3672 if (!list_empty(&io->list))
3673 list_del_init(&io->list);
3674 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3581 mutex_unlock(&inode->i_mutex); 3675 mutex_unlock(&inode->i_mutex);
3676 ext4_free_io_end(io);
3582} 3677}
3678
3583/* 3679/*
3584 * This function is called from ext4_sync_file(). 3680 * This function is called from ext4_sync_file().
3585 * 3681 *
3586 * When AIO DIO IO is completed, the work to convert unwritten 3682 * When IO is completed, the work to convert unwritten extents to
3587 * extents to written is queued on workqueue but may not get immediately 3683 * written is queued on workqueue but may not get immediately
3588 * scheduled. When fsync is called, we need to ensure the 3684 * scheduled. When fsync is called, we need to ensure the
3589 * conversion is complete before fsync returns. 3685 * conversion is complete before fsync returns.
3590 * The inode keeps track of a list of completed AIO from DIO path 3686 * The inode keeps track of a list of pending/completed IO that
3591 * that might needs to do the conversion. This function walks through 3687 * might needs to do the conversion. This function walks through
3592 * the list and convert the related unwritten extents to written. 3688 * the list and convert the related unwritten extents for completed IO
3689 * to written.
3690 * The function return the number of pending IOs on success.
3593 */ 3691 */
3594int flush_aio_dio_completed_IO(struct inode *inode) 3692int flush_completed_IO(struct inode *inode)
3595{ 3693{
3596 ext4_io_end_t *io; 3694 ext4_io_end_t *io;
3695 struct ext4_inode_info *ei = EXT4_I(inode);
3696 unsigned long flags;
3597 int ret = 0; 3697 int ret = 0;
3598 int ret2 = 0; 3698 int ret2 = 0;
3599 3699
3600 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) 3700 if (list_empty(&ei->i_completed_io_list))
3601 return ret; 3701 return ret;
3602 3702
3603 dump_aio_dio_list(inode); 3703 dump_completed_IO(inode);
3604 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3704 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3605 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, 3705 while (!list_empty(&ei->i_completed_io_list)){
3706 io = list_entry(ei->i_completed_io_list.next,
3606 ext4_io_end_t, list); 3707 ext4_io_end_t, list);
3607 /* 3708 /*
3608 * Calling ext4_end_aio_dio_nolock() to convert completed 3709 * Calling ext4_end_io_nolock() to convert completed
3609 * IO to written. 3710 * IO to written.
3610 * 3711 *
3611 * When ext4_sync_file() is called, run_queue() may already 3712 * When ext4_sync_file() is called, run_queue() may already
@@ -3618,20 +3719,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
3618 * avoid double converting from both fsync and background work 3719 * avoid double converting from both fsync and background work
3619 * queue work. 3720 * queue work.
3620 */ 3721 */
3621 ret = ext4_end_aio_dio_nolock(io); 3722 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3723 ret = ext4_end_io_nolock(io);
3724 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3622 if (ret < 0) 3725 if (ret < 0)
3623 ret2 = ret; 3726 ret2 = ret;
3624 else 3727 else
3625 list_del_init(&io->list); 3728 list_del_init(&io->list);
3626 } 3729 }
3730 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3627 return (ret2 < 0) ? ret2 : 0; 3731 return (ret2 < 0) ? ret2 : 0;
3628} 3732}
3629 3733
3630static ext4_io_end_t *ext4_init_io_end (struct inode *inode) 3734static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3631{ 3735{
3632 ext4_io_end_t *io = NULL; 3736 ext4_io_end_t *io = NULL;
3633 3737
3634 io = kmalloc(sizeof(*io), GFP_NOFS); 3738 io = kmalloc(sizeof(*io), flags);
3635 3739
3636 if (io) { 3740 if (io) {
3637 igrab(inode); 3741 igrab(inode);
@@ -3639,8 +3743,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3639 io->flag = 0; 3743 io->flag = 0;
3640 io->offset = 0; 3744 io->offset = 0;
3641 io->size = 0; 3745 io->size = 0;
3642 io->error = 0; 3746 io->page = NULL;
3643 INIT_WORK(&io->work, ext4_end_aio_dio_work); 3747 INIT_WORK(&io->work, ext4_end_io_work);
3644 INIT_LIST_HEAD(&io->list); 3748 INIT_LIST_HEAD(&io->list);
3645 } 3749 }
3646 3750
@@ -3652,6 +3756,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3652{ 3756{
3653 ext4_io_end_t *io_end = iocb->private; 3757 ext4_io_end_t *io_end = iocb->private;
3654 struct workqueue_struct *wq; 3758 struct workqueue_struct *wq;
3759 unsigned long flags;
3760 struct ext4_inode_info *ei;
3655 3761
3656 /* if not async direct IO or dio with 0 bytes write, just return */ 3762 /* if not async direct IO or dio with 0 bytes write, just return */
3657 if (!io_end || !size) 3763 if (!io_end || !size)
@@ -3663,7 +3769,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3663 size); 3769 size);
3664 3770
3665 /* if not aio dio with unwritten extents, just free io and return */ 3771 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){ 3772 if (io_end->flag != EXT4_IO_UNWRITTEN){
3667 ext4_free_io_end(io_end); 3773 ext4_free_io_end(io_end);
3668 iocb->private = NULL; 3774 iocb->private = NULL;
3669 return; 3775 return;
@@ -3671,16 +3777,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3671 3777
3672 io_end->offset = offset; 3778 io_end->offset = offset;
3673 io_end->size = size; 3779 io_end->size = size;
3780 io_end->flag = EXT4_IO_UNWRITTEN;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3781 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675 3782
3676 /* queue the work to convert unwritten extents to written */ 3783 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work); 3784 queue_work(wq, &io_end->work);
3678 3785
3679 /* Add the io_end to per-inode completed aio dio list*/ 3786 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list, 3787 ei = EXT4_I(io_end->inode);
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list); 3788 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3789 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3790 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3682 iocb->private = NULL; 3791 iocb->private = NULL;
3683} 3792}
3793
3794static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3795{
3796 ext4_io_end_t *io_end = bh->b_private;
3797 struct workqueue_struct *wq;
3798 struct inode *inode;
3799 unsigned long flags;
3800
3801 if (!test_clear_buffer_uninit(bh) || !io_end)
3802 goto out;
3803
3804 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3805 printk("sb umounted, discard end_io request for inode %lu\n",
3806 io_end->inode->i_ino);
3807 ext4_free_io_end(io_end);
3808 goto out;
3809 }
3810
3811 io_end->flag = EXT4_IO_UNWRITTEN;
3812 inode = io_end->inode;
3813
3814 /* Add the io_end to per-inode completed io list*/
3815 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3816 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3817 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3818
3819 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3820 /* queue the work to convert unwritten extents to written */
3821 queue_work(wq, &io_end->work);
3822out:
3823 bh->b_private = NULL;
3824 bh->b_end_io = NULL;
3825 clear_buffer_uninit(bh);
3826 end_buffer_async_write(bh, uptodate);
3827}
3828
3829static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3830{
3831 ext4_io_end_t *io_end;
3832 struct page *page = bh->b_page;
3833 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3834 size_t size = bh->b_size;
3835
3836retry:
3837 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3838 if (!io_end) {
3839 if (printk_ratelimit())
3840 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3841 schedule();
3842 goto retry;
3843 }
3844 io_end->offset = offset;
3845 io_end->size = size;
3846 /*
3847 * We need to hold a reference to the page to make sure it
3848 * doesn't get evicted before ext4_end_io_work() has a chance
3849 * to convert the extent from written to unwritten.
3850 */
3851 io_end->page = page;
3852 get_page(io_end->page);
3853
3854 bh->b_private = io_end;
3855 bh->b_end_io = ext4_end_io_buffer_write;
3856 return 0;
3857}
3858
3684/* 3859/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes, 3860 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to 3861 * preallocated extents, and those write extend the file, no need to
@@ -3734,7 +3909,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3734 iocb->private = NULL; 3909 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL; 3910 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) { 3911 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode); 3912 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3738 if (!iocb->private) 3913 if (!iocb->private)
3739 return -ENOMEM; 3914 return -ENOMEM;
3740 /* 3915 /*
@@ -3750,7 +3925,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3750 ret = blockdev_direct_IO(rw, iocb, inode, 3925 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov, 3926 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs, 3927 offset, nr_segs,
3753 ext4_get_block_dio_write, 3928 ext4_get_block_write,
3754 ext4_end_io_dio); 3929 ext4_end_io_dio);
3755 if (iocb->private) 3930 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL; 3931 EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3771,8 +3946,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3946 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private); 3947 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL; 3948 iocb->private = NULL;
3774 } else if (ret > 0 && (EXT4_I(inode)->i_state & 3949 } else if (ret > 0 && ext4_test_inode_state(inode,
3775 EXT4_STATE_DIO_UNWRITTEN)) { 3950 EXT4_STATE_DIO_UNWRITTEN)) {
3776 int err; 3951 int err;
3777 /* 3952 /*
3778 * for non AIO case, since the IO is already 3953 * for non AIO case, since the IO is already
@@ -3782,7 +3957,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3782 offset, ret); 3957 offset, ret);
3783 if (err < 0) 3958 if (err < 0)
3784 ret = err; 3959 ret = err;
3785 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; 3960 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3786 } 3961 }
3787 return ret; 3962 return ret;
3788 } 3963 }
@@ -4064,7 +4239,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
4064 int k, err; 4239 int k, err;
4065 4240
4066 *top = 0; 4241 *top = 0;
4067 /* Make k index the deepest non-null offest + 1 */ 4242 /* Make k index the deepest non-null offset + 1 */
4068 for (k = depth; k > 1 && !offsets[k-1]; k--) 4243 for (k = depth; k > 1 && !offsets[k-1]; k--)
4069 ; 4244 ;
4070 partial = ext4_get_branch(inode, k, offsets, chain, &err); 4245 partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -4113,13 +4288,27 @@ no_top:
4113 * We release `count' blocks on disk, but (last - first) may be greater 4288 * We release `count' blocks on disk, but (last - first) may be greater
4114 * than `count' because there can be holes in there. 4289 * than `count' because there can be holes in there.
4115 */ 4290 */
4116static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 4291static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4117 struct buffer_head *bh, 4292 struct buffer_head *bh,
4118 ext4_fsblk_t block_to_free, 4293 ext4_fsblk_t block_to_free,
4119 unsigned long count, __le32 *first, 4294 unsigned long count, __le32 *first,
4120 __le32 *last) 4295 __le32 *last)
4121{ 4296{
4122 __le32 *p; 4297 __le32 *p;
4298 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4299
4300 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4301 flags |= EXT4_FREE_BLOCKS_METADATA;
4302
4303 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4304 count)) {
4305 ext4_error(inode->i_sb, "inode #%lu: "
4306 "attempt to clear blocks %llu len %lu, invalid",
4307 inode->i_ino, (unsigned long long) block_to_free,
4308 count);
4309 return 1;
4310 }
4311
4123 if (try_to_extend_transaction(handle, inode)) { 4312 if (try_to_extend_transaction(handle, inode)) {
4124 if (bh) { 4313 if (bh) {
4125 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4314 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4134,27 +4323,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
4134 } 4323 }
4135 } 4324 }
4136 4325
4137 /* 4326 for (p = first; p < last; p++)
4138 * Any buffers which are on the journal will be in memory. We 4327 *p = 0;
4139 * find them on the hash table so jbd2_journal_revoke() will
4140 * run jbd2_journal_forget() on them. We've already detached
4141 * each block from the file, so bforget() in
4142 * jbd2_journal_forget() should be safe.
4143 *
4144 * AKPM: turn on bforget in jbd2_journal_forget()!!!
4145 */
4146 for (p = first; p < last; p++) {
4147 u32 nr = le32_to_cpu(*p);
4148 if (nr) {
4149 struct buffer_head *tbh;
4150
4151 *p = 0;
4152 tbh = sb_find_get_block(inode->i_sb, nr);
4153 ext4_forget(handle, 0, inode, tbh, nr);
4154 }
4155 }
4156 4328
4157 ext4_free_blocks(handle, inode, block_to_free, count, 0); 4329 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4330 return 0;
4158} 4331}
4159 4332
4160/** 4333/**
@@ -4210,9 +4383,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4210 } else if (nr == block_to_free + count) { 4383 } else if (nr == block_to_free + count) {
4211 count++; 4384 count++;
4212 } else { 4385 } else {
4213 ext4_clear_blocks(handle, inode, this_bh, 4386 if (ext4_clear_blocks(handle, inode, this_bh,
4214 block_to_free, 4387 block_to_free, count,
4215 count, block_to_free_p, p); 4388 block_to_free_p, p))
4389 break;
4216 block_to_free = nr; 4390 block_to_free = nr;
4217 block_to_free_p = p; 4391 block_to_free_p = p;
4218 count = 1; 4392 count = 1;
@@ -4236,7 +4410,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4236 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4410 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4237 ext4_handle_dirty_metadata(handle, inode, this_bh); 4411 ext4_handle_dirty_metadata(handle, inode, this_bh);
4238 else 4412 else
4239 ext4_error(inode->i_sb, __func__, 4413 ext4_error(inode->i_sb,
4240 "circular indirect block detected, " 4414 "circular indirect block detected, "
4241 "inode=%lu, block=%llu", 4415 "inode=%lu, block=%llu",
4242 inode->i_ino, 4416 inode->i_ino,
@@ -4276,6 +4450,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4276 if (!nr) 4450 if (!nr)
4277 continue; /* A hole */ 4451 continue; /* A hole */
4278 4452
4453 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4454 nr, 1)) {
4455 ext4_error(inode->i_sb,
4456 "indirect mapped block in inode "
4457 "#%lu invalid (level %d, blk #%lu)",
4458 inode->i_ino, depth,
4459 (unsigned long) nr);
4460 break;
4461 }
4462
4279 /* Go read the buffer for the next level down */ 4463 /* Go read the buffer for the next level down */
4280 bh = sb_bread(inode->i_sb, nr); 4464 bh = sb_bread(inode->i_sb, nr);
4281 4465
@@ -4284,7 +4468,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4284 * (should be rare). 4468 * (should be rare).
4285 */ 4469 */
4286 if (!bh) { 4470 if (!bh) {
4287 ext4_error(inode->i_sb, "ext4_free_branches", 4471 ext4_error(inode->i_sb,
4288 "Read failure, inode=%lu, block=%llu", 4472 "Read failure, inode=%lu, block=%llu",
4289 inode->i_ino, nr); 4473 inode->i_ino, nr);
4290 continue; 4474 continue;
@@ -4342,7 +4526,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4342 blocks_for_truncate(inode)); 4526 blocks_for_truncate(inode));
4343 } 4527 }
4344 4528
4345 ext4_free_blocks(handle, inode, nr, 1, 1); 4529 ext4_free_blocks(handle, inode, 0, nr, 1,
4530 EXT4_FREE_BLOCKS_METADATA);
4346 4531
4347 if (parent_bh) { 4532 if (parent_bh) {
4348 /* 4533 /*
@@ -4427,8 +4612,10 @@ void ext4_truncate(struct inode *inode)
4427 if (!ext4_can_truncate(inode)) 4612 if (!ext4_can_truncate(inode))
4428 return; 4613 return;
4429 4614
4615 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
4616
4430 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4617 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4431 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4618 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4432 4619
4433 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4620 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4434 ext4_ext_truncate(inode); 4621 ext4_ext_truncate(inode);
@@ -4598,9 +4785,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4598 4785
4599 bh = sb_getblk(sb, block); 4786 bh = sb_getblk(sb, block);
4600 if (!bh) { 4787 if (!bh) {
4601 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4788 ext4_error(sb, "unable to read inode block - "
4602 "inode block - inode=%lu, block=%llu", 4789 "inode=%lu, block=%llu", inode->i_ino, block);
4603 inode->i_ino, block);
4604 return -EIO; 4790 return -EIO;
4605 } 4791 }
4606 if (!buffer_uptodate(bh)) { 4792 if (!buffer_uptodate(bh)) {
@@ -4698,9 +4884,8 @@ make_io:
4698 submit_bh(READ_META, bh); 4884 submit_bh(READ_META, bh);
4699 wait_on_buffer(bh); 4885 wait_on_buffer(bh);
4700 if (!buffer_uptodate(bh)) { 4886 if (!buffer_uptodate(bh)) {
4701 ext4_error(sb, __func__, 4887 ext4_error(sb, "unable to read inode block - inode=%lu,"
4702 "unable to read inode block - inode=%lu, " 4888 " block=%llu", inode->i_ino, block);
4703 "block=%llu", inode->i_ino, block);
4704 brelse(bh); 4889 brelse(bh);
4705 return -EIO; 4890 return -EIO;
4706 } 4891 }
@@ -4714,7 +4899,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4714{ 4899{
4715 /* We have all inode data except xattrs in memory here. */ 4900 /* We have all inode data except xattrs in memory here. */
4716 return __ext4_get_inode_loc(inode, iloc, 4901 return __ext4_get_inode_loc(inode, iloc,
4717 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4902 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4718} 4903}
4719 4904
4720void ext4_set_inode_flags(struct inode *inode) 4905void ext4_set_inode_flags(struct inode *inode)
@@ -4781,8 +4966,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4781 struct ext4_iloc iloc; 4966 struct ext4_iloc iloc;
4782 struct ext4_inode *raw_inode; 4967 struct ext4_inode *raw_inode;
4783 struct ext4_inode_info *ei; 4968 struct ext4_inode_info *ei;
4784 struct buffer_head *bh;
4785 struct inode *inode; 4969 struct inode *inode;
4970 journal_t *journal = EXT4_SB(sb)->s_journal;
4786 long ret; 4971 long ret;
4787 int block; 4972 int block;
4788 4973
@@ -4793,11 +4978,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4793 return inode; 4978 return inode;
4794 4979
4795 ei = EXT4_I(inode); 4980 ei = EXT4_I(inode);
4981 iloc.bh = 0;
4796 4982
4797 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4983 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4798 if (ret < 0) 4984 if (ret < 0)
4799 goto bad_inode; 4985 goto bad_inode;
4800 bh = iloc.bh;
4801 raw_inode = ext4_raw_inode(&iloc); 4986 raw_inode = ext4_raw_inode(&iloc);
4802 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4987 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4803 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4988 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4808,7 +4993,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4808 } 4993 }
4809 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4994 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4810 4995
4811 ei->i_state = 0; 4996 ei->i_state_flags = 0;
4812 ei->i_dir_start_lookup = 0; 4997 ei->i_dir_start_lookup = 0;
4813 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4998 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4814 /* We now have enough fields to check if the inode was active or not. 4999 /* We now have enough fields to check if the inode was active or not.
@@ -4820,7 +5005,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4820 if (inode->i_mode == 0 || 5005 if (inode->i_mode == 0 ||
4821 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 5006 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4822 /* this inode is deleted */ 5007 /* this inode is deleted */
4823 brelse(bh);
4824 ret = -ESTALE; 5008 ret = -ESTALE;
4825 goto bad_inode; 5009 goto bad_inode;
4826 } 5010 }
@@ -4837,6 +5021,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4837 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 5021 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4838 inode->i_size = ext4_isize(raw_inode); 5022 inode->i_size = ext4_isize(raw_inode);
4839 ei->i_disksize = inode->i_size; 5023 ei->i_disksize = inode->i_size;
5024#ifdef CONFIG_QUOTA
5025 ei->i_reserved_quota = 0;
5026#endif
4840 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 5027 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4841 ei->i_block_group = iloc.block_group; 5028 ei->i_block_group = iloc.block_group;
4842 ei->i_last_alloc_group = ~0; 5029 ei->i_last_alloc_group = ~0;
@@ -4848,11 +5035,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4848 ei->i_data[block] = raw_inode->i_block[block]; 5035 ei->i_data[block] = raw_inode->i_block[block];
4849 INIT_LIST_HEAD(&ei->i_orphan); 5036 INIT_LIST_HEAD(&ei->i_orphan);
4850 5037
5038 /*
5039 * Set transaction id's of transactions that have to be committed
5040 * to finish f[data]sync. We set them to currently running transaction
5041 * as we cannot be sure that the inode or some of its metadata isn't
5042 * part of the transaction - the inode could have been reclaimed and
5043 * now it is reread from disk.
5044 */
5045 if (journal) {
5046 transaction_t *transaction;
5047 tid_t tid;
5048
5049 spin_lock(&journal->j_state_lock);
5050 if (journal->j_running_transaction)
5051 transaction = journal->j_running_transaction;
5052 else
5053 transaction = journal->j_committing_transaction;
5054 if (transaction)
5055 tid = transaction->t_tid;
5056 else
5057 tid = journal->j_commit_sequence;
5058 spin_unlock(&journal->j_state_lock);
5059 ei->i_sync_tid = tid;
5060 ei->i_datasync_tid = tid;
5061 }
5062
4851 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 5063 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4852 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 5064 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4853 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 5065 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4854 EXT4_INODE_SIZE(inode->i_sb)) { 5066 EXT4_INODE_SIZE(inode->i_sb)) {
4855 brelse(bh);
4856 ret = -EIO; 5067 ret = -EIO;
4857 goto bad_inode; 5068 goto bad_inode;
4858 } 5069 }
@@ -4865,7 +5076,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4865 EXT4_GOOD_OLD_INODE_SIZE + 5076 EXT4_GOOD_OLD_INODE_SIZE +
4866 ei->i_extra_isize; 5077 ei->i_extra_isize;
4867 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 5078 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4868 ei->i_state |= EXT4_STATE_XATTR; 5079 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4869 } 5080 }
4870 } else 5081 } else
4871 ei->i_extra_isize = 0; 5082 ei->i_extra_isize = 0;
@@ -4884,12 +5095,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4884 5095
4885 ret = 0; 5096 ret = 0;
4886 if (ei->i_file_acl && 5097 if (ei->i_file_acl &&
4887 ((ei->i_file_acl < 5098 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4888 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 5099 ext4_error(sb, "bad extended attribute block %llu inode #%lu",
4889 EXT4_SB(sb)->s_gdb_count)) ||
4890 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
4891 ext4_error(sb, __func__,
4892 "bad extended attribute block %llu in inode #%lu",
4893 ei->i_file_acl, inode->i_ino); 5100 ei->i_file_acl, inode->i_ino);
4894 ret = -EIO; 5101 ret = -EIO;
4895 goto bad_inode; 5102 goto bad_inode;
@@ -4905,10 +5112,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4905 /* Validate block references which are part of inode */ 5112 /* Validate block references which are part of inode */
4906 ret = ext4_check_inode_blockref(inode); 5113 ret = ext4_check_inode_blockref(inode);
4907 } 5114 }
4908 if (ret) { 5115 if (ret)
4909 brelse(bh);
4910 goto bad_inode; 5116 goto bad_inode;
4911 }
4912 5117
4913 if (S_ISREG(inode->i_mode)) { 5118 if (S_ISREG(inode->i_mode)) {
4914 inode->i_op = &ext4_file_inode_operations; 5119 inode->i_op = &ext4_file_inode_operations;
@@ -4936,10 +5141,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4936 init_special_inode(inode, inode->i_mode, 5141 init_special_inode(inode, inode->i_mode,
4937 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5142 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4938 } else { 5143 } else {
4939 brelse(bh);
4940 ret = -EIO; 5144 ret = -EIO;
4941 ext4_error(inode->i_sb, __func__, 5145 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
4942 "bogus i_mode (%o) for inode=%lu",
4943 inode->i_mode, inode->i_ino); 5146 inode->i_mode, inode->i_ino);
4944 goto bad_inode; 5147 goto bad_inode;
4945 } 5148 }
@@ -4949,6 +5152,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4949 return inode; 5152 return inode;
4950 5153
4951bad_inode: 5154bad_inode:
5155 brelse(iloc.bh);
4952 iget_failed(inode); 5156 iget_failed(inode);
4953 return ERR_PTR(ret); 5157 return ERR_PTR(ret);
4954} 5158}
@@ -5010,7 +5214,7 @@ static int ext4_do_update_inode(handle_t *handle,
5010 5214
5011 /* For fields not not tracking in the in-memory inode, 5215 /* For fields not not tracking in the in-memory inode,
5012 * initialise them to zero for new inodes. */ 5216 * initialise them to zero for new inodes. */
5013 if (ei->i_state & EXT4_STATE_NEW) 5217 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5014 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5218 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5015 5219
5016 ext4_get_inode_flags(ei); 5220 ext4_get_inode_flags(ei);
@@ -5074,7 +5278,7 @@ static int ext4_do_update_inode(handle_t *handle,
5074 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 5278 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5075 sb->s_dirt = 1; 5279 sb->s_dirt = 1;
5076 ext4_handle_sync(handle); 5280 ext4_handle_sync(handle);
5077 err = ext4_handle_dirty_metadata(handle, inode, 5281 err = ext4_handle_dirty_metadata(handle, NULL,
5078 EXT4_SB(sb)->s_sbh); 5282 EXT4_SB(sb)->s_sbh);
5079 } 5283 }
5080 } 5284 }
@@ -5103,11 +5307,12 @@ static int ext4_do_update_inode(handle_t *handle,
5103 } 5307 }
5104 5308
5105 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5309 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5106 rc = ext4_handle_dirty_metadata(handle, inode, bh); 5310 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5107 if (!err) 5311 if (!err)
5108 err = rc; 5312 err = rc;
5109 ei->i_state &= ~EXT4_STATE_NEW; 5313 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5110 5314
5315 ext4_update_inode_fsync_trans(handle, inode, 0);
5111out_brelse: 5316out_brelse:
5112 brelse(bh); 5317 brelse(bh);
5113 ext4_std_error(inode->i_sb, err); 5318 ext4_std_error(inode->i_sb, err);
@@ -5149,7 +5354,7 @@ out_brelse:
5149 * `stuff()' is running, and the new i_size will be lost. Plus the inode 5354 * `stuff()' is running, and the new i_size will be lost. Plus the inode
5150 * will no longer be on the superblock's dirty inode list. 5355 * will no longer be on the superblock's dirty inode list.
5151 */ 5356 */
5152int ext4_write_inode(struct inode *inode, int wait) 5357int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5153{ 5358{
5154 int err; 5359 int err;
5155 5360
@@ -5163,26 +5368,25 @@ int ext4_write_inode(struct inode *inode, int wait)
5163 return -EIO; 5368 return -EIO;
5164 } 5369 }
5165 5370
5166 if (!wait) 5371 if (wbc->sync_mode != WB_SYNC_ALL)
5167 return 0; 5372 return 0;
5168 5373
5169 err = ext4_force_commit(inode->i_sb); 5374 err = ext4_force_commit(inode->i_sb);
5170 } else { 5375 } else {
5171 struct ext4_iloc iloc; 5376 struct ext4_iloc iloc;
5172 5377
5173 err = ext4_get_inode_loc(inode, &iloc); 5378 err = __ext4_get_inode_loc(inode, &iloc, 0);
5174 if (err) 5379 if (err)
5175 return err; 5380 return err;
5176 if (wait) 5381 if (wbc->sync_mode == WB_SYNC_ALL)
5177 sync_dirty_buffer(iloc.bh); 5382 sync_dirty_buffer(iloc.bh);
5178 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5383 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5179 ext4_error(inode->i_sb, __func__, 5384 ext4_error(inode->i_sb, "IO error syncing inode, "
5180 "IO error syncing inode, " 5385 "inode=%lu, block=%llu", inode->i_ino,
5181 "inode=%lu, block=%llu",
5182 inode->i_ino,
5183 (unsigned long long)iloc.bh->b_blocknr); 5386 (unsigned long long)iloc.bh->b_blocknr);
5184 err = -EIO; 5387 err = -EIO;
5185 } 5388 }
5389 brelse(iloc.bh);
5186 } 5390 }
5187 return err; 5391 return err;
5188} 5392}
@@ -5221,19 +5425,21 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5221 if (error) 5425 if (error)
5222 return error; 5426 return error;
5223 5427
5428 if (ia_valid & ATTR_SIZE)
5429 dquot_initialize(inode);
5224 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5430 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5225 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5431 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5226 handle_t *handle; 5432 handle_t *handle;
5227 5433
5228 /* (user+group)*(old+new) structure, inode write (sb, 5434 /* (user+group)*(old+new) structure, inode write (sb,
5229 * inode block, ? - but truncate inode update has it) */ 5435 * inode block, ? - but truncate inode update has it) */
5230 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 5436 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
5231 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 5437 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
5232 if (IS_ERR(handle)) { 5438 if (IS_ERR(handle)) {
5233 error = PTR_ERR(handle); 5439 error = PTR_ERR(handle);
5234 goto err_out; 5440 goto err_out;
5235 } 5441 }
5236 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 5442 error = dquot_transfer(inode, attr);
5237 if (error) { 5443 if (error) {
5238 ext4_journal_stop(handle); 5444 ext4_journal_stop(handle);
5239 return error; 5445 return error;
@@ -5260,7 +5466,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5260 } 5466 }
5261 5467
5262 if (S_ISREG(inode->i_mode) && 5468 if (S_ISREG(inode->i_mode) &&
5263 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 5469 attr->ia_valid & ATTR_SIZE &&
5470 (attr->ia_size < inode->i_size ||
5471 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
5264 handle_t *handle; 5472 handle_t *handle;
5265 5473
5266 handle = ext4_journal_start(inode, 3); 5474 handle = ext4_journal_start(inode, 3);
@@ -5291,6 +5499,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5291 goto err_out; 5499 goto err_out;
5292 } 5500 }
5293 } 5501 }
5502 /* ext4_truncate will clear the flag */
5503 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
5504 ext4_truncate(inode);
5294 } 5505 }
5295 5506
5296 rc = inode_setattr(inode, attr); 5507 rc = inode_setattr(inode, attr);
@@ -5376,7 +5587,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5376 * worse case, the indexs blocks spread over different block groups 5587 * worse case, the indexs blocks spread over different block groups
5377 * 5588 *
5378 * If datablocks are discontiguous, they are possible to spread over 5589 * If datablocks are discontiguous, they are possible to spread over
5379 * different block groups too. If they are contiugous, with flexbg, 5590 * different block groups too. If they are contiuguous, with flexbg,
5380 * they could still across block group boundary. 5591 * they could still across block group boundary.
5381 * 5592 *
5382 * Also account for superblock, inode, quota and xattr blocks 5593 * Also account for superblock, inode, quota and xattr blocks
@@ -5452,7 +5663,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
5452 * Calculate the journal credits for a chunk of data modification. 5663 * Calculate the journal credits for a chunk of data modification.
5453 * 5664 *
5454 * This is called from DIO, fallocate or whoever calling 5665 * This is called from DIO, fallocate or whoever calling
5455 * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. 5666 * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
5456 * 5667 *
5457 * journal buffers for data blocks are not included here, as DIO 5668 * journal buffers for data blocks are not included here, as DIO
5458 * and fallocate do no need to journal data buffers. 5669 * and fallocate do no need to journal data buffers.
@@ -5529,8 +5740,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
5529 entry = IFIRST(header); 5740 entry = IFIRST(header);
5530 5741
5531 /* No extended attributes present */ 5742 /* No extended attributes present */
5532 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 5743 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5533 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5744 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5534 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5745 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5535 new_extra_isize); 5746 new_extra_isize);
5536 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5747 EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5574,7 +5785,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5574 err = ext4_reserve_inode_write(handle, inode, &iloc); 5785 err = ext4_reserve_inode_write(handle, inode, &iloc);
5575 if (ext4_handle_valid(handle) && 5786 if (ext4_handle_valid(handle) &&
5576 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5787 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5577 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 5788 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5578 /* 5789 /*
5579 * We need extra buffer credits since we may write into EA block 5790 * We need extra buffer credits since we may write into EA block
5580 * with this same handle. If journal_extend fails, then it will 5791 * with this same handle. If journal_extend fails, then it will
@@ -5588,10 +5799,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5588 sbi->s_want_extra_isize, 5799 sbi->s_want_extra_isize,
5589 iloc, handle); 5800 iloc, handle);
5590 if (ret) { 5801 if (ret) {
5591 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5802 ext4_set_inode_state(inode,
5803 EXT4_STATE_NO_EXPAND);
5592 if (mnt_count != 5804 if (mnt_count !=
5593 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5805 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5594 ext4_warning(inode->i_sb, __func__, 5806 ext4_warning(inode->i_sb,
5595 "Unable to expand inode %lu. Delete" 5807 "Unable to expand inode %lu. Delete"
5596 " some EAs or run e2fsck.", 5808 " some EAs or run e2fsck.",
5597 inode->i_ino); 5809 inode->i_ino);
@@ -5613,7 +5825,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5613 * i_size has been changed by generic_commit_write() and we thus need 5825 * i_size has been changed by generic_commit_write() and we thus need
5614 * to include the updated inode in the current transaction. 5826 * to include the updated inode in the current transaction.
5615 * 5827 *
5616 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks 5828 * Also, dquot_alloc_block() will always dirty the inode when blocks
5617 * are allocated to the file. 5829 * are allocated to the file.
5618 * 5830 *
5619 * If the inode is marked synchronous, we don't honour that here - doing 5831 * If the inode is marked synchronous, we don't honour that here - doing
@@ -5655,7 +5867,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5655 err = jbd2_journal_get_write_access(handle, iloc.bh); 5867 err = jbd2_journal_get_write_access(handle, iloc.bh);
5656 if (!err) 5868 if (!err)
5657 err = ext4_handle_dirty_metadata(handle, 5869 err = ext4_handle_dirty_metadata(handle,
5658 inode, 5870 NULL,
5659 iloc.bh); 5871 iloc.bh);
5660 brelse(iloc.bh); 5872 brelse(iloc.bh);
5661 } 5873 }