aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ext4.txt41
-rw-r--r--fs/ext4/balloc.c345
-rw-r--r--fs/ext4/ext4.h141
-rw-r--r--fs/ext4/ext4_extents.h2
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/extents.c1168
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/fsync.c10
-rw-r--r--fs/ext4/ialloc.c204
-rw-r--r--fs/ext4/indirect.c20
-rw-r--r--fs/ext4/inode.c512
-rw-r--r--fs/ext4/ioctl.c65
-rw-r--r--fs/ext4/mballoc.c331
-rw-r--r--fs/ext4/mballoc.h11
-rw-r--r--fs/ext4/migrate.c109
-rw-r--r--fs/ext4/mmp.c10
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/namei.c21
-rw-r--r--fs/ext4/page-io.c66
-rw-r--r--fs/ext4/resize.c10
-rw-r--r--fs/ext4/super.c263
-rw-r--r--fs/ext4/xattr.c12
-rw-r--r--fs/jbd/journal.c8
-rw-r--r--fs/jbd2/commit.c26
-rw-r--r--fs/jbd2/journal.c44
-rw-r--r--fs/jbd2/recovery.c28
-rw-r--r--fs/jbd2/transaction.c68
-rw-r--r--include/linux/ext2_fs.h4
-rw-r--r--include/linux/ext3_fs.h4
-rw-r--r--include/linux/fs.h10
-rw-r--r--include/linux/jbd.h64
-rw-r--r--include/linux/jbd2.h69
-rw-r--r--include/linux/jbd_common.h68
-rw-r--r--include/trace/events/ext4.h480
34 files changed, 2898 insertions, 1329 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 232a575a0c48..4917cf24a5e0 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -160,7 +160,9 @@ noload if the filesystem was not unmounted cleanly,
160 lead to any number of problems. 160 lead to any number of problems.
161 161
162data=journal All data are committed into the journal prior to being 162data=journal All data are committed into the journal prior to being
163 written into the main file system. 163 written into the main file system. Enabling
164 this mode will disable delayed allocation and
165 O_DIRECT support.
164 166
165data=ordered (*) All data are forced directly out to the main file 167data=ordered (*) All data are forced directly out to the main file
166 system prior to its metadata being committed to the 168 system prior to its metadata being committed to the
@@ -201,30 +203,19 @@ inode_readahead_blks=n This tuning parameter controls the maximum
201 table readahead algorithm will pre-read into 203 table readahead algorithm will pre-read into
202 the buffer cache. The default value is 32 blocks. 204 the buffer cache. The default value is 32 blocks.
203 205
204orlov (*) This enables the new Orlov block allocator. It is 206nouser_xattr Disables Extended User Attributes. If you have extended
205 enabled by default. 207 attribute support enabled in the kernel configuration
206 208 (CONFIG_EXT4_FS_XATTR), extended attribute support
207oldalloc This disables the Orlov block allocator and enables 209 is enabled by default on mount. See the attr(5) manual
208 the old block allocator. Orlov should have better 210 page and http://acl.bestbits.at/ for more information
209 performance - we'd like to get some feedback if it's 211 about extended attributes.
210 the contrary for you.
211
212user_xattr Enables Extended User Attributes. Additionally, you
213 need to have extended attribute support enabled in the
214 kernel configuration (CONFIG_EXT4_FS_XATTR). See the
215 attr(5) manual page and http://acl.bestbits.at/ to
216 learn more about extended attributes.
217
218nouser_xattr Disables Extended User Attributes.
219
220acl Enables POSIX Access Control Lists support.
221 Additionally, you need to have ACL support enabled in
222 the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL).
223 See the acl(5) manual page and http://acl.bestbits.at/
224 for more information.
225 212
226noacl This option disables POSIX Access Control List 213noacl This option disables POSIX Access Control List
227 support. 214 support. If ACL support is enabled in the kernel
215 configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is
216 enabled by default on mount. See the acl(5) manual
217 page and http://acl.bestbits.at/ for more information
218 about acl.
228 219
229bsddf (*) Make 'df' act like BSD. 220bsddf (*) Make 'df' act like BSD.
230minixdf Make 'df' act like Minix. 221minixdf Make 'df' act like Minix.
@@ -419,8 +410,8 @@ written to the journal first, and then to its final location.
419In the event of a crash, the journal can be replayed, bringing both data and 410In the event of a crash, the journal can be replayed, bringing both data and
420metadata into a consistent state. This mode is the slowest except when data 411metadata into a consistent state. This mode is the slowest except when data
421needs to be read from and written to disk at the same time where it 412needs to be read from and written to disk at the same time where it
422outperforms all others modes. Currently ext4 does not have delayed 413outperforms all others modes. Enabling this mode will disable delayed
423allocation support if this data journalling mode is selected. 414allocation and O_DIRECT support.
424 415
425/proc entries 416/proc entries
426============= 417=============
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f8224adf496e..f6dba4505f1c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -28,7 +28,8 @@
28 */ 28 */
29 29
30/* 30/*
31 * Calculate the block group number and offset, given a block number 31 * Calculate the block group number and offset into the block/cluster
32 * allocation bitmap, given a block number
32 */ 33 */
33void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 34void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
34 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) 35 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
37 ext4_grpblk_t offset; 38 ext4_grpblk_t offset;
38 39
39 blocknr = blocknr - le32_to_cpu(es->s_first_data_block); 40 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
40 offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)); 41 offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
42 EXT4_SB(sb)->s_cluster_bits;
41 if (offsetp) 43 if (offsetp)
42 *offsetp = offset; 44 *offsetp = offset;
43 if (blockgrpp) 45 if (blockgrpp)
@@ -55,130 +57,169 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
55 return 0; 57 return 0;
56} 58}
57 59
58static int ext4_group_used_meta_blocks(struct super_block *sb, 60/* Return the number of clusters used for file system metadata; this
59 ext4_group_t block_group, 61 * represents the overhead needed by the file system.
60 struct ext4_group_desc *gdp) 62 */
63unsigned ext4_num_overhead_clusters(struct super_block *sb,
64 ext4_group_t block_group,
65 struct ext4_group_desc *gdp)
61{ 66{
62 ext4_fsblk_t tmp; 67 unsigned num_clusters;
68 int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
69 ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
70 ext4_fsblk_t itbl_blk;
63 struct ext4_sb_info *sbi = EXT4_SB(sb); 71 struct ext4_sb_info *sbi = EXT4_SB(sb);
64 /* block bitmap, inode bitmap, and inode table blocks */
65 int used_blocks = sbi->s_itb_per_group + 2;
66 72
67 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 73 /* This is the number of clusters used by the superblock,
68 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), 74 * block group descriptors, and reserved block group
69 block_group)) 75 * descriptor blocks */
70 used_blocks--; 76 num_clusters = ext4_num_base_meta_clusters(sb, block_group);
71 77
72 if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), 78 /*
73 block_group)) 79 * For the allocation bitmaps and inode table, we first need
74 used_blocks--; 80 * to check to see if the block is in the block group. If it
75 81 * is, then check to see if the cluster is already accounted
76 tmp = ext4_inode_table(sb, gdp); 82 * for in the clusters used for the base metadata cluster, or
77 for (; tmp < ext4_inode_table(sb, gdp) + 83 * if we can increment the base metadata cluster to include
78 sbi->s_itb_per_group; tmp++) { 84 * that block. Otherwise, we will have to track the cluster
79 if (!ext4_block_in_group(sb, tmp, block_group)) 85 * used for the allocation bitmap or inode table explicitly.
80 used_blocks -= 1; 86 * Normally all of these blocks are contiguous, so the special
87 * case handling shouldn't be necessary except for *very*
88 * unusual file system layouts.
89 */
90 if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
91 block_cluster = EXT4_B2C(sbi, (start -
92 ext4_block_bitmap(sb, gdp)));
93 if (block_cluster < num_clusters)
94 block_cluster = -1;
95 else if (block_cluster == num_clusters) {
96 num_clusters++;
97 block_cluster = -1;
81 } 98 }
82 } 99 }
83 return used_blocks;
84}
85 100
86/* Initializes an uninitialized block bitmap if given, and returns the 101 if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
87 * number of blocks free in the group. */ 102 inode_cluster = EXT4_B2C(sbi,
88unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, 103 start - ext4_inode_bitmap(sb, gdp));
89 ext4_group_t block_group, struct ext4_group_desc *gdp) 104 if (inode_cluster < num_clusters)
90{ 105 inode_cluster = -1;
91 int bit, bit_max; 106 else if (inode_cluster == num_clusters) {
92 ext4_group_t ngroups = ext4_get_groups_count(sb); 107 num_clusters++;
93 unsigned free_blocks, group_blocks; 108 inode_cluster = -1;
94 struct ext4_sb_info *sbi = EXT4_SB(sb);
95
96 if (bh) {
97 J_ASSERT_BH(bh, buffer_locked(bh));
98
99 /* If checksum is bad mark all blocks used to prevent allocation
100 * essentially implementing a per-group read-only flag. */
101 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
102 ext4_error(sb, "Checksum bad for group %u",
103 block_group);
104 ext4_free_blks_set(sb, gdp, 0);
105 ext4_free_inodes_set(sb, gdp, 0);
106 ext4_itable_unused_set(sb, gdp, 0);
107 memset(bh->b_data, 0xff, sb->s_blocksize);
108 return 0;
109 } 109 }
110 memset(bh->b_data, 0, sb->s_blocksize);
111 } 110 }
112 111
113 /* Check for superblock and gdt backups in this group */ 112 itbl_blk = ext4_inode_table(sb, gdp);
114 bit_max = ext4_bg_has_super(sb, block_group); 113 for (i = 0; i < sbi->s_itb_per_group; i++) {
115 114 if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
116 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || 115 c = EXT4_B2C(sbi, start - itbl_blk + i);
117 block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * 116 if ((c < num_clusters) || (c == inode_cluster) ||
118 sbi->s_desc_per_block) { 117 (c == block_cluster) || (c == itbl_cluster))
119 if (bit_max) { 118 continue;
120 bit_max += ext4_bg_num_gdb(sb, block_group); 119 if (c == num_clusters) {
121 bit_max += 120 num_clusters++;
122 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); 121 continue;
122 }
123 num_clusters++;
124 itbl_cluster = c;
123 } 125 }
124 } else { /* For META_BG_BLOCK_GROUPS */
125 bit_max += ext4_bg_num_gdb(sb, block_group);
126 } 126 }
127 127
128 if (block_group == ngroups - 1) { 128 if (block_cluster != -1)
129 num_clusters++;
130 if (inode_cluster != -1)
131 num_clusters++;
132
133 return num_clusters;
134}
135
136static unsigned int num_clusters_in_group(struct super_block *sb,
137 ext4_group_t block_group)
138{
139 unsigned int blocks;
140
141 if (block_group == ext4_get_groups_count(sb) - 1) {
129 /* 142 /*
130 * Even though mke2fs always initialize first and last group 143 * Even though mke2fs always initializes the first and
131 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need 144 * last group, just in case some other tool was used,
132 * to make sure we calculate the right free blocks 145 * we need to make sure we calculate the right free
146 * blocks.
133 */ 147 */
134 group_blocks = ext4_blocks_count(sbi->s_es) - 148 blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
135 ext4_group_first_block_no(sb, ngroups - 1); 149 ext4_group_first_block_no(sb, block_group);
136 } else { 150 } else
137 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 151 blocks = EXT4_BLOCKS_PER_GROUP(sb);
138 } 152 return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
153}
139 154
140 free_blocks = group_blocks - bit_max; 155/* Initializes an uninitialized block bitmap */
156void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
157 ext4_group_t block_group,
158 struct ext4_group_desc *gdp)
159{
160 unsigned int bit, bit_max;
161 struct ext4_sb_info *sbi = EXT4_SB(sb);
162 ext4_fsblk_t start, tmp;
163 int flex_bg = 0;
164
165 J_ASSERT_BH(bh, buffer_locked(bh));
166
167 /* If checksum is bad mark all blocks used to prevent allocation
168 * essentially implementing a per-group read-only flag. */
169 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
170 ext4_error(sb, "Checksum bad for group %u", block_group);
171 ext4_free_group_clusters_set(sb, gdp, 0);
172 ext4_free_inodes_set(sb, gdp, 0);
173 ext4_itable_unused_set(sb, gdp, 0);
174 memset(bh->b_data, 0xff, sb->s_blocksize);
175 return;
176 }
177 memset(bh->b_data, 0, sb->s_blocksize);
141 178
142 if (bh) { 179 bit_max = ext4_num_base_meta_clusters(sb, block_group);
143 ext4_fsblk_t start, tmp; 180 for (bit = 0; bit < bit_max; bit++)
144 int flex_bg = 0; 181 ext4_set_bit(bit, bh->b_data);
145 182
146 for (bit = 0; bit < bit_max; bit++) 183 start = ext4_group_first_block_no(sb, block_group);
147 ext4_set_bit(bit, bh->b_data);
148 184
149 start = ext4_group_first_block_no(sb, block_group); 185 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
186 flex_bg = 1;
150 187
151 if (EXT4_HAS_INCOMPAT_FEATURE(sb, 188 /* Set bits for block and inode bitmaps, and inode table */
152 EXT4_FEATURE_INCOMPAT_FLEX_BG)) 189 tmp = ext4_block_bitmap(sb, gdp);
153 flex_bg = 1; 190 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
191 ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
154 192
155 /* Set bits for block and inode bitmaps, and inode table */ 193 tmp = ext4_inode_bitmap(sb, gdp);
156 tmp = ext4_block_bitmap(sb, gdp); 194 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
157 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) 195 ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
158 ext4_set_bit(tmp - start, bh->b_data);
159 196
160 tmp = ext4_inode_bitmap(sb, gdp); 197 tmp = ext4_inode_table(sb, gdp);
198 for (; tmp < ext4_inode_table(sb, gdp) +
199 sbi->s_itb_per_group; tmp++) {
161 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) 200 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
162 ext4_set_bit(tmp - start, bh->b_data); 201 ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
163
164 tmp = ext4_inode_table(sb, gdp);
165 for (; tmp < ext4_inode_table(sb, gdp) +
166 sbi->s_itb_per_group; tmp++) {
167 if (!flex_bg ||
168 ext4_block_in_group(sb, tmp, block_group))
169 ext4_set_bit(tmp - start, bh->b_data);
170 }
171 /*
172 * Also if the number of blocks within the group is
173 * less than the blocksize * 8 ( which is the size
174 * of bitmap ), set rest of the block bitmap to 1
175 */
176 ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
177 bh->b_data);
178 } 202 }
179 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); 203
204 /*
205 * Also if the number of blocks within the group is less than
206 * the blocksize * 8 ( which is the size of bitmap ), set rest
207 * of the block bitmap to 1
208 */
209 ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
210 sb->s_blocksize * 8, bh->b_data);
180} 211}
181 212
213/* Return the number of free blocks in a block group. It is used when
214 * the block bitmap is uninitialized, so we can't just count the bits
215 * in the bitmap. */
216unsigned ext4_free_clusters_after_init(struct super_block *sb,
217 ext4_group_t block_group,
218 struct ext4_group_desc *gdp)
219{
220 return num_clusters_in_group(sb, block_group) -
221 ext4_num_overhead_clusters(sb, block_group, gdp);
222}
182 223
183/* 224/*
184 * The free blocks are managed by bitmaps. A file system contains several 225 * The free blocks are managed by bitmaps. A file system contains several
@@ -362,53 +403,54 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
362} 403}
363 404
364/** 405/**
365 * ext4_has_free_blocks() 406 * ext4_has_free_clusters()
366 * @sbi: in-core super block structure. 407 * @sbi: in-core super block structure.
367 * @nblocks: number of needed blocks 408 * @nclusters: number of needed blocks
409 * @flags: flags from ext4_mb_new_blocks()
368 * 410 *
369 * Check if filesystem has nblocks free & available for allocation. 411 * Check if filesystem has nclusters free & available for allocation.
370 * On success return 1, return 0 on failure. 412 * On success return 1, return 0 on failure.
371 */ 413 */
372static int ext4_has_free_blocks(struct ext4_sb_info *sbi, 414static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
373 s64 nblocks, unsigned int flags) 415 s64 nclusters, unsigned int flags)
374{ 416{
375 s64 free_blocks, dirty_blocks, root_blocks; 417 s64 free_clusters, dirty_clusters, root_clusters;
376 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 418 struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
377 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; 419 struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
378 420
379 free_blocks = percpu_counter_read_positive(fbc); 421 free_clusters = percpu_counter_read_positive(fcc);
380 dirty_blocks = percpu_counter_read_positive(dbc); 422 dirty_clusters = percpu_counter_read_positive(dcc);
381 root_blocks = ext4_r_blocks_count(sbi->s_es); 423 root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
382 424
383 if (free_blocks - (nblocks + root_blocks + dirty_blocks) < 425 if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
384 EXT4_FREEBLOCKS_WATERMARK) { 426 EXT4_FREECLUSTERS_WATERMARK) {
385 free_blocks = percpu_counter_sum_positive(fbc); 427 free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
386 dirty_blocks = percpu_counter_sum_positive(dbc); 428 dirty_clusters = percpu_counter_sum_positive(dcc);
387 } 429 }
388 /* Check whether we have space after 430 /* Check whether we have space after accounting for current
389 * accounting for current dirty blocks & root reserved blocks. 431 * dirty clusters & root reserved clusters.
390 */ 432 */
391 if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks)) 433 if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
392 return 1; 434 return 1;
393 435
394 /* Hm, nope. Are (enough) root reserved blocks available? */ 436 /* Hm, nope. Are (enough) root reserved clusters available? */
395 if (sbi->s_resuid == current_fsuid() || 437 if (sbi->s_resuid == current_fsuid() ||
396 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || 438 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
397 capable(CAP_SYS_RESOURCE) || 439 capable(CAP_SYS_RESOURCE) ||
398 (flags & EXT4_MB_USE_ROOT_BLOCKS)) { 440 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
399 441
400 if (free_blocks >= (nblocks + dirty_blocks)) 442 if (free_clusters >= (nclusters + dirty_clusters))
401 return 1; 443 return 1;
402 } 444 }
403 445
404 return 0; 446 return 0;
405} 447}
406 448
407int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 449int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
408 s64 nblocks, unsigned int flags) 450 s64 nclusters, unsigned int flags)
409{ 451{
410 if (ext4_has_free_blocks(sbi, nblocks, flags)) { 452 if (ext4_has_free_clusters(sbi, nclusters, flags)) {
411 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); 453 percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
412 return 0; 454 return 0;
413 } else 455 } else
414 return -ENOSPC; 456 return -ENOSPC;
@@ -428,7 +470,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
428 */ 470 */
429int ext4_should_retry_alloc(struct super_block *sb, int *retries) 471int ext4_should_retry_alloc(struct super_block *sb, int *retries)
430{ 472{
431 if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || 473 if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
432 (*retries)++ > 3 || 474 (*retries)++ > 3 ||
433 !EXT4_SB(sb)->s_journal) 475 !EXT4_SB(sb)->s_journal)
434 return 0; 476 return 0;
@@ -444,7 +486,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
444 * @handle: handle to this transaction 486 * @handle: handle to this transaction
445 * @inode: file inode 487 * @inode: file inode
446 * @goal: given target block(filesystem wide) 488 * @goal: given target block(filesystem wide)
447 * @count: pointer to total number of blocks needed 489 * @count: pointer to total number of clusters needed
448 * @errp: error code 490 * @errp: error code
449 * 491 *
450 * Return 1st allocated block number on success, *count stores total account 492 * Return 1st allocated block number on success, *count stores total account
@@ -476,18 +518,19 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
476 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 518 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
477 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 519 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
478 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 520 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
479 dquot_alloc_block_nofail(inode, ar.len); 521 dquot_alloc_block_nofail(inode,
522 EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
480 } 523 }
481 return ret; 524 return ret;
482} 525}
483 526
484/** 527/**
485 * ext4_count_free_blocks() -- count filesystem free blocks 528 * ext4_count_free_clusters() -- count filesystem free clusters
486 * @sb: superblock 529 * @sb: superblock
487 * 530 *
488 * Adds up the number of free blocks from each block group. 531 * Adds up the number of free clusters from each block group.
489 */ 532 */
490ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) 533ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
491{ 534{
492 ext4_fsblk_t desc_count; 535 ext4_fsblk_t desc_count;
493 struct ext4_group_desc *gdp; 536 struct ext4_group_desc *gdp;
@@ -508,7 +551,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
508 gdp = ext4_get_group_desc(sb, i, NULL); 551 gdp = ext4_get_group_desc(sb, i, NULL);
509 if (!gdp) 552 if (!gdp)
510 continue; 553 continue;
511 desc_count += ext4_free_blks_count(sb, gdp); 554 desc_count += ext4_free_group_clusters(sb, gdp);
512 brelse(bitmap_bh); 555 brelse(bitmap_bh);
513 bitmap_bh = ext4_read_block_bitmap(sb, i); 556 bitmap_bh = ext4_read_block_bitmap(sb, i);
514 if (bitmap_bh == NULL) 557 if (bitmap_bh == NULL)
@@ -516,12 +559,13 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
516 559
517 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 560 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
518 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", 561 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
519 i, ext4_free_blks_count(sb, gdp), x); 562 i, ext4_free_group_clusters(sb, gdp), x);
520 bitmap_count += x; 563 bitmap_count += x;
521 } 564 }
522 brelse(bitmap_bh); 565 brelse(bitmap_bh);
523 printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" 566 printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
524 ", computed = %llu, %llu\n", ext4_free_blocks_count(es), 567 ", computed = %llu, %llu\n",
568 EXT4_B2C(sbi, ext4_free_blocks_count(es)),
525 desc_count, bitmap_count); 569 desc_count, bitmap_count);
526 return bitmap_count; 570 return bitmap_count;
527#else 571#else
@@ -530,7 +574,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
530 gdp = ext4_get_group_desc(sb, i, NULL); 574 gdp = ext4_get_group_desc(sb, i, NULL);
531 if (!gdp) 575 if (!gdp)
532 continue; 576 continue;
533 desc_count += ext4_free_blks_count(sb, gdp); 577 desc_count += ext4_free_group_clusters(sb, gdp);
534 } 578 }
535 579
536 return desc_count; 580 return desc_count;
@@ -620,6 +664,31 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
620 664
621} 665}
622 666
667/*
668 * This function returns the number of file system metadata clusters at
669 * the beginning of a block group, including the reserved gdt blocks.
670 */
671unsigned ext4_num_base_meta_clusters(struct super_block *sb,
672 ext4_group_t block_group)
673{
674 struct ext4_sb_info *sbi = EXT4_SB(sb);
675 unsigned num;
676
677 /* Check for superblock and gdt backups in this group */
678 num = ext4_bg_has_super(sb, block_group);
679
680 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
681 block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
682 sbi->s_desc_per_block) {
683 if (num) {
684 num += ext4_bg_num_gdb(sb, block_group);
685 num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
686 }
687 } else { /* For META_BG_BLOCK_GROUPS */
688 num += ext4_bg_num_gdb(sb, block_group);
689 }
690 return EXT4_NUM_B2C(sbi, num);
691}
623/** 692/**
624 * ext4_inode_to_goal_block - return a hint for block allocation 693 * ext4_inode_to_goal_block - return a hint for block allocation
625 * @inode: inode for block allocation 694 * @inode: inode for block allocation
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cec3145e532c..5b0e26a1272d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -144,9 +144,17 @@ struct ext4_allocation_request {
144#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) 144#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
145#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) 145#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
146#define EXT4_MAP_UNINIT (1 << BH_Uninit) 146#define EXT4_MAP_UNINIT (1 << BH_Uninit)
147/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
148 * ext4_map_blocks wants to know whether or not the underlying cluster has
149 * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
150 * the requested mapping was from previously mapped (or delayed allocated)
151 * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
152 * should never appear on buffer_head's state flags.
153 */
154#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
147#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ 155#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
148 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ 156 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
149 EXT4_MAP_UNINIT) 157 EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
150 158
151struct ext4_map_blocks { 159struct ext4_map_blocks {
152 ext4_fsblk_t m_pblk; 160 ext4_fsblk_t m_pblk;
@@ -239,8 +247,11 @@ struct ext4_io_submit {
239# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) 247# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
240#endif 248#endif
241#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) 249#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
250#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \
251 EXT4_SB(s)->s_cluster_bits)
242#ifdef __KERNEL__ 252#ifdef __KERNEL__
243# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) 253# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
254# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits)
244#else 255#else
245# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) 256# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10)
246#endif 257#endif
@@ -258,6 +269,14 @@ struct ext4_io_submit {
258#endif 269#endif
259#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) 270#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits)))
260 271
272/* Translate a block number to a cluster number */
273#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
274/* Translate a cluster number to a block number */
275#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits)
276/* Translate # of blks to # of clusters */
277#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
278 (sbi)->s_cluster_bits)
279
261/* 280/*
262 * Structure of a blocks group descriptor 281 * Structure of a blocks group descriptor
263 */ 282 */
@@ -289,7 +308,7 @@ struct ext4_group_desc
289 308
290struct flex_groups { 309struct flex_groups {
291 atomic_t free_inodes; 310 atomic_t free_inodes;
292 atomic_t free_blocks; 311 atomic_t free_clusters;
293 atomic_t used_dirs; 312 atomic_t used_dirs;
294}; 313};
295 314
@@ -306,6 +325,7 @@ struct flex_groups {
306#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) 325#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size)
307#ifdef __KERNEL__ 326#ifdef __KERNEL__
308# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) 327# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group)
328# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group)
309# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) 329# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block)
310# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) 330# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group)
311# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) 331# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits)
@@ -358,8 +378,7 @@ struct flex_groups {
358 378
359/* Flags that should be inherited by new inodes from their parent. */ 379/* Flags that should be inherited by new inodes from their parent. */
360#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 380#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
361 EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ 381 EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
362 EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
363 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ 382 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
364 EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) 383 EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
365 384
@@ -520,6 +539,8 @@ struct ext4_new_group_data {
520#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 539#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
521 /* Don't normalize allocation size (used for fallocate) */ 540 /* Don't normalize allocation size (used for fallocate) */
522#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 541#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
542 /* Request will not result in inode size update (user for fallocate) */
543#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
523 544
524/* 545/*
525 * Flags used by ext4_free_blocks 546 * Flags used by ext4_free_blocks
@@ -528,6 +549,13 @@ struct ext4_new_group_data {
528#define EXT4_FREE_BLOCKS_FORGET 0x0002 549#define EXT4_FREE_BLOCKS_FORGET 0x0002
529#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 550#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
530#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 551#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
552#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
553#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
554
555/*
556 * Flags used by ext4_discard_partial_page_buffers
557 */
558#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
531 559
532/* 560/*
533 * ioctl commands 561 * ioctl commands
@@ -538,9 +566,6 @@ struct ext4_new_group_data {
538#define EXT4_IOC_SETVERSION _IOW('f', 4, long) 566#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
539#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION 567#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
540#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION 568#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
541#ifdef CONFIG_JBD2_DEBUG
542#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
543#endif
544#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) 569#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
545#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) 570#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
546#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) 571#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
@@ -563,9 +588,6 @@ struct ext4_new_group_data {
563#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 588#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
564#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 589#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
565#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) 590#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
566#ifdef CONFIG_JBD2_DEBUG
567#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
568#endif
569#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 591#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
570#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 592#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
571#endif 593#endif
@@ -837,6 +859,7 @@ struct ext4_inode_info {
837 ext4_group_t i_last_alloc_group; 859 ext4_group_t i_last_alloc_group;
838 860
839 /* allocation reservation info for delalloc */ 861 /* allocation reservation info for delalloc */
862 /* In case of bigalloc, these refer to clusters rather than blocks */
840 unsigned int i_reserved_data_blocks; 863 unsigned int i_reserved_data_blocks;
841 unsigned int i_reserved_meta_blocks; 864 unsigned int i_reserved_meta_blocks;
842 unsigned int i_allocated_meta_blocks; 865 unsigned int i_allocated_meta_blocks;
@@ -886,7 +909,6 @@ struct ext4_inode_info {
886/* 909/*
887 * Mount flags 910 * Mount flags
888 */ 911 */
889#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
890#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 912#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
891#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ 913#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
892#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ 914#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
@@ -918,6 +940,9 @@ struct ext4_inode_info {
918#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 940#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
919#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ 941#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
920 942
943#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
944 specified delalloc */
945
921#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 946#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
922 ~EXT4_MOUNT_##opt 947 ~EXT4_MOUNT_##opt
923#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ 948#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -968,9 +993,9 @@ struct ext4_super_block {
968/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ 993/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
969 __le32 s_first_data_block; /* First Data Block */ 994 __le32 s_first_data_block; /* First Data Block */
970 __le32 s_log_block_size; /* Block size */ 995 __le32 s_log_block_size; /* Block size */
971 __le32 s_obso_log_frag_size; /* Obsoleted fragment size */ 996 __le32 s_log_cluster_size; /* Allocation cluster size */
972/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ 997/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
973 __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */ 998 __le32 s_clusters_per_group; /* # Clusters per group */
974 __le32 s_inodes_per_group; /* # Inodes per group */ 999 __le32 s_inodes_per_group; /* # Inodes per group */
975 __le32 s_mtime; /* Mount time */ 1000 __le32 s_mtime; /* Mount time */
976/*30*/ __le32 s_wtime; /* Write time */ 1001/*30*/ __le32 s_wtime; /* Write time */
@@ -1066,7 +1091,10 @@ struct ext4_super_block {
1066 __u8 s_last_error_func[32]; /* function where the error happened */ 1091 __u8 s_last_error_func[32]; /* function where the error happened */
1067#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) 1092#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
1068 __u8 s_mount_opts[64]; 1093 __u8 s_mount_opts[64];
1069 __le32 s_reserved[112]; /* Padding to the end of the block */ 1094 __le32 s_usr_quota_inum; /* inode for tracking user quota */
1095 __le32 s_grp_quota_inum; /* inode for tracking group quota */
1096 __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
1097 __le32 s_reserved[109]; /* Padding to the end of the block */
1070}; 1098};
1071 1099
1072#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) 1100#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1086,6 +1114,7 @@ struct ext4_sb_info {
1086 unsigned long s_desc_size; /* Size of a group descriptor in bytes */ 1114 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
1087 unsigned long s_inodes_per_block;/* Number of inodes per block */ 1115 unsigned long s_inodes_per_block;/* Number of inodes per block */
1088 unsigned long s_blocks_per_group;/* Number of blocks in a group */ 1116 unsigned long s_blocks_per_group;/* Number of blocks in a group */
1117 unsigned long s_clusters_per_group; /* Number of clusters in a group */
1089 unsigned long s_inodes_per_group;/* Number of inodes in a group */ 1118 unsigned long s_inodes_per_group;/* Number of inodes in a group */
1090 unsigned long s_itb_per_group; /* Number of inode table blocks per group */ 1119 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
1091 unsigned long s_gdb_count; /* Number of group descriptor blocks */ 1120 unsigned long s_gdb_count; /* Number of group descriptor blocks */
@@ -1094,6 +1123,8 @@ struct ext4_sb_info {
1094 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ 1123 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
1095 unsigned long s_overhead_last; /* Last calculated overhead */ 1124 unsigned long s_overhead_last; /* Last calculated overhead */
1096 unsigned long s_blocks_last; /* Last seen block count */ 1125 unsigned long s_blocks_last; /* Last seen block count */
1126 unsigned int s_cluster_ratio; /* Number of blocks per cluster */
1127 unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */
1097 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 1128 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
1098 struct buffer_head * s_sbh; /* Buffer containing the super block */ 1129 struct buffer_head * s_sbh; /* Buffer containing the super block */
1099 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ 1130 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
@@ -1117,10 +1148,10 @@ struct ext4_sb_info {
1117 u32 s_hash_seed[4]; 1148 u32 s_hash_seed[4];
1118 int s_def_hash_version; 1149 int s_def_hash_version;
1119 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ 1150 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
1120 struct percpu_counter s_freeblocks_counter; 1151 struct percpu_counter s_freeclusters_counter;
1121 struct percpu_counter s_freeinodes_counter; 1152 struct percpu_counter s_freeinodes_counter;
1122 struct percpu_counter s_dirs_counter; 1153 struct percpu_counter s_dirs_counter;
1123 struct percpu_counter s_dirtyblocks_counter; 1154 struct percpu_counter s_dirtyclusters_counter;
1124 struct blockgroup_lock *s_blockgroup_lock; 1155 struct blockgroup_lock *s_blockgroup_lock;
1125 struct proc_dir_entry *s_proc; 1156 struct proc_dir_entry *s_proc;
1126 struct kobject s_kobj; 1157 struct kobject s_kobj;
@@ -1136,10 +1167,6 @@ struct ext4_sb_info {
1136 u32 s_max_batch_time; 1167 u32 s_max_batch_time;
1137 u32 s_min_batch_time; 1168 u32 s_min_batch_time;
1138 struct block_device *journal_bdev; 1169 struct block_device *journal_bdev;
1139#ifdef CONFIG_JBD2_DEBUG
1140 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
1141 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
1142#endif
1143#ifdef CONFIG_QUOTA 1170#ifdef CONFIG_QUOTA
1144 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ 1171 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
1145 int s_jquota_fmt; /* Format of quota to use */ 1172 int s_jquota_fmt; /* Format of quota to use */
@@ -1248,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1248 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); 1275 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
1249} 1276}
1250 1277
1278static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1279 struct ext4_io_end *io_end)
1280{
1281 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1282 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1283 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
1284 }
1285}
1286
1251/* 1287/*
1252 * Inode dynamic state flags 1288 * Inode dynamic state flags
1253 */ 1289 */
@@ -1360,6 +1396,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1360#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 1396#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
1361#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1397#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1362#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 1398#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1399#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
1363 1400
1364#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1401#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
1365#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 1402#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1402,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1402 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ 1439 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
1403 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ 1440 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
1404 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ 1441 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
1405 EXT4_FEATURE_RO_COMPAT_HUGE_FILE) 1442 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
1443 EXT4_FEATURE_RO_COMPAT_BIGALLOC)
1406 1444
1407/* 1445/*
1408 * Default values for user and/or group using reserved blocks 1446 * Default values for user and/or group using reserved blocks
@@ -1735,9 +1773,9 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1735 unsigned int flags, 1773 unsigned int flags,
1736 unsigned long *count, 1774 unsigned long *count,
1737 int *errp); 1775 int *errp);
1738extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 1776extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
1739 s64 nblocks, unsigned int flags); 1777 s64 nclusters, unsigned int flags);
1740extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1778extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
1741extern void ext4_check_blocks_bitmap(struct super_block *); 1779extern void ext4_check_blocks_bitmap(struct super_block *);
1742extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1780extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1743 ext4_group_t block_group, 1781 ext4_group_t block_group,
@@ -1745,12 +1783,18 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1745extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1783extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1746struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, 1784struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1747 ext4_group_t block_group); 1785 ext4_group_t block_group);
1748extern unsigned ext4_init_block_bitmap(struct super_block *sb, 1786extern void ext4_init_block_bitmap(struct super_block *sb,
1749 struct buffer_head *bh, 1787 struct buffer_head *bh,
1750 ext4_group_t group, 1788 ext4_group_t group,
1751 struct ext4_group_desc *desc); 1789 struct ext4_group_desc *desc);
1752#define ext4_free_blocks_after_init(sb, group, desc) \ 1790extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
1753 ext4_init_block_bitmap(sb, NULL, group, desc) 1791 ext4_group_t block_group,
1792 struct ext4_group_desc *gdp);
1793extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
1794 ext4_group_t block_group);
1795extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
1796 ext4_group_t block_group,
1797 struct ext4_group_desc *gdp);
1754ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); 1798ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
1755 1799
1756/* dir.c */ 1800/* dir.c */
@@ -1776,7 +1820,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
1776 1820
1777/* ialloc.c */ 1821/* ialloc.c */
1778extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, 1822extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
1779 const struct qstr *qstr, __u32 goal); 1823 const struct qstr *qstr, __u32 goal,
1824 uid_t *owner);
1780extern void ext4_free_inode(handle_t *, struct inode *); 1825extern void ext4_free_inode(handle_t *, struct inode *);
1781extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); 1826extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1782extern unsigned long ext4_count_free_inodes(struct super_block *); 1827extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -1839,6 +1884,12 @@ extern int ext4_block_truncate_page(handle_t *handle,
1839 struct address_space *mapping, loff_t from); 1884 struct address_space *mapping, loff_t from);
1840extern int ext4_block_zero_page_range(handle_t *handle, 1885extern int ext4_block_zero_page_range(handle_t *handle,
1841 struct address_space *mapping, loff_t from, loff_t length); 1886 struct address_space *mapping, loff_t from, loff_t length);
1887extern int ext4_discard_partial_page_buffers(handle_t *handle,
1888 struct address_space *mapping, loff_t from,
1889 loff_t length, int flags);
1890extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
1891 struct inode *inode, struct page *page, loff_t from,
1892 loff_t length, int flags);
1842extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1893extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1843extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1894extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1844extern void ext4_da_update_reserve_space(struct inode *inode, 1895extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1927,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
1927 struct ext4_group_desc *bg); 1978 struct ext4_group_desc *bg);
1928extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, 1979extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
1929 struct ext4_group_desc *bg); 1980 struct ext4_group_desc *bg);
1930extern __u32 ext4_free_blks_count(struct super_block *sb, 1981extern __u32 ext4_free_group_clusters(struct super_block *sb,
1931 struct ext4_group_desc *bg); 1982 struct ext4_group_desc *bg);
1932extern __u32 ext4_free_inodes_count(struct super_block *sb, 1983extern __u32 ext4_free_inodes_count(struct super_block *sb,
1933 struct ext4_group_desc *bg); 1984 struct ext4_group_desc *bg);
1934extern __u32 ext4_used_dirs_count(struct super_block *sb, 1985extern __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -1941,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
1941 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1992 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1942extern void ext4_inode_table_set(struct super_block *sb, 1993extern void ext4_inode_table_set(struct super_block *sb,
1943 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1994 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1944extern void ext4_free_blks_set(struct super_block *sb, 1995extern void ext4_free_group_clusters_set(struct super_block *sb,
1945 struct ext4_group_desc *bg, __u32 count); 1996 struct ext4_group_desc *bg,
1997 __u32 count);
1946extern void ext4_free_inodes_set(struct super_block *sb, 1998extern void ext4_free_inodes_set(struct super_block *sb,
1947 struct ext4_group_desc *bg, __u32 count); 1999 struct ext4_group_desc *bg, __u32 count);
1948extern void ext4_used_dirs_set(struct super_block *sb, 2000extern void ext4_used_dirs_set(struct super_block *sb,
@@ -2051,13 +2103,13 @@ do { \
2051} while (0) 2103} while (0)
2052 2104
2053#ifdef CONFIG_SMP 2105#ifdef CONFIG_SMP
2054/* Each CPU can accumulate percpu_counter_batch blocks in their local 2106/* Each CPU can accumulate percpu_counter_batch clusters in their local
2055 * counters. So we need to make sure we have free blocks more 2107 * counters. So we need to make sure we have free clusters more
2056 * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. 2108 * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
2057 */ 2109 */
2058#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) 2110#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
2059#else 2111#else
2060#define EXT4_FREEBLOCKS_WATERMARK 0 2112#define EXT4_FREECLUSTERS_WATERMARK 0
2061#endif 2113#endif
2062 2114
2063static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) 2115static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
@@ -2243,10 +2295,19 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2243enum ext4_state_bits { 2295enum ext4_state_bits {
2244 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2296 BH_Uninit /* blocks are allocated but uninitialized on disk */
2245 = BH_JBDPrivateStart, 2297 = BH_JBDPrivateStart,
2298 BH_AllocFromCluster, /* allocated blocks were part of already
2299 * allocated cluster. Note that this flag will
2300 * never, ever appear in a buffer_head's state
2301 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2302 * this is used. */
2303 BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
2304 * flag is set when ext4_map_blocks is called on a
2305 * delayed allocated block to get its real mapping. */
2246}; 2306};
2247 2307
2248BUFFER_FNS(Uninit, uninit) 2308BUFFER_FNS(Uninit, uninit)
2249TAS_BUFFER_FNS(Uninit, uninit) 2309TAS_BUFFER_FNS(Uninit, uninit)
2310BUFFER_FNS(Da_Mapped, da_mapped)
2250 2311
2251/* 2312/*
2252 * Add new method to test wether block and inode bitmaps are properly 2313 * Add new method to test wether block and inode bitmaps are properly
@@ -2282,4 +2343,6 @@ extern void ext4_resize_end(struct super_block *sb);
2282 2343
2283#endif /* __KERNEL__ */ 2344#endif /* __KERNEL__ */
2284 2345
2346#include "ext4_extents.h"
2347
2285#endif /* _EXT4_H */ 2348#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 095c36f3b612..a52db3a69a30 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
290 struct ext4_ext_path *); 290 struct ext4_ext_path *);
291extern void ext4_ext_drop_refs(struct ext4_ext_path *); 291extern void ext4_ext_drop_refs(struct ext4_ext_path *);
292extern int ext4_ext_check_inode(struct inode *inode); 292extern int ext4_ext_check_inode(struct inode *inode);
293extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
294 int search_hint_reverse);
293#endif /* _EXT4_EXTENTS */ 295#endif /* _EXT4_EXTENTS */
294 296
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index f5240aa15601..aca179017582 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
109 109
110 if (ext4_handle_valid(handle)) { 110 if (ext4_handle_valid(handle)) {
111 err = jbd2_journal_dirty_metadata(handle, bh); 111 err = jbd2_journal_dirty_metadata(handle, bh);
112 if (err) 112 if (err) {
113 ext4_journal_abort_handle(where, line, __func__, 113 /* Errors can only happen if there is a bug */
114 bh, handle, err); 114 handle->h_err = err;
115 __ext4_journal_stop(where, line, handle);
116 }
115 } else { 117 } else {
116 if (inode) 118 if (inode)
117 mark_buffer_dirty_inode(bh, inode); 119 mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57cf568a98ab..61fa9e1614af 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -42,7 +42,6 @@
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43#include <linux/fiemap.h> 43#include <linux/fiemap.h>
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h"
46 45
47#include <trace/events/ext4.h> 46#include <trace/events/ext4.h>
48 47
@@ -96,13 +95,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
96 * - ENOMEM 95 * - ENOMEM
97 * - EIO 96 * - EIO
98 */ 97 */
99static int ext4_ext_dirty(handle_t *handle, struct inode *inode, 98#define ext4_ext_dirty(handle, inode, path) \
100 struct ext4_ext_path *path) 99 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
100static int __ext4_ext_dirty(const char *where, unsigned int line,
101 handle_t *handle, struct inode *inode,
102 struct ext4_ext_path *path)
101{ 103{
102 int err; 104 int err;
103 if (path->p_bh) { 105 if (path->p_bh) {
104 /* path points to block */ 106 /* path points to block */
105 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); 107 err = __ext4_handle_dirty_metadata(where, line, handle,
108 inode, path->p_bh);
106 } else { 109 } else {
107 /* path points to leaf/index in inode body */ 110 /* path points to leaf/index in inode body */
108 err = ext4_mark_inode_dirty(handle, inode); 111 err = ext4_mark_inode_dirty(handle, inode);
@@ -114,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
114 struct ext4_ext_path *path, 117 struct ext4_ext_path *path,
115 ext4_lblk_t block) 118 ext4_lblk_t block)
116{ 119{
117 int depth;
118
119 if (path) { 120 if (path) {
121 int depth = path->p_depth;
120 struct ext4_extent *ex; 122 struct ext4_extent *ex;
121 depth = path->p_depth;
122 123
123 /* 124 /*
124 * Try to predict block placement assuming that we are 125 * Try to predict block placement assuming that we are
@@ -180,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
180 181
181 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 182 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
182 / sizeof(struct ext4_extent); 183 / sizeof(struct ext4_extent);
183 if (!check) {
184#ifdef AGGRESSIVE_TEST 184#ifdef AGGRESSIVE_TEST
185 if (size > 6) 185 if (!check && size > 6)
186 size = 6; 186 size = 6;
187#endif 187#endif
188 }
189 return size; 188 return size;
190} 189}
191 190
@@ -195,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
195 194
196 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 195 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
197 / sizeof(struct ext4_extent_idx); 196 / sizeof(struct ext4_extent_idx);
198 if (!check) {
199#ifdef AGGRESSIVE_TEST 197#ifdef AGGRESSIVE_TEST
200 if (size > 5) 198 if (!check && size > 5)
201 size = 5; 199 size = 5;
202#endif 200#endif
203 }
204 return size; 201 return size;
205} 202}
206 203
@@ -211,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
211 size = sizeof(EXT4_I(inode)->i_data); 208 size = sizeof(EXT4_I(inode)->i_data);
212 size -= sizeof(struct ext4_extent_header); 209 size -= sizeof(struct ext4_extent_header);
213 size /= sizeof(struct ext4_extent); 210 size /= sizeof(struct ext4_extent);
214 if (!check) {
215#ifdef AGGRESSIVE_TEST 211#ifdef AGGRESSIVE_TEST
216 if (size > 3) 212 if (!check && size > 3)
217 size = 3; 213 size = 3;
218#endif 214#endif
219 }
220 return size; 215 return size;
221} 216}
222 217
@@ -227,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
227 size = sizeof(EXT4_I(inode)->i_data); 222 size = sizeof(EXT4_I(inode)->i_data);
228 size -= sizeof(struct ext4_extent_header); 223 size -= sizeof(struct ext4_extent_header);
229 size /= sizeof(struct ext4_extent_idx); 224 size /= sizeof(struct ext4_extent_idx);
230 if (!check) {
231#ifdef AGGRESSIVE_TEST 225#ifdef AGGRESSIVE_TEST
232 if (size > 4) 226 if (!check && size > 4)
233 size = 4; 227 size = 4;
234#endif 228#endif
235 }
236 return size; 229 return size;
237} 230}
238 231
@@ -244,7 +237,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
244int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 237int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
245{ 238{
246 struct ext4_inode_info *ei = EXT4_I(inode); 239 struct ext4_inode_info *ei = EXT4_I(inode);
247 int idxs, num = 0; 240 int idxs;
248 241
249 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 242 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
250 / sizeof(struct ext4_extent_idx)); 243 / sizeof(struct ext4_extent_idx));
@@ -259,6 +252,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
259 */ 252 */
260 if (ei->i_da_metadata_calc_len && 253 if (ei->i_da_metadata_calc_len &&
261 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 254 ei->i_da_metadata_calc_last_lblock+1 == lblock) {
255 int num = 0;
256
262 if ((ei->i_da_metadata_calc_len % idxs) == 0) 257 if ((ei->i_da_metadata_calc_len % idxs) == 0)
263 num++; 258 num++;
264 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 259 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -321,8 +316,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
321 struct ext4_extent_header *eh, 316 struct ext4_extent_header *eh,
322 int depth) 317 int depth)
323{ 318{
324 struct ext4_extent *ext;
325 struct ext4_extent_idx *ext_idx;
326 unsigned short entries; 319 unsigned short entries;
327 if (eh->eh_entries == 0) 320 if (eh->eh_entries == 0)
328 return 1; 321 return 1;
@@ -331,7 +324,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
331 324
332 if (depth == 0) { 325 if (depth == 0) {
333 /* leaf entries */ 326 /* leaf entries */
334 ext = EXT_FIRST_EXTENT(eh); 327 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
335 while (entries) { 328 while (entries) {
336 if (!ext4_valid_extent(inode, ext)) 329 if (!ext4_valid_extent(inode, ext))
337 return 0; 330 return 0;
@@ -339,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
339 entries--; 332 entries--;
340 } 333 }
341 } else { 334 } else {
342 ext_idx = EXT_FIRST_INDEX(eh); 335 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
343 while (entries) { 336 while (entries) {
344 if (!ext4_valid_extent_idx(inode, ext_idx)) 337 if (!ext4_valid_extent_idx(inode, ext_idx))
345 return 0; 338 return 0;
@@ -751,31 +744,30 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
751 return -EIO; 744 return -EIO;
752 } 745 }
753 746
754 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
755 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 747 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
756 /* insert after */ 748 /* insert after */
757 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 749 ext_debug("insert new index %d after: %llu\n", logical, ptr);
758 len = (len - 1) * sizeof(struct ext4_extent_idx);
759 len = len < 0 ? 0 : len;
760 ext_debug("insert new index %d after: %llu. "
761 "move %d from 0x%p to 0x%p\n",
762 logical, ptr, len,
763 (curp->p_idx + 1), (curp->p_idx + 2));
764 memmove(curp->p_idx + 2, curp->p_idx + 1, len);
765 }
766 ix = curp->p_idx + 1; 750 ix = curp->p_idx + 1;
767 } else { 751 } else {
768 /* insert before */ 752 /* insert before */
769 len = len * sizeof(struct ext4_extent_idx); 753 ext_debug("insert new index %d before: %llu\n", logical, ptr);
770 len = len < 0 ? 0 : len;
771 ext_debug("insert new index %d before: %llu. "
772 "move %d from 0x%p to 0x%p\n",
773 logical, ptr, len,
774 curp->p_idx, (curp->p_idx + 1));
775 memmove(curp->p_idx + 1, curp->p_idx, len);
776 ix = curp->p_idx; 754 ix = curp->p_idx;
777 } 755 }
778 756
757 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
758 BUG_ON(len < 0);
759 if (len > 0) {
760 ext_debug("insert new index %d: "
761 "move %d indices from 0x%p to 0x%p\n",
762 logical, len, ix, ix + 1);
763 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
764 }
765
766 if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
767 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
768 return -EIO;
769 }
770
779 ix->ei_block = cpu_to_le32(logical); 771 ix->ei_block = cpu_to_le32(logical);
780 ext4_idx_store_pblock(ix, ptr); 772 ext4_idx_store_pblock(ix, ptr);
781 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 773 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -1042,16 +1034,14 @@ cleanup:
1042 */ 1034 */
1043static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1035static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1044 unsigned int flags, 1036 unsigned int flags,
1045 struct ext4_ext_path *path,
1046 struct ext4_extent *newext) 1037 struct ext4_extent *newext)
1047{ 1038{
1048 struct ext4_ext_path *curp = path;
1049 struct ext4_extent_header *neh; 1039 struct ext4_extent_header *neh;
1050 struct buffer_head *bh; 1040 struct buffer_head *bh;
1051 ext4_fsblk_t newblock; 1041 ext4_fsblk_t newblock;
1052 int err = 0; 1042 int err = 0;
1053 1043
1054 newblock = ext4_ext_new_meta_block(handle, inode, path, 1044 newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1055 newext, &err, flags); 1045 newext, &err, flags);
1056 if (newblock == 0) 1046 if (newblock == 0)
1057 return err; 1047 return err;
@@ -1071,7 +1061,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1071 } 1061 }
1072 1062
1073 /* move top-level index/leaf into new block */ 1063 /* move top-level index/leaf into new block */
1074 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); 1064 memmove(bh->b_data, EXT4_I(inode)->i_data,
1065 sizeof(EXT4_I(inode)->i_data));
1075 1066
1076 /* set size of new block */ 1067 /* set size of new block */
1077 neh = ext_block_hdr(bh); 1068 neh = ext_block_hdr(bh);
@@ -1089,32 +1080,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1089 if (err) 1080 if (err)
1090 goto out; 1081 goto out;
1091 1082
1092 /* create index in new top-level index: num,max,pointer */ 1083 /* Update top-level index: num,max,pointer */
1093 err = ext4_ext_get_access(handle, inode, curp);
1094 if (err)
1095 goto out;
1096
1097 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1098 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1099 curp->p_hdr->eh_entries = cpu_to_le16(1);
1100 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1101
1102 if (path[0].p_hdr->eh_depth)
1103 curp->p_idx->ei_block =
1104 EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1105 else
1106 curp->p_idx->ei_block =
1107 EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1108 ext4_idx_store_pblock(curp->p_idx, newblock);
1109
1110 neh = ext_inode_hdr(inode); 1084 neh = ext_inode_hdr(inode);
1085 neh->eh_entries = cpu_to_le16(1);
1086 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1087 if (neh->eh_depth == 0) {
1088 /* Root extent block becomes index block */
1089 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1090 EXT_FIRST_INDEX(neh)->ei_block =
1091 EXT_FIRST_EXTENT(neh)->ee_block;
1092 }
1111 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1093 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1112 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1094 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1113 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1095 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1114 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1096 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1115 1097
1116 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1098 neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
1117 err = ext4_ext_dirty(handle, inode, curp); 1099 ext4_mark_inode_dirty(handle, inode);
1118out: 1100out:
1119 brelse(bh); 1101 brelse(bh);
1120 1102
@@ -1162,8 +1144,7 @@ repeat:
1162 err = PTR_ERR(path); 1144 err = PTR_ERR(path);
1163 } else { 1145 } else {
1164 /* tree is full, time to grow in depth */ 1146 /* tree is full, time to grow in depth */
1165 err = ext4_ext_grow_indepth(handle, inode, flags, 1147 err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1166 path, newext);
1167 if (err) 1148 if (err)
1168 goto out; 1149 goto out;
1169 1150
@@ -1235,9 +1216,9 @@ static int ext4_ext_search_left(struct inode *inode,
1235 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1216 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1236 EXT4_ERROR_INODE(inode, 1217 EXT4_ERROR_INODE(inode,
1237 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1218 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1238 ix != NULL ? ix->ei_block : 0, 1219 ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1239 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1220 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1240 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, 1221 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1241 depth); 1222 depth);
1242 return -EIO; 1223 return -EIO;
1243 } 1224 }
@@ -1260,13 +1241,14 @@ static int ext4_ext_search_left(struct inode *inode,
1260/* 1241/*
1261 * search the closest allocated block to the right for *logical 1242 * search the closest allocated block to the right for *logical
1262 * and returns it at @logical + it's physical address at @phys 1243 * and returns it at @logical + it's physical address at @phys
1263 * if *logical is the smallest allocated block, the function 1244 * if *logical is the largest allocated block, the function
1264 * returns 0 at @phys 1245 * returns 0 at @phys
1265 * return value contains 0 (success) or error code 1246 * return value contains 0 (success) or error code
1266 */ 1247 */
1267static int ext4_ext_search_right(struct inode *inode, 1248static int ext4_ext_search_right(struct inode *inode,
1268 struct ext4_ext_path *path, 1249 struct ext4_ext_path *path,
1269 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1250 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1251 struct ext4_extent **ret_ex)
1270{ 1252{
1271 struct buffer_head *bh = NULL; 1253 struct buffer_head *bh = NULL;
1272 struct ext4_extent_header *eh; 1254 struct ext4_extent_header *eh;
@@ -1308,9 +1290,7 @@ static int ext4_ext_search_right(struct inode *inode,
1308 return -EIO; 1290 return -EIO;
1309 } 1291 }
1310 } 1292 }
1311 *logical = le32_to_cpu(ex->ee_block); 1293 goto found_extent;
1312 *phys = ext4_ext_pblock(ex);
1313 return 0;
1314 } 1294 }
1315 1295
1316 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1296 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
@@ -1323,9 +1303,7 @@ static int ext4_ext_search_right(struct inode *inode,
1323 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1303 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1324 /* next allocated block in this leaf */ 1304 /* next allocated block in this leaf */
1325 ex++; 1305 ex++;
1326 *logical = le32_to_cpu(ex->ee_block); 1306 goto found_extent;
1327 *phys = ext4_ext_pblock(ex);
1328 return 0;
1329 } 1307 }
1330 1308
1331 /* go up and search for index to the right */ 1309 /* go up and search for index to the right */
@@ -1368,9 +1346,12 @@ got_index:
1368 return -EIO; 1346 return -EIO;
1369 } 1347 }
1370 ex = EXT_FIRST_EXTENT(eh); 1348 ex = EXT_FIRST_EXTENT(eh);
1349found_extent:
1371 *logical = le32_to_cpu(ex->ee_block); 1350 *logical = le32_to_cpu(ex->ee_block);
1372 *phys = ext4_ext_pblock(ex); 1351 *phys = ext4_ext_pblock(ex);
1373 put_bh(bh); 1352 *ret_ex = ex;
1353 if (bh)
1354 put_bh(bh);
1374 return 0; 1355 return 0;
1375} 1356}
1376 1357
@@ -1395,7 +1376,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1395 while (depth >= 0) { 1376 while (depth >= 0) {
1396 if (depth == path->p_depth) { 1377 if (depth == path->p_depth) {
1397 /* leaf */ 1378 /* leaf */
1398 if (path[depth].p_ext != 1379 if (path[depth].p_ext &&
1380 path[depth].p_ext !=
1399 EXT_LAST_EXTENT(path[depth].p_hdr)) 1381 EXT_LAST_EXTENT(path[depth].p_hdr))
1400 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1382 return le32_to_cpu(path[depth].p_ext[1].ee_block);
1401 } else { 1383 } else {
@@ -1623,7 +1605,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,
1623 * such that there will be no overlap, and then returns 1. 1605 * such that there will be no overlap, and then returns 1.
1624 * If there is no overlap found, it returns 0. 1606 * If there is no overlap found, it returns 0.
1625 */ 1607 */
1626static unsigned int ext4_ext_check_overlap(struct inode *inode, 1608static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1609 struct inode *inode,
1627 struct ext4_extent *newext, 1610 struct ext4_extent *newext,
1628 struct ext4_ext_path *path) 1611 struct ext4_ext_path *path)
1629{ 1612{
@@ -1637,6 +1620,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
1637 if (!path[depth].p_ext) 1620 if (!path[depth].p_ext)
1638 goto out; 1621 goto out;
1639 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1622 b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1623 b2 &= ~(sbi->s_cluster_ratio - 1);
1640 1624
1641 /* 1625 /*
1642 * get the next allocated block if the extent in the path 1626 * get the next allocated block if the extent in the path
@@ -1646,6 +1630,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
1646 b2 = ext4_ext_next_allocated_block(path); 1630 b2 = ext4_ext_next_allocated_block(path);
1647 if (b2 == EXT_MAX_BLOCKS) 1631 if (b2 == EXT_MAX_BLOCKS)
1648 goto out; 1632 goto out;
1633 b2 &= ~(sbi->s_cluster_ratio - 1);
1649 } 1634 }
1650 1635
1651 /* check for wrap through zero on extent logical start block*/ 1636 /* check for wrap through zero on extent logical start block*/
@@ -1697,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1697 /* try to insert block into found extent and return */ 1682 /* try to insert block into found extent and return */
1698 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1683 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1699 && ext4_can_extents_be_merged(inode, ex, newext)) { 1684 && ext4_can_extents_be_merged(inode, ex, newext)) {
1700 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1685 ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
1701 ext4_ext_is_uninitialized(newext), 1686 ext4_ext_is_uninitialized(newext),
1702 ext4_ext_get_actual_len(newext), 1687 ext4_ext_get_actual_len(newext),
1703 le32_to_cpu(ex->ee_block), 1688 le32_to_cpu(ex->ee_block),
@@ -1735,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1735 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) 1720 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1736 next = ext4_ext_next_leaf_block(path); 1721 next = ext4_ext_next_leaf_block(path);
1737 if (next != EXT_MAX_BLOCKS) { 1722 if (next != EXT_MAX_BLOCKS) {
1738 ext_debug("next leaf block - %d\n", next); 1723 ext_debug("next leaf block - %u\n", next);
1739 BUG_ON(npath != NULL); 1724 BUG_ON(npath != NULL);
1740 npath = ext4_ext_find_extent(inode, next, NULL); 1725 npath = ext4_ext_find_extent(inode, next, NULL);
1741 if (IS_ERR(npath)) 1726 if (IS_ERR(npath))
@@ -1773,46 +1758,51 @@ has_space:
1773 1758
1774 if (!nearex) { 1759 if (!nearex) {
1775 /* there is no extent in this leaf, create first one */ 1760 /* there is no extent in this leaf, create first one */
1776 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1761 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
1777 le32_to_cpu(newext->ee_block), 1762 le32_to_cpu(newext->ee_block),
1778 ext4_ext_pblock(newext), 1763 ext4_ext_pblock(newext),
1779 ext4_ext_is_uninitialized(newext), 1764 ext4_ext_is_uninitialized(newext),
1780 ext4_ext_get_actual_len(newext)); 1765 ext4_ext_get_actual_len(newext));
1781 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1766 nearex = EXT_FIRST_EXTENT(eh);
1782 } else if (le32_to_cpu(newext->ee_block) 1767 } else {
1768 if (le32_to_cpu(newext->ee_block)
1783 > le32_to_cpu(nearex->ee_block)) { 1769 > le32_to_cpu(nearex->ee_block)) {
1784/* BUG_ON(newext->ee_block == nearex->ee_block); */ 1770 /* Insert after */
1785 if (nearex != EXT_LAST_EXTENT(eh)) { 1771 ext_debug("insert %u:%llu:[%d]%d before: "
1786 len = EXT_MAX_EXTENT(eh) - nearex; 1772 "nearest %p\n",
1787 len = (len - 1) * sizeof(struct ext4_extent);
1788 len = len < 0 ? 0 : len;
1789 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1790 "move %d from 0x%p to 0x%p\n",
1791 le32_to_cpu(newext->ee_block), 1773 le32_to_cpu(newext->ee_block),
1792 ext4_ext_pblock(newext), 1774 ext4_ext_pblock(newext),
1793 ext4_ext_is_uninitialized(newext), 1775 ext4_ext_is_uninitialized(newext),
1794 ext4_ext_get_actual_len(newext), 1776 ext4_ext_get_actual_len(newext),
1795 nearex, len, nearex + 1, nearex + 2); 1777 nearex);
1796 memmove(nearex + 2, nearex + 1, len); 1778 nearex++;
1779 } else {
1780 /* Insert before */
1781 BUG_ON(newext->ee_block == nearex->ee_block);
1782 ext_debug("insert %u:%llu:[%d]%d after: "
1783 "nearest %p\n",
1784 le32_to_cpu(newext->ee_block),
1785 ext4_ext_pblock(newext),
1786 ext4_ext_is_uninitialized(newext),
1787 ext4_ext_get_actual_len(newext),
1788 nearex);
1789 }
1790 len = EXT_LAST_EXTENT(eh) - nearex + 1;
1791 if (len > 0) {
1792 ext_debug("insert %u:%llu:[%d]%d: "
1793 "move %d extents from 0x%p to 0x%p\n",
1794 le32_to_cpu(newext->ee_block),
1795 ext4_ext_pblock(newext),
1796 ext4_ext_is_uninitialized(newext),
1797 ext4_ext_get_actual_len(newext),
1798 len, nearex, nearex + 1);
1799 memmove(nearex + 1, nearex,
1800 len * sizeof(struct ext4_extent));
1797 } 1801 }
1798 path[depth].p_ext = nearex + 1;
1799 } else {
1800 BUG_ON(newext->ee_block == nearex->ee_block);
1801 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1802 len = len < 0 ? 0 : len;
1803 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1804 "move %d from 0x%p to 0x%p\n",
1805 le32_to_cpu(newext->ee_block),
1806 ext4_ext_pblock(newext),
1807 ext4_ext_is_uninitialized(newext),
1808 ext4_ext_get_actual_len(newext),
1809 nearex, len, nearex, nearex + 1);
1810 memmove(nearex + 1, nearex, len);
1811 path[depth].p_ext = nearex;
1812 } 1802 }
1813 1803
1814 le16_add_cpu(&eh->eh_entries, 1); 1804 le16_add_cpu(&eh->eh_entries, 1);
1815 nearex = path[depth].p_ext; 1805 path[depth].p_ext = nearex;
1816 nearex->ee_block = newext->ee_block; 1806 nearex->ee_block = newext->ee_block;
1817 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 1807 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1818 nearex->ee_len = newext->ee_len; 1808 nearex->ee_len = newext->ee_len;
@@ -1962,6 +1952,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1962 struct ext4_ext_cache *cex; 1952 struct ext4_ext_cache *cex;
1963 BUG_ON(len == 0); 1953 BUG_ON(len == 0);
1964 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1954 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1955 trace_ext4_ext_put_in_cache(inode, block, len, start);
1965 cex = &EXT4_I(inode)->i_cached_extent; 1956 cex = &EXT4_I(inode)->i_cached_extent;
1966 cex->ec_block = block; 1957 cex->ec_block = block;
1967 cex->ec_len = len; 1958 cex->ec_len = len;
@@ -2063,6 +2054,7 @@ errout:
2063 sbi->extent_cache_misses++; 2054 sbi->extent_cache_misses++;
2064 else 2055 else
2065 sbi->extent_cache_hits++; 2056 sbi->extent_cache_hits++;
2057 trace_ext4_ext_in_cache(inode, block, ret);
2066 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2058 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2067 return ret; 2059 return ret;
2068} 2060}
@@ -2130,6 +2122,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2130 if (err) 2122 if (err)
2131 return err; 2123 return err;
2132 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2124 ext_debug("index is empty, remove it, free block %llu\n", leaf);
2125 trace_ext4_ext_rm_idx(inode, leaf);
2126
2133 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2127 ext4_free_blocks(handle, inode, NULL, leaf, 1,
2134 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2128 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2135 return err; 2129 return err;
@@ -2158,7 +2152,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2158 * need to account for leaf block credit 2152 * need to account for leaf block credit
2159 * 2153 *
2160 * bitmaps and block group descriptor blocks 2154 * bitmaps and block group descriptor blocks
2161 * and other metadat blocks still need to be 2155 * and other metadata blocks still need to be
2162 * accounted. 2156 * accounted.
2163 */ 2157 */
2164 /* 1 bitmap, 1 block group descriptor */ 2158 /* 1 bitmap, 1 block group descriptor */
@@ -2195,14 +2189,40 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2195} 2189}
2196 2190
2197static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2191static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2198 struct ext4_extent *ex, 2192 struct ext4_extent *ex,
2199 ext4_lblk_t from, ext4_lblk_t to) 2193 ext4_fsblk_t *partial_cluster,
2194 ext4_lblk_t from, ext4_lblk_t to)
2200{ 2195{
2196 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2201 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2197 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2198 ext4_fsblk_t pblk;
2202 int flags = EXT4_FREE_BLOCKS_FORGET; 2199 int flags = EXT4_FREE_BLOCKS_FORGET;
2203 2200
2204 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2201 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2205 flags |= EXT4_FREE_BLOCKS_METADATA; 2202 flags |= EXT4_FREE_BLOCKS_METADATA;
2203 /*
2204 * For bigalloc file systems, we never free a partial cluster
2205 * at the beginning of the extent. Instead, we make a note
2206 * that we tried freeing the cluster, and check to see if we
2207 * need to free it on a subsequent call to ext4_remove_blocks,
2208 * or at the end of the ext4_truncate() operation.
2209 */
2210 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2211
2212 trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2213 /*
2214 * If we have a partial cluster, and it's different from the
2215 * cluster of the last block, we need to explicitly free the
2216 * partial cluster here.
2217 */
2218 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2219 if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2220 ext4_free_blocks(handle, inode, NULL,
2221 EXT4_C2B(sbi, *partial_cluster),
2222 sbi->s_cluster_ratio, flags);
2223 *partial_cluster = 0;
2224 }
2225
2206#ifdef EXTENTS_STATS 2226#ifdef EXTENTS_STATS
2207 { 2227 {
2208 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2228 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2222,12 +2242,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2222 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2242 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2223 /* tail removal */ 2243 /* tail removal */
2224 ext4_lblk_t num; 2244 ext4_lblk_t num;
2225 ext4_fsblk_t start;
2226 2245
2227 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2246 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2228 start = ext4_ext_pblock(ex) + ee_len - num; 2247 pblk = ext4_ext_pblock(ex) + ee_len - num;
2229 ext_debug("free last %u blocks starting %llu\n", num, start); 2248 ext_debug("free last %u blocks starting %llu\n", num, pblk);
2230 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2249 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2250 /*
2251 * If the block range to be freed didn't start at the
2252 * beginning of a cluster, and we removed the entire
2253 * extent, save the partial cluster here, since we
2254 * might need to delete if we determine that the
2255 * truncate operation has removed all of the blocks in
2256 * the cluster.
2257 */
2258 if (pblk & (sbi->s_cluster_ratio - 1) &&
2259 (ee_len == num))
2260 *partial_cluster = EXT4_B2C(sbi, pblk);
2261 else
2262 *partial_cluster = 0;
2231 } else if (from == le32_to_cpu(ex->ee_block) 2263 } else if (from == le32_to_cpu(ex->ee_block)
2232 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2264 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2233 /* head removal */ 2265 /* head removal */
@@ -2238,7 +2270,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2238 start = ext4_ext_pblock(ex); 2270 start = ext4_ext_pblock(ex);
2239 2271
2240 ext_debug("free first %u blocks starting %llu\n", num, start); 2272 ext_debug("free first %u blocks starting %llu\n", num, start);
2241 ext4_free_blocks(handle, inode, 0, start, num, flags); 2273 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2242 2274
2243 } else { 2275 } else {
2244 printk(KERN_INFO "strange request: removal(2) " 2276 printk(KERN_INFO "strange request: removal(2) "
@@ -2262,19 +2294,19 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2262 */ 2294 */
2263static int 2295static int
2264ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2296ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2265 struct ext4_ext_path *path, ext4_lblk_t start, 2297 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
2266 ext4_lblk_t end) 2298 ext4_lblk_t start, ext4_lblk_t end)
2267{ 2299{
2300 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2268 int err = 0, correct_index = 0; 2301 int err = 0, correct_index = 0;
2269 int depth = ext_depth(inode), credits; 2302 int depth = ext_depth(inode), credits;
2270 struct ext4_extent_header *eh; 2303 struct ext4_extent_header *eh;
2271 ext4_lblk_t a, b, block; 2304 ext4_lblk_t a, b;
2272 unsigned num; 2305 unsigned num;
2273 ext4_lblk_t ex_ee_block; 2306 ext4_lblk_t ex_ee_block;
2274 unsigned short ex_ee_len; 2307 unsigned short ex_ee_len;
2275 unsigned uninitialized = 0; 2308 unsigned uninitialized = 0;
2276 struct ext4_extent *ex; 2309 struct ext4_extent *ex;
2277 struct ext4_map_blocks map;
2278 2310
2279 /* the header must be checked already in ext4_ext_remove_space() */ 2311 /* the header must be checked already in ext4_ext_remove_space() */
2280 ext_debug("truncate since %u in leaf\n", start); 2312 ext_debug("truncate since %u in leaf\n", start);
@@ -2291,6 +2323,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2291 ex_ee_block = le32_to_cpu(ex->ee_block); 2323 ex_ee_block = le32_to_cpu(ex->ee_block);
2292 ex_ee_len = ext4_ext_get_actual_len(ex); 2324 ex_ee_len = ext4_ext_get_actual_len(ex);
2293 2325
2326 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2327
2294 while (ex >= EXT_FIRST_EXTENT(eh) && 2328 while (ex >= EXT_FIRST_EXTENT(eh) &&
2295 ex_ee_block + ex_ee_len > start) { 2329 ex_ee_block + ex_ee_len > start) {
2296 2330
@@ -2315,86 +2349,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2315 ex_ee_block = le32_to_cpu(ex->ee_block); 2349 ex_ee_block = le32_to_cpu(ex->ee_block);
2316 ex_ee_len = ext4_ext_get_actual_len(ex); 2350 ex_ee_len = ext4_ext_get_actual_len(ex);
2317 continue; 2351 continue;
2318 } else if (a != ex_ee_block && 2352 } else if (b != ex_ee_block + ex_ee_len - 1) {
2319 b != ex_ee_block + ex_ee_len - 1) { 2353 EXT4_ERROR_INODE(inode," bad truncate %u:%u\n",
2320 /* 2354 start, end);
2321 * If this is a truncate, then this condition should 2355 err = -EIO;
2322 * never happen because at least one of the end points 2356 goto out;
2323 * needs to be on the edge of the extent.
2324 */
2325 if (end == EXT_MAX_BLOCKS - 1) {
2326 ext_debug(" bad truncate %u:%u\n",
2327 start, end);
2328 block = 0;
2329 num = 0;
2330 err = -EIO;
2331 goto out;
2332 }
2333 /*
2334 * else this is a hole punch, so the extent needs to
2335 * be split since neither edge of the hole is on the
2336 * extent edge
2337 */
2338 else{
2339 map.m_pblk = ext4_ext_pblock(ex);
2340 map.m_lblk = ex_ee_block;
2341 map.m_len = b - ex_ee_block;
2342
2343 err = ext4_split_extent(handle,
2344 inode, path, &map, 0,
2345 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2346 EXT4_GET_BLOCKS_PRE_IO);
2347
2348 if (err < 0)
2349 goto out;
2350
2351 ex_ee_len = ext4_ext_get_actual_len(ex);
2352
2353 b = ex_ee_block+ex_ee_len - 1 < end ?
2354 ex_ee_block+ex_ee_len - 1 : end;
2355
2356 /* Then remove tail of this extent */
2357 block = ex_ee_block;
2358 num = a - block;
2359 }
2360 } else if (a != ex_ee_block) { 2357 } else if (a != ex_ee_block) {
2361 /* remove tail of the extent */ 2358 /* remove tail of the extent */
2362 block = ex_ee_block; 2359 num = a - ex_ee_block;
2363 num = a - block;
2364 } else if (b != ex_ee_block + ex_ee_len - 1) {
2365 /* remove head of the extent */
2366 block = b;
2367 num = ex_ee_block + ex_ee_len - b;
2368
2369 /*
2370 * If this is a truncate, this condition
2371 * should never happen
2372 */
2373 if (end == EXT_MAX_BLOCKS - 1) {
2374 ext_debug(" bad truncate %u:%u\n",
2375 start, end);
2376 err = -EIO;
2377 goto out;
2378 }
2379 } else { 2360 } else {
2380 /* remove whole extent: excellent! */ 2361 /* remove whole extent: excellent! */
2381 block = ex_ee_block;
2382 num = 0; 2362 num = 0;
2383 if (a != ex_ee_block) {
2384 ext_debug(" bad truncate %u:%u\n",
2385 start, end);
2386 err = -EIO;
2387 goto out;
2388 }
2389
2390 if (b != ex_ee_block + ex_ee_len - 1) {
2391 ext_debug(" bad truncate %u:%u\n",
2392 start, end);
2393 err = -EIO;
2394 goto out;
2395 }
2396 } 2363 }
2397
2398 /* 2364 /*
2399 * 3 for leaf, sb, and inode plus 2 (bmap and group 2365 * 3 for leaf, sb, and inode plus 2 (bmap and group
2400 * descriptor) for each block group; assume two block 2366 * descriptor) for each block group; assume two block
@@ -2416,23 +2382,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2416 if (err) 2382 if (err)
2417 goto out; 2383 goto out;
2418 2384
2419 err = ext4_remove_blocks(handle, inode, ex, a, b); 2385 err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2386 a, b);
2420 if (err) 2387 if (err)
2421 goto out; 2388 goto out;
2422 2389
2423 if (num == 0) { 2390 if (num == 0)
2424 /* this extent is removed; mark slot entirely unused */ 2391 /* this extent is removed; mark slot entirely unused */
2425 ext4_ext_store_pblock(ex, 0); 2392 ext4_ext_store_pblock(ex, 0);
2426 } else if (block != ex_ee_block) {
2427 /*
2428 * If this was a head removal, then we need to update
2429 * the physical block since it is now at a different
2430 * location
2431 */
2432 ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2433 }
2434 2393
2435 ex->ee_block = cpu_to_le32(block);
2436 ex->ee_len = cpu_to_le16(num); 2394 ex->ee_len = cpu_to_le16(num);
2437 /* 2395 /*
2438 * Do not mark uninitialized if all the blocks in the 2396 * Do not mark uninitialized if all the blocks in the
@@ -2440,11 +2398,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2440 */ 2398 */
2441 if (uninitialized && num) 2399 if (uninitialized && num)
2442 ext4_ext_mark_uninitialized(ex); 2400 ext4_ext_mark_uninitialized(ex);
2443
2444 err = ext4_ext_dirty(handle, inode, path + depth);
2445 if (err)
2446 goto out;
2447
2448 /* 2401 /*
2449 * If the extent was completely released, 2402 * If the extent was completely released,
2450 * we need to remove it from the leaf 2403 * we need to remove it from the leaf
@@ -2464,9 +2417,14 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2464 sizeof(struct ext4_extent)); 2417 sizeof(struct ext4_extent));
2465 } 2418 }
2466 le16_add_cpu(&eh->eh_entries, -1); 2419 le16_add_cpu(&eh->eh_entries, -1);
2467 } 2420 } else
2421 *partial_cluster = 0;
2468 2422
2469 ext_debug("new extent: %u:%u:%llu\n", block, num, 2423 err = ext4_ext_dirty(handle, inode, path + depth);
2424 if (err)
2425 goto out;
2426
2427 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2470 ext4_ext_pblock(ex)); 2428 ext4_ext_pblock(ex));
2471 ex--; 2429 ex--;
2472 ex_ee_block = le32_to_cpu(ex->ee_block); 2430 ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2476,6 +2434,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2476 if (correct_index && eh->eh_entries) 2434 if (correct_index && eh->eh_entries)
2477 err = ext4_ext_correct_indexes(handle, inode, path); 2435 err = ext4_ext_correct_indexes(handle, inode, path);
2478 2436
2437 /*
2438 * If there is still a entry in the leaf node, check to see if
2439 * it references the partial cluster. This is the only place
2440 * where it could; if it doesn't, we can free the cluster.
2441 */
2442 if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
2443 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2444 *partial_cluster)) {
2445 int flags = EXT4_FREE_BLOCKS_FORGET;
2446
2447 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2448 flags |= EXT4_FREE_BLOCKS_METADATA;
2449
2450 ext4_free_blocks(handle, inode, NULL,
2451 EXT4_C2B(sbi, *partial_cluster),
2452 sbi->s_cluster_ratio, flags);
2453 *partial_cluster = 0;
2454 }
2455
2479 /* if this leaf is free, then we should 2456 /* if this leaf is free, then we should
2480 * remove it from index block above */ 2457 * remove it from index block above */
2481 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2458 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
@@ -2511,6 +2488,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2511 struct super_block *sb = inode->i_sb; 2488 struct super_block *sb = inode->i_sb;
2512 int depth = ext_depth(inode); 2489 int depth = ext_depth(inode);
2513 struct ext4_ext_path *path; 2490 struct ext4_ext_path *path;
2491 ext4_fsblk_t partial_cluster = 0;
2514 handle_t *handle; 2492 handle_t *handle;
2515 int i, err; 2493 int i, err;
2516 2494
@@ -2524,6 +2502,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2524again: 2502again:
2525 ext4_ext_invalidate_cache(inode); 2503 ext4_ext_invalidate_cache(inode);
2526 2504
2505 trace_ext4_ext_remove_space(inode, start, depth);
2506
2527 /* 2507 /*
2528 * We start scanning from right side, freeing all the blocks 2508 * We start scanning from right side, freeing all the blocks
2529 * after i_size and walking into the tree depth-wise. 2509 * after i_size and walking into the tree depth-wise.
@@ -2546,7 +2526,8 @@ again:
2546 if (i == depth) { 2526 if (i == depth) {
2547 /* this is leaf block */ 2527 /* this is leaf block */
2548 err = ext4_ext_rm_leaf(handle, inode, path, 2528 err = ext4_ext_rm_leaf(handle, inode, path,
2549 start, EXT_MAX_BLOCKS - 1); 2529 &partial_cluster, start,
2530 EXT_MAX_BLOCKS - 1);
2550 /* root level has p_bh == NULL, brelse() eats this */ 2531 /* root level has p_bh == NULL, brelse() eats this */
2551 brelse(path[i].p_bh); 2532 brelse(path[i].p_bh);
2552 path[i].p_bh = NULL; 2533 path[i].p_bh = NULL;
@@ -2618,6 +2599,24 @@ again:
2618 } 2599 }
2619 } 2600 }
2620 2601
2602 trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
2603 path->p_hdr->eh_entries);
2604
2605 /* If we still have something in the partial cluster and we have removed
2606 * even the first extent, then we should free the blocks in the partial
2607 * cluster as well. */
2608 if (partial_cluster && path->p_hdr->eh_entries == 0) {
2609 int flags = EXT4_FREE_BLOCKS_FORGET;
2610
2611 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2612 flags |= EXT4_FREE_BLOCKS_METADATA;
2613
2614 ext4_free_blocks(handle, inode, NULL,
2615 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2616 EXT4_SB(sb)->s_cluster_ratio, flags);
2617 partial_cluster = 0;
2618 }
2619
2621 /* TODO: flexible tree reduction should be here */ 2620 /* TODO: flexible tree reduction should be here */
2622 if (path->p_hdr->eh_entries == 0) { 2621 if (path->p_hdr->eh_entries == 0) {
2623 /* 2622 /*
@@ -2909,17 +2908,29 @@ out:
2909 * a> There is no split required: Entire extent should be initialized 2908 * a> There is no split required: Entire extent should be initialized
2910 * b> Splits in two extents: Write is happening at either end of the extent 2909 * b> Splits in two extents: Write is happening at either end of the extent
2911 * c> Splits in three extents: Somone is writing in middle of the extent 2910 * c> Splits in three extents: Somone is writing in middle of the extent
2911 *
2912 * Pre-conditions:
2913 * - The extent pointed to by 'path' is uninitialized.
2914 * - The extent pointed to by 'path' contains a superset
2915 * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
2916 *
2917 * Post-conditions on success:
2918 * - the returned value is the number of blocks beyond map->l_lblk
2919 * that are allocated and initialized.
2920 * It is guaranteed to be >= map->m_len.
2912 */ 2921 */
2913static int ext4_ext_convert_to_initialized(handle_t *handle, 2922static int ext4_ext_convert_to_initialized(handle_t *handle,
2914 struct inode *inode, 2923 struct inode *inode,
2915 struct ext4_map_blocks *map, 2924 struct ext4_map_blocks *map,
2916 struct ext4_ext_path *path) 2925 struct ext4_ext_path *path)
2917{ 2926{
2927 struct ext4_extent_header *eh;
2918 struct ext4_map_blocks split_map; 2928 struct ext4_map_blocks split_map;
2919 struct ext4_extent zero_ex; 2929 struct ext4_extent zero_ex;
2920 struct ext4_extent *ex; 2930 struct ext4_extent *ex;
2921 ext4_lblk_t ee_block, eof_block; 2931 ext4_lblk_t ee_block, eof_block;
2922 unsigned int allocated, ee_len, depth; 2932 unsigned int ee_len, depth;
2933 int allocated;
2923 int err = 0; 2934 int err = 0;
2924 int split_flag = 0; 2935 int split_flag = 0;
2925 2936
@@ -2933,11 +2944,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2933 eof_block = map->m_lblk + map->m_len; 2944 eof_block = map->m_lblk + map->m_len;
2934 2945
2935 depth = ext_depth(inode); 2946 depth = ext_depth(inode);
2947 eh = path[depth].p_hdr;
2936 ex = path[depth].p_ext; 2948 ex = path[depth].p_ext;
2937 ee_block = le32_to_cpu(ex->ee_block); 2949 ee_block = le32_to_cpu(ex->ee_block);
2938 ee_len = ext4_ext_get_actual_len(ex); 2950 ee_len = ext4_ext_get_actual_len(ex);
2939 allocated = ee_len - (map->m_lblk - ee_block); 2951 allocated = ee_len - (map->m_lblk - ee_block);
2940 2952
2953 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
2954
2955 /* Pre-conditions */
2956 BUG_ON(!ext4_ext_is_uninitialized(ex));
2957 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
2958 BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
2959
2960 /*
2961 * Attempt to transfer newly initialized blocks from the currently
2962 * uninitialized extent to its left neighbor. This is much cheaper
2963 * than an insertion followed by a merge as those involve costly
2964 * memmove() calls. This is the common case in steady state for
2965 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
2966 * writes.
2967 *
2968 * Limitations of the current logic:
2969 * - L1: we only deal with writes at the start of the extent.
2970 * The approach could be extended to writes at the end
2971 * of the extent but this scenario was deemed less common.
2972 * - L2: we do not deal with writes covering the whole extent.
2973 * This would require removing the extent if the transfer
2974 * is possible.
2975 * - L3: we only attempt to merge with an extent stored in the
2976 * same extent tree node.
2977 */
2978 if ((map->m_lblk == ee_block) && /*L1*/
2979 (map->m_len < ee_len) && /*L2*/
2980 (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
2981 struct ext4_extent *prev_ex;
2982 ext4_lblk_t prev_lblk;
2983 ext4_fsblk_t prev_pblk, ee_pblk;
2984 unsigned int prev_len, write_len;
2985
2986 prev_ex = ex - 1;
2987 prev_lblk = le32_to_cpu(prev_ex->ee_block);
2988 prev_len = ext4_ext_get_actual_len(prev_ex);
2989 prev_pblk = ext4_ext_pblock(prev_ex);
2990 ee_pblk = ext4_ext_pblock(ex);
2991 write_len = map->m_len;
2992
2993 /*
2994 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
2995 * upon those conditions:
2996 * - C1: prev_ex is initialized,
2997 * - C2: prev_ex is logically abutting ex,
2998 * - C3: prev_ex is physically abutting ex,
2999 * - C4: prev_ex can receive the additional blocks without
3000 * overflowing the (initialized) length limit.
3001 */
3002 if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
3003 ((prev_lblk + prev_len) == ee_block) && /*C2*/
3004 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
3005 (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
3006 err = ext4_ext_get_access(handle, inode, path + depth);
3007 if (err)
3008 goto out;
3009
3010 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3011 map, ex, prev_ex);
3012
3013 /* Shift the start of ex by 'write_len' blocks */
3014 ex->ee_block = cpu_to_le32(ee_block + write_len);
3015 ext4_ext_store_pblock(ex, ee_pblk + write_len);
3016 ex->ee_len = cpu_to_le16(ee_len - write_len);
3017 ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3018
3019 /* Extend prev_ex by 'write_len' blocks */
3020 prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
3021
3022 /* Mark the block containing both extents as dirty */
3023 ext4_ext_dirty(handle, inode, path + depth);
3024
3025 /* Update path to point to the right extent */
3026 path[depth].p_ext = prev_ex;
3027
3028 /* Result: number of initialized blocks past m_lblk */
3029 allocated = write_len;
3030 goto out;
3031 }
3032 }
3033
2941 WARN_ON(map->m_lblk < ee_block); 3034 WARN_ON(map->m_lblk < ee_block);
2942 /* 3035 /*
2943 * It is safe to convert extent to initialized via explicit 3036 * It is safe to convert extent to initialized via explicit
@@ -3165,6 +3258,192 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3165 return ext4_mark_inode_dirty(handle, inode); 3258 return ext4_mark_inode_dirty(handle, inode);
3166} 3259}
3167 3260
3261/**
3262 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3263 *
3264 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
3265 * whether there are any buffers marked for delayed allocation. It returns '1'
3266 * on the first delalloc'ed buffer head found. If no buffer head in the given
3267 * range is marked for delalloc, it returns 0.
3268 * lblk_start should always be <= lblk_end.
3269 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3270 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3271 * block sooner). This is useful when blocks are truncated sequentially from
3272 * lblk_start towards lblk_end.
3273 */
3274static int ext4_find_delalloc_range(struct inode *inode,
3275 ext4_lblk_t lblk_start,
3276 ext4_lblk_t lblk_end,
3277 int search_hint_reverse)
3278{
3279 struct address_space *mapping = inode->i_mapping;
3280 struct buffer_head *head, *bh = NULL;
3281 struct page *page;
3282 ext4_lblk_t i, pg_lblk;
3283 pgoff_t index;
3284
3285 /* reverse search wont work if fs block size is less than page size */
3286 if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3287 search_hint_reverse = 0;
3288
3289 if (search_hint_reverse)
3290 i = lblk_end;
3291 else
3292 i = lblk_start;
3293
3294 index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3295
3296 while ((i >= lblk_start) && (i <= lblk_end)) {
3297 page = find_get_page(mapping, index);
3298 if (!page)
3299 goto nextpage;
3300
3301 if (!page_has_buffers(page))
3302 goto nextpage;
3303
3304 head = page_buffers(page);
3305 if (!head)
3306 goto nextpage;
3307
3308 bh = head;
3309 pg_lblk = index << (PAGE_CACHE_SHIFT -
3310 inode->i_blkbits);
3311 do {
3312 if (unlikely(pg_lblk < lblk_start)) {
3313 /*
3314 * This is possible when fs block size is less
3315 * than page size and our cluster starts/ends in
3316 * middle of the page. So we need to skip the
3317 * initial few blocks till we reach the 'lblk'
3318 */
3319 pg_lblk++;
3320 continue;
3321 }
3322
3323 /* Check if the buffer is delayed allocated and that it
3324 * is not yet mapped. (when da-buffers are mapped during
3325 * their writeout, their da_mapped bit is set.)
3326 */
3327 if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3328 page_cache_release(page);
3329 trace_ext4_find_delalloc_range(inode,
3330 lblk_start, lblk_end,
3331 search_hint_reverse,
3332 1, i);
3333 return 1;
3334 }
3335 if (search_hint_reverse)
3336 i--;
3337 else
3338 i++;
3339 } while ((i >= lblk_start) && (i <= lblk_end) &&
3340 ((bh = bh->b_this_page) != head));
3341nextpage:
3342 if (page)
3343 page_cache_release(page);
3344 /*
3345 * Move to next page. 'i' will be the first lblk in the next
3346 * page.
3347 */
3348 if (search_hint_reverse)
3349 index--;
3350 else
3351 index++;
3352 i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3353 }
3354
3355 trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3356 search_hint_reverse, 0, 0);
3357 return 0;
3358}
3359
3360int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
3361 int search_hint_reverse)
3362{
3363 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3364 ext4_lblk_t lblk_start, lblk_end;
3365 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3366 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3367
3368 return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3369 search_hint_reverse);
3370}
3371
3372/**
3373 * Determines how many complete clusters (out of those specified by the 'map')
3374 * are under delalloc and were reserved quota for.
3375 * This function is called when we are writing out the blocks that were
3376 * originally written with their allocation delayed, but then the space was
3377 * allocated using fallocate() before the delayed allocation could be resolved.
3378 * The cases to look for are:
3379 * ('=' indicated delayed allocated blocks
3380 * '-' indicates non-delayed allocated blocks)
3381 * (a) partial clusters towards beginning and/or end outside of allocated range
3382 * are not delalloc'ed.
3383 * Ex:
3384 * |----c---=|====c====|====c====|===-c----|
3385 * |++++++ allocated ++++++|
3386 * ==> 4 complete clusters in above example
3387 *
3388 * (b) partial cluster (outside of allocated range) towards either end is
3389 * marked for delayed allocation. In this case, we will exclude that
3390 * cluster.
3391 * Ex:
3392 * |----====c========|========c========|
3393 * |++++++ allocated ++++++|
3394 * ==> 1 complete clusters in above example
3395 *
3396 * Ex:
3397 * |================c================|
3398 * |++++++ allocated ++++++|
3399 * ==> 0 complete clusters in above example
3400 *
3401 * The ext4_da_update_reserve_space will be called only if we
3402 * determine here that there were some "entire" clusters that span
3403 * this 'allocated' range.
3404 * In the non-bigalloc case, this function will just end up returning num_blks
3405 * without ever calling ext4_find_delalloc_range.
3406 */
3407static unsigned int
3408get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3409 unsigned int num_blks)
3410{
3411 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3412 ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3413 ext4_lblk_t lblk_from, lblk_to, c_offset;
3414 unsigned int allocated_clusters = 0;
3415
3416 alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3417 alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3418
3419 /* max possible clusters for this allocation */
3420 allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3421
3422 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3423
3424 /* Check towards left side */
3425 c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3426 if (c_offset) {
3427 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3428 lblk_to = lblk_from + c_offset - 1;
3429
3430 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3431 allocated_clusters--;
3432 }
3433
3434 /* Now check towards right. */
3435 c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3436 if (allocated_clusters && c_offset) {
3437 lblk_from = lblk_start + num_blks;
3438 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3439
3440 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
3441 allocated_clusters--;
3442 }
3443
3444 return allocated_clusters;
3445}
3446
3168static int 3447static int
3169ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3448ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3170 struct ext4_map_blocks *map, 3449 struct ext4_map_blocks *map,
@@ -3181,6 +3460,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3181 flags, allocated); 3460 flags, allocated);
3182 ext4_ext_show_leaf(inode, path); 3461 ext4_ext_show_leaf(inode, path);
3183 3462
3463 trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
3464 newblock);
3465
3184 /* get_block() before submit the IO, split the extent */ 3466 /* get_block() before submit the IO, split the extent */
3185 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3467 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3186 ret = ext4_split_unwritten_extents(handle, inode, map, 3468 ret = ext4_split_unwritten_extents(handle, inode, map,
@@ -3190,10 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3190 * that this IO needs to conversion to written when IO is 3472 * that this IO needs to conversion to written when IO is
3191 * completed 3473 * completed
3192 */ 3474 */
3193 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3475 if (io)
3194 io->flag = EXT4_IO_END_UNWRITTEN; 3476 ext4_set_io_unwritten_flag(inode, io);
3195 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 3477 else
3196 } else
3197 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3478 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3198 if (ext4_should_dioread_nolock(inode)) 3479 if (ext4_should_dioread_nolock(inode))
3199 map->m_flags |= EXT4_MAP_UNINIT; 3480 map->m_flags |= EXT4_MAP_UNINIT;
@@ -3234,14 +3515,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3234 3515
3235 /* buffered write, writepage time, convert*/ 3516 /* buffered write, writepage time, convert*/
3236 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3517 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3237 if (ret >= 0) { 3518 if (ret >= 0)
3238 ext4_update_inode_fsync_trans(handle, inode, 1); 3519 ext4_update_inode_fsync_trans(handle, inode, 1);
3239 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3240 map->m_len);
3241 if (err < 0)
3242 goto out2;
3243 }
3244
3245out: 3520out:
3246 if (ret <= 0) { 3521 if (ret <= 0) {
3247 err = ret; 3522 err = ret;
@@ -3270,11 +3545,24 @@ out:
3270 * But fallocate would have already updated quota and block 3545 * But fallocate would have already updated quota and block
3271 * count for this offset. So cancel these reservation 3546 * count for this offset. So cancel these reservation
3272 */ 3547 */
3273 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3548 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3274 ext4_da_update_reserve_space(inode, allocated, 0); 3549 unsigned int reserved_clusters;
3550 reserved_clusters = get_reserved_cluster_alloc(inode,
3551 map->m_lblk, map->m_len);
3552 if (reserved_clusters)
3553 ext4_da_update_reserve_space(inode,
3554 reserved_clusters,
3555 0);
3556 }
3275 3557
3276map_out: 3558map_out:
3277 map->m_flags |= EXT4_MAP_MAPPED; 3559 map->m_flags |= EXT4_MAP_MAPPED;
3560 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
3561 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3562 map->m_len);
3563 if (err < 0)
3564 goto out2;
3565 }
3278out1: 3566out1:
3279 if (allocated > map->m_len) 3567 if (allocated > map->m_len)
3280 allocated = map->m_len; 3568 allocated = map->m_len;
@@ -3290,6 +3578,111 @@ out2:
3290} 3578}
3291 3579
3292/* 3580/*
3581 * get_implied_cluster_alloc - check to see if the requested
3582 * allocation (in the map structure) overlaps with a cluster already
3583 * allocated in an extent.
3584 * @sb The filesystem superblock structure
3585 * @map The requested lblk->pblk mapping
3586 * @ex The extent structure which might contain an implied
3587 * cluster allocation
3588 *
3589 * This function is called by ext4_ext_map_blocks() after we failed to
3590 * find blocks that were already in the inode's extent tree. Hence,
3591 * we know that the beginning of the requested region cannot overlap
3592 * the extent from the inode's extent tree. There are three cases we
3593 * want to catch. The first is this case:
3594 *
3595 * |--- cluster # N--|
3596 * |--- extent ---| |---- requested region ---|
3597 * |==========|
3598 *
3599 * The second case that we need to test for is this one:
3600 *
3601 * |--------- cluster # N ----------------|
3602 * |--- requested region --| |------- extent ----|
3603 * |=======================|
3604 *
3605 * The third case is when the requested region lies between two extents
3606 * within the same cluster:
3607 * |------------- cluster # N-------------|
3608 * |----- ex -----| |---- ex_right ----|
3609 * |------ requested region ------|
3610 * |================|
3611 *
3612 * In each of the above cases, we need to set the map->m_pblk and
3613 * map->m_len so it corresponds to the return the extent labelled as
3614 * "|====|" from cluster #N, since it is already in use for data in
3615 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
3616 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3617 * as a new "allocated" block region. Otherwise, we will return 0 and
3618 * ext4_ext_map_blocks() will then allocate one or more new clusters
3619 * by calling ext4_mb_new_blocks().
3620 */
3621static int get_implied_cluster_alloc(struct super_block *sb,
3622 struct ext4_map_blocks *map,
3623 struct ext4_extent *ex,
3624 struct ext4_ext_path *path)
3625{
3626 struct ext4_sb_info *sbi = EXT4_SB(sb);
3627 ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3628 ext4_lblk_t ex_cluster_start, ex_cluster_end;
3629 ext4_lblk_t rr_cluster_start, rr_cluster_end;
3630 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3631 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3632 unsigned short ee_len = ext4_ext_get_actual_len(ex);
3633
3634 /* The extent passed in that we are trying to match */
3635 ex_cluster_start = EXT4_B2C(sbi, ee_block);
3636 ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3637
3638 /* The requested region passed into ext4_map_blocks() */
3639 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3640 rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
3641
3642 if ((rr_cluster_start == ex_cluster_end) ||
3643 (rr_cluster_start == ex_cluster_start)) {
3644 if (rr_cluster_start == ex_cluster_end)
3645 ee_start += ee_len - 1;
3646 map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3647 c_offset;
3648 map->m_len = min(map->m_len,
3649 (unsigned) sbi->s_cluster_ratio - c_offset);
3650 /*
3651 * Check for and handle this case:
3652 *
3653 * |--------- cluster # N-------------|
3654 * |------- extent ----|
3655 * |--- requested region ---|
3656 * |===========|
3657 */
3658
3659 if (map->m_lblk < ee_block)
3660 map->m_len = min(map->m_len, ee_block - map->m_lblk);
3661
3662 /*
3663 * Check for the case where there is already another allocated
3664 * block to the right of 'ex' but before the end of the cluster.
3665 *
3666 * |------------- cluster # N-------------|
3667 * |----- ex -----| |---- ex_right ----|
3668 * |------ requested region ------|
3669 * |================|
3670 */
3671 if (map->m_lblk > ee_block) {
3672 ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3673 map->m_len = min(map->m_len, next - map->m_lblk);
3674 }
3675
3676 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3677 return 1;
3678 }
3679
3680 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3681 return 0;
3682}
3683
3684
3685/*
3293 * Block allocation/map/preallocation routine for extents based files 3686 * Block allocation/map/preallocation routine for extents based files
3294 * 3687 *
3295 * 3688 *
@@ -3311,15 +3704,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3311 struct ext4_map_blocks *map, int flags) 3704 struct ext4_map_blocks *map, int flags)
3312{ 3705{
3313 struct ext4_ext_path *path = NULL; 3706 struct ext4_ext_path *path = NULL;
3314 struct ext4_extent newex, *ex; 3707 struct ext4_extent newex, *ex, *ex2;
3708 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3315 ext4_fsblk_t newblock = 0; 3709 ext4_fsblk_t newblock = 0;
3316 int err = 0, depth, ret; 3710 int free_on_err = 0, err = 0, depth, ret;
3317 unsigned int allocated = 0; 3711 unsigned int allocated = 0, offset = 0;
3712 unsigned int allocated_clusters = 0;
3318 unsigned int punched_out = 0; 3713 unsigned int punched_out = 0;
3319 unsigned int result = 0; 3714 unsigned int result = 0;
3320 struct ext4_allocation_request ar; 3715 struct ext4_allocation_request ar;
3321 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3716 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3322 struct ext4_map_blocks punch_map; 3717 ext4_lblk_t cluster_offset;
3323 3718
3324 ext_debug("blocks %u/%u requested for inode %lu\n", 3719 ext_debug("blocks %u/%u requested for inode %lu\n",
3325 map->m_lblk, map->m_len, inode->i_ino); 3720 map->m_lblk, map->m_len, inode->i_ino);
@@ -3329,6 +3724,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3329 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && 3724 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3330 ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3725 ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3331 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3726 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3727 if ((sbi->s_cluster_ratio > 1) &&
3728 ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3729 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3730
3332 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3731 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3333 /* 3732 /*
3334 * block isn't allocated yet and 3733 * block isn't allocated yet and
@@ -3339,6 +3738,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3339 /* we should allocate requested block */ 3738 /* we should allocate requested block */
3340 } else { 3739 } else {
3341 /* block is already allocated */ 3740 /* block is already allocated */
3741 if (sbi->s_cluster_ratio > 1)
3742 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3342 newblock = map->m_lblk 3743 newblock = map->m_lblk
3343 - le32_to_cpu(newex.ee_block) 3744 - le32_to_cpu(newex.ee_block)
3344 + ext4_ext_pblock(&newex); 3745 + ext4_ext_pblock(&newex);
@@ -3384,8 +3785,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3384 * we split out initialized portions during a write. 3785 * we split out initialized portions during a write.
3385 */ 3786 */
3386 ee_len = ext4_ext_get_actual_len(ex); 3787 ee_len = ext4_ext_get_actual_len(ex);
3788
3789 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
3790
3387 /* if found extent covers block, simply return it */ 3791 /* if found extent covers block, simply return it */
3388 if (in_range(map->m_lblk, ee_block, ee_len)) { 3792 if (in_range(map->m_lblk, ee_block, ee_len)) {
3793 struct ext4_map_blocks punch_map;
3794 ext4_fsblk_t partial_cluster = 0;
3795
3389 newblock = map->m_lblk - ee_block + ee_start; 3796 newblock = map->m_lblk - ee_block + ee_start;
3390 /* number of remaining blocks in the extent */ 3797 /* number of remaining blocks in the extent */
3391 allocated = ee_len - (map->m_lblk - ee_block); 3798 allocated = ee_len - (map->m_lblk - ee_block);
@@ -3469,7 +3876,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3469 ext4_ext_invalidate_cache(inode); 3876 ext4_ext_invalidate_cache(inode);
3470 3877
3471 err = ext4_ext_rm_leaf(handle, inode, path, 3878 err = ext4_ext_rm_leaf(handle, inode, path,
3472 map->m_lblk, map->m_lblk + punched_out); 3879 &partial_cluster, map->m_lblk,
3880 map->m_lblk + punched_out);
3473 3881
3474 if (!err && path->p_hdr->eh_entries == 0) { 3882 if (!err && path->p_hdr->eh_entries == 0) {
3475 /* 3883 /*
@@ -3492,6 +3900,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3492 } 3900 }
3493 } 3901 }
3494 3902
3903 if ((sbi->s_cluster_ratio > 1) &&
3904 ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
3905 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3906
3495 /* 3907 /*
3496 * requested block isn't allocated yet; 3908 * requested block isn't allocated yet;
3497 * we couldn't try to create block if create flag is zero 3909 * we couldn't try to create block if create flag is zero
@@ -3504,9 +3916,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3504 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 3916 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3505 goto out2; 3917 goto out2;
3506 } 3918 }
3919
3507 /* 3920 /*
3508 * Okay, we need to do block allocation. 3921 * Okay, we need to do block allocation.
3509 */ 3922 */
3923 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
3924 newex.ee_block = cpu_to_le32(map->m_lblk);
3925 cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3926
3927 /*
3928 * If we are doing bigalloc, check to see if the extent returned
3929 * by ext4_ext_find_extent() implies a cluster we can use.
3930 */
3931 if (cluster_offset && ex &&
3932 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
3933 ar.len = allocated = map->m_len;
3934 newblock = map->m_pblk;
3935 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3936 goto got_allocated_blocks;
3937 }
3510 3938
3511 /* find neighbour allocated blocks */ 3939 /* find neighbour allocated blocks */
3512 ar.lleft = map->m_lblk; 3940 ar.lleft = map->m_lblk;
@@ -3514,10 +3942,21 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3514 if (err) 3942 if (err)
3515 goto out2; 3943 goto out2;
3516 ar.lright = map->m_lblk; 3944 ar.lright = map->m_lblk;
3517 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3945 ex2 = NULL;
3946 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
3518 if (err) 3947 if (err)
3519 goto out2; 3948 goto out2;
3520 3949
3950 /* Check if the extent after searching to the right implies a
3951 * cluster we can use. */
3952 if ((sbi->s_cluster_ratio > 1) && ex2 &&
3953 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
3954 ar.len = allocated = map->m_len;
3955 newblock = map->m_pblk;
3956 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3957 goto got_allocated_blocks;
3958 }
3959
3521 /* 3960 /*
3522 * See if request is beyond maximum number of blocks we can have in 3961 * See if request is beyond maximum number of blocks we can have in
3523 * a single extent. For an initialized extent this limit is 3962 * a single extent. For an initialized extent this limit is
@@ -3532,9 +3971,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3532 map->m_len = EXT_UNINIT_MAX_LEN; 3971 map->m_len = EXT_UNINIT_MAX_LEN;
3533 3972
3534 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 3973 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3535 newex.ee_block = cpu_to_le32(map->m_lblk);
3536 newex.ee_len = cpu_to_le16(map->m_len); 3974 newex.ee_len = cpu_to_le16(map->m_len);
3537 err = ext4_ext_check_overlap(inode, &newex, path); 3975 err = ext4_ext_check_overlap(sbi, inode, &newex, path);
3538 if (err) 3976 if (err)
3539 allocated = ext4_ext_get_actual_len(&newex); 3977 allocated = ext4_ext_get_actual_len(&newex);
3540 else 3978 else
@@ -3544,7 +3982,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3544 ar.inode = inode; 3982 ar.inode = inode;
3545 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 3983 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3546 ar.logical = map->m_lblk; 3984 ar.logical = map->m_lblk;
3547 ar.len = allocated; 3985 /*
3986 * We calculate the offset from the beginning of the cluster
3987 * for the logical block number, since when we allocate a
3988 * physical cluster, the physical block should start at the
3989 * same offset from the beginning of the cluster. This is
3990 * needed so that future calls to get_implied_cluster_alloc()
3991 * work correctly.
3992 */
3993 offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
3994 ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
3995 ar.goal -= offset;
3996 ar.logical -= offset;
3548 if (S_ISREG(inode->i_mode)) 3997 if (S_ISREG(inode->i_mode))
3549 ar.flags = EXT4_MB_HINT_DATA; 3998 ar.flags = EXT4_MB_HINT_DATA;
3550 else 3999 else
@@ -3557,9 +4006,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3557 goto out2; 4006 goto out2;
3558 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 4007 ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3559 ar.goal, newblock, allocated); 4008 ar.goal, newblock, allocated);
4009 free_on_err = 1;
4010 allocated_clusters = ar.len;
4011 ar.len = EXT4_C2B(sbi, ar.len) - offset;
4012 if (ar.len > allocated)
4013 ar.len = allocated;
3560 4014
4015got_allocated_blocks:
3561 /* try to insert new extent into found leaf and return */ 4016 /* try to insert new extent into found leaf and return */
3562 ext4_ext_store_pblock(&newex, newblock); 4017 ext4_ext_store_pblock(&newex, newblock + offset);
3563 newex.ee_len = cpu_to_le16(ar.len); 4018 newex.ee_len = cpu_to_le16(ar.len);
3564 /* Mark uninitialized */ 4019 /* Mark uninitialized */
3565 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 4020 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
@@ -3572,10 +4027,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3572 * that we need to perform conversion when IO is done. 4027 * that we need to perform conversion when IO is done.
3573 */ 4028 */
3574 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4029 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3575 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 4030 if (io)
3576 io->flag = EXT4_IO_END_UNWRITTEN; 4031 ext4_set_io_unwritten_flag(inode, io);
3577 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 4032 else
3578 } else
3579 ext4_set_inode_state(inode, 4033 ext4_set_inode_state(inode,
3580 EXT4_STATE_DIO_UNWRITTEN); 4034 EXT4_STATE_DIO_UNWRITTEN);
3581 } 4035 }
@@ -3583,11 +4037,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3583 map->m_flags |= EXT4_MAP_UNINIT; 4037 map->m_flags |= EXT4_MAP_UNINIT;
3584 } 4038 }
3585 4039
3586 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); 4040 err = 0;
4041 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4042 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4043 path, ar.len);
3587 if (!err) 4044 if (!err)
3588 err = ext4_ext_insert_extent(handle, inode, path, 4045 err = ext4_ext_insert_extent(handle, inode, path,
3589 &newex, flags); 4046 &newex, flags);
3590 if (err) { 4047 if (err && free_on_err) {
3591 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4048 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
3592 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4049 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
3593 /* free data blocks we just allocated */ 4050 /* free data blocks we just allocated */
@@ -3610,8 +4067,82 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3610 * Update reserved blocks/metadata blocks after successful 4067 * Update reserved blocks/metadata blocks after successful
3611 * block allocation which had been deferred till now. 4068 * block allocation which had been deferred till now.
3612 */ 4069 */
3613 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 4070 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3614 ext4_da_update_reserve_space(inode, allocated, 1); 4071 unsigned int reserved_clusters;
4072 /*
4073 * Check how many clusters we had reserved this allocated range
4074 */
4075 reserved_clusters = get_reserved_cluster_alloc(inode,
4076 map->m_lblk, allocated);
4077 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4078 if (reserved_clusters) {
4079 /*
4080 * We have clusters reserved for this range.
4081 * But since we are not doing actual allocation
4082 * and are simply using blocks from previously
4083 * allocated cluster, we should release the
4084 * reservation and not claim quota.
4085 */
4086 ext4_da_update_reserve_space(inode,
4087 reserved_clusters, 0);
4088 }
4089 } else {
4090 BUG_ON(allocated_clusters < reserved_clusters);
4091 /* We will claim quota for all newly allocated blocks.*/
4092 ext4_da_update_reserve_space(inode, allocated_clusters,
4093 1);
4094 if (reserved_clusters < allocated_clusters) {
4095 struct ext4_inode_info *ei = EXT4_I(inode);
4096 int reservation = allocated_clusters -
4097 reserved_clusters;
4098 /*
4099 * It seems we claimed few clusters outside of
4100 * the range of this allocation. We should give
4101 * it back to the reservation pool. This can
4102 * happen in the following case:
4103 *
4104 * * Suppose s_cluster_ratio is 4 (i.e., each
4105 * cluster has 4 blocks. Thus, the clusters
4106 * are [0-3],[4-7],[8-11]...
4107 * * First comes delayed allocation write for
4108 * logical blocks 10 & 11. Since there were no
4109 * previous delayed allocated blocks in the
4110 * range [8-11], we would reserve 1 cluster
4111 * for this write.
4112 * * Next comes write for logical blocks 3 to 8.
4113 * In this case, we will reserve 2 clusters
4114 * (for [0-3] and [4-7]; and not for [8-11] as
4115 * that range has a delayed allocated blocks.
4116 * Thus total reserved clusters now becomes 3.
4117 * * Now, during the delayed allocation writeout
4118 * time, we will first write blocks [3-8] and
4119 * allocate 3 clusters for writing these
4120 * blocks. Also, we would claim all these
4121 * three clusters above.
4122 * * Now when we come here to writeout the
4123 * blocks [10-11], we would expect to claim
4124 * the reservation of 1 cluster we had made
4125 * (and we would claim it since there are no
4126 * more delayed allocated blocks in the range
4127 * [8-11]. But our reserved cluster count had
4128 * already gone to 0.
4129 *
4130 * Thus, at the step 4 above when we determine
4131 * that there are still some unwritten delayed
4132 * allocated blocks outside of our current
4133 * block range, we should increment the
4134 * reserved clusters count so that when the
4135 * remaining blocks finally gets written, we
4136 * could claim them.
4137 */
4138 dquot_reserve_block(inode,
4139 EXT4_C2B(sbi, reservation));
4140 spin_lock(&ei->i_block_reservation_lock);
4141 ei->i_reserved_data_blocks += reservation;
4142 spin_unlock(&ei->i_block_reservation_lock);
4143 }
4144 }
4145 }
3615 4146
3616 /* 4147 /*
3617 * Cache the extent and update transaction to commit on fdatasync only 4148 * Cache the extent and update transaction to commit on fdatasync only
@@ -3634,12 +4165,12 @@ out2:
3634 ext4_ext_drop_refs(path); 4165 ext4_ext_drop_refs(path);
3635 kfree(path); 4166 kfree(path);
3636 } 4167 }
3637 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3638 newblock, map->m_len, err ? err : allocated);
3639
3640 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? 4168 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
3641 punched_out : allocated; 4169 punched_out : allocated;
3642 4170
4171 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4172 newblock, map->m_len, err ? err : result);
4173
3643 return err ? err : result; 4174 return err ? err : result;
3644} 4175}
3645 4176
@@ -3649,6 +4180,7 @@ void ext4_ext_truncate(struct inode *inode)
3649 struct super_block *sb = inode->i_sb; 4180 struct super_block *sb = inode->i_sb;
3650 ext4_lblk_t last_block; 4181 ext4_lblk_t last_block;
3651 handle_t *handle; 4182 handle_t *handle;
4183 loff_t page_len;
3652 int err = 0; 4184 int err = 0;
3653 4185
3654 /* 4186 /*
@@ -3665,8 +4197,16 @@ void ext4_ext_truncate(struct inode *inode)
3665 if (IS_ERR(handle)) 4197 if (IS_ERR(handle))
3666 return; 4198 return;
3667 4199
3668 if (inode->i_size & (sb->s_blocksize - 1)) 4200 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
3669 ext4_block_truncate_page(handle, mapping, inode->i_size); 4201 page_len = PAGE_CACHE_SIZE -
4202 (inode->i_size & (PAGE_CACHE_SIZE - 1));
4203
4204 err = ext4_discard_partial_page_buffers(handle,
4205 mapping, inode->i_size, page_len, 0);
4206
4207 if (err)
4208 goto out_stop;
4209 }
3670 4210
3671 if (ext4_orphan_add(handle, inode)) 4211 if (ext4_orphan_add(handle, inode))
3672 goto out_stop; 4212 goto out_stop;
@@ -3760,6 +4300,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3760 int ret = 0; 4300 int ret = 0;
3761 int ret2 = 0; 4301 int ret2 = 0;
3762 int retries = 0; 4302 int retries = 0;
4303 int flags;
3763 struct ext4_map_blocks map; 4304 struct ext4_map_blocks map;
3764 unsigned int credits, blkbits = inode->i_blkbits; 4305 unsigned int credits, blkbits = inode->i_blkbits;
3765 4306
@@ -3796,6 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3796 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4337 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
3797 return ret; 4338 return ret;
3798 } 4339 }
4340 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
4341 if (mode & FALLOC_FL_KEEP_SIZE)
4342 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4343 /*
4344 * Don't normalize the request if it can fit in one extent so
4345 * that it doesn't get unnecessarily split into multiple
4346 * extents.
4347 */
4348 if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4349 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
3799retry: 4350retry:
3800 while (ret >= 0 && ret < max_blocks) { 4351 while (ret >= 0 && ret < max_blocks) {
3801 map.m_lblk = map.m_lblk + ret; 4352 map.m_lblk = map.m_lblk + ret;
@@ -3805,9 +4356,7 @@ retry:
3805 ret = PTR_ERR(handle); 4356 ret = PTR_ERR(handle);
3806 break; 4357 break;
3807 } 4358 }
3808 ret = ext4_map_blocks(handle, inode, &map, 4359 ret = ext4_map_blocks(handle, inode, &map, flags);
3809 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
3810 EXT4_GET_BLOCKS_NO_NORMALIZE);
3811 if (ret <= 0) { 4360 if (ret <= 0) {
3812#ifdef EXT4FS_DEBUG 4361#ifdef EXT4FS_DEBUG
3813 WARN_ON(ret <= 0); 4362 WARN_ON(ret <= 0);
@@ -4102,7 +4651,6 @@ found_delayed_extent:
4102 return EXT_BREAK; 4651 return EXT_BREAK;
4103 return EXT_CONTINUE; 4652 return EXT_CONTINUE;
4104} 4653}
4105
4106/* fiemap flags we can handle specified here */ 4654/* fiemap flags we can handle specified here */
4107#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 4655#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4108 4656
@@ -4162,17 +4710,28 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4162 struct address_space *mapping = inode->i_mapping; 4710 struct address_space *mapping = inode->i_mapping;
4163 struct ext4_map_blocks map; 4711 struct ext4_map_blocks map;
4164 handle_t *handle; 4712 handle_t *handle;
4165 loff_t first_block_offset, last_block_offset, block_len; 4713 loff_t first_page, last_page, page_len;
4166 loff_t first_page, last_page, first_page_offset, last_page_offset; 4714 loff_t first_page_offset, last_page_offset;
4167 int ret, credits, blocks_released, err = 0; 4715 int ret, credits, blocks_released, err = 0;
4168 4716
4717 /* No need to punch hole beyond i_size */
4718 if (offset >= inode->i_size)
4719 return 0;
4720
4721 /*
4722 * If the hole extends beyond i_size, set the hole
4723 * to end after the page that contains i_size
4724 */
4725 if (offset + length > inode->i_size) {
4726 length = inode->i_size +
4727 PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4728 offset;
4729 }
4730
4169 first_block = (offset + sb->s_blocksize - 1) >> 4731 first_block = (offset + sb->s_blocksize - 1) >>
4170 EXT4_BLOCK_SIZE_BITS(sb); 4732 EXT4_BLOCK_SIZE_BITS(sb);
4171 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 4733 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4172 4734
4173 first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
4174 last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
4175
4176 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 4735 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4177 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 4736 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4178 4737
@@ -4185,11 +4744,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4185 */ 4744 */
4186 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4745 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4187 err = filemap_write_and_wait_range(mapping, 4746 err = filemap_write_and_wait_range(mapping,
4188 first_page_offset == 0 ? 0 : first_page_offset-1, 4747 offset, offset + length - 1);
4189 last_page_offset);
4190 4748
4191 if (err) 4749 if (err)
4192 return err; 4750 return err;
4193 } 4751 }
4194 4752
4195 /* Now release the pages */ 4753 /* Now release the pages */
@@ -4211,24 +4769,64 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4211 goto out; 4769 goto out;
4212 4770
4213 /* 4771 /*
4214 * Now we need to zero out the un block aligned data. 4772 * Now we need to zero out the non-page-aligned data in the
4215 * If the file is smaller than a block, just 4773 * pages at the start and tail of the hole, and unmap the buffer
4216 * zero out the middle 4774 * heads for the block aligned regions of the page that were
4775 * completely zeroed.
4217 */ 4776 */
4218 if (first_block > last_block) 4777 if (first_page > last_page) {
4219 ext4_block_zero_page_range(handle, mapping, offset, length); 4778 /*
4220 else { 4779 * If the file space being truncated is contained within a page
4221 /* zero out the head of the hole before the first block */ 4780 * just zero out and unmap the middle of that page
4222 block_len = first_block_offset - offset; 4781 */
4223 if (block_len > 0) 4782 err = ext4_discard_partial_page_buffers(handle,
4224 ext4_block_zero_page_range(handle, mapping, 4783 mapping, offset, length, 0);
4225 offset, block_len); 4784
4226 4785 if (err)
4227 /* zero out the tail of the hole after the last block */ 4786 goto out;
4228 block_len = offset + length - last_block_offset; 4787 } else {
4229 if (block_len > 0) { 4788 /*
4230 ext4_block_zero_page_range(handle, mapping, 4789 * zero out and unmap the partial page that contains
4231 last_block_offset, block_len); 4790 * the start of the hole
4791 */
4792 page_len = first_page_offset - offset;
4793 if (page_len > 0) {
4794 err = ext4_discard_partial_page_buffers(handle, mapping,
4795 offset, page_len, 0);
4796 if (err)
4797 goto out;
4798 }
4799
4800 /*
4801 * zero out and unmap the partial page that contains
4802 * the end of the hole
4803 */
4804 page_len = offset + length - last_page_offset;
4805 if (page_len > 0) {
4806 err = ext4_discard_partial_page_buffers(handle, mapping,
4807 last_page_offset, page_len, 0);
4808 if (err)
4809 goto out;
4810 }
4811 }
4812
4813
4814 /*
4815 * If i_size is contained in the last page, we need to
4816 * unmap and zero the partial page after i_size
4817 */
4818 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4819 inode->i_size % PAGE_CACHE_SIZE != 0) {
4820
4821 page_len = PAGE_CACHE_SIZE -
4822 (inode->i_size & (PAGE_CACHE_SIZE - 1));
4823
4824 if (page_len > 0) {
4825 err = ext4_discard_partial_page_buffers(handle,
4826 mapping, inode->i_size, page_len, 0);
4827
4828 if (err)
4829 goto out;
4232 } 4830 }
4233 } 4831 }
4234 4832
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b9548f477bb8..cb70f1812a70 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
181 path.dentry = mnt->mnt_root; 181 path.dentry = mnt->mnt_root;
182 cp = d_path(&path, buf, sizeof(buf)); 182 cp = d_path(&path, buf, sizeof(buf));
183 if (!IS_ERR(cp)) { 183 if (!IS_ERR(cp)) {
184 memcpy(sbi->s_es->s_last_mounted, cp, 184 strlcpy(sbi->s_es->s_last_mounted, cp,
185 sizeof(sbi->s_es->s_last_mounted)); 185 sizeof(sbi->s_es->s_last_mounted));
186 ext4_mark_super_dirty(sb); 186 ext4_mark_super_dirty(sb);
187 } 187 }
188 } 188 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 036f78f7a1ef..00a2cb753efd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
75 * to written. 75 * to written.
76 * The function return the number of pending IOs on success. 76 * The function return the number of pending IOs on success.
77 */ 77 */
78extern int ext4_flush_completed_IO(struct inode *inode) 78int ext4_flush_completed_IO(struct inode *inode)
79{ 79{
80 ext4_io_end_t *io; 80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode); 81 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -83,14 +83,12 @@ extern int ext4_flush_completed_IO(struct inode *inode)
83 int ret = 0; 83 int ret = 0;
84 int ret2 = 0; 84 int ret2 = 0;
85 85
86 if (list_empty(&ei->i_completed_io_list))
87 return ret;
88
89 dump_completed_IO(inode); 86 dump_completed_IO(inode);
90 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 87 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
91 while (!list_empty(&ei->i_completed_io_list)){ 88 while (!list_empty(&ei->i_completed_io_list)){
92 io = list_entry(ei->i_completed_io_list.next, 89 io = list_entry(ei->i_completed_io_list.next,
93 ext4_io_end_t, list); 90 ext4_io_end_t, list);
91 list_del_init(&io->list);
94 /* 92 /*
95 * Calling ext4_end_io_nolock() to convert completed 93 * Calling ext4_end_io_nolock() to convert completed
96 * IO to written. 94 * IO to written.
@@ -107,11 +105,9 @@ extern int ext4_flush_completed_IO(struct inode *inode)
107 */ 105 */
108 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 106 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
109 ret = ext4_end_io_nolock(io); 107 ret = ext4_end_io_nolock(io);
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
111 if (ret < 0) 108 if (ret < 0)
112 ret2 = ret; 109 ret2 = ret;
113 else 110 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
114 list_del_init(&io->list);
115 } 111 }
116 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 112 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
117 return (ret2 < 0) ? ret2 : 0; 113 return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9c63f273b550..612bec255c6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
78 * allocation, essentially implementing a per-group read-only flag. */ 78 * allocation, essentially implementing a per-group read-only flag. */
79 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 79 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
80 ext4_error(sb, "Checksum bad for group %u", block_group); 80 ext4_error(sb, "Checksum bad for group %u", block_group);
81 ext4_free_blks_set(sb, gdp, 0); 81 ext4_free_group_clusters_set(sb, gdp, 0);
82 ext4_free_inodes_set(sb, gdp, 0); 82 ext4_free_inodes_set(sb, gdp, 0);
83 ext4_itable_unused_set(sb, gdp, 0); 83 ext4_itable_unused_set(sb, gdp, 0);
84 memset(bh->b_data, 0xff, sb->s_blocksize); 84 memset(bh->b_data, 0xff, sb->s_blocksize);
@@ -293,121 +293,9 @@ error_return:
293 ext4_std_error(sb, fatal); 293 ext4_std_error(sb, fatal);
294} 294}
295 295
296/*
297 * There are two policies for allocating an inode. If the new inode is
298 * a directory, then a forward search is made for a block group with both
299 * free space and a low directory-to-inode ratio; if that fails, then of
300 * the groups with above-average free space, that group with the fewest
301 * directories already is chosen.
302 *
303 * For other inodes, search forward from the parent directory\'s block
304 * group to find a free inode.
305 */
306static int find_group_dir(struct super_block *sb, struct inode *parent,
307 ext4_group_t *best_group)
308{
309 ext4_group_t ngroups = ext4_get_groups_count(sb);
310 unsigned int freei, avefreei;
311 struct ext4_group_desc *desc, *best_desc = NULL;
312 ext4_group_t group;
313 int ret = -1;
314
315 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
316 avefreei = freei / ngroups;
317
318 for (group = 0; group < ngroups; group++) {
319 desc = ext4_get_group_desc(sb, group, NULL);
320 if (!desc || !ext4_free_inodes_count(sb, desc))
321 continue;
322 if (ext4_free_inodes_count(sb, desc) < avefreei)
323 continue;
324 if (!best_desc ||
325 (ext4_free_blks_count(sb, desc) >
326 ext4_free_blks_count(sb, best_desc))) {
327 *best_group = group;
328 best_desc = desc;
329 ret = 0;
330 }
331 }
332 return ret;
333}
334
335#define free_block_ratio 10
336
337static int find_group_flex(struct super_block *sb, struct inode *parent,
338 ext4_group_t *best_group)
339{
340 struct ext4_sb_info *sbi = EXT4_SB(sb);
341 struct ext4_group_desc *desc;
342 struct flex_groups *flex_group = sbi->s_flex_groups;
343 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
344 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
345 ext4_group_t ngroups = ext4_get_groups_count(sb);
346 int flex_size = ext4_flex_bg_size(sbi);
347 ext4_group_t best_flex = parent_fbg_group;
348 int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
349 int flexbg_free_blocks;
350 int flex_freeb_ratio;
351 ext4_group_t n_fbg_groups;
352 ext4_group_t i;
353
354 n_fbg_groups = (ngroups + flex_size - 1) >>
355 sbi->s_log_groups_per_flex;
356
357find_close_to_parent:
358 flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
359 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
360 if (atomic_read(&flex_group[best_flex].free_inodes) &&
361 flex_freeb_ratio > free_block_ratio)
362 goto found_flexbg;
363
364 if (best_flex && best_flex == parent_fbg_group) {
365 best_flex--;
366 goto find_close_to_parent;
367 }
368
369 for (i = 0; i < n_fbg_groups; i++) {
370 if (i == parent_fbg_group || i == parent_fbg_group - 1)
371 continue;
372
373 flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
374 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
375
376 if (flex_freeb_ratio > free_block_ratio &&
377 (atomic_read(&flex_group[i].free_inodes))) {
378 best_flex = i;
379 goto found_flexbg;
380 }
381
382 if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
383 ((atomic_read(&flex_group[i].free_blocks) >
384 atomic_read(&flex_group[best_flex].free_blocks)) &&
385 atomic_read(&flex_group[i].free_inodes)))
386 best_flex = i;
387 }
388
389 if (!atomic_read(&flex_group[best_flex].free_inodes) ||
390 !atomic_read(&flex_group[best_flex].free_blocks))
391 return -1;
392
393found_flexbg:
394 for (i = best_flex * flex_size; i < ngroups &&
395 i < (best_flex + 1) * flex_size; i++) {
396 desc = ext4_get_group_desc(sb, i, NULL);
397 if (ext4_free_inodes_count(sb, desc)) {
398 *best_group = i;
399 goto out;
400 }
401 }
402
403 return -1;
404out:
405 return 0;
406}
407
408struct orlov_stats { 296struct orlov_stats {
409 __u32 free_inodes; 297 __u32 free_inodes;
410 __u32 free_blocks; 298 __u32 free_clusters;
411 __u32 used_dirs; 299 __u32 used_dirs;
412}; 300};
413 301
@@ -424,7 +312,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
424 312
425 if (flex_size > 1) { 313 if (flex_size > 1) {
426 stats->free_inodes = atomic_read(&flex_group[g].free_inodes); 314 stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
427 stats->free_blocks = atomic_read(&flex_group[g].free_blocks); 315 stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
428 stats->used_dirs = atomic_read(&flex_group[g].used_dirs); 316 stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
429 return; 317 return;
430 } 318 }
@@ -432,11 +320,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
432 desc = ext4_get_group_desc(sb, g, NULL); 320 desc = ext4_get_group_desc(sb, g, NULL);
433 if (desc) { 321 if (desc) {
434 stats->free_inodes = ext4_free_inodes_count(sb, desc); 322 stats->free_inodes = ext4_free_inodes_count(sb, desc);
435 stats->free_blocks = ext4_free_blks_count(sb, desc); 323 stats->free_clusters = ext4_free_group_clusters(sb, desc);
436 stats->used_dirs = ext4_used_dirs_count(sb, desc); 324 stats->used_dirs = ext4_used_dirs_count(sb, desc);
437 } else { 325 } else {
438 stats->free_inodes = 0; 326 stats->free_inodes = 0;
439 stats->free_blocks = 0; 327 stats->free_clusters = 0;
440 stats->used_dirs = 0; 328 stats->used_dirs = 0;
441 } 329 }
442} 330}
@@ -471,10 +359,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
471 ext4_group_t real_ngroups = ext4_get_groups_count(sb); 359 ext4_group_t real_ngroups = ext4_get_groups_count(sb);
472 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 360 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
473 unsigned int freei, avefreei; 361 unsigned int freei, avefreei;
474 ext4_fsblk_t freeb, avefreeb; 362 ext4_fsblk_t freeb, avefreec;
475 unsigned int ndirs; 363 unsigned int ndirs;
476 int max_dirs, min_inodes; 364 int max_dirs, min_inodes;
477 ext4_grpblk_t min_blocks; 365 ext4_grpblk_t min_clusters;
478 ext4_group_t i, grp, g, ngroups; 366 ext4_group_t i, grp, g, ngroups;
479 struct ext4_group_desc *desc; 367 struct ext4_group_desc *desc;
480 struct orlov_stats stats; 368 struct orlov_stats stats;
@@ -490,9 +378,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
490 378
491 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 379 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
492 avefreei = freei / ngroups; 380 avefreei = freei / ngroups;
493 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 381 freeb = EXT4_C2B(sbi,
494 avefreeb = freeb; 382 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
495 do_div(avefreeb, ngroups); 383 avefreec = freeb;
384 do_div(avefreec, ngroups);
496 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 385 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
497 386
498 if (S_ISDIR(mode) && 387 if (S_ISDIR(mode) &&
@@ -518,7 +407,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
518 continue; 407 continue;
519 if (stats.free_inodes < avefreei) 408 if (stats.free_inodes < avefreei)
520 continue; 409 continue;
521 if (stats.free_blocks < avefreeb) 410 if (stats.free_clusters < avefreec)
522 continue; 411 continue;
523 grp = g; 412 grp = g;
524 ret = 0; 413 ret = 0;
@@ -556,7 +445,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
556 min_inodes = avefreei - inodes_per_group*flex_size / 4; 445 min_inodes = avefreei - inodes_per_group*flex_size / 4;
557 if (min_inodes < 1) 446 if (min_inodes < 1)
558 min_inodes = 1; 447 min_inodes = 1;
559 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; 448 min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
560 449
561 /* 450 /*
562 * Start looking in the flex group where we last allocated an 451 * Start looking in the flex group where we last allocated an
@@ -575,7 +464,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
575 continue; 464 continue;
576 if (stats.free_inodes < min_inodes) 465 if (stats.free_inodes < min_inodes)
577 continue; 466 continue;
578 if (stats.free_blocks < min_blocks) 467 if (stats.free_clusters < min_clusters)
579 continue; 468 continue;
580 goto found_flex_bg; 469 goto found_flex_bg;
581 } 470 }
@@ -659,7 +548,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
659 *group = parent_group; 548 *group = parent_group;
660 desc = ext4_get_group_desc(sb, *group, NULL); 549 desc = ext4_get_group_desc(sb, *group, NULL);
661 if (desc && ext4_free_inodes_count(sb, desc) && 550 if (desc && ext4_free_inodes_count(sb, desc) &&
662 ext4_free_blks_count(sb, desc)) 551 ext4_free_group_clusters(sb, desc))
663 return 0; 552 return 0;
664 553
665 /* 554 /*
@@ -683,7 +572,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
683 *group -= ngroups; 572 *group -= ngroups;
684 desc = ext4_get_group_desc(sb, *group, NULL); 573 desc = ext4_get_group_desc(sb, *group, NULL);
685 if (desc && ext4_free_inodes_count(sb, desc) && 574 if (desc && ext4_free_inodes_count(sb, desc) &&
686 ext4_free_blks_count(sb, desc)) 575 ext4_free_group_clusters(sb, desc))
687 return 0; 576 return 0;
688 } 577 }
689 578
@@ -802,7 +691,7 @@ err_ret:
802 * group to find a free inode. 691 * group to find a free inode.
803 */ 692 */
804struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, 693struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
805 const struct qstr *qstr, __u32 goal) 694 const struct qstr *qstr, __u32 goal, uid_t *owner)
806{ 695{
807 struct super_block *sb; 696 struct super_block *sb;
808 struct buffer_head *inode_bitmap_bh = NULL; 697 struct buffer_head *inode_bitmap_bh = NULL;
@@ -816,8 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
816 int ret2, err = 0; 705 int ret2, err = 0;
817 struct inode *ret; 706 struct inode *ret;
818 ext4_group_t i; 707 ext4_group_t i;
819 int free = 0;
820 static int once = 1;
821 ext4_group_t flex_group; 708 ext4_group_t flex_group;
822 709
823 /* Cannot create files in a deleted directory */ 710 /* Cannot create files in a deleted directory */
@@ -843,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
843 goto got_group; 730 goto got_group;
844 } 731 }
845 732
846 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { 733 if (S_ISDIR(mode))
847 ret2 = find_group_flex(sb, dir, &group); 734 ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
848 if (ret2 == -1) { 735 else
849 ret2 = find_group_other(sb, dir, &group, mode);
850 if (ret2 == 0 && once) {
851 once = 0;
852 printk(KERN_NOTICE "ext4: find_group_flex "
853 "failed, fallback succeeded dir %lu\n",
854 dir->i_ino);
855 }
856 }
857 goto got_group;
858 }
859
860 if (S_ISDIR(mode)) {
861 if (test_opt(sb, OLDALLOC))
862 ret2 = find_group_dir(sb, dir, &group);
863 else
864 ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
865 } else
866 ret2 = find_group_other(sb, dir, &group, mode); 736 ret2 = find_group_other(sb, dir, &group, mode);
867 737
868got_group: 738got_group:
@@ -950,26 +820,21 @@ got:
950 goto fail; 820 goto fail;
951 } 821 }
952 822
953 free = 0; 823 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
954 ext4_lock_group(sb, group); 824 err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
825 brelse(block_bitmap_bh);
826
955 /* recheck and clear flag under lock if we still need to */ 827 /* recheck and clear flag under lock if we still need to */
828 ext4_lock_group(sb, group);
956 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 829 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
957 free = ext4_free_blocks_after_init(sb, group, gdp);
958 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 830 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
959 ext4_free_blks_set(sb, gdp, free); 831 ext4_free_group_clusters_set(sb, gdp,
832 ext4_free_clusters_after_init(sb, group, gdp));
960 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 833 gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
961 gdp); 834 gdp);
962 } 835 }
963 ext4_unlock_group(sb, group); 836 ext4_unlock_group(sb, group);
964 837
965 /* Don't need to dirty bitmap block if we didn't change it */
966 if (free) {
967 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
968 err = ext4_handle_dirty_metadata(handle,
969 NULL, block_bitmap_bh);
970 }
971
972 brelse(block_bitmap_bh);
973 if (err) 838 if (err)
974 goto fail; 839 goto fail;
975 } 840 }
@@ -987,8 +852,11 @@ got:
987 flex_group = ext4_flex_group(sbi, group); 852 flex_group = ext4_flex_group(sbi, group);
988 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 853 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
989 } 854 }
990 855 if (owner) {
991 if (test_opt(sb, GRPID)) { 856 inode->i_mode = mode;
857 inode->i_uid = owner[0];
858 inode->i_gid = owner[1];
859 } else if (test_opt(sb, GRPID)) {
992 inode->i_mode = mode; 860 inode->i_mode = mode;
993 inode->i_uid = current_fsuid(); 861 inode->i_uid = current_fsuid();
994 inode->i_gid = dir->i_gid; 862 inode->i_gid = dir->i_gid;
@@ -1005,11 +873,7 @@ got:
1005 ei->i_dir_start_lookup = 0; 873 ei->i_dir_start_lookup = 0;
1006 ei->i_disksize = 0; 874 ei->i_disksize = 0;
1007 875
1008 /* 876 /* Don't inherit extent flag from directory, amongst others. */
1009 * Don't inherit extent flag from directory, amongst others. We set
1010 * extent flag on newly created directory and file only if -o extent
1011 * mount option is specified
1012 */
1013 ei->i_flags = 877 ei->i_flags =
1014 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); 878 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
1015 ei->i_file_acl = 0; 879 ei->i_file_acl = 0;
@@ -1235,7 +1099,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1235 * inode allocation from the current group, so we take alloc_sem lock, to 1099 * inode allocation from the current group, so we take alloc_sem lock, to
1236 * block ext4_claim_inode until we are finished. 1100 * block ext4_claim_inode until we are finished.
1237 */ 1101 */
1238extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, 1102int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1239 int barrier) 1103 int barrier)
1240{ 1104{
1241 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 1105 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 0962642119c0..3cfc73fbca8e 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -699,6 +699,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
699 /* 699 /*
700 * Okay, we need to do block allocation. 700 * Okay, we need to do block allocation.
701 */ 701 */
702 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
703 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
704 EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
705 "non-extent mapped inodes with bigalloc");
706 return -ENOSPC;
707 }
708
702 goal = ext4_find_goal(inode, map->m_lblk, partial); 709 goal = ext4_find_goal(inode, map->m_lblk, partial);
703 710
704 /* the number of blocks need to allocate for [d,t]indirect blocks */ 711 /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1343,7 +1350,9 @@ void ext4_ind_truncate(struct inode *inode)
1343 __le32 nr = 0; 1350 __le32 nr = 0;
1344 int n = 0; 1351 int n = 0;
1345 ext4_lblk_t last_block, max_block; 1352 ext4_lblk_t last_block, max_block;
1353 loff_t page_len;
1346 unsigned blocksize = inode->i_sb->s_blocksize; 1354 unsigned blocksize = inode->i_sb->s_blocksize;
1355 int err;
1347 1356
1348 handle = start_transaction(inode); 1357 handle = start_transaction(inode);
1349 if (IS_ERR(handle)) 1358 if (IS_ERR(handle))
@@ -1354,9 +1363,16 @@ void ext4_ind_truncate(struct inode *inode)
1354 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 1363 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1355 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1364 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1356 1365
1357 if (inode->i_size & (blocksize - 1)) 1366 if (inode->i_size % PAGE_CACHE_SIZE != 0) {
1358 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 1367 page_len = PAGE_CACHE_SIZE -
1368 (inode->i_size & (PAGE_CACHE_SIZE - 1));
1369
1370 err = ext4_discard_partial_page_buffers(handle,
1371 mapping, inode->i_size, page_len, 0);
1372
1373 if (err)
1359 goto out_stop; 1374 goto out_stop;
1375 }
1360 1376
1361 if (last_block != max_block) { 1377 if (last_block != max_block) {
1362 n = ext4_block_to_path(inode, last_block, offsets, NULL); 1378 n = ext4_block_to_path(inode, last_block, offsets, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0defe0bfe019..f2419a15b81a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,7 +42,6 @@
42#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
43#include "xattr.h" 43#include "xattr.h"
44#include "acl.h" 44#include "acl.h"
45#include "ext4_extents.h"
46#include "truncate.h" 45#include "truncate.h"
47 46
48#include <trace/events/ext4.h> 47#include <trace/events/ext4.h>
@@ -268,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
268 struct ext4_inode_info *ei = EXT4_I(inode); 267 struct ext4_inode_info *ei = EXT4_I(inode);
269 268
270 spin_lock(&ei->i_block_reservation_lock); 269 spin_lock(&ei->i_block_reservation_lock);
271 trace_ext4_da_update_reserve_space(inode, used); 270 trace_ext4_da_update_reserve_space(inode, used, quota_claim);
272 if (unlikely(used > ei->i_reserved_data_blocks)) { 271 if (unlikely(used > ei->i_reserved_data_blocks)) {
273 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 272 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
274 "with only %d reserved data blocks\n", 273 "with only %d reserved data blocks\n",
@@ -281,7 +280,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
281 /* Update per-inode reservations */ 280 /* Update per-inode reservations */
282 ei->i_reserved_data_blocks -= used; 281 ei->i_reserved_data_blocks -= used;
283 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 282 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
284 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 283 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
285 used + ei->i_allocated_meta_blocks); 284 used + ei->i_allocated_meta_blocks);
286 ei->i_allocated_meta_blocks = 0; 285 ei->i_allocated_meta_blocks = 0;
287 286
@@ -291,7 +290,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
291 * only when we have written all of the delayed 290 * only when we have written all of the delayed
292 * allocation blocks. 291 * allocation blocks.
293 */ 292 */
294 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 293 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
295 ei->i_reserved_meta_blocks); 294 ei->i_reserved_meta_blocks);
296 ei->i_reserved_meta_blocks = 0; 295 ei->i_reserved_meta_blocks = 0;
297 ei->i_da_metadata_calc_len = 0; 296 ei->i_da_metadata_calc_len = 0;
@@ -300,14 +299,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
300 299
301 /* Update quota subsystem for data blocks */ 300 /* Update quota subsystem for data blocks */
302 if (quota_claim) 301 if (quota_claim)
303 dquot_claim_block(inode, used); 302 dquot_claim_block(inode, EXT4_C2B(sbi, used));
304 else { 303 else {
305 /* 304 /*
306 * We did fallocate with an offset that is already delayed 305 * We did fallocate with an offset that is already delayed
307 * allocated. So on delayed allocated writeback we should 306 * allocated. So on delayed allocated writeback we should
308 * not re-claim the quota for fallocated blocks. 307 * not re-claim the quota for fallocated blocks.
309 */ 308 */
310 dquot_release_reservation_block(inode, used); 309 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
311 } 310 }
312 311
313 /* 312 /*
@@ -399,6 +398,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
399} 398}
400 399
401/* 400/*
401 * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
402 */
403static void set_buffers_da_mapped(struct inode *inode,
404 struct ext4_map_blocks *map)
405{
406 struct address_space *mapping = inode->i_mapping;
407 struct pagevec pvec;
408 int i, nr_pages;
409 pgoff_t index, end;
410
411 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
412 end = (map->m_lblk + map->m_len - 1) >>
413 (PAGE_CACHE_SHIFT - inode->i_blkbits);
414
415 pagevec_init(&pvec, 0);
416 while (index <= end) {
417 nr_pages = pagevec_lookup(&pvec, mapping, index,
418 min(end - index + 1,
419 (pgoff_t)PAGEVEC_SIZE));
420 if (nr_pages == 0)
421 break;
422 for (i = 0; i < nr_pages; i++) {
423 struct page *page = pvec.pages[i];
424 struct buffer_head *bh, *head;
425
426 if (unlikely(page->mapping != mapping) ||
427 !PageDirty(page))
428 break;
429
430 if (page_has_buffers(page)) {
431 bh = head = page_buffers(page);
432 do {
433 set_buffer_da_mapped(bh);
434 bh = bh->b_this_page;
435 } while (bh != head);
436 }
437 index++;
438 }
439 pagevec_release(&pvec);
440 }
441}
442
443/*
402 * The ext4_map_blocks() function tries to look up the requested blocks, 444 * The ext4_map_blocks() function tries to look up the requested blocks,
403 * and returns if the blocks are already mapped. 445 * and returns if the blocks are already mapped.
404 * 446 *
@@ -416,7 +458,7 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
416 * the buffer head is mapped. 458 * the buffer head is mapped.
417 * 459 *
418 * It returns 0 if plain look up failed (blocks have not been allocated), in 460 * It returns 0 if plain look up failed (blocks have not been allocated), in
419 * that casem, buffer head is unmapped 461 * that case, buffer head is unmapped
420 * 462 *
421 * It returns the error in case of allocation failure. 463 * It returns the error in case of allocation failure.
422 */ 464 */
@@ -435,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
435 */ 477 */
436 down_read((&EXT4_I(inode)->i_data_sem)); 478 down_read((&EXT4_I(inode)->i_data_sem));
437 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 479 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
438 retval = ext4_ext_map_blocks(handle, inode, map, 0); 480 retval = ext4_ext_map_blocks(handle, inode, map, flags &
481 EXT4_GET_BLOCKS_KEEP_SIZE);
439 } else { 482 } else {
440 retval = ext4_ind_map_blocks(handle, inode, map, 0); 483 retval = ext4_ind_map_blocks(handle, inode, map, flags &
484 EXT4_GET_BLOCKS_KEEP_SIZE);
441 } 485 }
442 up_read((&EXT4_I(inode)->i_data_sem)); 486 up_read((&EXT4_I(inode)->i_data_sem));
443 487
@@ -455,7 +499,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
455 * Returns if the blocks have already allocated 499 * Returns if the blocks have already allocated
456 * 500 *
457 * Note that if blocks have been preallocated 501 * Note that if blocks have been preallocated
458 * ext4_ext_get_block() returns th create = 0 502 * ext4_ext_get_block() returns the create = 0
459 * with buffer head unmapped. 503 * with buffer head unmapped.
460 */ 504 */
461 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 505 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
@@ -517,9 +561,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
517 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 561 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
518 ext4_da_update_reserve_space(inode, retval, 1); 562 ext4_da_update_reserve_space(inode, retval, 1);
519 } 563 }
520 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 564 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
521 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 565 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
522 566
567 /* If we have successfully mapped the delayed allocated blocks,
568 * set the BH_Da_Mapped bit on them. Its important to do this
569 * under the protection of i_data_sem.
570 */
571 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
572 set_buffers_da_mapped(inode, map);
573 }
574
523 up_write((&EXT4_I(inode)->i_data_sem)); 575 up_write((&EXT4_I(inode)->i_data_sem));
524 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 576 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
525 int ret = check_block_validity(inode, map); 577 int ret = check_block_validity(inode, map);
@@ -909,7 +961,11 @@ static int ext4_ordered_write_end(struct file *file,
909 ext4_orphan_add(handle, inode); 961 ext4_orphan_add(handle, inode);
910 if (ret2 < 0) 962 if (ret2 < 0)
911 ret = ret2; 963 ret = ret2;
964 } else {
965 unlock_page(page);
966 page_cache_release(page);
912 } 967 }
968
913 ret2 = ext4_journal_stop(handle); 969 ret2 = ext4_journal_stop(handle);
914 if (!ret) 970 if (!ret)
915 ret = ret2; 971 ret = ret2;
@@ -1037,14 +1093,14 @@ static int ext4_journalled_write_end(struct file *file,
1037} 1093}
1038 1094
1039/* 1095/*
1040 * Reserve a single block located at lblock 1096 * Reserve a single cluster located at lblock
1041 */ 1097 */
1042static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) 1098static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1043{ 1099{
1044 int retries = 0; 1100 int retries = 0;
1045 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1101 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1046 struct ext4_inode_info *ei = EXT4_I(inode); 1102 struct ext4_inode_info *ei = EXT4_I(inode);
1047 unsigned long md_needed; 1103 unsigned int md_needed;
1048 int ret; 1104 int ret;
1049 1105
1050 /* 1106 /*
@@ -1054,7 +1110,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1054 */ 1110 */
1055repeat: 1111repeat:
1056 spin_lock(&ei->i_block_reservation_lock); 1112 spin_lock(&ei->i_block_reservation_lock);
1057 md_needed = ext4_calc_metadata_amount(inode, lblock); 1113 md_needed = EXT4_NUM_B2C(sbi,
1114 ext4_calc_metadata_amount(inode, lblock));
1058 trace_ext4_da_reserve_space(inode, md_needed); 1115 trace_ext4_da_reserve_space(inode, md_needed);
1059 spin_unlock(&ei->i_block_reservation_lock); 1116 spin_unlock(&ei->i_block_reservation_lock);
1060 1117
@@ -1063,15 +1120,15 @@ repeat:
1063 * us from metadata over-estimation, though we may go over by 1120 * us from metadata over-estimation, though we may go over by
1064 * a small amount in the end. Here we just reserve for data. 1121 * a small amount in the end. Here we just reserve for data.
1065 */ 1122 */
1066 ret = dquot_reserve_block(inode, 1); 1123 ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
1067 if (ret) 1124 if (ret)
1068 return ret; 1125 return ret;
1069 /* 1126 /*
1070 * We do still charge estimated metadata to the sb though; 1127 * We do still charge estimated metadata to the sb though;
1071 * we cannot afford to run out of free blocks. 1128 * we cannot afford to run out of free blocks.
1072 */ 1129 */
1073 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { 1130 if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
1074 dquot_release_reservation_block(inode, 1); 1131 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1075 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1132 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1076 yield(); 1133 yield();
1077 goto repeat; 1134 goto repeat;
@@ -1118,19 +1175,21 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1118 * We can release all of the reserved metadata blocks 1175 * We can release all of the reserved metadata blocks
1119 * only when we have written all of the delayed 1176 * only when we have written all of the delayed
1120 * allocation blocks. 1177 * allocation blocks.
1178 * Note that in case of bigalloc, i_reserved_meta_blocks,
1179 * i_reserved_data_blocks, etc. refer to number of clusters.
1121 */ 1180 */
1122 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 1181 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
1123 ei->i_reserved_meta_blocks); 1182 ei->i_reserved_meta_blocks);
1124 ei->i_reserved_meta_blocks = 0; 1183 ei->i_reserved_meta_blocks = 0;
1125 ei->i_da_metadata_calc_len = 0; 1184 ei->i_da_metadata_calc_len = 0;
1126 } 1185 }
1127 1186
1128 /* update fs dirty data blocks counter */ 1187 /* update fs dirty data blocks counter */
1129 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1188 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
1130 1189
1131 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1190 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1132 1191
1133 dquot_release_reservation_block(inode, to_free); 1192 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1134} 1193}
1135 1194
1136static void ext4_da_page_release_reservation(struct page *page, 1195static void ext4_da_page_release_reservation(struct page *page,
@@ -1139,6 +1198,9 @@ static void ext4_da_page_release_reservation(struct page *page,
1139 int to_release = 0; 1198 int to_release = 0;
1140 struct buffer_head *head, *bh; 1199 struct buffer_head *head, *bh;
1141 unsigned int curr_off = 0; 1200 unsigned int curr_off = 0;
1201 struct inode *inode = page->mapping->host;
1202 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1203 int num_clusters;
1142 1204
1143 head = page_buffers(page); 1205 head = page_buffers(page);
1144 bh = head; 1206 bh = head;
@@ -1148,10 +1210,24 @@ static void ext4_da_page_release_reservation(struct page *page,
1148 if ((offset <= curr_off) && (buffer_delay(bh))) { 1210 if ((offset <= curr_off) && (buffer_delay(bh))) {
1149 to_release++; 1211 to_release++;
1150 clear_buffer_delay(bh); 1212 clear_buffer_delay(bh);
1213 clear_buffer_da_mapped(bh);
1151 } 1214 }
1152 curr_off = next_off; 1215 curr_off = next_off;
1153 } while ((bh = bh->b_this_page) != head); 1216 } while ((bh = bh->b_this_page) != head);
1154 ext4_da_release_space(page->mapping->host, to_release); 1217
1218 /* If we have released all the blocks belonging to a cluster, then we
1219 * need to release the reserved space for that cluster. */
1220 num_clusters = EXT4_NUM_B2C(sbi, to_release);
1221 while (num_clusters > 0) {
1222 ext4_fsblk_t lblk;
1223 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
1224 ((num_clusters - 1) << sbi->s_cluster_bits);
1225 if (sbi->s_cluster_ratio == 1 ||
1226 !ext4_find_delalloc_cluster(inode, lblk, 1))
1227 ext4_da_release_space(inode, 1);
1228
1229 num_clusters--;
1230 }
1155} 1231}
1156 1232
1157/* 1233/*
@@ -1253,6 +1329,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1253 clear_buffer_delay(bh); 1329 clear_buffer_delay(bh);
1254 bh->b_blocknr = pblock; 1330 bh->b_blocknr = pblock;
1255 } 1331 }
1332 if (buffer_da_mapped(bh))
1333 clear_buffer_da_mapped(bh);
1256 if (buffer_unwritten(bh) || 1334 if (buffer_unwritten(bh) ||
1257 buffer_mapped(bh)) 1335 buffer_mapped(bh))
1258 BUG_ON(bh->b_blocknr != pblock); 1336 BUG_ON(bh->b_blocknr != pblock);
@@ -1346,12 +1424,15 @@ static void ext4_print_free_blocks(struct inode *inode)
1346{ 1424{
1347 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1425 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1348 printk(KERN_CRIT "Total free blocks count %lld\n", 1426 printk(KERN_CRIT "Total free blocks count %lld\n",
1349 ext4_count_free_blocks(inode->i_sb)); 1427 EXT4_C2B(EXT4_SB(inode->i_sb),
1428 ext4_count_free_clusters(inode->i_sb)));
1350 printk(KERN_CRIT "Free/Dirty block details\n"); 1429 printk(KERN_CRIT "Free/Dirty block details\n");
1351 printk(KERN_CRIT "free_blocks=%lld\n", 1430 printk(KERN_CRIT "free_blocks=%lld\n",
1352 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); 1431 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1432 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1353 printk(KERN_CRIT "dirty_blocks=%lld\n", 1433 printk(KERN_CRIT "dirty_blocks=%lld\n",
1354 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1434 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1435 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1355 printk(KERN_CRIT "Block reservation details\n"); 1436 printk(KERN_CRIT "Block reservation details\n");
1356 printk(KERN_CRIT "i_reserved_data_blocks=%u\n", 1437 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
1357 EXT4_I(inode)->i_reserved_data_blocks); 1438 EXT4_I(inode)->i_reserved_data_blocks);
@@ -1430,8 +1511,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1430 if (err == -EAGAIN) 1511 if (err == -EAGAIN)
1431 goto submit_io; 1512 goto submit_io;
1432 1513
1433 if (err == -ENOSPC && 1514 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1434 ext4_count_free_blocks(sb)) {
1435 mpd->retval = err; 1515 mpd->retval = err;
1436 goto submit_io; 1516 goto submit_io;
1437 } 1517 }
@@ -1471,13 +1551,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1471 1551
1472 for (i = 0; i < map.m_len; i++) 1552 for (i = 0; i < map.m_len; i++)
1473 unmap_underlying_metadata(bdev, map.m_pblk + i); 1553 unmap_underlying_metadata(bdev, map.m_pblk + i);
1474 }
1475 1554
1476 if (ext4_should_order_data(mpd->inode)) { 1555 if (ext4_should_order_data(mpd->inode)) {
1477 err = ext4_jbd2_file_inode(handle, mpd->inode); 1556 err = ext4_jbd2_file_inode(handle, mpd->inode);
1478 if (err) 1557 if (err) {
1479 /* This only happens if the journal is aborted */ 1558 /* Only if the journal is aborted */
1480 return; 1559 mpd->retval = err;
1560 goto submit_io;
1561 }
1562 }
1481 } 1563 }
1482 1564
1483 /* 1565 /*
@@ -1584,6 +1666,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1584} 1666}
1585 1667
1586/* 1668/*
1669 * This function is grabs code from the very beginning of
1670 * ext4_map_blocks, but assumes that the caller is from delayed write
1671 * time. This function looks up the requested blocks and sets the
1672 * buffer delay bit under the protection of i_data_sem.
1673 */
1674static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1675 struct ext4_map_blocks *map,
1676 struct buffer_head *bh)
1677{
1678 int retval;
1679 sector_t invalid_block = ~((sector_t) 0xffff);
1680
1681 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1682 invalid_block = ~0;
1683
1684 map->m_flags = 0;
1685 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1686 "logical block %lu\n", inode->i_ino, map->m_len,
1687 (unsigned long) map->m_lblk);
1688 /*
1689 * Try to see if we can get the block without requesting a new
1690 * file system block.
1691 */
1692 down_read((&EXT4_I(inode)->i_data_sem));
1693 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1694 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1695 else
1696 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1697
1698 if (retval == 0) {
1699 /*
1700 * XXX: __block_prepare_write() unmaps passed block,
1701 * is it OK?
1702 */
1703 /* If the block was allocated from previously allocated cluster,
1704 * then we dont need to reserve it again. */
1705 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
1706 retval = ext4_da_reserve_space(inode, iblock);
1707 if (retval)
1708 /* not enough space to reserve */
1709 goto out_unlock;
1710 }
1711
1712 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1713 * and it should not appear on the bh->b_state.
1714 */
1715 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
1716
1717 map_bh(bh, inode->i_sb, invalid_block);
1718 set_buffer_new(bh);
1719 set_buffer_delay(bh);
1720 }
1721
1722out_unlock:
1723 up_read((&EXT4_I(inode)->i_data_sem));
1724
1725 return retval;
1726}
1727
1728/*
1587 * This is a special get_blocks_t callback which is used by 1729 * This is a special get_blocks_t callback which is used by
1588 * ext4_da_write_begin(). It will either return mapped block or 1730 * ext4_da_write_begin(). It will either return mapped block or
1589 * reserve space for a single block. 1731 * reserve space for a single block.
@@ -1600,10 +1742,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1600{ 1742{
1601 struct ext4_map_blocks map; 1743 struct ext4_map_blocks map;
1602 int ret = 0; 1744 int ret = 0;
1603 sector_t invalid_block = ~((sector_t) 0xffff);
1604
1605 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1606 invalid_block = ~0;
1607 1745
1608 BUG_ON(create == 0); 1746 BUG_ON(create == 0);
1609 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 1747 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1616,25 +1754,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1616 * preallocated blocks are unmapped but should treated 1754 * preallocated blocks are unmapped but should treated
1617 * the same as allocated blocks. 1755 * the same as allocated blocks.
1618 */ 1756 */
1619 ret = ext4_map_blocks(NULL, inode, &map, 0); 1757 ret = ext4_da_map_blocks(inode, iblock, &map, bh);
1620 if (ret < 0) 1758 if (ret <= 0)
1621 return ret; 1759 return ret;
1622 if (ret == 0) {
1623 if (buffer_delay(bh))
1624 return 0; /* Not sure this could or should happen */
1625 /*
1626 * XXX: __block_write_begin() unmaps passed block, is it OK?
1627 */
1628 ret = ext4_da_reserve_space(inode, iblock);
1629 if (ret)
1630 /* not enough space to reserve */
1631 return ret;
1632
1633 map_bh(bh, inode->i_sb, invalid_block);
1634 set_buffer_new(bh);
1635 set_buffer_delay(bh);
1636 return 0;
1637 }
1638 1760
1639 map_bh(bh, inode->i_sb, map.m_pblk); 1761 map_bh(bh, inode->i_sb, map.m_pblk);
1640 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 1762 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
@@ -2050,6 +2172,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2050 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2172 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2051 pgoff_t done_index = 0; 2173 pgoff_t done_index = 0;
2052 pgoff_t end; 2174 pgoff_t end;
2175 struct blk_plug plug;
2053 2176
2054 trace_ext4_da_writepages(inode, wbc); 2177 trace_ext4_da_writepages(inode, wbc);
2055 2178
@@ -2128,6 +2251,7 @@ retry:
2128 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2251 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2129 tag_pages_for_writeback(mapping, index, end); 2252 tag_pages_for_writeback(mapping, index, end);
2130 2253
2254 blk_start_plug(&plug);
2131 while (!ret && wbc->nr_to_write > 0) { 2255 while (!ret && wbc->nr_to_write > 0) {
2132 2256
2133 /* 2257 /*
@@ -2178,11 +2302,12 @@ retry:
2178 ret = 0; 2302 ret = 0;
2179 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2303 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2180 /* 2304 /*
2181 * got one extent now try with 2305 * Got one extent now try with rest of the pages.
2182 * rest of the pages 2306 * If mpd.retval is set -EIO, journal is aborted.
2307 * So we don't need to write any more.
2183 */ 2308 */
2184 pages_written += mpd.pages_written; 2309 pages_written += mpd.pages_written;
2185 ret = 0; 2310 ret = mpd.retval;
2186 io_done = 1; 2311 io_done = 1;
2187 } else if (wbc->nr_to_write) 2312 } else if (wbc->nr_to_write)
2188 /* 2313 /*
@@ -2192,6 +2317,7 @@ retry:
2192 */ 2317 */
2193 break; 2318 break;
2194 } 2319 }
2320 blk_finish_plug(&plug);
2195 if (!io_done && !cycled) { 2321 if (!io_done && !cycled) {
2196 cycled = 1; 2322 cycled = 1;
2197 index = 0; 2323 index = 0;
@@ -2230,10 +2356,11 @@ static int ext4_nonda_switch(struct super_block *sb)
2230 * Delalloc need an accurate free block accounting. So switch 2356 * Delalloc need an accurate free block accounting. So switch
2231 * to non delalloc when we are near to error range. 2357 * to non delalloc when we are near to error range.
2232 */ 2358 */
2233 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 2359 free_blocks = EXT4_C2B(sbi,
2234 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 2360 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
2361 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2235 if (2 * free_blocks < 3 * dirty_blocks || 2362 if (2 * free_blocks < 3 * dirty_blocks ||
2236 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 2363 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
2237 /* 2364 /*
2238 * free block count is less than 150% of dirty blocks 2365 * free block count is less than 150% of dirty blocks
2239 * or free blocks is less than watermark 2366 * or free blocks is less than watermark
@@ -2259,6 +2386,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2259 pgoff_t index; 2386 pgoff_t index;
2260 struct inode *inode = mapping->host; 2387 struct inode *inode = mapping->host;
2261 handle_t *handle; 2388 handle_t *handle;
2389 loff_t page_len;
2262 2390
2263 index = pos >> PAGE_CACHE_SHIFT; 2391 index = pos >> PAGE_CACHE_SHIFT;
2264 2392
@@ -2305,6 +2433,13 @@ retry:
2305 */ 2433 */
2306 if (pos + len > inode->i_size) 2434 if (pos + len > inode->i_size)
2307 ext4_truncate_failed_write(inode); 2435 ext4_truncate_failed_write(inode);
2436 } else {
2437 page_len = pos & (PAGE_CACHE_SIZE - 1);
2438 if (page_len > 0) {
2439 ret = ext4_discard_partial_page_buffers_no_lock(handle,
2440 inode, page, pos - page_len, page_len,
2441 EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
2442 }
2308 } 2443 }
2309 2444
2310 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2445 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2347,6 +2482,7 @@ static int ext4_da_write_end(struct file *file,
2347 loff_t new_i_size; 2482 loff_t new_i_size;
2348 unsigned long start, end; 2483 unsigned long start, end;
2349 int write_mode = (int)(unsigned long)fsdata; 2484 int write_mode = (int)(unsigned long)fsdata;
2485 loff_t page_len;
2350 2486
2351 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2487 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2352 if (ext4_should_order_data(inode)) { 2488 if (ext4_should_order_data(inode)) {
@@ -2395,6 +2531,16 @@ static int ext4_da_write_end(struct file *file,
2395 } 2531 }
2396 ret2 = generic_write_end(file, mapping, pos, len, copied, 2532 ret2 = generic_write_end(file, mapping, pos, len, copied,
2397 page, fsdata); 2533 page, fsdata);
2534
2535 page_len = PAGE_CACHE_SIZE -
2536 ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
2537
2538 if (page_len > 0) {
2539 ret = ext4_discard_partial_page_buffers_no_lock(handle,
2540 inode, page, pos + copied - 1, page_len,
2541 EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
2542 }
2543
2398 copied = ret2; 2544 copied = ret2;
2399 if (ret2 < 0) 2545 if (ret2 < 0)
2400 ret = ret2; 2546 ret = ret2;
@@ -2689,10 +2835,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2689 * but being more careful is always safe for the future change. 2835 * but being more careful is always safe for the future change.
2690 */ 2836 */
2691 inode = io_end->inode; 2837 inode = io_end->inode;
2692 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 2838 ext4_set_io_unwritten_flag(inode, io_end);
2693 io_end->flag |= EXT4_IO_END_UNWRITTEN;
2694 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
2695 }
2696 2839
2697 /* Add the io_end to per-inode completed io list*/ 2840 /* Add the io_end to per-inode completed io list*/
2698 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 2841 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -2858,6 +3001,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
2858 struct inode *inode = file->f_mapping->host; 3001 struct inode *inode = file->f_mapping->host;
2859 ssize_t ret; 3002 ssize_t ret;
2860 3003
3004 /*
3005 * If we are doing data journalling we don't support O_DIRECT
3006 */
3007 if (ext4_should_journal_data(inode))
3008 return 0;
3009
2861 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 3010 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
2862 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3011 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
2863 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3012 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -2927,6 +3076,7 @@ static const struct address_space_operations ext4_journalled_aops = {
2927 .bmap = ext4_bmap, 3076 .bmap = ext4_bmap,
2928 .invalidatepage = ext4_invalidatepage, 3077 .invalidatepage = ext4_invalidatepage,
2929 .releasepage = ext4_releasepage, 3078 .releasepage = ext4_releasepage,
3079 .direct_IO = ext4_direct_IO,
2930 .is_partially_uptodate = block_is_partially_uptodate, 3080 .is_partially_uptodate = block_is_partially_uptodate,
2931 .error_remove_page = generic_error_remove_page, 3081 .error_remove_page = generic_error_remove_page,
2932}; 3082};
@@ -2963,6 +3113,227 @@ void ext4_set_aops(struct inode *inode)
2963 inode->i_mapping->a_ops = &ext4_journalled_aops; 3113 inode->i_mapping->a_ops = &ext4_journalled_aops;
2964} 3114}
2965 3115
3116
3117/*
3118 * ext4_discard_partial_page_buffers()
3119 * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
3120 * This function finds and locks the page containing the offset
3121 * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
3122 * Calling functions that already have the page locked should call
3123 * ext4_discard_partial_page_buffers_no_lock directly.
3124 */
3125int ext4_discard_partial_page_buffers(handle_t *handle,
3126 struct address_space *mapping, loff_t from,
3127 loff_t length, int flags)
3128{
3129 struct inode *inode = mapping->host;
3130 struct page *page;
3131 int err = 0;
3132
3133 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3134 mapping_gfp_mask(mapping) & ~__GFP_FS);
3135 if (!page)
3136 return -ENOMEM;
3137
3138 err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
3139 from, length, flags);
3140
3141 unlock_page(page);
3142 page_cache_release(page);
3143 return err;
3144}
3145
3146/*
3147 * ext4_discard_partial_page_buffers_no_lock()
3148 * Zeros a page range of length 'length' starting from offset 'from'.
3149 * Buffer heads that correspond to the block aligned regions of the
3150 * zeroed range will be unmapped. Unblock aligned regions
3151 * will have the corresponding buffer head mapped if needed so that
3152 * that region of the page can be updated with the partial zero out.
3153 *
3154 * This function assumes that the page has already been locked. The
3155 * The range to be discarded must be contained with in the given page.
3156 * If the specified range exceeds the end of the page it will be shortened
3157 * to the end of the page that corresponds to 'from'. This function is
3158 * appropriate for updating a page and it buffer heads to be unmapped and
3159 * zeroed for blocks that have been either released, or are going to be
3160 * released.
3161 *
3162 * handle: The journal handle
3163 * inode: The files inode
3164 * page: A locked page that contains the offset "from"
3165 * from: The starting byte offset (from the begining of the file)
3166 * to begin discarding
3167 * len: The length of bytes to discard
3168 * flags: Optional flags that may be used:
3169 *
3170 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3171 * Only zero the regions of the page whose buffer heads
3172 * have already been unmapped. This flag is appropriate
3173 * for updateing the contents of a page whose blocks may
3174 * have already been released, and we only want to zero
3175 * out the regions that correspond to those released blocks.
3176 *
3177 * Returns zero on sucess or negative on failure.
3178 */
3179int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3180 struct inode *inode, struct page *page, loff_t from,
3181 loff_t length, int flags)
3182{
3183 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3184 unsigned int offset = from & (PAGE_CACHE_SIZE-1);
3185 unsigned int blocksize, max, pos;
3186 ext4_lblk_t iblock;
3187 struct buffer_head *bh;
3188 int err = 0;
3189
3190 blocksize = inode->i_sb->s_blocksize;
3191 max = PAGE_CACHE_SIZE - offset;
3192
3193 if (index != page->index)
3194 return -EINVAL;
3195
3196 /*
3197 * correct length if it does not fall between
3198 * 'from' and the end of the page
3199 */
3200 if (length > max || length < 0)
3201 length = max;
3202
3203 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3204
3205 if (!page_has_buffers(page)) {
3206 /*
3207 * If the range to be discarded covers a partial block
3208 * we need to get the page buffers. This is because
3209 * partial blocks cannot be released and the page needs
3210 * to be updated with the contents of the block before
3211 * we write the zeros on top of it.
3212 */
3213 if ((from & (blocksize - 1)) ||
3214 ((from + length) & (blocksize - 1))) {
3215 create_empty_buffers(page, blocksize, 0);
3216 } else {
3217 /*
3218 * If there are no partial blocks,
3219 * there is nothing to update,
3220 * so we can return now
3221 */
3222 return 0;
3223 }
3224 }
3225
3226 /* Find the buffer that contains "offset" */
3227 bh = page_buffers(page);
3228 pos = blocksize;
3229 while (offset >= pos) {
3230 bh = bh->b_this_page;
3231 iblock++;
3232 pos += blocksize;
3233 }
3234
3235 pos = offset;
3236 while (pos < offset + length) {
3237 unsigned int end_of_block, range_to_discard;
3238
3239 err = 0;
3240
3241 /* The length of space left to zero and unmap */
3242 range_to_discard = offset + length - pos;
3243
3244 /* The length of space until the end of the block */
3245 end_of_block = blocksize - (pos & (blocksize-1));
3246
3247 /*
3248 * Do not unmap or zero past end of block
3249 * for this buffer head
3250 */
3251 if (range_to_discard > end_of_block)
3252 range_to_discard = end_of_block;
3253
3254
3255 /*
3256 * Skip this buffer head if we are only zeroing unampped
3257 * regions of the page
3258 */
3259 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3260 buffer_mapped(bh))
3261 goto next;
3262
3263 /* If the range is block aligned, unmap */
3264 if (range_to_discard == blocksize) {
3265 clear_buffer_dirty(bh);
3266 bh->b_bdev = NULL;
3267 clear_buffer_mapped(bh);
3268 clear_buffer_req(bh);
3269 clear_buffer_new(bh);
3270 clear_buffer_delay(bh);
3271 clear_buffer_unwritten(bh);
3272 clear_buffer_uptodate(bh);
3273 zero_user(page, pos, range_to_discard);
3274 BUFFER_TRACE(bh, "Buffer discarded");
3275 goto next;
3276 }
3277
3278 /*
3279 * If this block is not completely contained in the range
3280 * to be discarded, then it is not going to be released. Because
3281 * we need to keep this block, we need to make sure this part
3282 * of the page is uptodate before we modify it by writeing
3283 * partial zeros on it.
3284 */
3285 if (!buffer_mapped(bh)) {
3286 /*
3287 * Buffer head must be mapped before we can read
3288 * from the block
3289 */
3290 BUFFER_TRACE(bh, "unmapped");
3291 ext4_get_block(inode, iblock, bh, 0);
3292 /* unmapped? It's a hole - nothing to do */
3293 if (!buffer_mapped(bh)) {
3294 BUFFER_TRACE(bh, "still unmapped");
3295 goto next;
3296 }
3297 }
3298
3299 /* Ok, it's mapped. Make sure it's up-to-date */
3300 if (PageUptodate(page))
3301 set_buffer_uptodate(bh);
3302
3303 if (!buffer_uptodate(bh)) {
3304 err = -EIO;
3305 ll_rw_block(READ, 1, &bh);
3306 wait_on_buffer(bh);
3307 /* Uhhuh. Read error. Complain and punt.*/
3308 if (!buffer_uptodate(bh))
3309 goto next;
3310 }
3311
3312 if (ext4_should_journal_data(inode)) {
3313 BUFFER_TRACE(bh, "get write access");
3314 err = ext4_journal_get_write_access(handle, bh);
3315 if (err)
3316 goto next;
3317 }
3318
3319 zero_user(page, pos, range_to_discard);
3320
3321 err = 0;
3322 if (ext4_should_journal_data(inode)) {
3323 err = ext4_handle_dirty_metadata(handle, inode, bh);
3324 } else
3325 mark_buffer_dirty(bh);
3326
3327 BUFFER_TRACE(bh, "Partial buffer zeroed");
3328next:
3329 bh = bh->b_this_page;
3330 iblock++;
3331 pos += range_to_discard;
3332 }
3333
3334 return err;
3335}
3336
2966/* 3337/*
2967 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3338 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
2968 * up to the end of the block which corresponds to `from'. 3339 * up to the end of the block which corresponds to `from'.
@@ -3005,7 +3376,7 @@ int ext4_block_zero_page_range(handle_t *handle,
3005 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3376 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3006 mapping_gfp_mask(mapping) & ~__GFP_FS); 3377 mapping_gfp_mask(mapping) & ~__GFP_FS);
3007 if (!page) 3378 if (!page)
3008 return -EINVAL; 3379 return -ENOMEM;
3009 3380
3010 blocksize = inode->i_sb->s_blocksize; 3381 blocksize = inode->i_sb->s_blocksize;
3011 max = blocksize - (offset & (blocksize - 1)); 3382 max = blocksize - (offset & (blocksize - 1));
@@ -3074,11 +3445,8 @@ int ext4_block_zero_page_range(handle_t *handle,
3074 err = 0; 3445 err = 0;
3075 if (ext4_should_journal_data(inode)) { 3446 if (ext4_should_journal_data(inode)) {
3076 err = ext4_handle_dirty_metadata(handle, inode, bh); 3447 err = ext4_handle_dirty_metadata(handle, inode, bh);
3077 } else { 3448 } else
3078 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
3079 err = ext4_jbd2_file_inode(handle, inode);
3080 mark_buffer_dirty(bh); 3449 mark_buffer_dirty(bh);
3081 }
3082 3450
3083unlock: 3451unlock:
3084 unlock_page(page); 3452 unlock_page(page);
@@ -3119,6 +3487,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3119 return -ENOTSUPP; 3487 return -ENOTSUPP;
3120 } 3488 }
3121 3489
3490 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3491 /* TODO: Add support for bigalloc file systems */
3492 return -ENOTSUPP;
3493 }
3494
3122 return ext4_ext_punch_hole(file, offset, length); 3495 return ext4_ext_punch_hole(file, offset, length);
3123} 3496}
3124 3497
@@ -4420,6 +4793,7 @@ retry_alloc:
4420 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { 4793 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
4421 unlock_page(page); 4794 unlock_page(page);
4422 ret = VM_FAULT_SIGBUS; 4795 ret = VM_FAULT_SIGBUS;
4796 ext4_journal_stop(handle);
4423 goto out; 4797 goto out;
4424 } 4798 }
4425 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 4799 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f18bfe37aff8..a56796814d6a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -21,6 +21,7 @@
21long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 21long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
22{ 22{
23 struct inode *inode = filp->f_dentry->d_inode; 23 struct inode *inode = filp->f_dentry->d_inode;
24 struct super_block *sb = inode->i_sb;
24 struct ext4_inode_info *ei = EXT4_I(inode); 25 struct ext4_inode_info *ei = EXT4_I(inode);
25 unsigned int flags; 26 unsigned int flags;
26 27
@@ -173,33 +174,8 @@ setversion_out:
173 mnt_drop_write(filp->f_path.mnt); 174 mnt_drop_write(filp->f_path.mnt);
174 return err; 175 return err;
175 } 176 }
176#ifdef CONFIG_JBD2_DEBUG
177 case EXT4_IOC_WAIT_FOR_READONLY:
178 /*
179 * This is racy - by the time we're woken up and running,
180 * the superblock could be released. And the module could
181 * have been unloaded. So sue me.
182 *
183 * Returns 1 if it slept, else zero.
184 */
185 {
186 struct super_block *sb = inode->i_sb;
187 DECLARE_WAITQUEUE(wait, current);
188 int ret = 0;
189
190 set_current_state(TASK_INTERRUPTIBLE);
191 add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
192 if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
193 schedule();
194 ret = 1;
195 }
196 remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
197 return ret;
198 }
199#endif
200 case EXT4_IOC_GROUP_EXTEND: { 177 case EXT4_IOC_GROUP_EXTEND: {
201 ext4_fsblk_t n_blocks_count; 178 ext4_fsblk_t n_blocks_count;
202 struct super_block *sb = inode->i_sb;
203 int err, err2=0; 179 int err, err2=0;
204 180
205 err = ext4_resize_begin(sb); 181 err = ext4_resize_begin(sb);
@@ -209,6 +185,13 @@ setversion_out:
209 if (get_user(n_blocks_count, (__u32 __user *)arg)) 185 if (get_user(n_blocks_count, (__u32 __user *)arg))
210 return -EFAULT; 186 return -EFAULT;
211 187
188 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
189 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
190 ext4_msg(sb, KERN_ERR,
191 "Online resizing not supported with bigalloc");
192 return -EOPNOTSUPP;
193 }
194
212 err = mnt_want_write(filp->f_path.mnt); 195 err = mnt_want_write(filp->f_path.mnt);
213 if (err) 196 if (err)
214 return err; 197 return err;
@@ -250,6 +233,13 @@ setversion_out:
250 goto mext_out; 233 goto mext_out;
251 } 234 }
252 235
236 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
237 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
238 ext4_msg(sb, KERN_ERR,
239 "Online defrag not supported with bigalloc");
240 return -EOPNOTSUPP;
241 }
242
253 err = mnt_want_write(filp->f_path.mnt); 243 err = mnt_want_write(filp->f_path.mnt);
254 if (err) 244 if (err)
255 goto mext_out; 245 goto mext_out;
@@ -270,7 +260,6 @@ mext_out:
270 260
271 case EXT4_IOC_GROUP_ADD: { 261 case EXT4_IOC_GROUP_ADD: {
272 struct ext4_new_group_data input; 262 struct ext4_new_group_data input;
273 struct super_block *sb = inode->i_sb;
274 int err, err2=0; 263 int err, err2=0;
275 264
276 err = ext4_resize_begin(sb); 265 err = ext4_resize_begin(sb);
@@ -281,6 +270,13 @@ mext_out:
281 sizeof(input))) 270 sizeof(input)))
282 return -EFAULT; 271 return -EFAULT;
283 272
273 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
274 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
275 ext4_msg(sb, KERN_ERR,
276 "Online resizing not supported with bigalloc");
277 return -EOPNOTSUPP;
278 }
279
284 err = mnt_want_write(filp->f_path.mnt); 280 err = mnt_want_write(filp->f_path.mnt);
285 if (err) 281 if (err)
286 return err; 282 return err;
@@ -337,7 +333,6 @@ mext_out:
337 333
338 case FITRIM: 334 case FITRIM:
339 { 335 {
340 struct super_block *sb = inode->i_sb;
341 struct request_queue *q = bdev_get_queue(sb->s_bdev); 336 struct request_queue *q = bdev_get_queue(sb->s_bdev);
342 struct fstrim_range range; 337 struct fstrim_range range;
343 int ret = 0; 338 int ret = 0;
@@ -348,7 +343,14 @@ mext_out:
348 if (!blk_queue_discard(q)) 343 if (!blk_queue_discard(q))
349 return -EOPNOTSUPP; 344 return -EOPNOTSUPP;
350 345
351 if (copy_from_user(&range, (struct fstrim_range *)arg, 346 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
347 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
348 ext4_msg(sb, KERN_ERR,
349 "FITRIM not supported with bigalloc");
350 return -EOPNOTSUPP;
351 }
352
353 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
352 sizeof(range))) 354 sizeof(range)))
353 return -EFAULT; 355 return -EFAULT;
354 356
@@ -358,7 +360,7 @@ mext_out:
358 if (ret < 0) 360 if (ret < 0)
359 return ret; 361 return ret;
360 362
361 if (copy_to_user((struct fstrim_range *)arg, &range, 363 if (copy_to_user((struct fstrim_range __user *)arg, &range,
362 sizeof(range))) 364 sizeof(range)))
363 return -EFAULT; 365 return -EFAULT;
364 366
@@ -396,11 +398,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
396 case EXT4_IOC32_SETVERSION_OLD: 398 case EXT4_IOC32_SETVERSION_OLD:
397 cmd = EXT4_IOC_SETVERSION_OLD; 399 cmd = EXT4_IOC_SETVERSION_OLD;
398 break; 400 break;
399#ifdef CONFIG_JBD2_DEBUG
400 case EXT4_IOC32_WAIT_FOR_READONLY:
401 cmd = EXT4_IOC_WAIT_FOR_READONLY;
402 break;
403#endif
404 case EXT4_IOC32_GETRSVSZ: 401 case EXT4_IOC32_GETRSVSZ:
405 cmd = EXT4_IOC_GETRSVSZ; 402 cmd = EXT4_IOC_GETRSVSZ;
406 break; 403 break;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 17a5a57c415a..e2d8be8f28bf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -70,8 +70,8 @@
70 * 70 *
71 * pa_lstart -> the logical start block for this prealloc space 71 * pa_lstart -> the logical start block for this prealloc space
72 * pa_pstart -> the physical start block for this prealloc space 72 * pa_pstart -> the physical start block for this prealloc space
73 * pa_len -> length for this prealloc space 73 * pa_len -> length for this prealloc space (in clusters)
74 * pa_free -> free space available in this prealloc space 74 * pa_free -> free space available in this prealloc space (in clusters)
75 * 75 *
76 * The inode preallocation space is used looking at the _logical_ start 76 * The inode preallocation space is used looking at the _logical_ start
77 * block. If only the logical file block falls within the range of prealloc 77 * block. If only the logical file block falls within the range of prealloc
@@ -126,7 +126,8 @@
126 * list. In case of inode preallocation we follow a list of heuristics 126 * list. In case of inode preallocation we follow a list of heuristics
127 * based on file size. This can be found in ext4_mb_normalize_request. If 127 * based on file size. This can be found in ext4_mb_normalize_request. If
128 * we are doing a group prealloc we try to normalize the request to 128 * we are doing a group prealloc we try to normalize the request to
129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is 129 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
130 * dependent on the cluster size; for non-bigalloc file systems, it is
130 * 512 blocks. This can be tuned via 131 * 512 blocks. This can be tuned via
131 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in 132 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
132 * terms of number of blocks. If we have mounted the file system with -O 133 * terms of number of blocks. If we have mounted the file system with -O
@@ -459,7 +460,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
459 ext4_fsblk_t blocknr; 460 ext4_fsblk_t blocknr;
460 461
461 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 462 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
462 blocknr += first + i; 463 blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
463 ext4_grp_locked_error(sb, e4b->bd_group, 464 ext4_grp_locked_error(sb, e4b->bd_group,
464 inode ? inode->i_ino : 0, 465 inode ? inode->i_ino : 0,
465 blocknr, 466 blocknr,
@@ -580,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
580 continue; 581 continue;
581 } 582 }
582 583
583 /* both bits in buddy2 must be 0 */ 584 /* both bits in buddy2 must be 1 */
584 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 585 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
585 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 586 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
586 587
@@ -653,7 +654,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
653 ext4_grpblk_t chunk; 654 ext4_grpblk_t chunk;
654 unsigned short border; 655 unsigned short border;
655 656
656 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 657 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
657 658
658 border = 2 << sb->s_blocksize_bits; 659 border = 2 << sb->s_blocksize_bits;
659 660
@@ -705,7 +706,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
705 void *buddy, void *bitmap, ext4_group_t group) 706 void *buddy, void *bitmap, ext4_group_t group)
706{ 707{
707 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 708 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
708 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); 709 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
709 ext4_grpblk_t i = 0; 710 ext4_grpblk_t i = 0;
710 ext4_grpblk_t first; 711 ext4_grpblk_t first;
711 ext4_grpblk_t len; 712 ext4_grpblk_t len;
@@ -734,7 +735,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
734 735
735 if (free != grp->bb_free) { 736 if (free != grp->bb_free) {
736 ext4_grp_locked_error(sb, group, 0, 0, 737 ext4_grp_locked_error(sb, group, 0, 0,
737 "%u blocks in bitmap, %u in gd", 738 "%u clusters in bitmap, %u in gd",
738 free, grp->bb_free); 739 free, grp->bb_free);
739 /* 740 /*
740 * If we intent to continue, we consider group descritor 741 * If we intent to continue, we consider group descritor
@@ -1339,7 +1340,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1339 ext4_fsblk_t blocknr; 1340 ext4_fsblk_t blocknr;
1340 1341
1341 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1342 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1342 blocknr += block; 1343 blocknr += EXT4_C2B(EXT4_SB(sb), block);
1343 ext4_grp_locked_error(sb, e4b->bd_group, 1344 ext4_grp_locked_error(sb, e4b->bd_group,
1344 inode ? inode->i_ino : 0, 1345 inode ? inode->i_ino : 0,
1345 blocknr, 1346 blocknr,
@@ -1390,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1390{ 1391{
1391 int next = block; 1392 int next = block;
1392 int max; 1393 int max;
1393 int ord;
1394 void *buddy; 1394 void *buddy;
1395 1395
1396 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1396 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1432,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1432 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1432 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
1433 break; 1433 break;
1434 1434
1435 ord = mb_find_order_for_block(e4b, next); 1435 order = mb_find_order_for_block(e4b, next);
1436 1436
1437 order = ord;
1438 block = next >> order; 1437 block = next >> order;
1439 ex->fe_len += 1 << order; 1438 ex->fe_len += 1 << order;
1440 } 1439 }
@@ -1624,8 +1623,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1624 struct ext4_free_extent *gex = &ac->ac_g_ex; 1623 struct ext4_free_extent *gex = &ac->ac_g_ex;
1625 1624
1626 BUG_ON(ex->fe_len <= 0); 1625 BUG_ON(ex->fe_len <= 0);
1627 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1626 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1628 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1627 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1629 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1628 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1630 1629
1631 ac->ac_found++; 1630 ac->ac_found++;
@@ -1823,15 +1822,15 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1823 1822
1824 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 1823 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1825 i = mb_find_next_zero_bit(bitmap, 1824 i = mb_find_next_zero_bit(bitmap,
1826 EXT4_BLOCKS_PER_GROUP(sb), i); 1825 EXT4_CLUSTERS_PER_GROUP(sb), i);
1827 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { 1826 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1828 /* 1827 /*
1829 * IF we have corrupt bitmap, we won't find any 1828 * IF we have corrupt bitmap, we won't find any
1830 * free blocks even though group info says we 1829 * free blocks even though group info says we
1831 * we have free blocks 1830 * we have free blocks
1832 */ 1831 */
1833 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1832 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1834 "%d free blocks as per " 1833 "%d free clusters as per "
1835 "group info. But bitmap says 0", 1834 "group info. But bitmap says 0",
1836 free); 1835 free);
1837 break; 1836 break;
@@ -1841,7 +1840,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1841 BUG_ON(ex.fe_len <= 0); 1840 BUG_ON(ex.fe_len <= 0);
1842 if (free < ex.fe_len) { 1841 if (free < ex.fe_len) {
1843 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1842 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1844 "%d free blocks as per " 1843 "%d free clusters as per "
1845 "group info. But got %d blocks", 1844 "group info. But got %d blocks",
1846 free, ex.fe_len); 1845 free, ex.fe_len);
1847 /* 1846 /*
@@ -1887,7 +1886,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1887 do_div(a, sbi->s_stripe); 1886 do_div(a, sbi->s_stripe);
1888 i = (a * sbi->s_stripe) - first_group_block; 1887 i = (a * sbi->s_stripe) - first_group_block;
1889 1888
1890 while (i < EXT4_BLOCKS_PER_GROUP(sb)) { 1889 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
1891 if (!mb_test_bit(i, bitmap)) { 1890 if (!mb_test_bit(i, bitmap)) {
1892 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1891 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
1893 if (max >= sbi->s_stripe) { 1892 if (max >= sbi->s_stripe) {
@@ -2252,10 +2251,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2252 */ 2251 */
2253 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2252 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2254 meta_group_info[i]->bb_free = 2253 meta_group_info[i]->bb_free =
2255 ext4_free_blocks_after_init(sb, group, desc); 2254 ext4_free_clusters_after_init(sb, group, desc);
2256 } else { 2255 } else {
2257 meta_group_info[i]->bb_free = 2256 meta_group_info[i]->bb_free =
2258 ext4_free_blks_count(sb, desc); 2257 ext4_free_group_clusters(sb, desc);
2259 } 2258 }
2260 2259
2261 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2260 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2473,7 +2472,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2473 sbi->s_mb_stats = MB_DEFAULT_STATS; 2472 sbi->s_mb_stats = MB_DEFAULT_STATS;
2474 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2473 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2475 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2474 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2476 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2475 /*
2476 * The default group preallocation is 512, which for 4k block
2477 * sizes translates to 2 megabytes. However for bigalloc file
2478 * systems, this is probably too big (i.e, if the cluster size
2479 * is 1 megabyte, then group preallocation size becomes half a
2480 * gigabyte!). As a default, we will keep a two megabyte
2481 * group pralloc size for cluster sizes up to 64k, and after
2482 * that, we will force a minimum group preallocation size of
2483 * 32 clusters. This translates to 8 megs when the cluster
2484 * size is 256k, and 32 megs when the cluster size is 1 meg,
2485 * which seems reasonable as a default.
2486 */
2487 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2488 sbi->s_cluster_bits, 32);
2477 /* 2489 /*
2478 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc 2490 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2479 * to the lowest multiple of s_stripe which is bigger than 2491 * to the lowest multiple of s_stripe which is bigger than
@@ -2490,7 +2502,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2490 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2502 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2491 if (sbi->s_locality_groups == NULL) { 2503 if (sbi->s_locality_groups == NULL) {
2492 ret = -ENOMEM; 2504 ret = -ENOMEM;
2493 goto out; 2505 goto out_free_groupinfo_slab;
2494 } 2506 }
2495 for_each_possible_cpu(i) { 2507 for_each_possible_cpu(i) {
2496 struct ext4_locality_group *lg; 2508 struct ext4_locality_group *lg;
@@ -2503,9 +2515,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2503 2515
2504 /* init file for buddy data */ 2516 /* init file for buddy data */
2505 ret = ext4_mb_init_backend(sb); 2517 ret = ext4_mb_init_backend(sb);
2506 if (ret != 0) { 2518 if (ret != 0)
2507 goto out; 2519 goto out_free_locality_groups;
2508 }
2509 2520
2510 if (sbi->s_proc) 2521 if (sbi->s_proc)
2511 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2522 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
@@ -2513,11 +2524,19 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2513 2524
2514 if (sbi->s_journal) 2525 if (sbi->s_journal)
2515 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2526 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2527
2528 return 0;
2529
2530out_free_locality_groups:
2531 free_percpu(sbi->s_locality_groups);
2532 sbi->s_locality_groups = NULL;
2533out_free_groupinfo_slab:
2534 ext4_groupinfo_destroy_slabs();
2516out: 2535out:
2517 if (ret) { 2536 kfree(sbi->s_mb_offsets);
2518 kfree(sbi->s_mb_offsets); 2537 sbi->s_mb_offsets = NULL;
2519 kfree(sbi->s_mb_maxs); 2538 kfree(sbi->s_mb_maxs);
2520 } 2539 sbi->s_mb_maxs = NULL;
2521 return ret; 2540 return ret;
2522} 2541}
2523 2542
@@ -2602,11 +2621,13 @@ int ext4_mb_release(struct super_block *sb)
2602} 2621}
2603 2622
2604static inline int ext4_issue_discard(struct super_block *sb, 2623static inline int ext4_issue_discard(struct super_block *sb,
2605 ext4_group_t block_group, ext4_grpblk_t block, int count) 2624 ext4_group_t block_group, ext4_grpblk_t cluster, int count)
2606{ 2625{
2607 ext4_fsblk_t discard_block; 2626 ext4_fsblk_t discard_block;
2608 2627
2609 discard_block = block + ext4_group_first_block_no(sb, block_group); 2628 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2629 ext4_group_first_block_no(sb, block_group));
2630 count = EXT4_C2B(EXT4_SB(sb), count);
2610 trace_ext4_discard_blocks(sb, 2631 trace_ext4_discard_blocks(sb,
2611 (unsigned long long) discard_block, count); 2632 (unsigned long long) discard_block, count);
2612 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 2633 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
@@ -2633,7 +2654,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2633 2654
2634 if (test_opt(sb, DISCARD)) 2655 if (test_opt(sb, DISCARD))
2635 ext4_issue_discard(sb, entry->group, 2656 ext4_issue_discard(sb, entry->group,
2636 entry->start_blk, entry->count); 2657 entry->start_cluster, entry->count);
2637 2658
2638 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2659 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2639 /* we expect to find existing buddy because it's pinned */ 2660 /* we expect to find existing buddy because it's pinned */
@@ -2646,7 +2667,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2646 ext4_lock_group(sb, entry->group); 2667 ext4_lock_group(sb, entry->group);
2647 /* Take it out of per group rb tree */ 2668 /* Take it out of per group rb tree */
2648 rb_erase(&entry->node, &(db->bb_free_root)); 2669 rb_erase(&entry->node, &(db->bb_free_root));
2649 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2670 mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
2650 2671
2651 /* 2672 /*
2652 * Clear the trimmed flag for the group so that the next 2673 * Clear the trimmed flag for the group so that the next
@@ -2752,7 +2773,7 @@ void ext4_exit_mballoc(void)
2752 */ 2773 */
2753static noinline_for_stack int 2774static noinline_for_stack int
2754ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2775ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2755 handle_t *handle, unsigned int reserv_blks) 2776 handle_t *handle, unsigned int reserv_clstrs)
2756{ 2777{
2757 struct buffer_head *bitmap_bh = NULL; 2778 struct buffer_head *bitmap_bh = NULL;
2758 struct ext4_group_desc *gdp; 2779 struct ext4_group_desc *gdp;
@@ -2783,7 +2804,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2783 goto out_err; 2804 goto out_err;
2784 2805
2785 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 2806 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2786 ext4_free_blks_count(sb, gdp)); 2807 ext4_free_group_clusters(sb, gdp));
2787 2808
2788 err = ext4_journal_get_write_access(handle, gdp_bh); 2809 err = ext4_journal_get_write_access(handle, gdp_bh);
2789 if (err) 2810 if (err)
@@ -2791,7 +2812,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2791 2812
2792 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 2813 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2793 2814
2794 len = ac->ac_b_ex.fe_len; 2815 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2795 if (!ext4_data_block_valid(sbi, block, len)) { 2816 if (!ext4_data_block_valid(sbi, block, len)) {
2796 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 2817 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2797 "fs metadata\n", block, block+len); 2818 "fs metadata\n", block, block+len);
@@ -2823,28 +2844,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2823 ac->ac_b_ex.fe_len); 2844 ac->ac_b_ex.fe_len);
2824 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2845 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2825 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2846 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2826 ext4_free_blks_set(sb, gdp, 2847 ext4_free_group_clusters_set(sb, gdp,
2827 ext4_free_blocks_after_init(sb, 2848 ext4_free_clusters_after_init(sb,
2828 ac->ac_b_ex.fe_group, gdp)); 2849 ac->ac_b_ex.fe_group, gdp));
2829 } 2850 }
2830 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; 2851 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
2831 ext4_free_blks_set(sb, gdp, len); 2852 ext4_free_group_clusters_set(sb, gdp, len);
2832 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2853 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2833 2854
2834 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2855 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2835 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 2856 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
2836 /* 2857 /*
2837 * Now reduce the dirty block count also. Should not go negative 2858 * Now reduce the dirty block count also. Should not go negative
2838 */ 2859 */
2839 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2860 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2840 /* release all the reserved blocks if non delalloc */ 2861 /* release all the reserved blocks if non delalloc */
2841 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); 2862 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
2863 reserv_clstrs);
2842 2864
2843 if (sbi->s_log_groups_per_flex) { 2865 if (sbi->s_log_groups_per_flex) {
2844 ext4_group_t flex_group = ext4_flex_group(sbi, 2866 ext4_group_t flex_group = ext4_flex_group(sbi,
2845 ac->ac_b_ex.fe_group); 2867 ac->ac_b_ex.fe_group);
2846 atomic_sub(ac->ac_b_ex.fe_len, 2868 atomic_sub(ac->ac_b_ex.fe_len,
2847 &sbi->s_flex_groups[flex_group].free_blocks); 2869 &sbi->s_flex_groups[flex_group].free_clusters);
2848 } 2870 }
2849 2871
2850 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2872 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2886,6 +2908,7 @@ static noinline_for_stack void
2886ext4_mb_normalize_request(struct ext4_allocation_context *ac, 2908ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2887 struct ext4_allocation_request *ar) 2909 struct ext4_allocation_request *ar)
2888{ 2910{
2911 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2889 int bsbits, max; 2912 int bsbits, max;
2890 ext4_lblk_t end; 2913 ext4_lblk_t end;
2891 loff_t size, orig_size, start_off; 2914 loff_t size, orig_size, start_off;
@@ -2916,7 +2939,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2916 2939
2917 /* first, let's learn actual file size 2940 /* first, let's learn actual file size
2918 * given current request is allocated */ 2941 * given current request is allocated */
2919 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 2942 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
2920 size = size << bsbits; 2943 size = size << bsbits;
2921 if (size < i_size_read(ac->ac_inode)) 2944 if (size < i_size_read(ac->ac_inode))
2922 size = i_size_read(ac->ac_inode); 2945 size = i_size_read(ac->ac_inode);
@@ -2988,7 +3011,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2988 continue; 3011 continue;
2989 } 3012 }
2990 3013
2991 pa_end = pa->pa_lstart + pa->pa_len; 3014 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3015 pa->pa_len);
2992 3016
2993 /* PA must not overlap original request */ 3017 /* PA must not overlap original request */
2994 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3018 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3018,9 +3042,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3018 rcu_read_lock(); 3042 rcu_read_lock();
3019 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3043 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3020 ext4_lblk_t pa_end; 3044 ext4_lblk_t pa_end;
3045
3021 spin_lock(&pa->pa_lock); 3046 spin_lock(&pa->pa_lock);
3022 if (pa->pa_deleted == 0) { 3047 if (pa->pa_deleted == 0) {
3023 pa_end = pa->pa_lstart + pa->pa_len; 3048 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3049 pa->pa_len);
3024 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 3050 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3025 } 3051 }
3026 spin_unlock(&pa->pa_lock); 3052 spin_unlock(&pa->pa_lock);
@@ -3036,14 +3062,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3036 } 3062 }
3037 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3063 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3038 start > ac->ac_o_ex.fe_logical); 3064 start > ac->ac_o_ex.fe_logical);
3039 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3065 BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
3040 3066
3041 /* now prepare goal request */ 3067 /* now prepare goal request */
3042 3068
3043 /* XXX: is it better to align blocks WRT to logical 3069 /* XXX: is it better to align blocks WRT to logical
3044 * placement or satisfy big request as is */ 3070 * placement or satisfy big request as is */
3045 ac->ac_g_ex.fe_logical = start; 3071 ac->ac_g_ex.fe_logical = start;
3046 ac->ac_g_ex.fe_len = size; 3072 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3047 3073
3048 /* define goal start in order to merge */ 3074 /* define goal start in order to merge */
3049 if (ar->pright && (ar->lright == (start + size))) { 3075 if (ar->pright && (ar->lright == (start + size))) {
@@ -3112,14 +3138,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3112static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3138static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3113 struct ext4_prealloc_space *pa) 3139 struct ext4_prealloc_space *pa)
3114{ 3140{
3141 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3115 ext4_fsblk_t start; 3142 ext4_fsblk_t start;
3116 ext4_fsblk_t end; 3143 ext4_fsblk_t end;
3117 int len; 3144 int len;
3118 3145
3119 /* found preallocated blocks, use them */ 3146 /* found preallocated blocks, use them */
3120 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 3147 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3121 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); 3148 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3122 len = end - start; 3149 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3150 len = EXT4_NUM_B2C(sbi, end - start);
3123 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 3151 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3124 &ac->ac_b_ex.fe_start); 3152 &ac->ac_b_ex.fe_start);
3125 ac->ac_b_ex.fe_len = len; 3153 ac->ac_b_ex.fe_len = len;
@@ -3127,7 +3155,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3127 ac->ac_pa = pa; 3155 ac->ac_pa = pa;
3128 3156
3129 BUG_ON(start < pa->pa_pstart); 3157 BUG_ON(start < pa->pa_pstart);
3130 BUG_ON(start + len > pa->pa_pstart + pa->pa_len); 3158 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3131 BUG_ON(pa->pa_free < len); 3159 BUG_ON(pa->pa_free < len);
3132 pa->pa_free -= len; 3160 pa->pa_free -= len;
3133 3161
@@ -3193,6 +3221,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3193static noinline_for_stack int 3221static noinline_for_stack int
3194ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3222ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3195{ 3223{
3224 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3196 int order, i; 3225 int order, i;
3197 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3226 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3198 struct ext4_locality_group *lg; 3227 struct ext4_locality_group *lg;
@@ -3210,12 +3239,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3210 /* all fields in this condition don't change, 3239 /* all fields in this condition don't change,
3211 * so we can skip locking for them */ 3240 * so we can skip locking for them */
3212 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 3241 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3213 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3242 ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3243 EXT4_C2B(sbi, pa->pa_len)))
3214 continue; 3244 continue;
3215 3245
3216 /* non-extent files can't have physical blocks past 2^32 */ 3246 /* non-extent files can't have physical blocks past 2^32 */
3217 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 3247 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3218 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3248 (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3249 EXT4_MAX_BLOCK_FILE_PHYS))
3219 continue; 3250 continue;
3220 3251
3221 /* found preallocated blocks, use them */ 3252 /* found preallocated blocks, use them */
@@ -3291,7 +3322,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3291 3322
3292 while (n) { 3323 while (n) {
3293 entry = rb_entry(n, struct ext4_free_data, node); 3324 entry = rb_entry(n, struct ext4_free_data, node);
3294 ext4_set_bits(bitmap, entry->start_blk, entry->count); 3325 ext4_set_bits(bitmap, entry->start_cluster, entry->count);
3295 n = rb_next(n); 3326 n = rb_next(n);
3296 } 3327 }
3297 return; 3328 return;
@@ -3312,7 +3343,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3312 ext4_group_t groupnr; 3343 ext4_group_t groupnr;
3313 ext4_grpblk_t start; 3344 ext4_grpblk_t start;
3314 int preallocated = 0; 3345 int preallocated = 0;
3315 int count = 0;
3316 int len; 3346 int len;
3317 3347
3318 /* all form of preallocation discards first load group, 3348 /* all form of preallocation discards first load group,
@@ -3335,7 +3365,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3335 BUG_ON(groupnr != group); 3365 BUG_ON(groupnr != group);
3336 ext4_set_bits(bitmap, start, len); 3366 ext4_set_bits(bitmap, start, len);
3337 preallocated += len; 3367 preallocated += len;
3338 count++;
3339 } 3368 }
3340 mb_debug(1, "prellocated %u for group %u\n", preallocated, group); 3369 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3341} 3370}
@@ -3412,6 +3441,7 @@ static noinline_for_stack int
3412ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 3441ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3413{ 3442{
3414 struct super_block *sb = ac->ac_sb; 3443 struct super_block *sb = ac->ac_sb;
3444 struct ext4_sb_info *sbi = EXT4_SB(sb);
3415 struct ext4_prealloc_space *pa; 3445 struct ext4_prealloc_space *pa;
3416 struct ext4_group_info *grp; 3446 struct ext4_group_info *grp;
3417 struct ext4_inode_info *ei; 3447 struct ext4_inode_info *ei;
@@ -3443,16 +3473,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3443 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 3473 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3444 3474
3445 /* also, we should cover whole original request */ 3475 /* also, we should cover whole original request */
3446 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; 3476 wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3447 3477
3448 /* the smallest one defines real window */ 3478 /* the smallest one defines real window */
3449 win = min(winl, wins); 3479 win = min(winl, wins);
3450 3480
3451 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; 3481 offs = ac->ac_o_ex.fe_logical %
3482 EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3452 if (offs && offs < win) 3483 if (offs && offs < win)
3453 win = offs; 3484 win = offs;
3454 3485
3455 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; 3486 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3487 EXT4_B2C(sbi, win);
3456 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 3488 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3457 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 3489 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3458 } 3490 }
@@ -3477,7 +3509,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3477 trace_ext4_mb_new_inode_pa(ac, pa); 3509 trace_ext4_mb_new_inode_pa(ac, pa);
3478 3510
3479 ext4_mb_use_inode_pa(ac, pa); 3511 ext4_mb_use_inode_pa(ac, pa);
3480 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3512 atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3481 3513
3482 ei = EXT4_I(ac->ac_inode); 3514 ei = EXT4_I(ac->ac_inode);
3483 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3515 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3592,7 +3624,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3592 3624
3593 BUG_ON(pa->pa_deleted == 0); 3625 BUG_ON(pa->pa_deleted == 0);
3594 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3626 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3595 grp_blk_start = pa->pa_pstart - bit; 3627 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3596 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3628 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3597 end = bit + pa->pa_len; 3629 end = bit + pa->pa_len;
3598 3630
@@ -3607,7 +3639,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3607 free += next - bit; 3639 free += next - bit;
3608 3640
3609 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 3641 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3610 trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, 3642 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3643 EXT4_C2B(sbi, bit)),
3611 next - bit); 3644 next - bit);
3612 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3645 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3613 bit = next + 1; 3646 bit = next + 1;
@@ -3690,7 +3723,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3690 } 3723 }
3691 3724
3692 if (needed == 0) 3725 if (needed == 0)
3693 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3726 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3694 3727
3695 INIT_LIST_HEAD(&list); 3728 INIT_LIST_HEAD(&list);
3696repeat: 3729repeat:
@@ -3958,7 +3991,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3958 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3991 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3959 return; 3992 return;
3960 3993
3961 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3994 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3962 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3995 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
3963 >> bsbits; 3996 >> bsbits;
3964 3997
@@ -3969,6 +4002,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3969 return; 4002 return;
3970 } 4003 }
3971 4004
4005 if (sbi->s_mb_group_prealloc <= 0) {
4006 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4007 return;
4008 }
4009
3972 /* don't use group allocation for large files */ 4010 /* don't use group allocation for large files */
3973 size = max(size, isize); 4011 size = max(size, isize);
3974 if (size > sbi->s_mb_stream_request) { 4012 if (size > sbi->s_mb_stream_request) {
@@ -4007,8 +4045,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4007 len = ar->len; 4045 len = ar->len;
4008 4046
4009 /* just a dirty hack to filter too big requests */ 4047 /* just a dirty hack to filter too big requests */
4010 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) 4048 if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
4011 len = EXT4_BLOCKS_PER_GROUP(sb) - 10; 4049 len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
4012 4050
4013 /* start searching from the goal */ 4051 /* start searching from the goal */
4014 goal = ar->goal; 4052 goal = ar->goal;
@@ -4019,18 +4057,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4019 4057
4020 /* set up allocation goals */ 4058 /* set up allocation goals */
4021 memset(ac, 0, sizeof(struct ext4_allocation_context)); 4059 memset(ac, 0, sizeof(struct ext4_allocation_context));
4022 ac->ac_b_ex.fe_logical = ar->logical; 4060 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
4023 ac->ac_status = AC_STATUS_CONTINUE; 4061 ac->ac_status = AC_STATUS_CONTINUE;
4024 ac->ac_sb = sb; 4062 ac->ac_sb = sb;
4025 ac->ac_inode = ar->inode; 4063 ac->ac_inode = ar->inode;
4026 ac->ac_o_ex.fe_logical = ar->logical; 4064 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4027 ac->ac_o_ex.fe_group = group; 4065 ac->ac_o_ex.fe_group = group;
4028 ac->ac_o_ex.fe_start = block; 4066 ac->ac_o_ex.fe_start = block;
4029 ac->ac_o_ex.fe_len = len; 4067 ac->ac_o_ex.fe_len = len;
4030 ac->ac_g_ex.fe_logical = ar->logical; 4068 ac->ac_g_ex = ac->ac_o_ex;
4031 ac->ac_g_ex.fe_group = group;
4032 ac->ac_g_ex.fe_start = block;
4033 ac->ac_g_ex.fe_len = len;
4034 ac->ac_flags = ar->flags; 4069 ac->ac_flags = ar->flags;
4035 4070
4036 /* we have to define context: we'll we work with a file or 4071 /* we have to define context: we'll we work with a file or
@@ -4182,13 +4217,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4182 */ 4217 */
4183static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4218static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4184{ 4219{
4220 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4185 struct ext4_prealloc_space *pa = ac->ac_pa; 4221 struct ext4_prealloc_space *pa = ac->ac_pa;
4186 if (pa) { 4222 if (pa) {
4187 if (pa->pa_type == MB_GROUP_PA) { 4223 if (pa->pa_type == MB_GROUP_PA) {
4188 /* see comment in ext4_mb_use_group_pa() */ 4224 /* see comment in ext4_mb_use_group_pa() */
4189 spin_lock(&pa->pa_lock); 4225 spin_lock(&pa->pa_lock);
4190 pa->pa_pstart += ac->ac_b_ex.fe_len; 4226 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4191 pa->pa_lstart += ac->ac_b_ex.fe_len; 4227 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4192 pa->pa_free -= ac->ac_b_ex.fe_len; 4228 pa->pa_free -= ac->ac_b_ex.fe_len;
4193 pa->pa_len -= ac->ac_b_ex.fe_len; 4229 pa->pa_len -= ac->ac_b_ex.fe_len;
4194 spin_unlock(&pa->pa_lock); 4230 spin_unlock(&pa->pa_lock);
@@ -4249,13 +4285,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4249 struct super_block *sb; 4285 struct super_block *sb;
4250 ext4_fsblk_t block = 0; 4286 ext4_fsblk_t block = 0;
4251 unsigned int inquota = 0; 4287 unsigned int inquota = 0;
4252 unsigned int reserv_blks = 0; 4288 unsigned int reserv_clstrs = 0;
4253 4289
4254 sb = ar->inode->i_sb; 4290 sb = ar->inode->i_sb;
4255 sbi = EXT4_SB(sb); 4291 sbi = EXT4_SB(sb);
4256 4292
4257 trace_ext4_request_blocks(ar); 4293 trace_ext4_request_blocks(ar);
4258 4294
4295 /* Allow to use superuser reservation for quota file */
4296 if (IS_NOQUOTA(ar->inode))
4297 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4298
4259 /* 4299 /*
4260 * For delayed allocation, we could skip the ENOSPC and 4300 * For delayed allocation, we could skip the ENOSPC and
4261 * EDQUOT check, as blocks and quotas have been already 4301 * EDQUOT check, as blocks and quotas have been already
@@ -4269,7 +4309,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4269 * and verify allocation doesn't exceed the quota limits. 4309 * and verify allocation doesn't exceed the quota limits.
4270 */ 4310 */
4271 while (ar->len && 4311 while (ar->len &&
4272 ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { 4312 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4273 4313
4274 /* let others to free the space */ 4314 /* let others to free the space */
4275 yield(); 4315 yield();
@@ -4279,12 +4319,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4279 *errp = -ENOSPC; 4319 *errp = -ENOSPC;
4280 return 0; 4320 return 0;
4281 } 4321 }
4282 reserv_blks = ar->len; 4322 reserv_clstrs = ar->len;
4283 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { 4323 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4284 dquot_alloc_block_nofail(ar->inode, ar->len); 4324 dquot_alloc_block_nofail(ar->inode,
4325 EXT4_C2B(sbi, ar->len));
4285 } else { 4326 } else {
4286 while (ar->len && 4327 while (ar->len &&
4287 dquot_alloc_block(ar->inode, ar->len)) { 4328 dquot_alloc_block(ar->inode,
4329 EXT4_C2B(sbi, ar->len))) {
4288 4330
4289 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4331 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4290 ar->len--; 4332 ar->len--;
@@ -4328,7 +4370,7 @@ repeat:
4328 ext4_mb_new_preallocation(ac); 4370 ext4_mb_new_preallocation(ac);
4329 } 4371 }
4330 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4372 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4331 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4373 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4332 if (*errp == -EAGAIN) { 4374 if (*errp == -EAGAIN) {
4333 /* 4375 /*
4334 * drop the reference that we took 4376 * drop the reference that we took
@@ -4364,13 +4406,13 @@ out:
4364 if (ac) 4406 if (ac)
4365 kmem_cache_free(ext4_ac_cachep, ac); 4407 kmem_cache_free(ext4_ac_cachep, ac);
4366 if (inquota && ar->len < inquota) 4408 if (inquota && ar->len < inquota)
4367 dquot_free_block(ar->inode, inquota - ar->len); 4409 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4368 if (!ar->len) { 4410 if (!ar->len) {
4369 if (!ext4_test_inode_state(ar->inode, 4411 if (!ext4_test_inode_state(ar->inode,
4370 EXT4_STATE_DELALLOC_RESERVED)) 4412 EXT4_STATE_DELALLOC_RESERVED))
4371 /* release all the reserved blocks if non delalloc */ 4413 /* release all the reserved blocks if non delalloc */
4372 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4414 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4373 reserv_blks); 4415 reserv_clstrs);
4374 } 4416 }
4375 4417
4376 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 4418 trace_ext4_allocate_blocks(ar, (unsigned long long)block);
@@ -4388,7 +4430,7 @@ static int can_merge(struct ext4_free_data *entry1,
4388{ 4430{
4389 if ((entry1->t_tid == entry2->t_tid) && 4431 if ((entry1->t_tid == entry2->t_tid) &&
4390 (entry1->group == entry2->group) && 4432 (entry1->group == entry2->group) &&
4391 ((entry1->start_blk + entry1->count) == entry2->start_blk)) 4433 ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
4392 return 1; 4434 return 1;
4393 return 0; 4435 return 0;
4394} 4436}
@@ -4398,7 +4440,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4398 struct ext4_free_data *new_entry) 4440 struct ext4_free_data *new_entry)
4399{ 4441{
4400 ext4_group_t group = e4b->bd_group; 4442 ext4_group_t group = e4b->bd_group;
4401 ext4_grpblk_t block; 4443 ext4_grpblk_t cluster;
4402 struct ext4_free_data *entry; 4444 struct ext4_free_data *entry;
4403 struct ext4_group_info *db = e4b->bd_info; 4445 struct ext4_group_info *db = e4b->bd_info;
4404 struct super_block *sb = e4b->bd_sb; 4446 struct super_block *sb = e4b->bd_sb;
@@ -4411,7 +4453,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4411 BUG_ON(e4b->bd_buddy_page == NULL); 4453 BUG_ON(e4b->bd_buddy_page == NULL);
4412 4454
4413 new_node = &new_entry->node; 4455 new_node = &new_entry->node;
4414 block = new_entry->start_blk; 4456 cluster = new_entry->start_cluster;
4415 4457
4416 if (!*n) { 4458 if (!*n) {
4417 /* first free block exent. We need to 4459 /* first free block exent. We need to
@@ -4425,13 +4467,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4425 while (*n) { 4467 while (*n) {
4426 parent = *n; 4468 parent = *n;
4427 entry = rb_entry(parent, struct ext4_free_data, node); 4469 entry = rb_entry(parent, struct ext4_free_data, node);
4428 if (block < entry->start_blk) 4470 if (cluster < entry->start_cluster)
4429 n = &(*n)->rb_left; 4471 n = &(*n)->rb_left;
4430 else if (block >= (entry->start_blk + entry->count)) 4472 else if (cluster >= (entry->start_cluster + entry->count))
4431 n = &(*n)->rb_right; 4473 n = &(*n)->rb_right;
4432 else { 4474 else {
4433 ext4_grp_locked_error(sb, group, 0, 4475 ext4_grp_locked_error(sb, group, 0,
4434 ext4_group_first_block_no(sb, group) + block, 4476 ext4_group_first_block_no(sb, group) +
4477 EXT4_C2B(sbi, cluster),
4435 "Block already on to-be-freed list"); 4478 "Block already on to-be-freed list");
4436 return 0; 4479 return 0;
4437 } 4480 }
@@ -4445,7 +4488,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4445 if (node) { 4488 if (node) {
4446 entry = rb_entry(node, struct ext4_free_data, node); 4489 entry = rb_entry(node, struct ext4_free_data, node);
4447 if (can_merge(entry, new_entry)) { 4490 if (can_merge(entry, new_entry)) {
4448 new_entry->start_blk = entry->start_blk; 4491 new_entry->start_cluster = entry->start_cluster;
4449 new_entry->count += entry->count; 4492 new_entry->count += entry->count;
4450 rb_erase(node, &(db->bb_free_root)); 4493 rb_erase(node, &(db->bb_free_root));
4451 spin_lock(&sbi->s_md_lock); 4494 spin_lock(&sbi->s_md_lock);
@@ -4496,6 +4539,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4496 ext4_group_t block_group; 4539 ext4_group_t block_group;
4497 struct ext4_sb_info *sbi; 4540 struct ext4_sb_info *sbi;
4498 struct ext4_buddy e4b; 4541 struct ext4_buddy e4b;
4542 unsigned int count_clusters;
4499 int err = 0; 4543 int err = 0;
4500 int ret; 4544 int ret;
4501 4545
@@ -4544,6 +4588,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4544 if (!ext4_should_writeback_data(inode)) 4588 if (!ext4_should_writeback_data(inode))
4545 flags |= EXT4_FREE_BLOCKS_METADATA; 4589 flags |= EXT4_FREE_BLOCKS_METADATA;
4546 4590
4591 /*
4592 * If the extent to be freed does not begin on a cluster
4593 * boundary, we need to deal with partial clusters at the
4594 * beginning and end of the extent. Normally we will free
4595 * blocks at the beginning or the end unless we are explicitly
4596 * requested to avoid doing so.
4597 */
4598 overflow = block & (sbi->s_cluster_ratio - 1);
4599 if (overflow) {
4600 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4601 overflow = sbi->s_cluster_ratio - overflow;
4602 block += overflow;
4603 if (count > overflow)
4604 count -= overflow;
4605 else
4606 return;
4607 } else {
4608 block -= overflow;
4609 count += overflow;
4610 }
4611 }
4612 overflow = count & (sbi->s_cluster_ratio - 1);
4613 if (overflow) {
4614 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4615 if (count > overflow)
4616 count -= overflow;
4617 else
4618 return;
4619 } else
4620 count += sbi->s_cluster_ratio - overflow;
4621 }
4622
4547do_more: 4623do_more:
4548 overflow = 0; 4624 overflow = 0;
4549 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4625 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4552,10 +4628,12 @@ do_more:
4552 * Check to see if we are freeing blocks across a group 4628 * Check to see if we are freeing blocks across a group
4553 * boundary. 4629 * boundary.
4554 */ 4630 */
4555 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4631 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4556 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4632 overflow = EXT4_C2B(sbi, bit) + count -
4633 EXT4_BLOCKS_PER_GROUP(sb);
4557 count -= overflow; 4634 count -= overflow;
4558 } 4635 }
4636 count_clusters = EXT4_B2C(sbi, count);
4559 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4637 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4560 if (!bitmap_bh) { 4638 if (!bitmap_bh) {
4561 err = -EIO; 4639 err = -EIO;
@@ -4570,9 +4648,9 @@ do_more:
4570 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4648 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4571 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4649 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4572 in_range(block, ext4_inode_table(sb, gdp), 4650 in_range(block, ext4_inode_table(sb, gdp),
4573 EXT4_SB(sb)->s_itb_per_group) || 4651 EXT4_SB(sb)->s_itb_per_group) ||
4574 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4652 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4575 EXT4_SB(sb)->s_itb_per_group)) { 4653 EXT4_SB(sb)->s_itb_per_group)) {
4576 4654
4577 ext4_error(sb, "Freeing blocks in system zone - " 4655 ext4_error(sb, "Freeing blocks in system zone - "
4578 "Block = %llu, count = %lu", block, count); 4656 "Block = %llu, count = %lu", block, count);
@@ -4597,11 +4675,11 @@ do_more:
4597#ifdef AGGRESSIVE_CHECK 4675#ifdef AGGRESSIVE_CHECK
4598 { 4676 {
4599 int i; 4677 int i;
4600 for (i = 0; i < count; i++) 4678 for (i = 0; i < count_clusters; i++)
4601 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4679 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4602 } 4680 }
4603#endif 4681#endif
4604 trace_ext4_mballoc_free(sb, inode, block_group, bit, count); 4682 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4605 4683
4606 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4684 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4607 if (err) 4685 if (err)
@@ -4618,13 +4696,13 @@ do_more:
4618 err = -ENOMEM; 4696 err = -ENOMEM;
4619 goto error_return; 4697 goto error_return;
4620 } 4698 }
4621 new_entry->start_blk = bit; 4699 new_entry->start_cluster = bit;
4622 new_entry->group = block_group; 4700 new_entry->group = block_group;
4623 new_entry->count = count; 4701 new_entry->count = count_clusters;
4624 new_entry->t_tid = handle->h_transaction->t_tid; 4702 new_entry->t_tid = handle->h_transaction->t_tid;
4625 4703
4626 ext4_lock_group(sb, block_group); 4704 ext4_lock_group(sb, block_group);
4627 mb_clear_bits(bitmap_bh->b_data, bit, count); 4705 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4628 ext4_mb_free_metadata(handle, &e4b, new_entry); 4706 ext4_mb_free_metadata(handle, &e4b, new_entry);
4629 } else { 4707 } else {
4630 /* need to update group_info->bb_free and bitmap 4708 /* need to update group_info->bb_free and bitmap
@@ -4632,25 +4710,29 @@ do_more:
4632 * them with group lock_held 4710 * them with group lock_held
4633 */ 4711 */
4634 ext4_lock_group(sb, block_group); 4712 ext4_lock_group(sb, block_group);
4635 mb_clear_bits(bitmap_bh->b_data, bit, count); 4713 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4636 mb_free_blocks(inode, &e4b, bit, count); 4714 mb_free_blocks(inode, &e4b, bit, count_clusters);
4637 } 4715 }
4638 4716
4639 ret = ext4_free_blks_count(sb, gdp) + count; 4717 ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4640 ext4_free_blks_set(sb, gdp, ret); 4718 ext4_free_group_clusters_set(sb, gdp, ret);
4641 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4719 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4642 ext4_unlock_group(sb, block_group); 4720 ext4_unlock_group(sb, block_group);
4643 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4721 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4644 4722
4645 if (sbi->s_log_groups_per_flex) { 4723 if (sbi->s_log_groups_per_flex) {
4646 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4724 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4647 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4725 atomic_add(count_clusters,
4726 &sbi->s_flex_groups[flex_group].free_clusters);
4648 } 4727 }
4649 4728
4650 ext4_mb_unload_buddy(&e4b); 4729 ext4_mb_unload_buddy(&e4b);
4651 4730
4652 freed += count; 4731 freed += count;
4653 4732
4733 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4734 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4735
4654 /* We dirtied the bitmap block */ 4736 /* We dirtied the bitmap block */
4655 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4737 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4656 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4738 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4669,8 +4751,6 @@ do_more:
4669 } 4751 }
4670 ext4_mark_super_dirty(sb); 4752 ext4_mark_super_dirty(sb);
4671error_return: 4753error_return:
4672 if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4673 dquot_free_block(inode, freed);
4674 brelse(bitmap_bh); 4754 brelse(bitmap_bh);
4675 ext4_std_error(sb, err); 4755 ext4_std_error(sb, err);
4676 return; 4756 return;
@@ -4778,16 +4858,17 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4778 ext4_lock_group(sb, block_group); 4858 ext4_lock_group(sb, block_group);
4779 mb_clear_bits(bitmap_bh->b_data, bit, count); 4859 mb_clear_bits(bitmap_bh->b_data, bit, count);
4780 mb_free_blocks(NULL, &e4b, bit, count); 4860 mb_free_blocks(NULL, &e4b, bit, count);
4781 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); 4861 blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
4782 ext4_free_blks_set(sb, desc, blk_free_count); 4862 ext4_free_group_clusters_set(sb, desc, blk_free_count);
4783 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 4863 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4784 ext4_unlock_group(sb, block_group); 4864 ext4_unlock_group(sb, block_group);
4785 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); 4865 percpu_counter_add(&sbi->s_freeclusters_counter,
4866 EXT4_B2C(sbi, blocks_freed));
4786 4867
4787 if (sbi->s_log_groups_per_flex) { 4868 if (sbi->s_log_groups_per_flex) {
4788 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4869 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4789 atomic_add(blocks_freed, 4870 atomic_add(EXT4_B2C(sbi, blocks_freed),
4790 &sbi->s_flex_groups[flex_group].free_blocks); 4871 &sbi->s_flex_groups[flex_group].free_clusters);
4791 } 4872 }
4792 4873
4793 ext4_mb_unload_buddy(&e4b); 4874 ext4_mb_unload_buddy(&e4b);
@@ -4948,7 +5029,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4948 struct ext4_group_info *grp; 5029 struct ext4_group_info *grp;
4949 ext4_group_t first_group, last_group; 5030 ext4_group_t first_group, last_group;
4950 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 5031 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4951 ext4_grpblk_t cnt = 0, first_block, last_block; 5032 ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
4952 uint64_t start, len, minlen, trimmed = 0; 5033 uint64_t start, len, minlen, trimmed = 0;
4953 ext4_fsblk_t first_data_blk = 5034 ext4_fsblk_t first_data_blk =
4954 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 5035 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
@@ -4958,7 +5039,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4958 len = range->len >> sb->s_blocksize_bits; 5039 len = range->len >> sb->s_blocksize_bits;
4959 minlen = range->minlen >> sb->s_blocksize_bits; 5040 minlen = range->minlen >> sb->s_blocksize_bits;
4960 5041
4961 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 5042 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
4962 return -EINVAL; 5043 return -EINVAL;
4963 if (start + len <= first_data_blk) 5044 if (start + len <= first_data_blk)
4964 goto out; 5045 goto out;
@@ -4969,11 +5050,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4969 5050
4970 /* Determine first and last group to examine based on start and len */ 5051 /* Determine first and last group to examine based on start and len */
4971 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 5052 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4972 &first_group, &first_block); 5053 &first_group, &first_cluster);
4973 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), 5054 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4974 &last_group, &last_block); 5055 &last_group, &last_cluster);
4975 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; 5056 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4976 last_block = EXT4_BLOCKS_PER_GROUP(sb); 5057 last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
4977 5058
4978 if (first_group > last_group) 5059 if (first_group > last_group)
4979 return -EINVAL; 5060 return -EINVAL;
@@ -4993,20 +5074,20 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4993 * change it for the last group in which case start + 5074 * change it for the last group in which case start +
4994 * len < EXT4_BLOCKS_PER_GROUP(sb). 5075 * len < EXT4_BLOCKS_PER_GROUP(sb).
4995 */ 5076 */
4996 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) 5077 if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
4997 last_block = first_block + len; 5078 last_cluster = first_cluster + len;
4998 len -= last_block - first_block; 5079 len -= last_cluster - first_cluster;
4999 5080
5000 if (grp->bb_free >= minlen) { 5081 if (grp->bb_free >= minlen) {
5001 cnt = ext4_trim_all_free(sb, group, first_block, 5082 cnt = ext4_trim_all_free(sb, group, first_cluster,
5002 last_block, minlen); 5083 last_cluster, minlen);
5003 if (cnt < 0) { 5084 if (cnt < 0) {
5004 ret = cnt; 5085 ret = cnt;
5005 break; 5086 break;
5006 } 5087 }
5007 } 5088 }
5008 trimmed += cnt; 5089 trimmed += cnt;
5009 first_block = 0; 5090 first_cluster = 0;
5010 } 5091 }
5011 range->len = trimmed * sb->s_blocksize; 5092 range->len = trimmed * sb->s_blocksize;
5012 5093
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9d4a636b546c..47705f3285e3 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -106,7 +106,7 @@ struct ext4_free_data {
106 ext4_group_t group; 106 ext4_group_t group;
107 107
108 /* free block extent */ 108 /* free block extent */
109 ext4_grpblk_t start_blk; 109 ext4_grpblk_t start_cluster;
110 ext4_grpblk_t count; 110 ext4_grpblk_t count;
111 111
112 /* transaction which freed this extent */ 112 /* transaction which freed this extent */
@@ -139,9 +139,9 @@ enum {
139 139
140struct ext4_free_extent { 140struct ext4_free_extent {
141 ext4_lblk_t fe_logical; 141 ext4_lblk_t fe_logical;
142 ext4_grpblk_t fe_start; 142 ext4_grpblk_t fe_start; /* In cluster units */
143 ext4_group_t fe_group; 143 ext4_group_t fe_group;
144 ext4_grpblk_t fe_len; 144 ext4_grpblk_t fe_len; /* In cluster units */
145}; 145};
146 146
147/* 147/*
@@ -175,7 +175,7 @@ struct ext4_allocation_context {
175 /* the best found extent */ 175 /* the best found extent */
176 struct ext4_free_extent ac_b_ex; 176 struct ext4_free_extent ac_b_ex;
177 177
178 /* copy of the bext found extent taken before preallocation efforts */ 178 /* copy of the best found extent taken before preallocation efforts */
179 struct ext4_free_extent ac_f_ex; 179 struct ext4_free_extent ac_f_ex;
180 180
181 /* number of iterations done. we have to track to limit searching */ 181 /* number of iterations done. we have to track to limit searching */
@@ -216,6 +216,7 @@ struct ext4_buddy {
216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
217 struct ext4_free_extent *fex) 217 struct ext4_free_extent *fex)
218{ 218{
219 return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; 219 return ext4_group_first_block_no(sb, fex->fe_group) +
220 (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
220} 221}
221#endif 222#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b57b98fb44d1..f729377bf043 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -15,19 +15,18 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
18#include "ext4_extents.h"
19 18
20/* 19/*
21 * The contiguous blocks details which can be 20 * The contiguous blocks details which can be
22 * represented by a single extent 21 * represented by a single extent
23 */ 22 */
24struct list_blocks_struct { 23struct migrate_struct {
25 ext4_lblk_t first_block, last_block; 24 ext4_lblk_t first_block, last_block, curr_block;
26 ext4_fsblk_t first_pblock, last_pblock; 25 ext4_fsblk_t first_pblock, last_pblock;
27}; 26};
28 27
29static int finish_range(handle_t *handle, struct inode *inode, 28static int finish_range(handle_t *handle, struct inode *inode,
30 struct list_blocks_struct *lb) 29 struct migrate_struct *lb)
31 30
32{ 31{
33 int retval = 0, needed; 32 int retval = 0, needed;
@@ -87,8 +86,7 @@ err_out:
87} 86}
88 87
89static int update_extent_range(handle_t *handle, struct inode *inode, 88static int update_extent_range(handle_t *handle, struct inode *inode,
90 ext4_fsblk_t pblock, ext4_lblk_t blk_num, 89 ext4_fsblk_t pblock, struct migrate_struct *lb)
91 struct list_blocks_struct *lb)
92{ 90{
93 int retval; 91 int retval;
94 /* 92 /*
@@ -96,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
96 */ 94 */
97 if (lb->first_pblock && 95 if (lb->first_pblock &&
98 (lb->last_pblock+1 == pblock) && 96 (lb->last_pblock+1 == pblock) &&
99 (lb->last_block+1 == blk_num)) { 97 (lb->last_block+1 == lb->curr_block)) {
100 lb->last_pblock = pblock; 98 lb->last_pblock = pblock;
101 lb->last_block = blk_num; 99 lb->last_block = lb->curr_block;
100 lb->curr_block++;
102 return 0; 101 return 0;
103 } 102 }
104 /* 103 /*
@@ -106,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
106 */ 105 */
107 retval = finish_range(handle, inode, lb); 106 retval = finish_range(handle, inode, lb);
108 lb->first_pblock = lb->last_pblock = pblock; 107 lb->first_pblock = lb->last_pblock = pblock;
109 lb->first_block = lb->last_block = blk_num; 108 lb->first_block = lb->last_block = lb->curr_block;
110 109 lb->curr_block++;
111 return retval; 110 return retval;
112} 111}
113 112
114static int update_ind_extent_range(handle_t *handle, struct inode *inode, 113static int update_ind_extent_range(handle_t *handle, struct inode *inode,
115 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, 114 ext4_fsblk_t pblock,
116 struct list_blocks_struct *lb) 115 struct migrate_struct *lb)
117{ 116{
118 struct buffer_head *bh; 117 struct buffer_head *bh;
119 __le32 *i_data; 118 __le32 *i_data;
120 int i, retval = 0; 119 int i, retval = 0;
121 ext4_lblk_t blk_count = *blk_nump;
122 unsigned long max_entries = inode->i_sb->s_blocksize >> 2; 120 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
123 121
124 if (!pblock) {
125 /* Only update the file block number */
126 *blk_nump += max_entries;
127 return 0;
128 }
129
130 bh = sb_bread(inode->i_sb, pblock); 122 bh = sb_bread(inode->i_sb, pblock);
131 if (!bh) 123 if (!bh)
132 return -EIO; 124 return -EIO;
133 125
134 i_data = (__le32 *)bh->b_data; 126 i_data = (__le32 *)bh->b_data;
135 for (i = 0; i < max_entries; i++, blk_count++) { 127 for (i = 0; i < max_entries; i++) {
136 if (i_data[i]) { 128 if (i_data[i]) {
137 retval = update_extent_range(handle, inode, 129 retval = update_extent_range(handle, inode,
138 le32_to_cpu(i_data[i]), 130 le32_to_cpu(i_data[i]), lb);
139 blk_count, lb);
140 if (retval) 131 if (retval)
141 break; 132 break;
133 } else {
134 lb->curr_block++;
142 } 135 }
143 } 136 }
144
145 /* Update the file block number */
146 *blk_nump = blk_count;
147 put_bh(bh); 137 put_bh(bh);
148 return retval; 138 return retval;
149 139
150} 140}
151 141
152static int update_dind_extent_range(handle_t *handle, struct inode *inode, 142static int update_dind_extent_range(handle_t *handle, struct inode *inode,
153 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, 143 ext4_fsblk_t pblock,
154 struct list_blocks_struct *lb) 144 struct migrate_struct *lb)
155{ 145{
156 struct buffer_head *bh; 146 struct buffer_head *bh;
157 __le32 *i_data; 147 __le32 *i_data;
158 int i, retval = 0; 148 int i, retval = 0;
159 ext4_lblk_t blk_count = *blk_nump;
160 unsigned long max_entries = inode->i_sb->s_blocksize >> 2; 149 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
161 150
162 if (!pblock) {
163 /* Only update the file block number */
164 *blk_nump += max_entries * max_entries;
165 return 0;
166 }
167 bh = sb_bread(inode->i_sb, pblock); 151 bh = sb_bread(inode->i_sb, pblock);
168 if (!bh) 152 if (!bh)
169 return -EIO; 153 return -EIO;
@@ -172,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
172 for (i = 0; i < max_entries; i++) { 156 for (i = 0; i < max_entries; i++) {
173 if (i_data[i]) { 157 if (i_data[i]) {
174 retval = update_ind_extent_range(handle, inode, 158 retval = update_ind_extent_range(handle, inode,
175 le32_to_cpu(i_data[i]), 159 le32_to_cpu(i_data[i]), lb);
176 &blk_count, lb);
177 if (retval) 160 if (retval)
178 break; 161 break;
179 } else { 162 } else {
180 /* Only update the file block number */ 163 /* Only update the file block number */
181 blk_count += max_entries; 164 lb->curr_block += max_entries;
182 } 165 }
183 } 166 }
184
185 /* Update the file block number */
186 *blk_nump = blk_count;
187 put_bh(bh); 167 put_bh(bh);
188 return retval; 168 return retval;
189 169
190} 170}
191 171
192static int update_tind_extent_range(handle_t *handle, struct inode *inode, 172static int update_tind_extent_range(handle_t *handle, struct inode *inode,
193 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, 173 ext4_fsblk_t pblock,
194 struct list_blocks_struct *lb) 174 struct migrate_struct *lb)
195{ 175{
196 struct buffer_head *bh; 176 struct buffer_head *bh;
197 __le32 *i_data; 177 __le32 *i_data;
198 int i, retval = 0; 178 int i, retval = 0;
199 ext4_lblk_t blk_count = *blk_nump;
200 unsigned long max_entries = inode->i_sb->s_blocksize >> 2; 179 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
201 180
202 if (!pblock) {
203 /* Only update the file block number */
204 *blk_nump += max_entries * max_entries * max_entries;
205 return 0;
206 }
207 bh = sb_bread(inode->i_sb, pblock); 181 bh = sb_bread(inode->i_sb, pblock);
208 if (!bh) 182 if (!bh)
209 return -EIO; 183 return -EIO;
@@ -212,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
212 for (i = 0; i < max_entries; i++) { 186 for (i = 0; i < max_entries; i++) {
213 if (i_data[i]) { 187 if (i_data[i]) {
214 retval = update_dind_extent_range(handle, inode, 188 retval = update_dind_extent_range(handle, inode,
215 le32_to_cpu(i_data[i]), 189 le32_to_cpu(i_data[i]), lb);
216 &blk_count, lb);
217 if (retval) 190 if (retval)
218 break; 191 break;
219 } else 192 } else {
220 /* Only update the file block number */ 193 /* Only update the file block number */
221 blk_count += max_entries * max_entries; 194 lb->curr_block += max_entries * max_entries;
195 }
222 } 196 }
223 /* Update the file block number */
224 *blk_nump = blk_count;
225 put_bh(bh); 197 put_bh(bh);
226 return retval; 198 return retval;
227 199
@@ -462,12 +434,12 @@ int ext4_ext_migrate(struct inode *inode)
462 handle_t *handle; 434 handle_t *handle;
463 int retval = 0, i; 435 int retval = 0, i;
464 __le32 *i_data; 436 __le32 *i_data;
465 ext4_lblk_t blk_count = 0;
466 struct ext4_inode_info *ei; 437 struct ext4_inode_info *ei;
467 struct inode *tmp_inode = NULL; 438 struct inode *tmp_inode = NULL;
468 struct list_blocks_struct lb; 439 struct migrate_struct lb;
469 unsigned long max_entries; 440 unsigned long max_entries;
470 __u32 goal; 441 __u32 goal;
442 uid_t owner[2];
471 443
472 /* 444 /*
473 * If the filesystem does not support extents, or the inode 445 * If the filesystem does not support extents, or the inode
@@ -495,10 +467,12 @@ int ext4_ext_migrate(struct inode *inode)
495 } 467 }
496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * 468 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; 469 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
470 owner[0] = inode->i_uid;
471 owner[1] = inode->i_gid;
498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 472 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
499 S_IFREG, NULL, goal); 473 S_IFREG, NULL, goal, owner);
500 if (IS_ERR(tmp_inode)) { 474 if (IS_ERR(tmp_inode)) {
501 retval = -ENOMEM; 475 retval = PTR_ERR(inode);
502 ext4_journal_stop(handle); 476 ext4_journal_stop(handle);
503 return retval; 477 return retval;
504 } 478 }
@@ -551,35 +525,32 @@ int ext4_ext_migrate(struct inode *inode)
551 525
552 /* 32 bit block address 4 bytes */ 526 /* 32 bit block address 4 bytes */
553 max_entries = inode->i_sb->s_blocksize >> 2; 527 max_entries = inode->i_sb->s_blocksize >> 2;
554 for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { 528 for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
555 if (i_data[i]) { 529 if (i_data[i]) {
556 retval = update_extent_range(handle, tmp_inode, 530 retval = update_extent_range(handle, tmp_inode,
557 le32_to_cpu(i_data[i]), 531 le32_to_cpu(i_data[i]), &lb);
558 blk_count, &lb);
559 if (retval) 532 if (retval)
560 goto err_out; 533 goto err_out;
561 } 534 } else
535 lb.curr_block++;
562 } 536 }
563 if (i_data[EXT4_IND_BLOCK]) { 537 if (i_data[EXT4_IND_BLOCK]) {
564 retval = update_ind_extent_range(handle, tmp_inode, 538 retval = update_ind_extent_range(handle, tmp_inode,
565 le32_to_cpu(i_data[EXT4_IND_BLOCK]), 539 le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
566 &blk_count, &lb);
567 if (retval) 540 if (retval)
568 goto err_out; 541 goto err_out;
569 } else 542 } else
570 blk_count += max_entries; 543 lb.curr_block += max_entries;
571 if (i_data[EXT4_DIND_BLOCK]) { 544 if (i_data[EXT4_DIND_BLOCK]) {
572 retval = update_dind_extent_range(handle, tmp_inode, 545 retval = update_dind_extent_range(handle, tmp_inode,
573 le32_to_cpu(i_data[EXT4_DIND_BLOCK]), 546 le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
574 &blk_count, &lb);
575 if (retval) 547 if (retval)
576 goto err_out; 548 goto err_out;
577 } else 549 } else
578 blk_count += max_entries * max_entries; 550 lb.curr_block += max_entries * max_entries;
579 if (i_data[EXT4_TIND_BLOCK]) { 551 if (i_data[EXT4_TIND_BLOCK]) {
580 retval = update_tind_extent_range(handle, tmp_inode, 552 retval = update_tind_extent_range(handle, tmp_inode,
581 le32_to_cpu(i_data[EXT4_TIND_BLOCK]), 553 le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
582 &blk_count, &lb);
583 if (retval) 554 if (retval)
584 goto err_out; 555 goto err_out;
585 } 556 }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9bdef3f537c5..7ea4ba4eff2a 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -109,7 +109,7 @@ static int kmmpd(void *data)
109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); 109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
110 bdevname(bh->b_bdev, mmp->mmp_bdevname); 110 bdevname(bh->b_bdev, mmp->mmp_bdevname);
111 111
112 memcpy(mmp->mmp_nodename, init_utsname()->sysname, 112 memcpy(mmp->mmp_nodename, init_utsname()->nodename,
113 sizeof(mmp->mmp_nodename)); 113 sizeof(mmp->mmp_nodename));
114 114
115 while (!kthread_should_stop()) { 115 while (!kthread_should_stop()) {
@@ -125,8 +125,9 @@ static int kmmpd(void *data)
125 * Don't spew too many error messages. Print one every 125 * Don't spew too many error messages. Print one every
126 * (s_mmp_update_interval * 60) seconds. 126 * (s_mmp_update_interval * 60) seconds.
127 */ 127 */
128 if (retval && (failed_writes % 60) == 0) { 128 if (retval) {
129 ext4_error(sb, "Error writing to MMP block"); 129 if ((failed_writes % 60) == 0)
130 ext4_error(sb, "Error writing to MMP block");
130 failed_writes++; 131 failed_writes++;
131 } 132 }
132 133
@@ -295,7 +296,8 @@ skip:
295 /* 296 /*
296 * write a new random sequence number. 297 * write a new random sequence number.
297 */ 298 */
298 mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); 299 seq = mmp_new_seq();
300 mmp->mmp_seq = cpu_to_le32(seq);
299 301
300 retval = write_mmp_block(bh); 302 retval = write_mmp_block(bh);
301 if (retval) 303 if (retval)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f57455a1b1b2..c5826c623e7a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,7 +17,6 @@
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
20#include "ext4_extents.h"
21#include "ext4.h" 20#include "ext4.h"
22 21
23/** 22/**
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1c924faeb6c8..2a75eed2ef06 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1586,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1586 dxtrace(dx_show_index("node", frames[1].entries)); 1586 dxtrace(dx_show_index("node", frames[1].entries));
1587 dxtrace(dx_show_index("node", 1587 dxtrace(dx_show_index("node",
1588 ((struct dx_node *) bh2->b_data)->entries)); 1588 ((struct dx_node *) bh2->b_data)->entries));
1589 err = ext4_handle_dirty_metadata(handle, inode, bh2); 1589 err = ext4_handle_dirty_metadata(handle, dir, bh2);
1590 if (err) 1590 if (err)
1591 goto journal_error; 1591 goto journal_error;
1592 brelse (bh2); 1592 brelse (bh2);
@@ -1612,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1612 if (err) 1612 if (err)
1613 goto journal_error; 1613 goto journal_error;
1614 } 1614 }
1615 err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); 1615 err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
1616 if (err) { 1616 if (err) {
1617 ext4_std_error(inode->i_sb, err); 1617 ext4_std_error(inode->i_sb, err);
1618 goto cleanup; 1618 goto cleanup;
@@ -1707,9 +1707,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
1707 */ 1707 */
1708static void ext4_dec_count(handle_t *handle, struct inode *inode) 1708static void ext4_dec_count(handle_t *handle, struct inode *inode)
1709{ 1709{
1710 drop_nlink(inode); 1710 if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
1711 if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) 1711 drop_nlink(inode);
1712 inc_nlink(inode);
1713} 1712}
1714 1713
1715 1714
@@ -1756,7 +1755,7 @@ retry:
1756 if (IS_DIRSYNC(dir)) 1755 if (IS_DIRSYNC(dir))
1757 ext4_handle_sync(handle); 1756 ext4_handle_sync(handle);
1758 1757
1759 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); 1758 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
1760 err = PTR_ERR(inode); 1759 err = PTR_ERR(inode);
1761 if (!IS_ERR(inode)) { 1760 if (!IS_ERR(inode)) {
1762 inode->i_op = &ext4_file_inode_operations; 1761 inode->i_op = &ext4_file_inode_operations;
@@ -1792,7 +1791,7 @@ retry:
1792 if (IS_DIRSYNC(dir)) 1791 if (IS_DIRSYNC(dir))
1793 ext4_handle_sync(handle); 1792 ext4_handle_sync(handle);
1794 1793
1795 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); 1794 inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
1796 err = PTR_ERR(inode); 1795 err = PTR_ERR(inode);
1797 if (!IS_ERR(inode)) { 1796 if (!IS_ERR(inode)) {
1798 init_special_inode(inode, inode->i_mode, rdev); 1797 init_special_inode(inode, inode->i_mode, rdev);
@@ -1832,7 +1831,7 @@ retry:
1832 ext4_handle_sync(handle); 1831 ext4_handle_sync(handle);
1833 1832
1834 inode = ext4_new_inode(handle, dir, S_IFDIR | mode, 1833 inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
1835 &dentry->d_name, 0); 1834 &dentry->d_name, 0, NULL);
1836 err = PTR_ERR(inode); 1835 err = PTR_ERR(inode);
1837 if (IS_ERR(inode)) 1836 if (IS_ERR(inode))
1838 goto out_stop; 1837 goto out_stop;
@@ -1863,7 +1862,7 @@ retry:
1863 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1862 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1864 inode->i_nlink = 2; 1863 inode->i_nlink = 2;
1865 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 1864 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1866 err = ext4_handle_dirty_metadata(handle, dir, dir_block); 1865 err = ext4_handle_dirty_metadata(handle, inode, dir_block);
1867 if (err) 1866 if (err)
1868 goto out_clear_inode; 1867 goto out_clear_inode;
1869 err = ext4_mark_inode_dirty(handle, inode); 1868 err = ext4_mark_inode_dirty(handle, inode);
@@ -2279,7 +2278,7 @@ retry:
2279 ext4_handle_sync(handle); 2278 ext4_handle_sync(handle);
2280 2279
2281 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, 2280 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
2282 &dentry->d_name, 0); 2281 &dentry->d_name, 0, NULL);
2283 err = PTR_ERR(inode); 2282 err = PTR_ERR(inode);
2284 if (IS_ERR(inode)) 2283 if (IS_ERR(inode))
2285 goto out_stop; 2284 goto out_stop;
@@ -2530,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2530 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2529 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2531 cpu_to_le32(new_dir->i_ino); 2530 cpu_to_le32(new_dir->i_ino);
2532 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2531 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2533 retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2532 retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
2534 if (retval) { 2533 if (retval) {
2535 ext4_std_error(old_dir->i_sb, retval); 2534 ext4_std_error(old_dir->i_sb, retval);
2536 goto end_rename; 2535 goto end_rename;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 92f38ee13f8a..7ce1d0b19c94 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page)
70void ext4_free_io_end(ext4_io_end_t *io) 70void ext4_free_io_end(ext4_io_end_t *io)
71{ 71{
72 int i; 72 int i;
73 wait_queue_head_t *wq;
74 73
75 BUG_ON(!io); 74 BUG_ON(!io);
76 if (io->page) 75 if (io->page)
@@ -78,56 +77,43 @@ void ext4_free_io_end(ext4_io_end_t *io)
78 for (i = 0; i < io->num_io_pages; i++) 77 for (i = 0; i < io->num_io_pages; i++)
79 put_io_page(io->pages[i]); 78 put_io_page(io->pages[i]);
80 io->num_io_pages = 0; 79 io->num_io_pages = 0;
81 wq = ext4_ioend_wq(io->inode); 80 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
82 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && 81 wake_up_all(ext4_ioend_wq(io->inode));
83 waitqueue_active(wq))
84 wake_up_all(wq);
85 kmem_cache_free(io_end_cachep, io); 82 kmem_cache_free(io_end_cachep, io);
86} 83}
87 84
88/* 85/*
89 * check a range of space and convert unwritten extents to written. 86 * check a range of space and convert unwritten extents to written.
87 *
88 * Called with inode->i_mutex; we depend on this when we manipulate
89 * io->flag, since we could otherwise race with ext4_flush_completed_IO()
90 */ 90 */
91int ext4_end_io_nolock(ext4_io_end_t *io) 91int ext4_end_io_nolock(ext4_io_end_t *io)
92{ 92{
93 struct inode *inode = io->inode; 93 struct inode *inode = io->inode;
94 loff_t offset = io->offset; 94 loff_t offset = io->offset;
95 ssize_t size = io->size; 95 ssize_t size = io->size;
96 wait_queue_head_t *wq;
97 int ret = 0; 96 int ret = 0;
98 97
99 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 98 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
100 "list->prev 0x%p\n", 99 "list->prev 0x%p\n",
101 io, inode->i_ino, io->list.next, io->list.prev); 100 io, inode->i_ino, io->list.next, io->list.prev);
102 101
103 if (list_empty(&io->list))
104 return ret;
105
106 if (!(io->flag & EXT4_IO_END_UNWRITTEN))
107 return ret;
108
109 ret = ext4_convert_unwritten_extents(inode, offset, size); 102 ret = ext4_convert_unwritten_extents(inode, offset, size);
110 if (ret < 0) { 103 if (ret < 0) {
111 printk(KERN_EMERG "%s: failed to convert unwritten " 104 ext4_msg(inode->i_sb, KERN_EMERG,
112 "extents to written extents, error is %d " 105 "failed to convert unwritten extents to written "
113 "io is still on inode %lu aio dio list\n", 106 "extents -- potential data loss! "
114 __func__, ret, inode->i_ino); 107 "(inode %lu, offset %llu, size %zd, error %d)",
115 return ret; 108 inode->i_ino, offset, size, ret);
116 } 109 }
117 110
118 if (io->iocb) 111 if (io->iocb)
119 aio_complete(io->iocb, io->result, 0); 112 aio_complete(io->iocb, io->result, 0);
120 /* clear the DIO AIO unwritten flag */
121 if (io->flag & EXT4_IO_END_UNWRITTEN) {
122 io->flag &= ~EXT4_IO_END_UNWRITTEN;
123 /* Wake up anyone waiting on unwritten extent conversion */
124 wq = ext4_ioend_wq(io->inode);
125 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
126 waitqueue_active(wq)) {
127 wake_up_all(wq);
128 }
129 }
130 113
114 /* Wake up anyone waiting on unwritten extent conversion */
115 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
116 wake_up_all(ext4_ioend_wq(io->inode));
131 return ret; 117 return ret;
132} 118}
133 119
@@ -140,9 +126,15 @@ static void ext4_end_io_work(struct work_struct *work)
140 struct inode *inode = io->inode; 126 struct inode *inode = io->inode;
141 struct ext4_inode_info *ei = EXT4_I(inode); 127 struct ext4_inode_info *ei = EXT4_I(inode);
142 unsigned long flags; 128 unsigned long flags;
143 int ret; 129
130 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
131 if (list_empty(&io->list)) {
132 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
133 goto free;
134 }
144 135
145 if (!mutex_trylock(&inode->i_mutex)) { 136 if (!mutex_trylock(&inode->i_mutex)) {
137 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
146 /* 138 /*
147 * Requeue the work instead of waiting so that the work 139 * Requeue the work instead of waiting so that the work
148 * items queued after this can be processed. 140 * items queued after this can be processed.
@@ -159,17 +151,11 @@ static void ext4_end_io_work(struct work_struct *work)
159 io->flag |= EXT4_IO_END_QUEUED; 151 io->flag |= EXT4_IO_END_QUEUED;
160 return; 152 return;
161 } 153 }
162 ret = ext4_end_io_nolock(io); 154 list_del_init(&io->list);
163 if (ret < 0) {
164 mutex_unlock(&inode->i_mutex);
165 return;
166 }
167
168 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
169 if (!list_empty(&io->list))
170 list_del_init(&io->list);
171 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 155 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
156 (void) ext4_end_io_nolock(io);
172 mutex_unlock(&inode->i_mutex); 157 mutex_unlock(&inode->i_mutex);
158free:
173 ext4_free_io_end(io); 159 ext4_free_io_end(io);
174} 160}
175 161
@@ -350,10 +336,8 @@ submit_and_retry:
350 if ((io_end->num_io_pages >= MAX_IO_PAGES) && 336 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
351 (io_end->pages[io_end->num_io_pages-1] != io_page)) 337 (io_end->pages[io_end->num_io_pages-1] != io_page))
352 goto submit_and_retry; 338 goto submit_and_retry;
353 if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 339 if (buffer_uninit(bh))
354 io_end->flag |= EXT4_IO_END_UNWRITTEN; 340 ext4_set_io_unwritten_flag(inode, io_end);
355 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
356 }
357 io->io_end->size += bh->b_size; 341 io->io_end->size += bh->b_size;
358 io->io_next_block++; 342 io->io_next_block++;
359 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 343 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 707d3f16f7ce..996780ab4f4e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -875,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
875 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ 875 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
876 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ 876 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
877 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 877 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
878 ext4_free_blks_set(sb, gdp, input->free_blocks_count); 878 ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
879 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); 879 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
880 gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); 880 gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
881 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 881 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
@@ -937,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
937 input->reserved_blocks); 937 input->reserved_blocks);
938 938
939 /* Update the free space counts */ 939 /* Update the free space counts */
940 percpu_counter_add(&sbi->s_freeblocks_counter, 940 percpu_counter_add(&sbi->s_freeclusters_counter,
941 input->free_blocks_count); 941 EXT4_B2C(sbi, input->free_blocks_count));
942 percpu_counter_add(&sbi->s_freeinodes_counter, 942 percpu_counter_add(&sbi->s_freeinodes_counter,
943 EXT4_INODES_PER_GROUP(sb)); 943 EXT4_INODES_PER_GROUP(sb));
944 944
@@ -946,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
946 sbi->s_log_groups_per_flex) { 946 sbi->s_log_groups_per_flex) {
947 ext4_group_t flex_group; 947 ext4_group_t flex_group;
948 flex_group = ext4_flex_group(sbi, input->group); 948 flex_group = ext4_flex_group(sbi, input->group);
949 atomic_add(input->free_blocks_count, 949 atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
950 &sbi->s_flex_groups[flex_group].free_blocks); 950 &sbi->s_flex_groups[flex_group].free_clusters);
951 atomic_add(EXT4_INODES_PER_GROUP(sb), 951 atomic_add(EXT4_INODES_PER_GROUP(sb),
952 &sbi->s_flex_groups[flex_group].free_inodes); 952 &sbi->s_flex_groups[flex_group].free_inodes);
953 } 953 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 44d0c8db2239..9953d80145ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
45#include <linux/freezer.h> 45#include <linux/freezer.h>
46 46
47#include "ext4.h" 47#include "ext4.h"
48#include "ext4_extents.h"
48#include "ext4_jbd2.h" 49#include "ext4_jbd2.h"
49#include "xattr.h" 50#include "xattr.h"
50#include "acl.h" 51#include "acl.h"
@@ -163,8 +164,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
163 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 164 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
164} 165}
165 166
166__u32 ext4_free_blks_count(struct super_block *sb, 167__u32 ext4_free_group_clusters(struct super_block *sb,
167 struct ext4_group_desc *bg) 168 struct ext4_group_desc *bg)
168{ 169{
169 return le16_to_cpu(bg->bg_free_blocks_count_lo) | 170 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
170 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 171 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
@@ -219,8 +220,8 @@ void ext4_inode_table_set(struct super_block *sb,
219 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); 220 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
220} 221}
221 222
222void ext4_free_blks_set(struct super_block *sb, 223void ext4_free_group_clusters_set(struct super_block *sb,
223 struct ext4_group_desc *bg, __u32 count) 224 struct ext4_group_desc *bg, __u32 count)
224{ 225{
225 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); 226 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
226 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 227 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
@@ -414,6 +415,22 @@ static void save_error_info(struct super_block *sb, const char *func,
414 ext4_commit_super(sb, 1); 415 ext4_commit_super(sb, 1);
415} 416}
416 417
418/*
419 * The del_gendisk() function uninitializes the disk-specific data
420 * structures, including the bdi structure, without telling anyone
421 * else. Once this happens, any attempt to call mark_buffer_dirty()
422 * (for example, by ext4_commit_super), will cause a kernel OOPS.
423 * This is a kludge to prevent these oops until we can put in a proper
424 * hook in del_gendisk() to inform the VFS and file system layers.
425 */
426static int block_device_ejected(struct super_block *sb)
427{
428 struct inode *bd_inode = sb->s_bdev->bd_inode;
429 struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
430
431 return bdi->dev == NULL;
432}
433
417 434
418/* Deal with the reporting of failure conditions on a filesystem such as 435/* Deal with the reporting of failure conditions on a filesystem such as
419 * inconsistencies detected or read IO failures. 436 * inconsistencies detected or read IO failures.
@@ -821,10 +838,10 @@ static void ext4_put_super(struct super_block *sb)
821 brelse(sbi->s_group_desc[i]); 838 brelse(sbi->s_group_desc[i]);
822 ext4_kvfree(sbi->s_group_desc); 839 ext4_kvfree(sbi->s_group_desc);
823 ext4_kvfree(sbi->s_flex_groups); 840 ext4_kvfree(sbi->s_flex_groups);
824 percpu_counter_destroy(&sbi->s_freeblocks_counter); 841 percpu_counter_destroy(&sbi->s_freeclusters_counter);
825 percpu_counter_destroy(&sbi->s_freeinodes_counter); 842 percpu_counter_destroy(&sbi->s_freeinodes_counter);
826 percpu_counter_destroy(&sbi->s_dirs_counter); 843 percpu_counter_destroy(&sbi->s_dirs_counter);
827 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 844 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
828 brelse(sbi->s_sbh); 845 brelse(sbi->s_sbh);
829#ifdef CONFIG_QUOTA 846#ifdef CONFIG_QUOTA
830 for (i = 0; i < MAXQUOTAS; i++) 847 for (i = 0; i < MAXQUOTAS; i++)
@@ -1057,8 +1074,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1057 seq_puts(seq, ",nouid32"); 1074 seq_puts(seq, ",nouid32");
1058 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) 1075 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
1059 seq_puts(seq, ",debug"); 1076 seq_puts(seq, ",debug");
1060 if (test_opt(sb, OLDALLOC))
1061 seq_puts(seq, ",oldalloc");
1062#ifdef CONFIG_EXT4_FS_XATTR 1077#ifdef CONFIG_EXT4_FS_XATTR
1063 if (test_opt(sb, XATTR_USER)) 1078 if (test_opt(sb, XATTR_USER))
1064 seq_puts(seq, ",user_xattr"); 1079 seq_puts(seq, ",user_xattr");
@@ -1567,10 +1582,12 @@ static int parse_options(char *options, struct super_block *sb,
1567 set_opt(sb, DEBUG); 1582 set_opt(sb, DEBUG);
1568 break; 1583 break;
1569 case Opt_oldalloc: 1584 case Opt_oldalloc:
1570 set_opt(sb, OLDALLOC); 1585 ext4_msg(sb, KERN_WARNING,
1586 "Ignoring deprecated oldalloc option");
1571 break; 1587 break;
1572 case Opt_orlov: 1588 case Opt_orlov:
1573 clear_opt(sb, OLDALLOC); 1589 ext4_msg(sb, KERN_WARNING,
1590 "Ignoring deprecated orlov option");
1574 break; 1591 break;
1575#ifdef CONFIG_EXT4_FS_XATTR 1592#ifdef CONFIG_EXT4_FS_XATTR
1576 case Opt_user_xattr: 1593 case Opt_user_xattr:
@@ -1801,6 +1818,7 @@ set_qf_format:
1801 break; 1818 break;
1802 case Opt_nodelalloc: 1819 case Opt_nodelalloc:
1803 clear_opt(sb, DELALLOC); 1820 clear_opt(sb, DELALLOC);
1821 clear_opt2(sb, EXPLICIT_DELALLOC);
1804 break; 1822 break;
1805 case Opt_mblk_io_submit: 1823 case Opt_mblk_io_submit:
1806 set_opt(sb, MBLK_IO_SUBMIT); 1824 set_opt(sb, MBLK_IO_SUBMIT);
@@ -1817,6 +1835,7 @@ set_qf_format:
1817 break; 1835 break;
1818 case Opt_delalloc: 1836 case Opt_delalloc:
1819 set_opt(sb, DELALLOC); 1837 set_opt(sb, DELALLOC);
1838 set_opt2(sb, EXPLICIT_DELALLOC);
1820 break; 1839 break;
1821 case Opt_block_validity: 1840 case Opt_block_validity:
1822 set_opt(sb, BLOCK_VALIDITY); 1841 set_opt(sb, BLOCK_VALIDITY);
@@ -1935,7 +1954,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1935 res = MS_RDONLY; 1954 res = MS_RDONLY;
1936 } 1955 }
1937 if (read_only) 1956 if (read_only)
1938 return res; 1957 goto done;
1939 if (!(sbi->s_mount_state & EXT4_VALID_FS)) 1958 if (!(sbi->s_mount_state & EXT4_VALID_FS))
1940 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " 1959 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
1941 "running e2fsck is recommended"); 1960 "running e2fsck is recommended");
@@ -1966,6 +1985,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1966 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 1985 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1967 1986
1968 ext4_commit_super(sb, 1); 1987 ext4_commit_super(sb, 1);
1988done:
1969 if (test_opt(sb, DEBUG)) 1989 if (test_opt(sb, DEBUG))
1970 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 1990 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1971 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", 1991 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
@@ -2015,8 +2035,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
2015 flex_group = ext4_flex_group(sbi, i); 2035 flex_group = ext4_flex_group(sbi, i);
2016 atomic_add(ext4_free_inodes_count(sb, gdp), 2036 atomic_add(ext4_free_inodes_count(sb, gdp),
2017 &sbi->s_flex_groups[flex_group].free_inodes); 2037 &sbi->s_flex_groups[flex_group].free_inodes);
2018 atomic_add(ext4_free_blks_count(sb, gdp), 2038 atomic_add(ext4_free_group_clusters(sb, gdp),
2019 &sbi->s_flex_groups[flex_group].free_blocks); 2039 &sbi->s_flex_groups[flex_group].free_clusters);
2020 atomic_add(ext4_used_dirs_count(sb, gdp), 2040 atomic_add(ext4_used_dirs_count(sb, gdp),
2021 &sbi->s_flex_groups[flex_group].used_dirs); 2041 &sbi->s_flex_groups[flex_group].used_dirs);
2022 } 2042 }
@@ -2134,7 +2154,8 @@ static int ext4_check_descriptors(struct super_block *sb,
2134 if (NULL != first_not_zeroed) 2154 if (NULL != first_not_zeroed)
2135 *first_not_zeroed = grp; 2155 *first_not_zeroed = grp;
2136 2156
2137 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 2157 ext4_free_blocks_count_set(sbi->s_es,
2158 EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
2138 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); 2159 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
2139 return 1; 2160 return 1;
2140} 2161}
@@ -2454,7 +2475,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2454 char *buf) 2475 char *buf)
2455{ 2476{
2456 return snprintf(buf, PAGE_SIZE, "%llu\n", 2477 return snprintf(buf, PAGE_SIZE, "%llu\n",
2457 (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2478 (s64) EXT4_C2B(sbi,
2479 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
2458} 2480}
2459 2481
2460static ssize_t session_write_kbytes_show(struct ext4_attr *a, 2482static ssize_t session_write_kbytes_show(struct ext4_attr *a,
@@ -2682,6 +2704,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2682 return 0; 2704 return 0;
2683 } 2705 }
2684 } 2706 }
2707 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
2708 !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2709 ext4_msg(sb, KERN_ERR,
2710 "Can't support bigalloc feature without "
2711 "extents feature\n");
2712 return 0;
2713 }
2685 return 1; 2714 return 1;
2686} 2715}
2687 2716
@@ -3087,10 +3116,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3087 char *cp; 3116 char *cp;
3088 const char *descr; 3117 const char *descr;
3089 int ret = -ENOMEM; 3118 int ret = -ENOMEM;
3090 int blocksize; 3119 int blocksize, clustersize;
3091 unsigned int db_count; 3120 unsigned int db_count;
3092 unsigned int i; 3121 unsigned int i;
3093 int needs_recovery, has_huge_files; 3122 int needs_recovery, has_huge_files, has_bigalloc;
3094 __u64 blocks_count; 3123 __u64 blocks_count;
3095 int err; 3124 int err;
3096 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3125 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -3224,6 +3253,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3224 &journal_ioprio, NULL, 0)) 3253 &journal_ioprio, NULL, 0))
3225 goto failed_mount; 3254 goto failed_mount;
3226 3255
3256 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3257 printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
3258 "with data=journal disables delayed "
3259 "allocation and O_DIRECT support!\n");
3260 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
3261 ext4_msg(sb, KERN_ERR, "can't mount with "
3262 "both data=journal and delalloc");
3263 goto failed_mount;
3264 }
3265 if (test_opt(sb, DIOREAD_NOLOCK)) {
3266 ext4_msg(sb, KERN_ERR, "can't mount with "
3267 "both data=journal and delalloc");
3268 goto failed_mount;
3269 }
3270 if (test_opt(sb, DELALLOC))
3271 clear_opt(sb, DELALLOC);
3272 }
3273
3274 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3275 if (test_opt(sb, DIOREAD_NOLOCK)) {
3276 if (blocksize < PAGE_SIZE) {
3277 ext4_msg(sb, KERN_ERR, "can't mount with "
3278 "dioread_nolock if block size != PAGE_SIZE");
3279 goto failed_mount;
3280 }
3281 }
3282
3227 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3283 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3228 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); 3284 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3229 3285
@@ -3265,8 +3321,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3265 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) 3321 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
3266 goto failed_mount; 3322 goto failed_mount;
3267 3323
3268 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3269
3270 if (blocksize < EXT4_MIN_BLOCK_SIZE || 3324 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
3271 blocksize > EXT4_MAX_BLOCK_SIZE) { 3325 blocksize > EXT4_MAX_BLOCK_SIZE) {
3272 ext4_msg(sb, KERN_ERR, 3326 ext4_msg(sb, KERN_ERR,
@@ -3369,12 +3423,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3369 sb->s_dirt = 1; 3423 sb->s_dirt = 1;
3370 } 3424 }
3371 3425
3372 if (sbi->s_blocks_per_group > blocksize * 8) { 3426 /* Handle clustersize */
3373 ext4_msg(sb, KERN_ERR, 3427 clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
3374 "#blocks per group too big: %lu", 3428 has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3375 sbi->s_blocks_per_group); 3429 EXT4_FEATURE_RO_COMPAT_BIGALLOC);
3376 goto failed_mount; 3430 if (has_bigalloc) {
3431 if (clustersize < blocksize) {
3432 ext4_msg(sb, KERN_ERR,
3433 "cluster size (%d) smaller than "
3434 "block size (%d)", clustersize, blocksize);
3435 goto failed_mount;
3436 }
3437 sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
3438 le32_to_cpu(es->s_log_block_size);
3439 sbi->s_clusters_per_group =
3440 le32_to_cpu(es->s_clusters_per_group);
3441 if (sbi->s_clusters_per_group > blocksize * 8) {
3442 ext4_msg(sb, KERN_ERR,
3443 "#clusters per group too big: %lu",
3444 sbi->s_clusters_per_group);
3445 goto failed_mount;
3446 }
3447 if (sbi->s_blocks_per_group !=
3448 (sbi->s_clusters_per_group * (clustersize / blocksize))) {
3449 ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
3450 "clusters per group (%lu) inconsistent",
3451 sbi->s_blocks_per_group,
3452 sbi->s_clusters_per_group);
3453 goto failed_mount;
3454 }
3455 } else {
3456 if (clustersize != blocksize) {
3457 ext4_warning(sb, "fragment/cluster size (%d) != "
3458 "block size (%d)", clustersize,
3459 blocksize);
3460 clustersize = blocksize;
3461 }
3462 if (sbi->s_blocks_per_group > blocksize * 8) {
3463 ext4_msg(sb, KERN_ERR,
3464 "#blocks per group too big: %lu",
3465 sbi->s_blocks_per_group);
3466 goto failed_mount;
3467 }
3468 sbi->s_clusters_per_group = sbi->s_blocks_per_group;
3469 sbi->s_cluster_bits = 0;
3377 } 3470 }
3471 sbi->s_cluster_ratio = clustersize / blocksize;
3472
3378 if (sbi->s_inodes_per_group > blocksize * 8) { 3473 if (sbi->s_inodes_per_group > blocksize * 8) {
3379 ext4_msg(sb, KERN_ERR, 3474 ext4_msg(sb, KERN_ERR,
3380 "#inodes per group too big: %lu", 3475 "#inodes per group too big: %lu",
@@ -3446,10 +3541,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3446 goto failed_mount; 3541 goto failed_mount;
3447 } 3542 }
3448 3543
3449#ifdef CONFIG_PROC_FS
3450 if (ext4_proc_root) 3544 if (ext4_proc_root)
3451 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 3545 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
3452#endif
3453 3546
3454 bgl_lock_init(sbi->s_blockgroup_lock); 3547 bgl_lock_init(sbi->s_blockgroup_lock);
3455 3548
@@ -3483,8 +3576,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3483 sbi->s_err_report.function = print_daily_error_info; 3576 sbi->s_err_report.function = print_daily_error_info;
3484 sbi->s_err_report.data = (unsigned long) sb; 3577 sbi->s_err_report.data = (unsigned long) sb;
3485 3578
3486 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3579 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3487 ext4_count_free_blocks(sb)); 3580 ext4_count_free_clusters(sb));
3488 if (!err) { 3581 if (!err) {
3489 err = percpu_counter_init(&sbi->s_freeinodes_counter, 3582 err = percpu_counter_init(&sbi->s_freeinodes_counter,
3490 ext4_count_free_inodes(sb)); 3583 ext4_count_free_inodes(sb));
@@ -3494,7 +3587,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3494 ext4_count_dirs(sb)); 3587 ext4_count_dirs(sb));
3495 } 3588 }
3496 if (!err) { 3589 if (!err) {
3497 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3590 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
3498 } 3591 }
3499 if (err) { 3592 if (err) {
3500 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3593 ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -3609,13 +3702,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3609 * The journal may have updated the bg summary counts, so we 3702 * The journal may have updated the bg summary counts, so we
3610 * need to update the global counters. 3703 * need to update the global counters.
3611 */ 3704 */
3612 percpu_counter_set(&sbi->s_freeblocks_counter, 3705 percpu_counter_set(&sbi->s_freeclusters_counter,
3613 ext4_count_free_blocks(sb)); 3706 ext4_count_free_clusters(sb));
3614 percpu_counter_set(&sbi->s_freeinodes_counter, 3707 percpu_counter_set(&sbi->s_freeinodes_counter,
3615 ext4_count_free_inodes(sb)); 3708 ext4_count_free_inodes(sb));
3616 percpu_counter_set(&sbi->s_dirs_counter, 3709 percpu_counter_set(&sbi->s_dirs_counter,
3617 ext4_count_dirs(sb)); 3710 ext4_count_dirs(sb));
3618 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); 3711 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
3619 3712
3620no_journal: 3713no_journal:
3621 /* 3714 /*
@@ -3679,25 +3772,6 @@ no_journal:
3679 "available"); 3772 "available");
3680 } 3773 }
3681 3774
3682 if (test_opt(sb, DELALLOC) &&
3683 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
3684 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
3685 "requested data journaling mode");
3686 clear_opt(sb, DELALLOC);
3687 }
3688 if (test_opt(sb, DIOREAD_NOLOCK)) {
3689 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3690 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3691 "option - requested data journaling mode");
3692 clear_opt(sb, DIOREAD_NOLOCK);
3693 }
3694 if (sb->s_blocksize < PAGE_SIZE) {
3695 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3696 "option - block size is too small");
3697 clear_opt(sb, DIOREAD_NOLOCK);
3698 }
3699 }
3700
3701 err = ext4_setup_system_zone(sb); 3775 err = ext4_setup_system_zone(sb);
3702 if (err) { 3776 if (err) {
3703 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3777 ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -3710,22 +3784,19 @@ no_journal:
3710 if (err) { 3784 if (err) {
3711 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", 3785 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
3712 err); 3786 err);
3713 goto failed_mount4; 3787 goto failed_mount5;
3714 } 3788 }
3715 3789
3716 err = ext4_register_li_request(sb, first_not_zeroed); 3790 err = ext4_register_li_request(sb, first_not_zeroed);
3717 if (err) 3791 if (err)
3718 goto failed_mount4; 3792 goto failed_mount6;
3719 3793
3720 sbi->s_kobj.kset = ext4_kset; 3794 sbi->s_kobj.kset = ext4_kset;
3721 init_completion(&sbi->s_kobj_unregister); 3795 init_completion(&sbi->s_kobj_unregister);
3722 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, 3796 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
3723 "%s", sb->s_id); 3797 "%s", sb->s_id);
3724 if (err) { 3798 if (err)
3725 ext4_mb_release(sb); 3799 goto failed_mount7;
3726 ext4_ext_release(sb);
3727 goto failed_mount4;
3728 };
3729 3800
3730 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 3801 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
3731 ext4_orphan_cleanup(sb, es); 3802 ext4_orphan_cleanup(sb, es);
@@ -3759,13 +3830,19 @@ cantfind_ext4:
3759 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); 3830 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
3760 goto failed_mount; 3831 goto failed_mount;
3761 3832
3833failed_mount7:
3834 ext4_unregister_li_request(sb);
3835failed_mount6:
3836 ext4_ext_release(sb);
3837failed_mount5:
3838 ext4_mb_release(sb);
3839 ext4_release_system_zone(sb);
3762failed_mount4: 3840failed_mount4:
3763 iput(root); 3841 iput(root);
3764 sb->s_root = NULL; 3842 sb->s_root = NULL;
3765 ext4_msg(sb, KERN_ERR, "mount failed"); 3843 ext4_msg(sb, KERN_ERR, "mount failed");
3766 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 3844 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
3767failed_mount_wq: 3845failed_mount_wq:
3768 ext4_release_system_zone(sb);
3769 if (sbi->s_journal) { 3846 if (sbi->s_journal) {
3770 jbd2_journal_destroy(sbi->s_journal); 3847 jbd2_journal_destroy(sbi->s_journal);
3771 sbi->s_journal = NULL; 3848 sbi->s_journal = NULL;
@@ -3774,10 +3851,10 @@ failed_mount3:
3774 del_timer(&sbi->s_err_report); 3851 del_timer(&sbi->s_err_report);
3775 if (sbi->s_flex_groups) 3852 if (sbi->s_flex_groups)
3776 ext4_kvfree(sbi->s_flex_groups); 3853 ext4_kvfree(sbi->s_flex_groups);
3777 percpu_counter_destroy(&sbi->s_freeblocks_counter); 3854 percpu_counter_destroy(&sbi->s_freeclusters_counter);
3778 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3855 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3779 percpu_counter_destroy(&sbi->s_dirs_counter); 3856 percpu_counter_destroy(&sbi->s_dirs_counter);
3780 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3857 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
3781 if (sbi->s_mmp_tsk) 3858 if (sbi->s_mmp_tsk)
3782 kthread_stop(sbi->s_mmp_tsk); 3859 kthread_stop(sbi->s_mmp_tsk);
3783failed_mount2: 3860failed_mount2:
@@ -4064,7 +4141,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4064 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 4141 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
4065 int error = 0; 4142 int error = 0;
4066 4143
4067 if (!sbh) 4144 if (!sbh || block_device_ejected(sb))
4068 return error; 4145 return error;
4069 if (buffer_write_io_error(sbh)) { 4146 if (buffer_write_io_error(sbh)) {
4070 /* 4147 /*
@@ -4100,8 +4177,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4100 else 4177 else
4101 es->s_kbytes_written = 4178 es->s_kbytes_written =
4102 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 4179 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
4103 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 4180 ext4_free_blocks_count_set(es,
4104 &EXT4_SB(sb)->s_freeblocks_counter)); 4181 EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
4182 &EXT4_SB(sb)->s_freeclusters_counter)));
4105 es->s_free_inodes_count = 4183 es->s_free_inodes_count =
4106 cpu_to_le32(percpu_counter_sum_positive( 4184 cpu_to_le32(percpu_counter_sum_positive(
4107 &EXT4_SB(sb)->s_freeinodes_counter)); 4185 &EXT4_SB(sb)->s_freeinodes_counter));
@@ -4506,16 +4584,34 @@ restore_opts:
4506 return err; 4584 return err;
4507} 4585}
4508 4586
4587/*
4588 * Note: calculating the overhead so we can be compatible with
4589 * historical BSD practice is quite difficult in the face of
4590 * clusters/bigalloc. This is because multiple metadata blocks from
4591 * different block group can end up in the same allocation cluster.
4592 * Calculating the exact overhead in the face of clustered allocation
4593 * requires either O(all block bitmaps) in memory or O(number of block
4594 * groups**2) in time. We will still calculate the superblock for
4595 * older file systems --- and if we come across with a bigalloc file
4596 * system with zero in s_overhead_clusters the estimate will be close to
4597 * correct especially for very large cluster sizes --- but for newer
4598 * file systems, it's better to calculate this figure once at mkfs
4599 * time, and store it in the superblock. If the superblock value is
4600 * present (even for non-bigalloc file systems), we will use it.
4601 */
4509static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) 4602static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4510{ 4603{
4511 struct super_block *sb = dentry->d_sb; 4604 struct super_block *sb = dentry->d_sb;
4512 struct ext4_sb_info *sbi = EXT4_SB(sb); 4605 struct ext4_sb_info *sbi = EXT4_SB(sb);
4513 struct ext4_super_block *es = sbi->s_es; 4606 struct ext4_super_block *es = sbi->s_es;
4607 struct ext4_group_desc *gdp;
4514 u64 fsid; 4608 u64 fsid;
4515 s64 bfree; 4609 s64 bfree;
4516 4610
4517 if (test_opt(sb, MINIX_DF)) { 4611 if (test_opt(sb, MINIX_DF)) {
4518 sbi->s_overhead_last = 0; 4612 sbi->s_overhead_last = 0;
4613 } else if (es->s_overhead_clusters) {
4614 sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
4519 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { 4615 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
4520 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 4616 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4521 ext4_fsblk_t overhead = 0; 4617 ext4_fsblk_t overhead = 0;
@@ -4530,24 +4626,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4530 * All of the blocks before first_data_block are 4626 * All of the blocks before first_data_block are
4531 * overhead 4627 * overhead
4532 */ 4628 */
4533 overhead = le32_to_cpu(es->s_first_data_block); 4629 overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
4534 4630
4535 /* 4631 /*
4536 * Add the overhead attributed to the superblock and 4632 * Add the overhead found in each block group
4537 * block group descriptors. If the sparse superblocks
4538 * feature is turned on, then not all groups have this.
4539 */ 4633 */
4540 for (i = 0; i < ngroups; i++) { 4634 for (i = 0; i < ngroups; i++) {
4541 overhead += ext4_bg_has_super(sb, i) + 4635 gdp = ext4_get_group_desc(sb, i, NULL);
4542 ext4_bg_num_gdb(sb, i); 4636 overhead += ext4_num_overhead_clusters(sb, i, gdp);
4543 cond_resched(); 4637 cond_resched();
4544 } 4638 }
4545
4546 /*
4547 * Every block group has an inode bitmap, a block
4548 * bitmap, and an inode table.
4549 */
4550 overhead += ngroups * (2 + sbi->s_itb_per_group);
4551 sbi->s_overhead_last = overhead; 4639 sbi->s_overhead_last = overhead;
4552 smp_wmb(); 4640 smp_wmb();
4553 sbi->s_blocks_last = ext4_blocks_count(es); 4641 sbi->s_blocks_last = ext4_blocks_count(es);
@@ -4555,11 +4643,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4555 4643
4556 buf->f_type = EXT4_SUPER_MAGIC; 4644 buf->f_type = EXT4_SUPER_MAGIC;
4557 buf->f_bsize = sb->s_blocksize; 4645 buf->f_bsize = sb->s_blocksize;
4558 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 4646 buf->f_blocks = (ext4_blocks_count(es) -
4559 bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 4647 EXT4_C2B(sbi, sbi->s_overhead_last));
4560 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 4648 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
4649 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
4561 /* prevent underflow in case that few free space is available */ 4650 /* prevent underflow in case that few free space is available */
4562 buf->f_bfree = max_t(s64, bfree, 0); 4651 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
4563 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4652 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4564 if (buf->f_bfree < ext4_r_blocks_count(es)) 4653 if (buf->f_bfree < ext4_r_blocks_count(es))
4565 buf->f_bavail = 0; 4654 buf->f_bavail = 0;
@@ -4980,13 +5069,11 @@ static int __init ext4_init_fs(void)
4980 return err; 5069 return err;
4981 err = ext4_init_system_zone(); 5070 err = ext4_init_system_zone();
4982 if (err) 5071 if (err)
4983 goto out7; 5072 goto out6;
4984 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 5073 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4985 if (!ext4_kset) 5074 if (!ext4_kset)
4986 goto out6;
4987 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4988 if (!ext4_proc_root)
4989 goto out5; 5075 goto out5;
5076 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4990 5077
4991 err = ext4_init_feat_adverts(); 5078 err = ext4_init_feat_adverts();
4992 if (err) 5079 if (err)
@@ -5022,12 +5109,12 @@ out2:
5022out3: 5109out3:
5023 ext4_exit_feat_adverts(); 5110 ext4_exit_feat_adverts();
5024out4: 5111out4:
5025 remove_proc_entry("fs/ext4", NULL); 5112 if (ext4_proc_root)
5026out5: 5113 remove_proc_entry("fs/ext4", NULL);
5027 kset_unregister(ext4_kset); 5114 kset_unregister(ext4_kset);
5028out6: 5115out5:
5029 ext4_exit_system_zone(); 5116 ext4_exit_system_zone();
5030out7: 5117out6:
5031 ext4_exit_pageio(); 5118 ext4_exit_pageio();
5032 return err; 5119 return err;
5033} 5120}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc97250..93a00d89a220 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,14 @@ inserted:
820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
822 822
823 /*
824 * take i_data_sem because we will test
825 * i_delalloc_reserved_flag in ext4_mb_new_blocks
826 */
827 down_read((&EXT4_I(inode)->i_data_sem));
823 block = ext4_new_meta_blocks(handle, inode, goal, 0, 828 block = ext4_new_meta_blocks(handle, inode, goal, 0,
824 NULL, &error); 829 NULL, &error);
830 up_read((&EXT4_I(inode)->i_data_sem));
825 if (error) 831 if (error)
826 goto cleanup; 832 goto cleanup;
827 833
@@ -985,11 +991,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
985 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); 991 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
986 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); 992 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
987 993
988 error = ext4_get_inode_loc(inode, &is.iloc); 994 error = ext4_reserve_inode_write(handle, inode, &is.iloc);
989 if (error)
990 goto cleanup;
991
992 error = ext4_journal_get_write_access(handle, is.iloc.bh);
993 if (error) 995 if (error)
994 goto cleanup; 996 goto cleanup;
995 997
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9fe061fb8779..fea8dd661d2b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1135,6 +1135,14 @@ static int journal_get_superblock(journal_t *journal)
1135 goto out; 1135 goto out;
1136 } 1136 }
1137 1137
1138 if (be32_to_cpu(sb->s_first) == 0 ||
1139 be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1140 printk(KERN_WARNING
1141 "JBD: Invalid start block of journal: %u\n",
1142 be32_to_cpu(sb->s_first));
1143 goto out;
1144 }
1145
1138 return 0; 1146 return 0;
1139 1147
1140out: 1148out:
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index eef6979821a4..68d704db787f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
352 J_ASSERT(commit_transaction->t_state == T_RUNNING); 352 J_ASSERT(commit_transaction->t_state == T_RUNNING);
353 353
354 trace_jbd2_start_commit(journal, commit_transaction); 354 trace_jbd2_start_commit(journal, commit_transaction);
355 jbd_debug(1, "JBD: starting commit of transaction %d\n", 355 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
356 commit_transaction->t_tid); 356 commit_transaction->t_tid);
357 357
358 write_lock(&journal->j_state_lock); 358 write_lock(&journal->j_state_lock);
@@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
427 __jbd2_journal_clean_checkpoint_list(journal); 427 __jbd2_journal_clean_checkpoint_list(journal);
428 spin_unlock(&journal->j_list_lock); 428 spin_unlock(&journal->j_list_lock);
429 429
430 jbd_debug (3, "JBD: commit phase 1\n"); 430 jbd_debug(3, "JBD2: commit phase 1\n");
431 431
432 /* 432 /*
433 * Switch to a new revoke table. 433 * Switch to a new revoke table.
@@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
447 wake_up(&journal->j_wait_transaction_locked); 447 wake_up(&journal->j_wait_transaction_locked);
448 write_unlock(&journal->j_state_lock); 448 write_unlock(&journal->j_state_lock);
449 449
450 jbd_debug (3, "JBD: commit phase 2\n"); 450 jbd_debug(3, "JBD2: commit phase 2\n");
451 451
452 /* 452 /*
453 * Now start flushing things to disk, in the order they appear 453 * Now start flushing things to disk, in the order they appear
@@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
462 WRITE_SYNC); 462 WRITE_SYNC);
463 blk_finish_plug(&plug); 463 blk_finish_plug(&plug);
464 464
465 jbd_debug(3, "JBD: commit phase 2\n"); 465 jbd_debug(3, "JBD2: commit phase 2\n");
466 466
467 /* 467 /*
468 * Way to go: we have now written out all of the data for a 468 * Way to go: we have now written out all of the data for a
@@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
522 522
523 J_ASSERT (bufs == 0); 523 J_ASSERT (bufs == 0);
524 524
525 jbd_debug(4, "JBD: get descriptor\n"); 525 jbd_debug(4, "JBD2: get descriptor\n");
526 526
527 descriptor = jbd2_journal_get_descriptor_buffer(journal); 527 descriptor = jbd2_journal_get_descriptor_buffer(journal);
528 if (!descriptor) { 528 if (!descriptor) {
@@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
531 } 531 }
532 532
533 bh = jh2bh(descriptor); 533 bh = jh2bh(descriptor);
534 jbd_debug(4, "JBD: got buffer %llu (%p)\n", 534 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
535 (unsigned long long)bh->b_blocknr, bh->b_data); 535 (unsigned long long)bh->b_blocknr, bh->b_data);
536 header = (journal_header_t *)&bh->b_data[0]; 536 header = (journal_header_t *)&bh->b_data[0];
537 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 537 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
@@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
625 commit_transaction->t_buffers == NULL || 625 commit_transaction->t_buffers == NULL ||
626 space_left < tag_bytes + 16) { 626 space_left < tag_bytes + 16) {
627 627
628 jbd_debug(4, "JBD: Submit %d IOs\n", bufs); 628 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
629 629
630 /* Write an end-of-descriptor marker before 630 /* Write an end-of-descriptor marker before
631 submitting the IOs. "tag" still points to 631 submitting the IOs. "tag" still points to
@@ -707,7 +707,7 @@ start_journal_io:
707 so we incur less scheduling load. 707 so we incur less scheduling load.
708 */ 708 */
709 709
710 jbd_debug(3, "JBD: commit phase 3\n"); 710 jbd_debug(3, "JBD2: commit phase 3\n");
711 711
712 /* 712 /*
713 * akpm: these are BJ_IO, and j_list_lock is not needed. 713 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -771,7 +771,7 @@ wait_for_iobuf:
771 771
772 J_ASSERT (commit_transaction->t_shadow_list == NULL); 772 J_ASSERT (commit_transaction->t_shadow_list == NULL);
773 773
774 jbd_debug(3, "JBD: commit phase 4\n"); 774 jbd_debug(3, "JBD2: commit phase 4\n");
775 775
776 /* Here we wait for the revoke record and descriptor record buffers */ 776 /* Here we wait for the revoke record and descriptor record buffers */
777 wait_for_ctlbuf: 777 wait_for_ctlbuf:
@@ -801,7 +801,7 @@ wait_for_iobuf:
801 if (err) 801 if (err)
802 jbd2_journal_abort(journal, err); 802 jbd2_journal_abort(journal, err);
803 803
804 jbd_debug(3, "JBD: commit phase 5\n"); 804 jbd_debug(3, "JBD2: commit phase 5\n");
805 write_lock(&journal->j_state_lock); 805 write_lock(&journal->j_state_lock);
806 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); 806 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
807 commit_transaction->t_state = T_COMMIT_JFLUSH; 807 commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -830,7 +830,7 @@ wait_for_iobuf:
830 transaction can be removed from any checkpoint list it was on 830 transaction can be removed from any checkpoint list it was on
831 before. */ 831 before. */
832 832
833 jbd_debug(3, "JBD: commit phase 6\n"); 833 jbd_debug(3, "JBD2: commit phase 6\n");
834 834
835 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 835 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
836 J_ASSERT(commit_transaction->t_buffers == NULL); 836 J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -964,7 +964,7 @@ restart_loop:
964 964
965 /* Done with this transaction! */ 965 /* Done with this transaction! */
966 966
967 jbd_debug(3, "JBD: commit phase 7\n"); 967 jbd_debug(3, "JBD2: commit phase 7\n");
968 968
969 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); 969 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
970 970
@@ -1039,7 +1039,7 @@ restart_loop:
1039 journal->j_commit_callback(journal, commit_transaction); 1039 journal->j_commit_callback(journal, commit_transaction);
1040 1040
1041 trace_jbd2_end_commit(journal, commit_transaction); 1041 trace_jbd2_end_commit(journal, commit_transaction);
1042 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1042 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1043 journal->j_commit_sequence, journal->j_tail_sequence); 1043 journal->j_commit_sequence, journal->j_tail_sequence);
1044 if (to_free) 1044 if (to_free)
1045 kfree(commit_transaction); 1045 kfree(commit_transaction);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f24df13adc4e..0fa0123151d3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
491 */ 491 */
492 492
493 journal->j_commit_request = target; 493 journal->j_commit_request = target;
494 jbd_debug(1, "JBD: requesting commit %d/%d\n", 494 jbd_debug(1, "JBD2: requesting commit %d/%d\n",
495 journal->j_commit_request, 495 journal->j_commit_request,
496 journal->j_commit_sequence); 496 journal->j_commit_sequence);
497 wake_up(&journal->j_wait_commit); 497 wake_up(&journal->j_wait_commit);
@@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
500 /* This should never happen, but if it does, preserve 500 /* This should never happen, but if it does, preserve
501 the evidence before kjournald goes into a loop and 501 the evidence before kjournald goes into a loop and
502 increments j_commit_sequence beyond all recognition. */ 502 increments j_commit_sequence beyond all recognition. */
503 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", 503 WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
504 journal->j_commit_request, 504 journal->j_commit_request,
505 journal->j_commit_sequence, 505 journal->j_commit_sequence,
506 target, journal->j_running_transaction ? 506 target, journal->j_running_transaction ?
@@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
645 } 645 }
646#endif 646#endif
647 while (tid_gt(tid, journal->j_commit_sequence)) { 647 while (tid_gt(tid, journal->j_commit_sequence)) {
648 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", 648 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
649 tid, journal->j_commit_sequence); 649 tid, journal->j_commit_sequence);
650 wake_up(&journal->j_wait_commit); 650 wake_up(&journal->j_wait_commit);
651 read_unlock(&journal->j_state_lock); 651 read_unlock(&journal->j_state_lock);
@@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal)
1093 first = be32_to_cpu(sb->s_first); 1093 first = be32_to_cpu(sb->s_first);
1094 last = be32_to_cpu(sb->s_maxlen); 1094 last = be32_to_cpu(sb->s_maxlen);
1095 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { 1095 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1096 printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", 1096 printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
1097 first, last); 1097 first, last);
1098 journal_fail_superblock(journal); 1098 journal_fail_superblock(journal);
1099 return -EINVAL; 1099 return -EINVAL;
@@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1139 */ 1139 */
1140 if (sb->s_start == 0 && journal->j_tail_sequence == 1140 if (sb->s_start == 0 && journal->j_tail_sequence ==
1141 journal->j_transaction_sequence) { 1141 journal->j_transaction_sequence) {
1142 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 1142 jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
1143 "(start %ld, seq %d, errno %d)\n", 1143 "(start %ld, seq %d, errno %d)\n",
1144 journal->j_tail, journal->j_tail_sequence, 1144 journal->j_tail, journal->j_tail_sequence,
1145 journal->j_errno); 1145 journal->j_errno);
@@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1163 } 1163 }
1164 1164
1165 read_lock(&journal->j_state_lock); 1165 read_lock(&journal->j_state_lock);
1166 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1166 jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
1167 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1167 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
1168 1168
1169 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1169 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal)
1216 ll_rw_block(READ, 1, &bh); 1216 ll_rw_block(READ, 1, &bh);
1217 wait_on_buffer(bh); 1217 wait_on_buffer(bh);
1218 if (!buffer_uptodate(bh)) { 1218 if (!buffer_uptodate(bh)) {
1219 printk (KERN_ERR 1219 printk(KERN_ERR
1220 "JBD: IO error reading journal superblock\n"); 1220 "JBD2: IO error reading journal superblock\n");
1221 goto out; 1221 goto out;
1222 } 1222 }
1223 } 1223 }
@@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal)
1228 1228
1229 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || 1229 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
1230 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { 1230 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1231 printk(KERN_WARNING "JBD: no valid journal superblock found\n"); 1231 printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
1232 goto out; 1232 goto out;
1233 } 1233 }
1234 1234
@@ -1240,14 +1240,22 @@ static int journal_get_superblock(journal_t *journal)
1240 journal->j_format_version = 2; 1240 journal->j_format_version = 2;
1241 break; 1241 break;
1242 default: 1242 default:
1243 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); 1243 printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
1244 goto out; 1244 goto out;
1245 } 1245 }
1246 1246
1247 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) 1247 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1248 journal->j_maxlen = be32_to_cpu(sb->s_maxlen); 1248 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1249 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { 1249 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1250 printk (KERN_WARNING "JBD: journal file too short\n"); 1250 printk(KERN_WARNING "JBD2: journal file too short\n");
1251 goto out;
1252 }
1253
1254 if (be32_to_cpu(sb->s_first) == 0 ||
1255 be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1256 printk(KERN_WARNING
1257 "JBD2: Invalid start block of journal: %u\n",
1258 be32_to_cpu(sb->s_first));
1251 goto out; 1259 goto out;
1252 } 1260 }
1253 1261
@@ -1310,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal)
1310 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || 1318 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
1311 (sb->s_feature_incompat & 1319 (sb->s_feature_incompat &
1312 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { 1320 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
1313 printk (KERN_WARNING 1321 printk(KERN_WARNING
1314 "JBD: Unrecognised features on journal\n"); 1322 "JBD2: Unrecognised features on journal\n");
1315 return -EINVAL; 1323 return -EINVAL;
1316 } 1324 }
1317 } 1325 }
@@ -1346,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal)
1346 return 0; 1354 return 0;
1347 1355
1348recovery_error: 1356recovery_error:
1349 printk (KERN_WARNING "JBD: recovery failed\n"); 1357 printk(KERN_WARNING "JBD2: recovery failed\n");
1350 return -EIO; 1358 return -EIO;
1351} 1359}
1352 1360
@@ -1577,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal,
1577 struct buffer_head *bh; 1585 struct buffer_head *bh;
1578 1586
1579 printk(KERN_WARNING 1587 printk(KERN_WARNING
1580 "JBD: Converting superblock from version 1 to 2.\n"); 1588 "JBD2: Converting superblock from version 1 to 2.\n");
1581 1589
1582 /* Pre-initialise new fields to zero */ 1590 /* Pre-initialise new fields to zero */
1583 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); 1591 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
@@ -1694,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1694 if (!journal->j_tail) 1702 if (!journal->j_tail)
1695 goto no_recovery; 1703 goto no_recovery;
1696 1704
1697 printk (KERN_WARNING "JBD: %s recovery information on journal\n", 1705 printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
1698 write ? "Clearing" : "Ignoring"); 1706 write ? "Clearing" : "Ignoring");
1699 1707
1700 err = jbd2_journal_skip_recovery(journal); 1708 err = jbd2_journal_skip_recovery(journal);
@@ -2020,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void)
2020 retval = 0; 2028 retval = 0;
2021 if (!jbd2_journal_head_cache) { 2029 if (!jbd2_journal_head_cache) {
2022 retval = -ENOMEM; 2030 retval = -ENOMEM;
2023 printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); 2031 printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
2024 } 2032 }
2025 return retval; 2033 return retval;
2026} 2034}
@@ -2383,7 +2391,7 @@ static void __exit journal_exit(void)
2383#ifdef CONFIG_JBD2_DEBUG 2391#ifdef CONFIG_JBD2_DEBUG
2384 int n = atomic_read(&nr_journal_heads); 2392 int n = atomic_read(&nr_journal_heads);
2385 if (n) 2393 if (n)
2386 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 2394 printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
2387#endif 2395#endif
2388 jbd2_remove_debugfs_entry(); 2396 jbd2_remove_debugfs_entry();
2389 jbd2_remove_jbd_stats_proc_entry(); 2397 jbd2_remove_jbd_stats_proc_entry();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 1cad869494f0..da6d7baf1390 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
89 err = jbd2_journal_bmap(journal, next, &blocknr); 89 err = jbd2_journal_bmap(journal, next, &blocknr);
90 90
91 if (err) { 91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n", 92 printk(KERN_ERR "JBD2: bad block at offset %u\n",
93 next); 93 next);
94 goto failed; 94 goto failed;
95 } 95 }
@@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
138 *bhp = NULL; 138 *bhp = NULL;
139 139
140 if (offset >= journal->j_maxlen) { 140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n"); 141 printk(KERN_ERR "JBD2: corrupted journal superblock\n");
142 return -EIO; 142 return -EIO;
143 } 143 }
144 144
145 err = jbd2_journal_bmap(journal, offset, &blocknr); 145 err = jbd2_journal_bmap(journal, offset, &blocknr);
146 146
147 if (err) { 147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n", 148 printk(KERN_ERR "JBD2: bad block at offset %u\n",
149 offset); 149 offset);
150 return err; 150 return err;
151 } 151 }
@@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
163 } 163 }
164 164
165 if (!buffer_uptodate(bh)) { 165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n", 166 printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
167 offset); 167 offset);
168 brelse(bh); 168 brelse(bh);
169 return -EIO; 169 return -EIO;
@@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal)
251 if (!err) 251 if (!err)
252 err = do_one_pass(journal, &info, PASS_REPLAY); 252 err = do_one_pass(journal, &info, PASS_REPLAY);
253 253
254 jbd_debug(1, "JBD: recovery, exit status %d, " 254 jbd_debug(1, "JBD2: recovery, exit status %d, "
255 "recovered transactions %u to %u\n", 255 "recovered transactions %u to %u\n",
256 err, info.start_transaction, info.end_transaction); 256 err, info.start_transaction, info.end_transaction);
257 jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", 257 jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
258 info.nr_replays, info.nr_revoke_hits, info.nr_revokes); 258 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
259 259
260 /* Restart the log at the next transaction ID, thus invalidating 260 /* Restart the log at the next transaction ID, thus invalidating
@@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal)
293 err = do_one_pass(journal, &info, PASS_SCAN); 293 err = do_one_pass(journal, &info, PASS_SCAN);
294 294
295 if (err) { 295 if (err) {
296 printk(KERN_ERR "JBD: error %d scanning journal\n", err); 296 printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
297 ++journal->j_transaction_sequence; 297 ++journal->j_transaction_sequence;
298 } else { 298 } else {
299#ifdef CONFIG_JBD2_DEBUG 299#ifdef CONFIG_JBD2_DEBUG
300 int dropped = info.end_transaction - 300 int dropped = info.end_transaction -
301 be32_to_cpu(journal->j_superblock->s_sequence); 301 be32_to_cpu(journal->j_superblock->s_sequence);
302 jbd_debug(1, 302 jbd_debug(1,
303 "JBD: ignoring %d transaction%s from the journal.\n", 303 "JBD2: ignoring %d transaction%s from the journal.\n",
304 dropped, (dropped == 1) ? "" : "s"); 304 dropped, (dropped == 1) ? "" : "s");
305#endif 305#endif
306 journal->j_transaction_sequence = ++info.end_transaction; 306 journal->j_transaction_sequence = ++info.end_transaction;
@@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
338 wrap(journal, *next_log_block); 338 wrap(journal, *next_log_block);
339 err = jread(&obh, journal, io_block); 339 err = jread(&obh, journal, io_block);
340 if (err) { 340 if (err) {
341 printk(KERN_ERR "JBD: IO error %d recovering block " 341 printk(KERN_ERR "JBD2: IO error %d recovering block "
342 "%lu in log\n", err, io_block); 342 "%lu in log\n", err, io_block);
343 return 1; 343 return 1;
344 } else { 344 } else {
@@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal,
411 * either the next descriptor block or the final commit 411 * either the next descriptor block or the final commit
412 * record. */ 412 * record. */
413 413
414 jbd_debug(3, "JBD: checking block %ld\n", next_log_block); 414 jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
415 err = jread(&bh, journal, next_log_block); 415 err = jread(&bh, journal, next_log_block);
416 if (err) 416 if (err)
417 goto failed; 417 goto failed;
@@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal,
491 /* Recover what we can, but 491 /* Recover what we can, but
492 * report failure at the end. */ 492 * report failure at the end. */
493 success = err; 493 success = err;
494 printk (KERN_ERR 494 printk(KERN_ERR
495 "JBD: IO error %d recovering " 495 "JBD2: IO error %d recovering "
496 "block %ld in log\n", 496 "block %ld in log\n",
497 err, io_block); 497 err, io_block);
498 } else { 498 } else {
@@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal,
520 journal->j_blocksize); 520 journal->j_blocksize);
521 if (nbh == NULL) { 521 if (nbh == NULL) {
522 printk(KERN_ERR 522 printk(KERN_ERR
523 "JBD: Out of memory " 523 "JBD2: Out of memory "
524 "during recovery.\n"); 524 "during recovery.\n");
525 err = -ENOMEM; 525 err = -ENOMEM;
526 brelse(bh); 526 brelse(bh);
@@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal,
689 /* It's really bad news if different passes end up at 689 /* It's really bad news if different passes end up at
690 * different places (but possible due to IO errors). */ 690 * different places (but possible due to IO errors). */
691 if (info->end_transaction != next_commit_ID) { 691 if (info->end_transaction != next_commit_ID) {
692 printk (KERN_ERR "JBD: recovery pass %d ended at " 692 printk(KERN_ERR "JBD2: recovery pass %d ended at "
693 "transaction %u, expected %u\n", 693 "transaction %u, expected %u\n",
694 pass, next_commit_ID, info->end_transaction); 694 pass, next_commit_ID, info->end_transaction);
695 if (!success) 695 if (!success)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2d7109414cdd..a0e41a4c080e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -27,6 +27,7 @@
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/bug.h>
30#include <linux/module.h> 31#include <linux/module.h>
31 32
32static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -115,7 +116,7 @@ static inline void update_t_max_wait(transaction_t *transaction,
115 */ 116 */
116 117
117static int start_this_handle(journal_t *journal, handle_t *handle, 118static int start_this_handle(journal_t *journal, handle_t *handle,
118 int gfp_mask) 119 gfp_t gfp_mask)
119{ 120{
120 transaction_t *transaction, *new_transaction = NULL; 121 transaction_t *transaction, *new_transaction = NULL;
121 tid_t tid; 122 tid_t tid;
@@ -124,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
124 unsigned long ts = jiffies; 125 unsigned long ts = jiffies;
125 126
126 if (nblocks > journal->j_max_transaction_buffers) { 127 if (nblocks > journal->j_max_transaction_buffers) {
127 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 128 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
128 current->comm, nblocks, 129 current->comm, nblocks,
129 journal->j_max_transaction_buffers); 130 journal->j_max_transaction_buffers);
130 return -ENOSPC; 131 return -ENOSPC;
@@ -320,7 +321,7 @@ static handle_t *new_handle(int nblocks)
320 * Return a pointer to a newly allocated handle, or an ERR_PTR() value 321 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
321 * on failure. 322 * on failure.
322 */ 323 */
323handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) 324handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
324{ 325{
325 handle_t *handle = journal_current_handle(); 326 handle_t *handle = journal_current_handle();
326 int err; 327 int err;
@@ -443,7 +444,7 @@ out:
443 * transaction capabable of guaranteeing the requested number of 444 * transaction capabable of guaranteeing the requested number of
444 * credits. 445 * credits.
445 */ 446 */
446int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) 447int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
447{ 448{
448 transaction_t *transaction = handle->h_transaction; 449 transaction_t *transaction = handle->h_transaction;
449 journal_t *journal = transaction->t_journal; 450 journal_t *journal = transaction->t_journal;
@@ -563,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh)
563 char b[BDEVNAME_SIZE]; 564 char b[BDEVNAME_SIZE];
564 565
565 printk(KERN_WARNING 566 printk(KERN_WARNING
566 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " 567 "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
567 "There's a risk of filesystem corruption in case of system " 568 "There's a risk of filesystem corruption in case of system "
568 "crash.\n", 569 "crash.\n",
569 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 570 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
@@ -1049,6 +1050,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
1049 * mark dirty metadata which needs to be journaled as part of the current 1050 * mark dirty metadata which needs to be journaled as part of the current
1050 * transaction. 1051 * transaction.
1051 * 1052 *
1053 * The buffer must have previously had jbd2_journal_get_write_access()
1054 * called so that it has a valid journal_head attached to the buffer
1055 * head.
1056 *
1052 * The buffer is placed on the transaction's metadata list and is marked 1057 * The buffer is placed on the transaction's metadata list and is marked
1053 * as belonging to the transaction. 1058 * as belonging to the transaction.
1054 * 1059 *
@@ -1065,11 +1070,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1065 transaction_t *transaction = handle->h_transaction; 1070 transaction_t *transaction = handle->h_transaction;
1066 journal_t *journal = transaction->t_journal; 1071 journal_t *journal = transaction->t_journal;
1067 struct journal_head *jh = bh2jh(bh); 1072 struct journal_head *jh = bh2jh(bh);
1073 int ret = 0;
1068 1074
1069 jbd_debug(5, "journal_head %p\n", jh); 1075 jbd_debug(5, "journal_head %p\n", jh);
1070 JBUFFER_TRACE(jh, "entry"); 1076 JBUFFER_TRACE(jh, "entry");
1071 if (is_handle_aborted(handle)) 1077 if (is_handle_aborted(handle))
1072 goto out; 1078 goto out;
1079 if (!buffer_jbd(bh)) {
1080 ret = -EUCLEAN;
1081 goto out;
1082 }
1073 1083
1074 jbd_lock_bh_state(bh); 1084 jbd_lock_bh_state(bh);
1075 1085
@@ -1093,8 +1103,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1093 */ 1103 */
1094 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { 1104 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1095 JBUFFER_TRACE(jh, "fastpath"); 1105 JBUFFER_TRACE(jh, "fastpath");
1096 J_ASSERT_JH(jh, jh->b_transaction == 1106 if (unlikely(jh->b_transaction !=
1097 journal->j_running_transaction); 1107 journal->j_running_transaction)) {
1108 printk(KERN_EMERG "JBD: %s: "
1109 "jh->b_transaction (%llu, %p, %u) != "
1110 "journal->j_running_transaction (%p, %u)",
1111 journal->j_devname,
1112 (unsigned long long) bh->b_blocknr,
1113 jh->b_transaction,
1114 jh->b_transaction ? jh->b_transaction->t_tid : 0,
1115 journal->j_running_transaction,
1116 journal->j_running_transaction ?
1117 journal->j_running_transaction->t_tid : 0);
1118 ret = -EINVAL;
1119 }
1098 goto out_unlock_bh; 1120 goto out_unlock_bh;
1099 } 1121 }
1100 1122
@@ -1108,9 +1130,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1108 */ 1130 */
1109 if (jh->b_transaction != transaction) { 1131 if (jh->b_transaction != transaction) {
1110 JBUFFER_TRACE(jh, "already on other transaction"); 1132 JBUFFER_TRACE(jh, "already on other transaction");
1111 J_ASSERT_JH(jh, jh->b_transaction == 1133 if (unlikely(jh->b_transaction !=
1112 journal->j_committing_transaction); 1134 journal->j_committing_transaction)) {
1113 J_ASSERT_JH(jh, jh->b_next_transaction == transaction); 1135 printk(KERN_EMERG "JBD: %s: "
1136 "jh->b_transaction (%llu, %p, %u) != "
1137 "journal->j_committing_transaction (%p, %u)",
1138 journal->j_devname,
1139 (unsigned long long) bh->b_blocknr,
1140 jh->b_transaction,
1141 jh->b_transaction ? jh->b_transaction->t_tid : 0,
1142 journal->j_committing_transaction,
1143 journal->j_committing_transaction ?
1144 journal->j_committing_transaction->t_tid : 0);
1145 ret = -EINVAL;
1146 }
1147 if (unlikely(jh->b_next_transaction != transaction)) {
1148 printk(KERN_EMERG "JBD: %s: "
1149 "jh->b_next_transaction (%llu, %p, %u) != "
1150 "transaction (%p, %u)",
1151 journal->j_devname,
1152 (unsigned long long) bh->b_blocknr,
1153 jh->b_next_transaction,
1154 jh->b_next_transaction ?
1155 jh->b_next_transaction->t_tid : 0,
1156 transaction, transaction->t_tid);
1157 ret = -EINVAL;
1158 }
1114 /* And this case is illegal: we can't reuse another 1159 /* And this case is illegal: we can't reuse another
1115 * transaction's data buffer, ever. */ 1160 * transaction's data buffer, ever. */
1116 goto out_unlock_bh; 1161 goto out_unlock_bh;
@@ -1127,7 +1172,8 @@ out_unlock_bh:
1127 jbd_unlock_bh_state(bh); 1172 jbd_unlock_bh_state(bh);
1128out: 1173out:
1129 JBUFFER_TRACE(jh, "exit"); 1174 JBUFFER_TRACE(jh, "exit");
1130 return 0; 1175 WARN_ON(ret); /* All errors are bugs, so dump the stack */
1176 return ret;
1131} 1177}
1132 1178
1133/* 1179/*
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index 53792bf36c71..ce1b719e8bd4 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -197,8 +197,8 @@ struct ext2_group_desc
197 197
198/* Flags that should be inherited by new inodes from their parent. */ 198/* Flags that should be inherited by new inodes from their parent. */
199#define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\ 199#define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\
200 EXT2_SYNC_FL | EXT2_IMMUTABLE_FL | EXT2_APPEND_FL |\ 200 EXT2_SYNC_FL | EXT2_NODUMP_FL |\
201 EXT2_NODUMP_FL | EXT2_NOATIME_FL | EXT2_COMPRBLK_FL|\ 201 EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\
202 EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\ 202 EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\
203 EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL) 203 EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL)
204 204
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index f5fceffd4cfe..dec99116a0e4 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -180,8 +180,8 @@ struct ext3_group_desc
180 180
181/* Flags that should be inherited by new inodes from their parent. */ 181/* Flags that should be inherited by new inodes from their parent. */
182#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ 182#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
183 EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\ 183 EXT3_SYNC_FL | EXT3_NODUMP_FL |\
184 EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\ 184 EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
185 EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ 185 EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
186 EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) 186 EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
187 187
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a049fd2aa4c..78af9385f415 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -770,12 +770,13 @@ struct inode {
770 unsigned long i_ino; 770 unsigned long i_ino;
771 unsigned int i_nlink; 771 unsigned int i_nlink;
772 dev_t i_rdev; 772 dev_t i_rdev;
773 loff_t i_size;
774 struct timespec i_atime; 773 struct timespec i_atime;
775 struct timespec i_mtime; 774 struct timespec i_mtime;
776 struct timespec i_ctime; 775 struct timespec i_ctime;
777 unsigned int i_blkbits; 776 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
777 unsigned short i_bytes;
778 blkcnt_t i_blocks; 778 blkcnt_t i_blocks;
779 loff_t i_size;
779 780
780#ifdef __NEED_I_SIZE_ORDERED 781#ifdef __NEED_I_SIZE_ORDERED
781 seqcount_t i_size_seqcount; 782 seqcount_t i_size_seqcount;
@@ -783,7 +784,6 @@ struct inode {
783 784
784 /* Misc */ 785 /* Misc */
785 unsigned long i_state; 786 unsigned long i_state;
786 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
787 struct mutex i_mutex; 787 struct mutex i_mutex;
788 788
789 unsigned long dirtied_when; /* jiffies of first dirtying */ 789 unsigned long dirtied_when; /* jiffies of first dirtying */
@@ -797,9 +797,10 @@ struct inode {
797 struct rcu_head i_rcu; 797 struct rcu_head i_rcu;
798 }; 798 };
799 atomic_t i_count; 799 atomic_t i_count;
800 unsigned int i_blkbits;
800 u64 i_version; 801 u64 i_version;
801 unsigned short i_bytes;
802 atomic_t i_dio_count; 802 atomic_t i_dio_count;
803 atomic_t i_writecount;
803 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ 804 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
804 struct file_lock *i_flock; 805 struct file_lock *i_flock;
805 struct address_space i_data; 806 struct address_space i_data;
@@ -823,7 +824,6 @@ struct inode {
823#ifdef CONFIG_IMA 824#ifdef CONFIG_IMA
824 atomic_t i_readcount; /* struct files open RO */ 825 atomic_t i_readcount; /* struct files open RO */
825#endif 826#endif
826 atomic_t i_writecount;
827 void *i_private; /* fs or device private pointer */ 827 void *i_private; /* fs or device private pointer */
828}; 828};
829 829
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index e6a5e34bed4f..c7acdde3243d 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -244,6 +244,7 @@ typedef struct journal_superblock_s
244 244
245#include <linux/fs.h> 245#include <linux/fs.h>
246#include <linux/sched.h> 246#include <linux/sched.h>
247#include <linux/jbd_common.h>
247 248
248#define J_ASSERT(assert) BUG_ON(!(assert)) 249#define J_ASSERT(assert) BUG_ON(!(assert))
249 250
@@ -270,69 +271,6 @@ typedef struct journal_superblock_s
270#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) 271#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why)
271#endif 272#endif
272 273
273enum jbd_state_bits {
274 BH_JBD /* Has an attached ext3 journal_head */
275 = BH_PrivateStart,
276 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
277 BH_Freed, /* Has been freed (truncated) */
278 BH_Revoked, /* Has been revoked from the log */
279 BH_RevokeValid, /* Revoked flag is valid */
280 BH_JBDDirty, /* Is dirty but journaled */
281 BH_State, /* Pins most journal_head state */
282 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
283 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
284};
285
286BUFFER_FNS(JBD, jbd)
287BUFFER_FNS(JWrite, jwrite)
288BUFFER_FNS(JBDDirty, jbddirty)
289TAS_BUFFER_FNS(JBDDirty, jbddirty)
290BUFFER_FNS(Revoked, revoked)
291TAS_BUFFER_FNS(Revoked, revoked)
292BUFFER_FNS(RevokeValid, revokevalid)
293TAS_BUFFER_FNS(RevokeValid, revokevalid)
294BUFFER_FNS(Freed, freed)
295
296static inline struct buffer_head *jh2bh(struct journal_head *jh)
297{
298 return jh->b_bh;
299}
300
301static inline struct journal_head *bh2jh(struct buffer_head *bh)
302{
303 return bh->b_private;
304}
305
306static inline void jbd_lock_bh_state(struct buffer_head *bh)
307{
308 bit_spin_lock(BH_State, &bh->b_state);
309}
310
311static inline int jbd_trylock_bh_state(struct buffer_head *bh)
312{
313 return bit_spin_trylock(BH_State, &bh->b_state);
314}
315
316static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
317{
318 return bit_spin_is_locked(BH_State, &bh->b_state);
319}
320
321static inline void jbd_unlock_bh_state(struct buffer_head *bh)
322{
323 bit_spin_unlock(BH_State, &bh->b_state);
324}
325
326static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
327{
328 bit_spin_lock(BH_JournalHead, &bh->b_state);
329}
330
331static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
332{
333 bit_spin_unlock(BH_JournalHead, &bh->b_state);
334}
335
336struct jbd_revoke_table_s; 274struct jbd_revoke_table_s;
337 275
338/** 276/**
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 38f307b8c334..2092ea21e469 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -275,6 +275,7 @@ typedef struct journal_superblock_s
275 275
276#include <linux/fs.h> 276#include <linux/fs.h>
277#include <linux/sched.h> 277#include <linux/sched.h>
278#include <linux/jbd_common.h>
278 279
279#define J_ASSERT(assert) BUG_ON(!(assert)) 280#define J_ASSERT(assert) BUG_ON(!(assert))
280 281
@@ -302,70 +303,6 @@ typedef struct journal_superblock_s
302#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) 303#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why)
303#endif 304#endif
304 305
305enum jbd_state_bits {
306 BH_JBD /* Has an attached ext3 journal_head */
307 = BH_PrivateStart,
308 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
309 BH_Freed, /* Has been freed (truncated) */
310 BH_Revoked, /* Has been revoked from the log */
311 BH_RevokeValid, /* Revoked flag is valid */
312 BH_JBDDirty, /* Is dirty but journaled */
313 BH_State, /* Pins most journal_head state */
314 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
315 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
316 BH_JBDPrivateStart, /* First bit available for private use by FS */
317};
318
319BUFFER_FNS(JBD, jbd)
320BUFFER_FNS(JWrite, jwrite)
321BUFFER_FNS(JBDDirty, jbddirty)
322TAS_BUFFER_FNS(JBDDirty, jbddirty)
323BUFFER_FNS(Revoked, revoked)
324TAS_BUFFER_FNS(Revoked, revoked)
325BUFFER_FNS(RevokeValid, revokevalid)
326TAS_BUFFER_FNS(RevokeValid, revokevalid)
327BUFFER_FNS(Freed, freed)
328
329static inline struct buffer_head *jh2bh(struct journal_head *jh)
330{
331 return jh->b_bh;
332}
333
334static inline struct journal_head *bh2jh(struct buffer_head *bh)
335{
336 return bh->b_private;
337}
338
339static inline void jbd_lock_bh_state(struct buffer_head *bh)
340{
341 bit_spin_lock(BH_State, &bh->b_state);
342}
343
344static inline int jbd_trylock_bh_state(struct buffer_head *bh)
345{
346 return bit_spin_trylock(BH_State, &bh->b_state);
347}
348
349static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
350{
351 return bit_spin_is_locked(BH_State, &bh->b_state);
352}
353
354static inline void jbd_unlock_bh_state(struct buffer_head *bh)
355{
356 bit_spin_unlock(BH_State, &bh->b_state);
357}
358
359static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
360{
361 bit_spin_lock(BH_JournalHead, &bh->b_state);
362}
363
364static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
365{
366 bit_spin_unlock(BH_JournalHead, &bh->b_state);
367}
368
369/* Flags in jbd_inode->i_flags */ 306/* Flags in jbd_inode->i_flags */
370#define __JI_COMMIT_RUNNING 0 307#define __JI_COMMIT_RUNNING 0
371/* Commit of the inode data in progress. We use this flag to protect us from 308/* Commit of the inode data in progress. We use this flag to protect us from
@@ -1106,9 +1043,9 @@ static inline handle_t *journal_current_handle(void)
1106 */ 1043 */
1107 1044
1108extern handle_t *jbd2_journal_start(journal_t *, int nblocks); 1045extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
1109extern handle_t *jbd2__journal_start(journal_t *, int nblocks, int gfp_mask); 1046extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask);
1110extern int jbd2_journal_restart(handle_t *, int nblocks); 1047extern int jbd2_journal_restart(handle_t *, int nblocks);
1111extern int jbd2__journal_restart(handle_t *, int nblocks, int gfp_mask); 1048extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask);
1112extern int jbd2_journal_extend (handle_t *, int nblocks); 1049extern int jbd2_journal_extend (handle_t *, int nblocks);
1113extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); 1050extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
1114extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); 1051extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
new file mode 100644
index 000000000000..6230f8556a4e
--- /dev/null
+++ b/include/linux/jbd_common.h
@@ -0,0 +1,68 @@
1#ifndef _LINUX_JBD_STATE_H
2#define _LINUX_JBD_STATE_H
3
4enum jbd_state_bits {
5 BH_JBD /* Has an attached ext3 journal_head */
6 = BH_PrivateStart,
7 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
8 BH_Freed, /* Has been freed (truncated) */
9 BH_Revoked, /* Has been revoked from the log */
10 BH_RevokeValid, /* Revoked flag is valid */
11 BH_JBDDirty, /* Is dirty but journaled */
12 BH_State, /* Pins most journal_head state */
13 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
14 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
15 BH_JBDPrivateStart, /* First bit available for private use by FS */
16};
17
18BUFFER_FNS(JBD, jbd)
19BUFFER_FNS(JWrite, jwrite)
20BUFFER_FNS(JBDDirty, jbddirty)
21TAS_BUFFER_FNS(JBDDirty, jbddirty)
22BUFFER_FNS(Revoked, revoked)
23TAS_BUFFER_FNS(Revoked, revoked)
24BUFFER_FNS(RevokeValid, revokevalid)
25TAS_BUFFER_FNS(RevokeValid, revokevalid)
26BUFFER_FNS(Freed, freed)
27
28static inline struct buffer_head *jh2bh(struct journal_head *jh)
29{
30 return jh->b_bh;
31}
32
33static inline struct journal_head *bh2jh(struct buffer_head *bh)
34{
35 return bh->b_private;
36}
37
38static inline void jbd_lock_bh_state(struct buffer_head *bh)
39{
40 bit_spin_lock(BH_State, &bh->b_state);
41}
42
43static inline int jbd_trylock_bh_state(struct buffer_head *bh)
44{
45 return bit_spin_trylock(BH_State, &bh->b_state);
46}
47
48static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
49{
50 return bit_spin_is_locked(BH_State, &bh->b_state);
51}
52
53static inline void jbd_unlock_bh_state(struct buffer_head *bh)
54{
55 bit_spin_unlock(BH_State, &bh->b_state);
56}
57
58static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
59{
60 bit_spin_lock(BH_JournalHead, &bh->b_state);
61}
62
63static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
64{
65 bit_spin_unlock(BH_JournalHead, &bh->b_state);
66}
67
68#endif
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index b50a54736242..748ff7cbe555 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -9,9 +9,12 @@
9 9
10struct ext4_allocation_context; 10struct ext4_allocation_context;
11struct ext4_allocation_request; 11struct ext4_allocation_request;
12struct ext4_extent;
12struct ext4_prealloc_space; 13struct ext4_prealloc_space;
13struct ext4_inode_info; 14struct ext4_inode_info;
14struct mpage_da_data; 15struct mpage_da_data;
16struct ext4_map_blocks;
17struct ext4_extent;
15 18
16#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) 19#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
17 20
@@ -1032,9 +1035,9 @@ TRACE_EVENT(ext4_forget,
1032); 1035);
1033 1036
1034TRACE_EVENT(ext4_da_update_reserve_space, 1037TRACE_EVENT(ext4_da_update_reserve_space,
1035 TP_PROTO(struct inode *inode, int used_blocks), 1038 TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),
1036 1039
1037 TP_ARGS(inode, used_blocks), 1040 TP_ARGS(inode, used_blocks, quota_claim),
1038 1041
1039 TP_STRUCT__entry( 1042 TP_STRUCT__entry(
1040 __field( dev_t, dev ) 1043 __field( dev_t, dev )
@@ -1045,6 +1048,7 @@ TRACE_EVENT(ext4_da_update_reserve_space,
1045 __field( int, reserved_data_blocks ) 1048 __field( int, reserved_data_blocks )
1046 __field( int, reserved_meta_blocks ) 1049 __field( int, reserved_meta_blocks )
1047 __field( int, allocated_meta_blocks ) 1050 __field( int, allocated_meta_blocks )
1051 __field( int, quota_claim )
1048 ), 1052 ),
1049 1053
1050 TP_fast_assign( 1054 TP_fast_assign(
@@ -1053,19 +1057,24 @@ TRACE_EVENT(ext4_da_update_reserve_space,
1053 __entry->mode = inode->i_mode; 1057 __entry->mode = inode->i_mode;
1054 __entry->i_blocks = inode->i_blocks; 1058 __entry->i_blocks = inode->i_blocks;
1055 __entry->used_blocks = used_blocks; 1059 __entry->used_blocks = used_blocks;
1056 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1060 __entry->reserved_data_blocks =
1057 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; 1061 EXT4_I(inode)->i_reserved_data_blocks;
1058 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; 1062 __entry->reserved_meta_blocks =
1063 EXT4_I(inode)->i_reserved_meta_blocks;
1064 __entry->allocated_meta_blocks =
1065 EXT4_I(inode)->i_allocated_meta_blocks;
1066 __entry->quota_claim = quota_claim;
1059 ), 1067 ),
1060 1068
1061 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " 1069 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
1062 "reserved_data_blocks %d reserved_meta_blocks %d " 1070 "reserved_data_blocks %d reserved_meta_blocks %d "
1063 "allocated_meta_blocks %d", 1071 "allocated_meta_blocks %d quota_claim %d",
1064 MAJOR(__entry->dev), MINOR(__entry->dev), 1072 MAJOR(__entry->dev), MINOR(__entry->dev),
1065 (unsigned long) __entry->ino, 1073 (unsigned long) __entry->ino,
1066 __entry->mode, __entry->i_blocks, 1074 __entry->mode, __entry->i_blocks,
1067 __entry->used_blocks, __entry->reserved_data_blocks, 1075 __entry->used_blocks, __entry->reserved_data_blocks,
1068 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) 1076 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks,
1077 __entry->quota_claim)
1069); 1078);
1070 1079
1071TRACE_EVENT(ext4_da_reserve_space, 1080TRACE_EVENT(ext4_da_reserve_space,
@@ -1386,6 +1395,87 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
1386 TP_ARGS(inode) 1395 TP_ARGS(inode)
1387); 1396);
1388 1397
1398/* 'ux' is the uninitialized extent. */
1399TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
1400 TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
1401 struct ext4_extent *ux),
1402
1403 TP_ARGS(inode, map, ux),
1404
1405 TP_STRUCT__entry(
1406 __field( ino_t, ino )
1407 __field( dev_t, dev )
1408 __field( ext4_lblk_t, m_lblk )
1409 __field( unsigned, m_len )
1410 __field( ext4_lblk_t, u_lblk )
1411 __field( unsigned, u_len )
1412 __field( ext4_fsblk_t, u_pblk )
1413 ),
1414
1415 TP_fast_assign(
1416 __entry->ino = inode->i_ino;
1417 __entry->dev = inode->i_sb->s_dev;
1418 __entry->m_lblk = map->m_lblk;
1419 __entry->m_len = map->m_len;
1420 __entry->u_lblk = le32_to_cpu(ux->ee_block);
1421 __entry->u_len = ext4_ext_get_actual_len(ux);
1422 __entry->u_pblk = ext4_ext_pblock(ux);
1423 ),
1424
1425 TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
1426 "u_pblk %llu",
1427 MAJOR(__entry->dev), MINOR(__entry->dev),
1428 (unsigned long) __entry->ino,
1429 __entry->m_lblk, __entry->m_len,
1430 __entry->u_lblk, __entry->u_len, __entry->u_pblk)
1431);
1432
1433/*
1434 * 'ux' is the uninitialized extent.
1435 * 'ix' is the initialized extent to which blocks are transferred.
1436 */
1437TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
1438 TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
1439 struct ext4_extent *ux, struct ext4_extent *ix),
1440
1441 TP_ARGS(inode, map, ux, ix),
1442
1443 TP_STRUCT__entry(
1444 __field( ino_t, ino )
1445 __field( dev_t, dev )
1446 __field( ext4_lblk_t, m_lblk )
1447 __field( unsigned, m_len )
1448 __field( ext4_lblk_t, u_lblk )
1449 __field( unsigned, u_len )
1450 __field( ext4_fsblk_t, u_pblk )
1451 __field( ext4_lblk_t, i_lblk )
1452 __field( unsigned, i_len )
1453 __field( ext4_fsblk_t, i_pblk )
1454 ),
1455
1456 TP_fast_assign(
1457 __entry->ino = inode->i_ino;
1458 __entry->dev = inode->i_sb->s_dev;
1459 __entry->m_lblk = map->m_lblk;
1460 __entry->m_len = map->m_len;
1461 __entry->u_lblk = le32_to_cpu(ux->ee_block);
1462 __entry->u_len = ext4_ext_get_actual_len(ux);
1463 __entry->u_pblk = ext4_ext_pblock(ux);
1464 __entry->i_lblk = le32_to_cpu(ix->ee_block);
1465 __entry->i_len = ext4_ext_get_actual_len(ix);
1466 __entry->i_pblk = ext4_ext_pblock(ix);
1467 ),
1468
1469 TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
1470 "u_lblk %u u_len %u u_pblk %llu "
1471 "i_lblk %u i_len %u i_pblk %llu ",
1472 MAJOR(__entry->dev), MINOR(__entry->dev),
1473 (unsigned long) __entry->ino,
1474 __entry->m_lblk, __entry->m_len,
1475 __entry->u_lblk, __entry->u_len, __entry->u_pblk,
1476 __entry->i_lblk, __entry->i_len, __entry->i_pblk)
1477);
1478
1389DECLARE_EVENT_CLASS(ext4__map_blocks_enter, 1479DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
1390 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1480 TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
1391 unsigned int len, unsigned int flags), 1481 unsigned int len, unsigned int flags),
@@ -1589,6 +1679,382 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free,
1589 TP_ARGS(sb, group, start, len) 1679 TP_ARGS(sb, group, start, len)
1590); 1680);
1591 1681
1682TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
1683 TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
1684 unsigned int allocated, ext4_fsblk_t newblock),
1685
1686 TP_ARGS(inode, map, allocated, newblock),
1687
1688 TP_STRUCT__entry(
1689 __field( ino_t, ino )
1690 __field( dev_t, dev )
1691 __field( ext4_lblk_t, lblk )
1692 __field( ext4_fsblk_t, pblk )
1693 __field( unsigned int, len )
1694 __field( int, flags )
1695 __field( unsigned int, allocated )
1696 __field( ext4_fsblk_t, newblk )
1697 ),
1698
1699 TP_fast_assign(
1700 __entry->ino = inode->i_ino;
1701 __entry->dev = inode->i_sb->s_dev;
1702 __entry->lblk = map->m_lblk;
1703 __entry->pblk = map->m_pblk;
1704 __entry->len = map->m_len;
1705 __entry->flags = map->m_flags;
1706 __entry->allocated = allocated;
1707 __entry->newblk = newblock;
1708 ),
1709
1710 TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d"
1711 "allocated %d newblock %llu",
1712 MAJOR(__entry->dev), MINOR(__entry->dev),
1713 (unsigned long) __entry->ino,
1714 (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
1715 __entry->len, __entry->flags,
1716 (unsigned int) __entry->allocated,
1717 (unsigned long long) __entry->newblk)
1718);
1719
1720TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
1721 TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),
1722
1723 TP_ARGS(sb, map, ret),
1724
1725 TP_STRUCT__entry(
1726 __field( dev_t, dev )
1727 __field( ext4_lblk_t, lblk )
1728 __field( ext4_fsblk_t, pblk )
1729 __field( unsigned int, len )
1730 __field( unsigned int, flags )
1731 __field( int, ret )
1732 ),
1733
1734 TP_fast_assign(
1735 __entry->dev = sb->s_dev;
1736 __entry->lblk = map->m_lblk;
1737 __entry->pblk = map->m_pblk;
1738 __entry->len = map->m_len;
1739 __entry->flags = map->m_flags;
1740 __entry->ret = ret;
1741 ),
1742
1743 TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d",
1744 MAJOR(__entry->dev), MINOR(__entry->dev),
1745 __entry->lblk, (unsigned long long) __entry->pblk,
1746 __entry->len, __entry->flags, __entry->ret)
1747);
1748
1749TRACE_EVENT(ext4_ext_put_in_cache,
1750 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len,
1751 ext4_fsblk_t start),
1752
1753 TP_ARGS(inode, lblk, len, start),
1754
1755 TP_STRUCT__entry(
1756 __field( ino_t, ino )
1757 __field( dev_t, dev )
1758 __field( ext4_lblk_t, lblk )
1759 __field( unsigned int, len )
1760 __field( ext4_fsblk_t, start )
1761 ),
1762
1763 TP_fast_assign(
1764 __entry->ino = inode->i_ino;
1765 __entry->dev = inode->i_sb->s_dev;
1766 __entry->lblk = lblk;
1767 __entry->len = len;
1768 __entry->start = start;
1769 ),
1770
1771 TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu",
1772 MAJOR(__entry->dev), MINOR(__entry->dev),
1773 (unsigned long) __entry->ino,
1774 (unsigned) __entry->lblk,
1775 __entry->len,
1776 (unsigned long long) __entry->start)
1777);
1778
1779TRACE_EVENT(ext4_ext_in_cache,
1780 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret),
1781
1782 TP_ARGS(inode, lblk, ret),
1783
1784 TP_STRUCT__entry(
1785 __field( ino_t, ino )
1786 __field( dev_t, dev )
1787 __field( ext4_lblk_t, lblk )
1788 __field( int, ret )
1789 ),
1790
1791 TP_fast_assign(
1792 __entry->ino = inode->i_ino;
1793 __entry->dev = inode->i_sb->s_dev;
1794 __entry->lblk = lblk;
1795 __entry->ret = ret;
1796 ),
1797
1798 TP_printk("dev %d,%d ino %lu lblk %u ret %d",
1799 MAJOR(__entry->dev), MINOR(__entry->dev),
1800 (unsigned long) __entry->ino,
1801 (unsigned) __entry->lblk,
1802 __entry->ret)
1803
1804);
1805
1806TRACE_EVENT(ext4_find_delalloc_range,
1807 TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to,
1808 int reverse, int found, ext4_lblk_t found_blk),
1809
1810 TP_ARGS(inode, from, to, reverse, found, found_blk),
1811
1812 TP_STRUCT__entry(
1813 __field( ino_t, ino )
1814 __field( dev_t, dev )
1815 __field( ext4_lblk_t, from )
1816 __field( ext4_lblk_t, to )
1817 __field( int, reverse )
1818 __field( int, found )
1819 __field( ext4_lblk_t, found_blk )
1820 ),
1821
1822 TP_fast_assign(
1823 __entry->ino = inode->i_ino;
1824 __entry->dev = inode->i_sb->s_dev;
1825 __entry->from = from;
1826 __entry->to = to;
1827 __entry->reverse = reverse;
1828 __entry->found = found;
1829 __entry->found_blk = found_blk;
1830 ),
1831
1832 TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d "
1833 "(blk = %u)",
1834 MAJOR(__entry->dev), MINOR(__entry->dev),
1835 (unsigned long) __entry->ino,
1836 (unsigned) __entry->from, (unsigned) __entry->to,
1837 __entry->reverse, __entry->found,
1838 (unsigned) __entry->found_blk)
1839);
1840
1841TRACE_EVENT(ext4_get_reserved_cluster_alloc,
1842 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len),
1843
1844 TP_ARGS(inode, lblk, len),
1845
1846 TP_STRUCT__entry(
1847 __field( ino_t, ino )
1848 __field( dev_t, dev )
1849 __field( ext4_lblk_t, lblk )
1850 __field( unsigned int, len )
1851 ),
1852
1853 TP_fast_assign(
1854 __entry->ino = inode->i_ino;
1855 __entry->dev = inode->i_sb->s_dev;
1856 __entry->lblk = lblk;
1857 __entry->len = len;
1858 ),
1859
1860 TP_printk("dev %d,%d ino %lu lblk %u len %u",
1861 MAJOR(__entry->dev), MINOR(__entry->dev),
1862 (unsigned long) __entry->ino,
1863 (unsigned) __entry->lblk,
1864 __entry->len)
1865);
1866
1867TRACE_EVENT(ext4_ext_show_extent,
1868 TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
1869 unsigned short len),
1870
1871 TP_ARGS(inode, lblk, pblk, len),
1872
1873 TP_STRUCT__entry(
1874 __field( ino_t, ino )
1875 __field( dev_t, dev )
1876 __field( ext4_lblk_t, lblk )
1877 __field( ext4_fsblk_t, pblk )
1878 __field( unsigned short, len )
1879 ),
1880
1881 TP_fast_assign(
1882 __entry->ino = inode->i_ino;
1883 __entry->dev = inode->i_sb->s_dev;
1884 __entry->lblk = lblk;
1885 __entry->pblk = pblk;
1886 __entry->len = len;
1887 ),
1888
1889 TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
1890 MAJOR(__entry->dev), MINOR(__entry->dev),
1891 (unsigned long) __entry->ino,
1892 (unsigned) __entry->lblk,
1893 (unsigned long long) __entry->pblk,
1894 (unsigned short) __entry->len)
1895);
1896
1897TRACE_EVENT(ext4_remove_blocks,
1898 TP_PROTO(struct inode *inode, struct ext4_extent *ex,
1899 ext4_lblk_t from, ext4_fsblk_t to,
1900 ext4_fsblk_t partial_cluster),
1901
1902 TP_ARGS(inode, ex, from, to, partial_cluster),
1903
1904 TP_STRUCT__entry(
1905 __field( ino_t, ino )
1906 __field( dev_t, dev )
1907 __field( ext4_lblk_t, ee_lblk )
1908 __field( ext4_fsblk_t, ee_pblk )
1909 __field( unsigned short, ee_len )
1910 __field( ext4_lblk_t, from )
1911 __field( ext4_lblk_t, to )
1912 __field( ext4_fsblk_t, partial )
1913 ),
1914
1915 TP_fast_assign(
1916 __entry->ino = inode->i_ino;
1917 __entry->dev = inode->i_sb->s_dev;
1918 __entry->ee_lblk = cpu_to_le32(ex->ee_block);
1919 __entry->ee_pblk = ext4_ext_pblock(ex);
1920 __entry->ee_len = ext4_ext_get_actual_len(ex);
1921 __entry->from = from;
1922 __entry->to = to;
1923 __entry->partial = partial_cluster;
1924 ),
1925
1926 TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
1927 "from %u to %u partial_cluster %u",
1928 MAJOR(__entry->dev), MINOR(__entry->dev),
1929 (unsigned long) __entry->ino,
1930 (unsigned) __entry->ee_lblk,
1931 (unsigned long long) __entry->ee_pblk,
1932 (unsigned short) __entry->ee_len,
1933 (unsigned) __entry->from,
1934 (unsigned) __entry->to,
1935 (unsigned) __entry->partial)
1936);
1937
1938TRACE_EVENT(ext4_ext_rm_leaf,
1939 TP_PROTO(struct inode *inode, ext4_lblk_t start,
1940 struct ext4_extent *ex, ext4_fsblk_t partial_cluster),
1941
1942 TP_ARGS(inode, start, ex, partial_cluster),
1943
1944 TP_STRUCT__entry(
1945 __field( ino_t, ino )
1946 __field( dev_t, dev )
1947 __field( ext4_lblk_t, start )
1948 __field( ext4_lblk_t, ee_lblk )
1949 __field( ext4_fsblk_t, ee_pblk )
1950 __field( short, ee_len )
1951 __field( ext4_fsblk_t, partial )
1952 ),
1953
1954 TP_fast_assign(
1955 __entry->ino = inode->i_ino;
1956 __entry->dev = inode->i_sb->s_dev;
1957 __entry->start = start;
1958 __entry->ee_lblk = le32_to_cpu(ex->ee_block);
1959 __entry->ee_pblk = ext4_ext_pblock(ex);
1960 __entry->ee_len = ext4_ext_get_actual_len(ex);
1961 __entry->partial = partial_cluster;
1962 ),
1963
1964 TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
1965 "partial_cluster %u",
1966 MAJOR(__entry->dev), MINOR(__entry->dev),
1967 (unsigned long) __entry->ino,
1968 (unsigned) __entry->start,
1969 (unsigned) __entry->ee_lblk,
1970 (unsigned long long) __entry->ee_pblk,
1971 (unsigned short) __entry->ee_len,
1972 (unsigned) __entry->partial)
1973);
1974
1975TRACE_EVENT(ext4_ext_rm_idx,
1976 TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),
1977
1978 TP_ARGS(inode, pblk),
1979
1980 TP_STRUCT__entry(
1981 __field( ino_t, ino )
1982 __field( dev_t, dev )
1983 __field( ext4_fsblk_t, pblk )
1984 ),
1985
1986 TP_fast_assign(
1987 __entry->ino = inode->i_ino;
1988 __entry->dev = inode->i_sb->s_dev;
1989 __entry->pblk = pblk;
1990 ),
1991
1992 TP_printk("dev %d,%d ino %lu index_pblk %llu",
1993 MAJOR(__entry->dev), MINOR(__entry->dev),
1994 (unsigned long) __entry->ino,
1995 (unsigned long long) __entry->pblk)
1996);
1997
1998TRACE_EVENT(ext4_ext_remove_space,
1999 TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth),
2000
2001 TP_ARGS(inode, start, depth),
2002
2003 TP_STRUCT__entry(
2004 __field( ino_t, ino )
2005 __field( dev_t, dev )
2006 __field( ext4_lblk_t, start )
2007 __field( int, depth )
2008 ),
2009
2010 TP_fast_assign(
2011 __entry->ino = inode->i_ino;
2012 __entry->dev = inode->i_sb->s_dev;
2013 __entry->start = start;
2014 __entry->depth = depth;
2015 ),
2016
2017 TP_printk("dev %d,%d ino %lu since %u depth %d",
2018 MAJOR(__entry->dev), MINOR(__entry->dev),
2019 (unsigned long) __entry->ino,
2020 (unsigned) __entry->start,
2021 __entry->depth)
2022);
2023
2024TRACE_EVENT(ext4_ext_remove_space_done,
2025 TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
2026 ext4_lblk_t partial, unsigned short eh_entries),
2027
2028 TP_ARGS(inode, start, depth, partial, eh_entries),
2029
2030 TP_STRUCT__entry(
2031 __field( ino_t, ino )
2032 __field( dev_t, dev )
2033 __field( ext4_lblk_t, start )
2034 __field( int, depth )
2035 __field( ext4_lblk_t, partial )
2036 __field( unsigned short, eh_entries )
2037 ),
2038
2039 TP_fast_assign(
2040 __entry->ino = inode->i_ino;
2041 __entry->dev = inode->i_sb->s_dev;
2042 __entry->start = start;
2043 __entry->depth = depth;
2044 __entry->partial = partial;
2045 __entry->eh_entries = eh_entries;
2046 ),
2047
2048 TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
2049 "remaining_entries %u",
2050 MAJOR(__entry->dev), MINOR(__entry->dev),
2051 (unsigned long) __entry->ino,
2052 (unsigned) __entry->start,
2053 __entry->depth,
2054 (unsigned) __entry->partial,
2055 (unsigned short) __entry->eh_entries)
2056);
2057
1592#endif /* _TRACE_EXT4_H */ 2058#endif /* _TRACE_EXT4_H */
1593 2059
1594/* This part must be outside protection */ 2060/* This part must be outside protection */