aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ext4.txt85
-rw-r--r--block/Kconfig6
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/ext3/hash.c77
-rw-r--r--fs/ext3/namei.c11
-rw-r--r--fs/ext3/super.c33
-rw-r--r--fs/ext4/balloc.c293
-rw-r--r--fs/ext4/bitmap.c5
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h152
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_i.h16
-rw-r--r--fs/ext4/ext4_jbd2.c83
-rw-r--r--fs/ext4/ext4_jbd2.h87
-rw-r--r--fs/ext4/ext4_sb.h6
-rw-r--r--fs/ext4/extents.c60
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/hash.c77
-rw-r--r--fs/ext4/ialloc.c324
-rw-r--r--fs/ext4/inode.c309
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c629
-rw-r--r--fs/ext4/mballoc.h71
-rw-r--r--fs/ext4/migrate.c19
-rw-r--r--fs/ext4/namei.c92
-rw-r--r--fs/ext4/resize.c113
-rw-r--r--fs/ext4/super.c622
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/ioprio.c3
-rw-r--r--fs/jbd2/checkpoint.c24
-rw-r--r--fs/jbd2/commit.c58
-rw-r--r--fs/jbd2/journal.c124
-rw-r--r--fs/jbd2/transaction.c60
-rw-r--r--fs/super.c2
-rw-r--r--include/linux/ext3_fs.h28
-rw-r--r--include/linux/ext3_fs_sb.h1
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/ioprio.h2
-rw-r--r--include/linux/jbd2.h38
39 files changed, 2271 insertions, 1301 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 174eaff7ded9..cec829bc7291 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -58,13 +58,22 @@ Note: More extensive information for getting started with ext4 can be
58 58
59 # mount -t ext4 /dev/hda1 /wherever 59 # mount -t ext4 /dev/hda1 /wherever
60 60
61 - When comparing performance with other filesystems, remember that 61 - When comparing performance with other filesystems, it's always
62 ext3/4 by default offers higher data integrity guarantees than most. 62 important to try multiple workloads; very often a subtle change in a
63 So when comparing with a metadata-only journalling filesystem, such 63 workload parameter can completely change the ranking of which
64 as ext3, use `mount -o data=writeback'. And you might as well use 64 filesystems do well compared to others. When comparing versus ext3,
65 `mount -o nobh' too along with it. Making the journal larger than 65 note that ext4 enables write barriers by default, while ext3 does
66 the mke2fs default often helps performance with metadata-intensive 66 not enable write barriers by default. So it is useful to use
67 workloads. 67 explicitly specify whether barriers are enabled or not when via the
68 '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
69 for a fair comparison. When tuning ext3 for best benchmark numbers,
70 it is often worthwhile to try changing the data journaling mode; '-o
71 data=writeback,nobh' can be faster for some workloads. (Note
72 however that running mounted with data=writeback can potentially
73 leave stale data exposed in recently written files in case of an
74 unclean shutdown, which could be a security exposure in some
75 situations.) Configuring the filesystem with a large journal can
76 also be helpful for metadata-intensive workloads.
68 77
692. Features 782. Features
70=========== 79===========
@@ -74,7 +83,7 @@ Note: More extensive information for getting started with ext4 can be
74* ability to use filesystems > 16TB (e2fsprogs support not available yet) 83* ability to use filesystems > 16TB (e2fsprogs support not available yet)
75* extent format reduces metadata overhead (RAM, IO for access, transactions) 84* extent format reduces metadata overhead (RAM, IO for access, transactions)
76* extent format more robust in face of on-disk corruption due to magics, 85* extent format more robust in face of on-disk corruption due to magics,
77* internal redunancy in tree 86* internal redundancy in tree
78* improved file allocation (multi-block alloc) 87* improved file allocation (multi-block alloc)
79* fix 32000 subdirectory limit 88* fix 32000 subdirectory limit
80* nsec timestamps for mtime, atime, ctime, create time 89* nsec timestamps for mtime, atime, ctime, create time
@@ -116,10 +125,11 @@ grouping of bitmaps and inode tables. Some test results available here:
116When mounting an ext4 filesystem, the following option are accepted: 125When mounting an ext4 filesystem, the following option are accepted:
117(*) == default 126(*) == default
118 127
119extents (*) ext4 will use extents to address file data. The 128ro Mount filesystem read only. Note that ext4 will
120 file system will no longer be mountable by ext3. 129 replay the journal (and thus write to the
121 130 partition) even when mounted "read only". The
122noextents ext4 will not use extents for newly created files 131 mount options "ro,noload" can be used to prevent
132 writes to the filesystem.
123 133
124journal_checksum Enable checksumming of the journal transactions. 134journal_checksum Enable checksumming of the journal transactions.
125 This will allow the recovery code in e2fsck and the 135 This will allow the recovery code in e2fsck and the
@@ -134,17 +144,17 @@ journal_async_commit Commit block can be written to disk without waiting
134journal=update Update the ext4 file system's journal to the current 144journal=update Update the ext4 file system's journal to the current
135 format. 145 format.
136 146
137journal=inum When a journal already exists, this option is ignored.
138 Otherwise, it specifies the number of the inode which
139 will represent the ext4 file system's journal file.
140
141journal_dev=devnum When the external journal device's major/minor numbers 147journal_dev=devnum When the external journal device's major/minor numbers
142 have changed, this option allows the user to specify 148 have changed, this option allows the user to specify
143 the new journal location. The journal device is 149 the new journal location. The journal device is
144 identified through its new major/minor numbers encoded 150 identified through its new major/minor numbers encoded
145 in devnum. 151 in devnum.
146 152
147noload Don't load the journal on mounting. 153noload Don't load the journal on mounting. Note that
154 if the filesystem was not unmounted cleanly,
155 skipping the journal replay will lead to the
156 filesystem containing inconsistencies that can
157 lead to any number of problems.
148 158
149data=journal All data are committed into the journal prior to being 159data=journal All data are committed into the journal prior to being
150 written into the main file system. 160 written into the main file system.
@@ -219,9 +229,12 @@ minixdf Make 'df' act like Minix.
219 229
220debug Extra debugging information is sent to syslog. 230debug Extra debugging information is sent to syslog.
221 231
222errors=remount-ro(*) Remount the filesystem read-only on an error. 232errors=remount-ro Remount the filesystem read-only on an error.
223errors=continue Keep going on a filesystem error. 233errors=continue Keep going on a filesystem error.
224errors=panic Panic and halt the machine if an error occurs. 234errors=panic Panic and halt the machine if an error occurs.
235 (These mount options override the errors behavior
236 specified in the superblock, which can be configured
237 using tune2fs)
225 238
226data_err=ignore(*) Just print an error message if an error occurs 239data_err=ignore(*) Just print an error message if an error occurs
227 in a file data buffer in ordered mode. 240 in a file data buffer in ordered mode.
@@ -261,6 +274,42 @@ delalloc (*) Deferring block allocation until write-out time.
261nodelalloc Disable delayed allocation. Blocks are allocation 274nodelalloc Disable delayed allocation. Blocks are allocation
262 when data is copied from user to page cache. 275 when data is copied from user to page cache.
263 276
277max_batch_time=usec Maximum amount of time ext4 should wait for
278 additional filesystem operations to be batch
279 together with a synchronous write operation.
280 Since a synchronous write operation is going to
281 force a commit and then a wait for the I/O
282 complete, it doesn't cost much, and can be a
283 huge throughput win, we wait for a small amount
284 of time to see if any other transactions can
285 piggyback on the synchronous write. The
286 algorithm used is designed to automatically tune
287 for the speed of the disk, by measuring the
288 amount of time (on average) that it takes to
289 finish committing a transaction. Call this time
290 the "commit time". If the time that the
291 transactoin has been running is less than the
292 commit time, ext4 will try sleeping for the
293 commit time to see if other operations will join
294 the transaction. The commit time is capped by
295 the max_batch_time, which defaults to 15000us
296 (15ms). This optimization can be turned off
297 entirely by setting max_batch_time to 0.
298
299min_batch_time=usec This parameter sets the commit time (as
300 described above) to be at least min_batch_time.
301 It defaults to zero microseconds. Increasing
302 this parameter may improve the throughput of
303 multi-threaded, synchronous workloads on very
304 fast disks, at the cost of increasing latency.
305
306journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
307 highest priorty) which should be used for I/O
308 operations submitted by kjournald2 during a
309 commit operation. This defaults to 3, which is
310 a slightly higher priority than the default I/O
311 priority.
312
264Data Mode 313Data Mode
265========= 314=========
266There are 3 different data modes: 315There are 3 different data modes:
diff --git a/block/Kconfig b/block/Kconfig
index ac0956f77785..0cbb3b88b59a 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -36,6 +36,12 @@ config LBD
36 This option also enables support for single files larger than 36 This option also enables support for single files larger than
37 2TB. 37 2TB.
38 38
39 The ext4 filesystem requires that this feature be enabled in
40 order to support filesystems that have the huge_file feature
41 enabled. Otherwise, it will refuse to mount any filesystems
42 that use the huge_file feature, which is enabled by default
43 by mke2fs.ext4. The GFS2 filesystem also requires this feature.
44
39 If unsure, say N. 45 If unsure, say N.
40 46
41config BLK_DEV_IO_TRACE 47config BLK_DEV_IO_TRACE
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8ebbfdf708c2..ac7031f12ea5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1234,6 +1234,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1234 return blkdev_ioctl(bdev, mode, cmd, arg); 1234 return blkdev_ioctl(bdev, mode, cmd, arg);
1235} 1235}
1236 1236
1237/*
1238 * Try to release a page associated with block device when the system
1239 * is under memory pressure.
1240 */
1241static int blkdev_releasepage(struct page *page, gfp_t wait)
1242{
1243 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1244
1245 if (super && super->s_op->bdev_try_to_free_page)
1246 return super->s_op->bdev_try_to_free_page(super, page, wait);
1247
1248 return try_to_free_buffers(page);
1249}
1250
1237static const struct address_space_operations def_blk_aops = { 1251static const struct address_space_operations def_blk_aops = {
1238 .readpage = blkdev_readpage, 1252 .readpage = blkdev_readpage,
1239 .writepage = blkdev_writepage, 1253 .writepage = blkdev_writepage,
@@ -1241,6 +1255,7 @@ static const struct address_space_operations def_blk_aops = {
1241 .write_begin = blkdev_write_begin, 1255 .write_begin = blkdev_write_begin,
1242 .write_end = blkdev_write_end, 1256 .write_end = blkdev_write_end,
1243 .writepages = generic_writepages, 1257 .writepages = generic_writepages,
1258 .releasepage = blkdev_releasepage,
1244 .direct_IO = blkdev_direct_IO, 1259 .direct_IO = blkdev_direct_IO,
1245}; 1260};
1246 1261
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2e..7d215b4d4f2e 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash (const char *name, int len) 38static __u32 dx_hack_hash_unsigned(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 const unsigned char *ucp = (const unsigned char *) name;
42
43 while (len--) {
44 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
45
46 if (hash & 0x80000000)
47 hash -= 0x7fffffff;
48 hash1 = hash0;
49 hash0 = hash;
50 }
51 return hash0 << 1;
52}
53
54static __u32 dx_hack_hash_signed(const char *name, int len)
55{
56 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
57 const signed char *scp = (const signed char *) name;
58
41 while (len--) { 59 while (len--) {
42 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); 60 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
43 61
44 if (hash & 0x80000000) hash -= 0x7fffffff; 62 if (hash & 0x80000000)
63 hash -= 0x7fffffff;
45 hash1 = hash0; 64 hash1 = hash0;
46 hash0 = hash; 65 hash0 = hash;
47 } 66 }
48 return (hash0 << 1); 67 return hash0 << 1;
49} 68}
50 69
51static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 70static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
52{ 71{
53 __u32 pad, val; 72 __u32 pad, val;
54 int i; 73 int i;
74 const signed char *scp = (const signed char *) msg;
75
76 pad = (__u32)len | ((__u32)len << 8);
77 pad |= pad << 16;
78
79 val = pad;
80 if (len > num*4)
81 len = num * 4;
82 for (i = 0; i < len; i++) {
83 if ((i % 4) == 0)
84 val = pad;
85 val = ((int) scp[i]) + (val << 8);
86 if ((i % 4) == 3) {
87 *buf++ = val;
88 val = pad;
89 num--;
90 }
91 }
92 if (--num >= 0)
93 *buf++ = val;
94 while (--num >= 0)
95 *buf++ = pad;
96}
97
98static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
99{
100 __u32 pad, val;
101 int i;
102 const unsigned char *ucp = (const unsigned char *) msg;
55 103
56 pad = (__u32)len | ((__u32)len << 8); 104 pad = (__u32)len | ((__u32)len << 8);
57 pad |= pad << 16; 105 pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
62 for (i=0; i < len; i++) { 110 for (i=0; i < len; i++) {
63 if ((i % 4) == 0) 111 if ((i % 4) == 0)
64 val = pad; 112 val = pad;
65 val = msg[i] + (val << 8); 113 val = ((int) ucp[i]) + (val << 8);
66 if ((i % 4) == 3) { 114 if ((i % 4) == 3) {
67 *buf++ = val; 115 *buf++ = val;
68 val = pad; 116 val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
95 const char *p; 143 const char *p;
96 int i; 144 int i;
97 __u32 in[8], buf[4]; 145 __u32 in[8], buf[4];
146 void (*str2hashbuf)(const char *, int, __u32 *, int) =
147 str2hashbuf_signed;
98 148
99 /* Initialize the default seed for the hash checksum functions */ 149 /* Initialize the default seed for the hash checksum functions */
100 buf[0] = 0x67452301; 150 buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
113 } 163 }
114 164
115 switch (hinfo->hash_version) { 165 switch (hinfo->hash_version) {
166 case DX_HASH_LEGACY_UNSIGNED:
167 hash = dx_hack_hash_unsigned(name, len);
168 break;
116 case DX_HASH_LEGACY: 169 case DX_HASH_LEGACY:
117 hash = dx_hack_hash(name, len); 170 hash = dx_hack_hash_signed(name, len);
118 break; 171 break;
172 case DX_HASH_HALF_MD4_UNSIGNED:
173 str2hashbuf = str2hashbuf_unsigned;
119 case DX_HASH_HALF_MD4: 174 case DX_HASH_HALF_MD4:
120 p = name; 175 p = name;
121 while (len > 0) { 176 while (len > 0) {
122 str2hashbuf(p, len, in, 8); 177 (*str2hashbuf)(p, len, in, 8);
123 half_md4_transform(buf, in); 178 half_md4_transform(buf, in);
124 len -= 32; 179 len -= 32;
125 p += 32; 180 p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
127 minor_hash = buf[2]; 182 minor_hash = buf[2];
128 hash = buf[1]; 183 hash = buf[1];
129 break; 184 break;
185 case DX_HASH_TEA_UNSIGNED:
186 str2hashbuf = str2hashbuf_unsigned;
130 case DX_HASH_TEA: 187 case DX_HASH_TEA:
131 p = name; 188 p = name;
132 while (len > 0) { 189 while (len > 0) {
133 str2hashbuf(p, len, in, 4); 190 (*str2hashbuf)(p, len, in, 4);
134 TEA_transform(buf, in); 191 TEA_transform(buf, in);
135 len -= 16; 192 len -= 16;
136 p += 16; 193 p += 16;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 8d6f965e502c..69a3d19ca9fd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -364,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
364 goto fail; 364 goto fail;
365 } 365 }
366 hinfo->hash_version = root->info.hash_version; 366 hinfo->hash_version = root->info.hash_version;
367 if (hinfo->hash_version <= DX_HASH_TEA)
368 hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
367 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; 369 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
368 if (entry) 370 if (entry)
369 ext3fs_dirhash(entry->name, entry->len, hinfo); 371 ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -632,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
632 dir = dir_file->f_path.dentry->d_inode; 634 dir = dir_file->f_path.dentry->d_inode;
633 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { 635 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
634 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; 636 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
637 if (hinfo.hash_version <= DX_HASH_TEA)
638 hinfo.hash_version +=
639 EXT3_SB(dir->i_sb)->s_hash_unsigned;
635 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; 640 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
636 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 641 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
637 start_hash, start_minor_hash); 642 start_hash, start_minor_hash);
@@ -1152,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1152 u32 hash2; 1157 u32 hash2;
1153 struct dx_map_entry *map; 1158 struct dx_map_entry *map;
1154 char *data1 = (*bh)->b_data, *data2; 1159 char *data1 = (*bh)->b_data, *data2;
1155 unsigned split, move, size, i; 1160 unsigned split, move, size;
1156 struct ext3_dir_entry_2 *de = NULL, *de2; 1161 struct ext3_dir_entry_2 *de = NULL, *de2;
1157 int err = 0; 1162 int err = 0, i;
1158 1163
1159 bh2 = ext3_append (handle, dir, &newblock, &err); 1164 bh2 = ext3_append (handle, dir, &newblock, &err);
1160 if (!(bh2)) { 1165 if (!(bh2)) {
@@ -1394,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1394 1399
1395 /* Initialize as for dx_probe */ 1400 /* Initialize as for dx_probe */
1396 hinfo.hash_version = root->info.hash_version; 1401 hinfo.hash_version = root->info.hash_version;
1402 if (hinfo.hash_version <= DX_HASH_TEA)
1403 hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
1397 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; 1404 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1398 ext3fs_dirhash(name, namelen, &hinfo); 1405 ext3fs_dirhash(name, namelen, &hinfo);
1399 frame = frames; 1406 frame = frames;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 01c235bc2054..5d047a030a73 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -683,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
683 ext3_nfs_get_inode); 683 ext3_nfs_get_inode);
684} 684}
685 685
686/*
687 * Try to release metadata pages (indirect blocks, directories) which are
688 * mapped via the block device. Since these pages could have journal heads
689 * which would prevent try_to_free_buffers() from freeing them, we must use
690 * jbd layer's try_to_free_buffers() function to release them.
691 */
692static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
693 gfp_t wait)
694{
695 journal_t *journal = EXT3_SB(sb)->s_journal;
696
697 WARN_ON(PageChecked(page));
698 if (!page_has_buffers(page))
699 return 0;
700 if (journal)
701 return journal_try_to_free_buffers(journal, page,
702 wait & ~__GFP_WAIT);
703 return try_to_free_buffers(page);
704}
705
686#ifdef CONFIG_QUOTA 706#ifdef CONFIG_QUOTA
687#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") 707#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
688#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 708#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -749,6 +769,7 @@ static const struct super_operations ext3_sops = {
749 .quota_read = ext3_quota_read, 769 .quota_read = ext3_quota_read,
750 .quota_write = ext3_quota_write, 770 .quota_write = ext3_quota_write,
751#endif 771#endif
772 .bdev_try_to_free_page = bdev_try_to_free_page,
752}; 773};
753 774
754static const struct export_operations ext3_export_ops = { 775static const struct export_operations ext3_export_ops = {
@@ -1750,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1750 for (i=0; i < 4; i++) 1771 for (i=0; i < 4; i++)
1751 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 1772 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1752 sbi->s_def_hash_version = es->s_def_hash_version; 1773 sbi->s_def_hash_version = es->s_def_hash_version;
1774 i = le32_to_cpu(es->s_flags);
1775 if (i & EXT2_FLAGS_UNSIGNED_HASH)
1776 sbi->s_hash_unsigned = 3;
1777 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
1778#ifdef __CHAR_UNSIGNED__
1779 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
1780 sbi->s_hash_unsigned = 3;
1781#else
1782 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
1783#endif
1784 sb->s_dirt = 1;
1785 }
1753 1786
1754 if (sbi->s_blocks_per_group > blocksize * 8) { 1787 if (sbi->s_blocks_per_group > blocksize * 8) {
1755 printk (KERN_ERR 1788 printk (KERN_ERR
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683b..6bba06b09dd1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
20#include "ext4.h" 20#include "ext4.h"
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "group.h" 22#include "group.h"
23#include "mballoc.h"
23 24
24/* 25/*
25 * balloc.c contains the blocks allocation and deallocation routines 26 * balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
100 * essentially implementing a per-group read-only flag. */ 101 * essentially implementing a per-group read-only flag. */
101 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 102 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
102 ext4_error(sb, __func__, 103 ext4_error(sb, __func__,
103 "Checksum bad for group %lu\n", block_group); 104 "Checksum bad for group %u", block_group);
104 gdp->bg_free_blocks_count = 0; 105 ext4_free_blks_set(sb, gdp, 0);
105 gdp->bg_free_inodes_count = 0; 106 ext4_free_inodes_set(sb, gdp, 0);
106 gdp->bg_itable_unused = 0; 107 ext4_itable_unused_set(sb, gdp, 0);
107 memset(bh->b_data, 0xff, sb->s_blocksize); 108 memset(bh->b_data, 0xff, sb->s_blocksize);
108 return 0; 109 return 0;
109 } 110 }
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
205 ext4_group_t block_group, 206 ext4_group_t block_group,
206 struct buffer_head **bh) 207 struct buffer_head **bh)
207{ 208{
208 unsigned long group_desc; 209 unsigned int group_desc;
209 unsigned long offset; 210 unsigned int offset;
210 struct ext4_group_desc *desc; 211 struct ext4_group_desc *desc;
211 struct ext4_sb_info *sbi = EXT4_SB(sb); 212 struct ext4_sb_info *sbi = EXT4_SB(sb);
212 213
213 if (block_group >= sbi->s_groups_count) { 214 if (block_group >= sbi->s_groups_count) {
214 ext4_error(sb, "ext4_get_group_desc", 215 ext4_error(sb, "ext4_get_group_desc",
215 "block_group >= groups_count - " 216 "block_group >= groups_count - "
216 "block_group = %lu, groups_count = %lu", 217 "block_group = %u, groups_count = %u",
217 block_group, sbi->s_groups_count); 218 block_group, sbi->s_groups_count);
218 219
219 return NULL; 220 return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
225 if (!sbi->s_group_desc[group_desc]) { 226 if (!sbi->s_group_desc[group_desc]) {
226 ext4_error(sb, "ext4_get_group_desc", 227 ext4_error(sb, "ext4_get_group_desc",
227 "Group descriptor not loaded - " 228 "Group descriptor not loaded - "
228 "block_group = %lu, group_desc = %lu, desc = %lu", 229 "block_group = %u, group_desc = %u, desc = %u",
229 block_group, group_desc, offset); 230 block_group, group_desc, offset);
230 return NULL; 231 return NULL;
231 } 232 }
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
315 if (unlikely(!bh)) { 316 if (unlikely(!bh)) {
316 ext4_error(sb, __func__, 317 ext4_error(sb, __func__,
317 "Cannot read block bitmap - " 318 "Cannot read block bitmap - "
318 "block_group = %lu, block_bitmap = %llu", 319 "block_group = %u, block_bitmap = %llu",
319 block_group, bitmap_blk); 320 block_group, bitmap_blk);
320 return NULL; 321 return NULL;
321 } 322 }
322 if (buffer_uptodate(bh) && 323
323 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) 324 if (bitmap_uptodate(bh))
324 return bh; 325 return bh;
325 326
326 lock_buffer(bh); 327 lock_buffer(bh);
328 if (bitmap_uptodate(bh)) {
329 unlock_buffer(bh);
330 return bh;
331 }
327 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 332 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
328 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 333 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
329 ext4_init_block_bitmap(sb, bh, block_group, desc); 334 ext4_init_block_bitmap(sb, bh, block_group, desc);
335 set_bitmap_uptodate(bh);
330 set_buffer_uptodate(bh); 336 set_buffer_uptodate(bh);
331 unlock_buffer(bh);
332 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 337 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
338 unlock_buffer(bh);
333 return bh; 339 return bh;
334 } 340 }
335 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 341 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
342 if (buffer_uptodate(bh)) {
343 /*
344 * if not uninit if bh is uptodate,
345 * bitmap is also uptodate
346 */
347 set_bitmap_uptodate(bh);
348 unlock_buffer(bh);
349 return bh;
350 }
351 /*
352 * submit the buffer_head for read. We can
353 * safely mark the bitmap as uptodate now.
354 * We do it here so the bitmap uptodate bit
355 * get set with buffer lock held.
356 */
357 set_bitmap_uptodate(bh);
336 if (bh_submit_read(bh) < 0) { 358 if (bh_submit_read(bh) < 0) {
337 put_bh(bh); 359 put_bh(bh);
338 ext4_error(sb, __func__, 360 ext4_error(sb, __func__,
339 "Cannot read block bitmap - " 361 "Cannot read block bitmap - "
340 "block_group = %lu, block_bitmap = %llu", 362 "block_group = %u, block_bitmap = %llu",
341 block_group, bitmap_blk); 363 block_group, bitmap_blk);
342 return NULL; 364 return NULL;
343 } 365 }
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
350} 372}
351 373
352/** 374/**
353 * ext4_free_blocks_sb() -- Free given blocks and update quota 375 * ext4_add_groupblocks() -- Add given blocks to an existing group
354 * @handle: handle to this transaction 376 * @handle: handle to this transaction
355 * @sb: super block 377 * @sb: super block
356 * @block: start physcial block to free 378 * @block: start physcial block to add to the block group
357 * @count: number of blocks to free 379 * @count: number of blocks to free
358 * @pdquot_freed_blocks: pointer to quota
359 * 380 *
360 * XXX This function is only used by the on-line resizing code, which 381 * This marks the blocks as free in the bitmap. We ask the
361 * should probably be fixed up to call the mballoc variant. There 382 * mballoc to reload the buddy after this by setting group
362 * this needs to be cleaned up later; in fact, I'm not convinced this 383 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
363 * is 100% correct in the face of the mballoc code. The online resizing
364 * code needs to be fixed up to more tightly (and correctly) interlock
365 * with the mballoc code.
366 */ 384 */
367void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 385void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
368 ext4_fsblk_t block, unsigned long count, 386 ext4_fsblk_t block, unsigned long count)
369 unsigned long *pdquot_freed_blocks)
370{ 387{
371 struct buffer_head *bitmap_bh = NULL; 388 struct buffer_head *bitmap_bh = NULL;
372 struct buffer_head *gd_bh; 389 struct buffer_head *gd_bh;
373 ext4_group_t block_group; 390 ext4_group_t block_group;
374 ext4_grpblk_t bit; 391 ext4_grpblk_t bit;
375 unsigned long i; 392 unsigned int i;
376 unsigned long overflow;
377 struct ext4_group_desc *desc; 393 struct ext4_group_desc *desc;
378 struct ext4_super_block *es; 394 struct ext4_super_block *es;
379 struct ext4_sb_info *sbi; 395 struct ext4_sb_info *sbi;
380 int err = 0, ret; 396 int err = 0, ret, blk_free_count;
381 ext4_grpblk_t group_freed; 397 ext4_grpblk_t blocks_freed;
398 struct ext4_group_info *grp;
382 399
383 *pdquot_freed_blocks = 0;
384 sbi = EXT4_SB(sb); 400 sbi = EXT4_SB(sb);
385 es = sbi->s_es; 401 es = sbi->s_es;
386 if (block < le32_to_cpu(es->s_first_data_block) || 402 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
387 block + count < block ||
388 block + count > ext4_blocks_count(es)) {
389 ext4_error(sb, "ext4_free_blocks",
390 "Freeing blocks not in datazone - "
391 "block = %llu, count = %lu", block, count);
392 goto error_return;
393 }
394
395 ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
396 403
397do_more:
398 overflow = 0;
399 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 404 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
405 grp = ext4_get_group_info(sb, block_group);
400 /* 406 /*
401 * Check to see if we are freeing blocks across a group 407 * Check to see if we are freeing blocks across a group
402 * boundary. 408 * boundary.
403 */ 409 */
404 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 410 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
405 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 411 goto error_return;
406 count -= overflow;
407 } 412 }
408 brelse(bitmap_bh);
409 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 413 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
410 if (!bitmap_bh) 414 if (!bitmap_bh)
411 goto error_return; 415 goto error_return;
@@ -418,18 +422,17 @@ do_more:
418 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 422 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
419 in_range(block + count - 1, ext4_inode_table(sb, desc), 423 in_range(block + count - 1, ext4_inode_table(sb, desc),
420 sbi->s_itb_per_group)) { 424 sbi->s_itb_per_group)) {
421 ext4_error(sb, "ext4_free_blocks", 425 ext4_error(sb, __func__,
422 "Freeing blocks in system zones - " 426 "Adding blocks in system zones - "
423 "Block = %llu, count = %lu", 427 "Block = %llu, count = %lu",
424 block, count); 428 block, count);
425 goto error_return; 429 goto error_return;
426 } 430 }
427 431
428 /* 432 /*
429 * We are about to start releasing blocks in the bitmap, 433 * We are about to add blocks to the bitmap,
430 * so we need undo access. 434 * so we need undo access.
431 */ 435 */
432 /* @@@ check errors */
433 BUFFER_TRACE(bitmap_bh, "getting undo access"); 436 BUFFER_TRACE(bitmap_bh, "getting undo access");
434 err = ext4_journal_get_undo_access(handle, bitmap_bh); 437 err = ext4_journal_get_undo_access(handle, bitmap_bh);
435 if (err) 438 if (err)
@@ -444,107 +447,55 @@ do_more:
444 err = ext4_journal_get_write_access(handle, gd_bh); 447 err = ext4_journal_get_write_access(handle, gd_bh);
445 if (err) 448 if (err)
446 goto error_return; 449 goto error_return;
447 450 /*
448 jbd_lock_bh_state(bitmap_bh); 451 * make sure we don't allow a parallel init on other groups in the
449 452 * same buddy cache
450 for (i = 0, group_freed = 0; i < count; i++) { 453 */
451 /* 454 down_write(&grp->alloc_sem);
452 * An HJ special. This is expensive... 455 for (i = 0, blocks_freed = 0; i < count; i++) {
453 */
454#ifdef CONFIG_JBD2_DEBUG
455 jbd_unlock_bh_state(bitmap_bh);
456 {
457 struct buffer_head *debug_bh;
458 debug_bh = sb_find_get_block(sb, block + i);
459 if (debug_bh) {
460 BUFFER_TRACE(debug_bh, "Deleted!");
461 if (!bh2jh(bitmap_bh)->b_committed_data)
462 BUFFER_TRACE(debug_bh,
463 "No commited data in bitmap");
464 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
465 __brelse(debug_bh);
466 }
467 }
468 jbd_lock_bh_state(bitmap_bh);
469#endif
470 if (need_resched()) {
471 jbd_unlock_bh_state(bitmap_bh);
472 cond_resched();
473 jbd_lock_bh_state(bitmap_bh);
474 }
475 /* @@@ This prevents newly-allocated data from being
476 * freed and then reallocated within the same
477 * transaction.
478 *
479 * Ideally we would want to allow that to happen, but to
480 * do so requires making jbd2_journal_forget() capable of
481 * revoking the queued write of a data block, which
482 * implies blocking on the journal lock. *forget()
483 * cannot block due to truncate races.
484 *
485 * Eventually we can fix this by making jbd2_journal_forget()
486 * return a status indicating whether or not it was able
487 * to revoke the buffer. On successful revoke, it is
488 * safe not to set the allocation bit in the committed
489 * bitmap, because we know that there is no outstanding
490 * activity on the buffer any more and so it is safe to
491 * reallocate it.
492 */
493 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
494 J_ASSERT_BH(bitmap_bh,
495 bh2jh(bitmap_bh)->b_committed_data != NULL);
496 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
497 bh2jh(bitmap_bh)->b_committed_data);
498
499 /*
500 * We clear the bit in the bitmap after setting the committed
501 * data bit, because this is the reverse order to that which
502 * the allocator uses.
503 */
504 BUFFER_TRACE(bitmap_bh, "clear bit"); 456 BUFFER_TRACE(bitmap_bh, "clear bit");
505 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 457 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
506 bit + i, bitmap_bh->b_data)) { 458 bit + i, bitmap_bh->b_data)) {
507 jbd_unlock_bh_state(bitmap_bh);
508 ext4_error(sb, __func__, 459 ext4_error(sb, __func__,
509 "bit already cleared for block %llu", 460 "bit already cleared for block %llu",
510 (ext4_fsblk_t)(block + i)); 461 (ext4_fsblk_t)(block + i));
511 jbd_lock_bh_state(bitmap_bh);
512 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 462 BUFFER_TRACE(bitmap_bh, "bit already cleared");
513 } else { 463 } else {
514 group_freed++; 464 blocks_freed++;
515 } 465 }
516 } 466 }
517 jbd_unlock_bh_state(bitmap_bh);
518
519 spin_lock(sb_bgl_lock(sbi, block_group)); 467 spin_lock(sb_bgl_lock(sbi, block_group));
520 le16_add_cpu(&desc->bg_free_blocks_count, group_freed); 468 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
469 ext4_free_blks_set(sb, desc, blk_free_count);
521 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 470 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
522 spin_unlock(sb_bgl_lock(sbi, block_group)); 471 spin_unlock(sb_bgl_lock(sbi, block_group));
523 percpu_counter_add(&sbi->s_freeblocks_counter, count); 472 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
524 473
525 if (sbi->s_log_groups_per_flex) { 474 if (sbi->s_log_groups_per_flex) {
526 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 475 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
527 spin_lock(sb_bgl_lock(sbi, flex_group)); 476 spin_lock(sb_bgl_lock(sbi, flex_group));
528 sbi->s_flex_groups[flex_group].free_blocks += count; 477 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
529 spin_unlock(sb_bgl_lock(sbi, flex_group)); 478 spin_unlock(sb_bgl_lock(sbi, flex_group));
530 } 479 }
480 /*
481 * request to reload the buddy with the
482 * new bitmap information
483 */
484 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
485 ext4_mb_update_group_info(grp, blocks_freed);
486 up_write(&grp->alloc_sem);
531 487
532 /* We dirtied the bitmap block */ 488 /* We dirtied the bitmap block */
533 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 489 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
534 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 490 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
535 491
536 /* And the group descriptor block */ 492 /* And the group descriptor block */
537 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 493 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
538 ret = ext4_journal_dirty_metadata(handle, gd_bh); 494 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
539 if (!err) err = ret; 495 if (!err)
540 *pdquot_freed_blocks += group_freed; 496 err = ret;
541
542 if (overflow && !err) {
543 block += count;
544 count = overflow;
545 goto do_more;
546 }
547 sb->s_dirt = 1; 497 sb->s_dirt = 1;
498
548error_return: 499error_return:
549 brelse(bitmap_bh); 500 brelse(bitmap_bh);
550 ext4_std_error(sb, err); 501 ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
614 if (dirty_blocks < 0) { 565 if (dirty_blocks < 0) {
615 printk(KERN_CRIT "Dirty block accounting " 566 printk(KERN_CRIT "Dirty block accounting "
616 "went wrong %lld\n", 567 "went wrong %lld\n",
617 dirty_blocks); 568 (long long)dirty_blocks);
618 } 569 }
619 } 570 }
620 /* Check whether we have space after 571 /* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
666 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 617 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
667} 618}
668 619
669#define EXT4_META_BLOCK 0x1
670
671static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
672 ext4_lblk_t iblock, ext4_fsblk_t goal,
673 unsigned long *count, int *errp, int flags)
674{
675 struct ext4_allocation_request ar;
676 ext4_fsblk_t ret;
677
678 memset(&ar, 0, sizeof(ar));
679 /* Fill with neighbour allocated blocks */
680
681 ar.inode = inode;
682 ar.goal = goal;
683 ar.len = *count;
684 ar.logical = iblock;
685
686 if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
687 /* enable in-core preallocation for data block allocation */
688 ar.flags = EXT4_MB_HINT_DATA;
689 else
690 /* disable in-core preallocation for non-regular files */
691 ar.flags = 0;
692
693 ret = ext4_mb_new_blocks(handle, &ar, errp);
694 *count = ar.len;
695 return ret;
696}
697
698/* 620/*
699 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks 621 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
700 * 622 *
701 * @handle: handle to this transaction 623 * @handle: handle to this transaction
702 * @inode: file inode 624 * @inode: file inode
703 * @goal: given target block(filesystem wide) 625 * @goal: given target block(filesystem wide)
704 * @count: total number of blocks need 626 * @count: pointer to total number of blocks needed
705 * @errp: error code 627 * @errp: error code
706 * 628 *
707 * Return 1st allocated block numberon success, *count stores total account 629 * Return 1st allocated block number on success, *count stores total account
708 * error stores in errp pointer 630 * error stores in errp pointer
709 */ 631 */
710ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 632ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
711 ext4_fsblk_t goal, unsigned long *count, int *errp) 633 ext4_fsblk_t goal, unsigned long *count, int *errp)
712{ 634{
635 struct ext4_allocation_request ar;
713 ext4_fsblk_t ret; 636 ext4_fsblk_t ret;
714 ret = do_blk_alloc(handle, inode, 0, goal, 637
715 count, errp, EXT4_META_BLOCK); 638 memset(&ar, 0, sizeof(ar));
639 /* Fill with neighbour allocated blocks */
640 ar.inode = inode;
641 ar.goal = goal;
642 ar.len = count ? *count : 1;
643
644 ret = ext4_mb_new_blocks(handle, &ar, errp);
645 if (count)
646 *count = ar.len;
647
716 /* 648 /*
717 * Account for the allocated meta blocks 649 * Account for the allocated meta blocks
718 */ 650 */
719 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 651 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
720 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 652 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
721 EXT4_I(inode)->i_allocated_meta_blocks += *count; 653 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
722 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 654 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
723 } 655 }
724 return ret; 656 return ret;
725} 657}
726 658
727/*
728 * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
729 *
730 * @handle: handle to this transaction
731 * @inode: file inode
732 * @goal: given target block(filesystem wide)
733 * @errp: error code
734 *
735 * Return allocated block number on success
736 */
737ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
738 ext4_fsblk_t goal, int *errp)
739{
740 unsigned long count = 1;
741 return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
742}
743
744/*
745 * ext4_new_blocks() -- allocate data blocks
746 *
747 * @handle: handle to this transaction
748 * @inode: file inode
749 * @goal: given target block(filesystem wide)
750 * @count: total number of blocks need
751 * @errp: error code
752 *
753 * Return 1st allocated block numberon success, *count stores total account
754 * error stores in errp pointer
755 */
756
757ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
758 ext4_lblk_t iblock, ext4_fsblk_t goal,
759 unsigned long *count, int *errp)
760{
761 return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
762}
763
764/** 659/**
765 * ext4_count_free_blocks() -- count filesystem free blocks 660 * ext4_count_free_blocks() -- count filesystem free blocks
766 * @sb: superblock 661 * @sb: superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
776#ifdef EXT4FS_DEBUG 671#ifdef EXT4FS_DEBUG
777 struct ext4_super_block *es; 672 struct ext4_super_block *es;
778 ext4_fsblk_t bitmap_count; 673 ext4_fsblk_t bitmap_count;
779 unsigned long x; 674 unsigned int x;
780 struct buffer_head *bitmap_bh = NULL; 675 struct buffer_head *bitmap_bh = NULL;
781 676
782 es = EXT4_SB(sb)->s_es; 677 es = EXT4_SB(sb)->s_es;
@@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
796 continue; 691 continue;
797 692
798 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 693 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
799 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 694 printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
800 i, le16_to_cpu(gdp->bg_free_blocks_count), x); 695 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
801 bitmap_count += x; 696 bitmap_count += x;
802 } 697 }
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
812 gdp = ext4_get_group_desc(sb, i, NULL); 707 gdp = ext4_get_group_desc(sb, i, NULL);
813 if (!gdp) 708 if (!gdp)
814 continue; 709 continue;
815 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 710 desc_count += ext4_free_blks_count(sb, gdp);
816 } 711 }
817 712
818 return desc_count; 713 return desc_count;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c190..fa3af81ac565 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
15 15
16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17 17
18unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) 18unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
19{ 19{
20 unsigned int i; 20 unsigned int i, sum = 0;
21 unsigned long sum = 0;
22 21
23 if (!map) 22 if (!map)
24 return 0; 23 return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5a..2df2e40b01af 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
64int ext4_check_dir_entry(const char *function, struct inode *dir, 64int ext4_check_dir_entry(const char *function, struct inode *dir,
65 struct ext4_dir_entry_2 *de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head *bh, 66 struct buffer_head *bh,
67 unsigned long offset) 67 unsigned int offset)
68{ 68{
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
84 if (error_msg != NULL) 84 if (error_msg != NULL)
85 ext4_error(dir->i_sb, function, 85 ext4_error(dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%u, inode=%u, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
89 (unsigned long) le32_to_cpu(de->inode), 89 le32_to_cpu(de->inode),
90 rlen, de->name_len); 90 rlen, de->name_len);
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92} 92}
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
95 void *dirent, filldir_t filldir) 95 void *dirent, filldir_t filldir)
96{ 96{
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned int offset;
99 int i, stored; 99 int i, stored;
100 struct ext4_dir_entry_2 *de; 100 struct ext4_dir_entry_2 *de;
101 struct super_block *sb; 101 struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
405 sb = inode->i_sb; 405 sb = inode->i_sb;
406 406
407 if (!fname) { 407 if (!fname) {
408 printk(KERN_ERR "ext4: call_filldir: called with " 408 printk(KERN_ERR "EXT4-fs: call_filldir: called with "
409 "null fname?!?\n"); 409 "null fname?!?\n");
410 return 0; 410 return 0;
411 } 411 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6c46c648430d..c668e4377d76 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
19#include <linux/types.h> 19#include <linux/types.h>
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include <linux/jbd2.h>
22#include "ext4_i.h" 23#include "ext4_i.h"
23 24
24/* 25/*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
94 /* phys. block for ^^^ */ 95 /* phys. block for ^^^ */
95 ext4_fsblk_t pright; 96 ext4_fsblk_t pright;
96 /* how many blocks we want to allocate */ 97 /* how many blocks we want to allocate */
97 unsigned long len; 98 unsigned int len;
98 /* flags. see above EXT4_MB_HINT_* */ 99 /* flags. see above EXT4_MB_HINT_* */
99 unsigned long flags; 100 unsigned int flags;
100}; 101};
101 102
102/* 103/*
@@ -156,12 +157,12 @@ struct ext4_group_desc
156 __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ 157 __le32 bg_block_bitmap_lo; /* Blocks bitmap block */
157 __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ 158 __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
158 __le32 bg_inode_table_lo; /* Inodes table block */ 159 __le32 bg_inode_table_lo; /* Inodes table block */
159 __le16 bg_free_blocks_count; /* Free blocks count */ 160 __le16 bg_free_blocks_count_lo;/* Free blocks count */
160 __le16 bg_free_inodes_count; /* Free inodes count */ 161 __le16 bg_free_inodes_count_lo;/* Free inodes count */
161 __le16 bg_used_dirs_count; /* Directories count */ 162 __le16 bg_used_dirs_count_lo; /* Directories count */
162 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ 163 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
163 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ 164 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
164 __le16 bg_itable_unused; /* Unused inodes count */ 165 __le16 bg_itable_unused_lo; /* Unused inodes count */
165 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ 166 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
166 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ 167 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
167 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ 168 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
169 __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ 170 __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
170 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ 171 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
171 __le16 bg_used_dirs_count_hi; /* Directories count MSB */ 172 __le16 bg_used_dirs_count_hi; /* Directories count MSB */
172 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ 173 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
173 __u32 bg_reserved2[3]; 174 __u32 bg_reserved2[3];
174}; 175};
175 176
@@ -328,6 +329,7 @@ struct ext4_mount_options {
328 uid_t s_resuid; 329 uid_t s_resuid;
329 gid_t s_resgid; 330 gid_t s_resgid;
330 unsigned long s_commit_interval; 331 unsigned long s_commit_interval;
332 u32 s_min_batch_time, s_max_batch_time;
331#ifdef CONFIG_QUOTA 333#ifdef CONFIG_QUOTA
332 int s_jquota_fmt; 334 int s_jquota_fmt;
333 char *s_qf_names[MAXQUOTAS]; 335 char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do { \
534#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 536#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
535#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 537#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
536#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 538#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
537#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 539#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 540#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 541#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
726 */ 727 */
727 728
728#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ 729#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
729 (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) 730 ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
730#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ 731#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
731 (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) 732 ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
732#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ 733#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
733 (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) 734 ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
734#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ 735#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
735 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) 736 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
736#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ 737#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
806#define EXT4_DEFM_JMODE_WBACK 0x0060 807#define EXT4_DEFM_JMODE_WBACK 0x0060
807 808
808/* 809/*
810 * Default journal batch times
811 */
812#define EXT4_DEF_MIN_BATCH_TIME 0
813#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
814
815/*
809 * Structure of a directory entry 816 * Structure of a directory entry
810 */ 817 */
811#define EXT4_NAME_LEN 255 818#define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
891#define DX_HASH_LEGACY 0 898#define DX_HASH_LEGACY 0
892#define DX_HASH_HALF_MD4 1 899#define DX_HASH_HALF_MD4 1
893#define DX_HASH_TEA 2 900#define DX_HASH_TEA 2
901#define DX_HASH_LEGACY_UNSIGNED 3
902#define DX_HASH_HALF_MD4_UNSIGNED 4
903#define DX_HASH_TEA_UNSIGNED 5
894 904
895#ifdef __KERNEL__ 905#ifdef __KERNEL__
896 906
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
955#define ERR_BAD_DX_DIR -75000 965#define ERR_BAD_DX_DIR -75000
956 966
957void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 967void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
958 unsigned long *blockgrpp, ext4_grpblk_t *offsetp); 968 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
959 969
960extern struct proc_dir_entry *ext4_proc_root; 970extern struct proc_dir_entry *ext4_proc_root;
961 971
@@ -987,6 +997,9 @@ do { \
987# define ATTRIB_NORET __attribute__((noreturn)) 997# define ATTRIB_NORET __attribute__((noreturn))
988# define NORET_AND noreturn, 998# define NORET_AND noreturn,
989 999
1000/* bitmap.c */
1001extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
1002
990/* balloc.c */ 1003/* balloc.c */
991extern unsigned int ext4_block_group(struct super_block *sb, 1004extern unsigned int ext4_block_group(struct super_block *sb,
992 ext4_fsblk_t blocknr); 1005 ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
995extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); 1008extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
996extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1009extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
997 ext4_group_t group); 1010 ext4_group_t group);
998extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
999 ext4_fsblk_t goal, int *errp);
1000extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1011extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1001 ext4_fsblk_t goal, unsigned long *count, int *errp); 1012 ext4_fsblk_t goal, unsigned long *count, int *errp);
1002extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1003 ext4_lblk_t iblock, ext4_fsblk_t goal,
1004 unsigned long *count, int *errp);
1005extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1013extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1006extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1014extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1007extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1015extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1008 ext4_fsblk_t block, unsigned long count, int metadata); 1016 ext4_fsblk_t block, unsigned long count, int metadata);
1009extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 1017extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1010 ext4_fsblk_t block, unsigned long count, 1018 ext4_fsblk_t block, unsigned long count);
1011 unsigned long *pdquot_freed_blocks);
1012extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1019extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1013extern void ext4_check_blocks_bitmap(struct super_block *); 1020extern void ext4_check_blocks_bitmap(struct super_block *);
1014extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1021extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1019/* dir.c */ 1026/* dir.c */
1020extern int ext4_check_dir_entry(const char *, struct inode *, 1027extern int ext4_check_dir_entry(const char *, struct inode *,
1021 struct ext4_dir_entry_2 *, 1028 struct ext4_dir_entry_2 *,
1022 struct buffer_head *, unsigned long); 1029 struct buffer_head *, unsigned int);
1023extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1030extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1024 __u32 minor_hash, 1031 __u32 minor_hash,
1025 struct ext4_dir_entry_2 *dirent); 1032 struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1039extern unsigned long ext4_count_free_inodes(struct super_block *); 1046extern unsigned long ext4_count_free_inodes(struct super_block *);
1040extern unsigned long ext4_count_dirs(struct super_block *); 1047extern unsigned long ext4_count_dirs(struct super_block *);
1041extern void ext4_check_inodes_bitmap(struct super_block *); 1048extern void ext4_check_inodes_bitmap(struct super_block *);
1042extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
1043 1049
1044/* mballoc.c */ 1050/* mballoc.c */
1045extern long ext4_mb_stats; 1051extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
1054extern void exit_ext4_mballoc(void); 1060extern void exit_ext4_mballoc(void);
1055extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1061extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1056 unsigned long, unsigned long, int, unsigned long *); 1062 unsigned long, unsigned long, int, unsigned long *);
1057extern int ext4_mb_add_more_groupinfo(struct super_block *sb, 1063extern int ext4_mb_add_groupinfo(struct super_block *sb,
1058 ext4_group_t i, struct ext4_group_desc *desc); 1064 ext4_group_t i, struct ext4_group_desc *desc);
1059extern void ext4_mb_update_group_info(struct ext4_group_info *grp, 1065extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1060 ext4_grpblk_t add); 1066 ext4_grpblk_t add);
1061 1067extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1062 1068extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1069 ext4_group_t, int);
1063/* inode.c */ 1070/* inode.c */
1064int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1071int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1065 struct buffer_head *bh, ext4_fsblk_t blocknr); 1072 struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
1069 ext4_lblk_t, int, int *); 1076 ext4_lblk_t, int, int *);
1070int ext4_get_block(struct inode *inode, sector_t iblock, 1077int ext4_get_block(struct inode *inode, sector_t iblock,
1071 struct buffer_head *bh_result, int create); 1078 struct buffer_head *bh_result, int create);
1072int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1073 ext4_lblk_t iblock, unsigned long maxblocks,
1074 struct buffer_head *bh_result,
1075 int create, int extend_disksize);
1076 1079
1077extern struct inode *ext4_iget(struct super_block *, unsigned long); 1080extern struct inode *ext4_iget(struct super_block *, unsigned long);
1078extern int ext4_write_inode(struct inode *, int); 1081extern int ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1123 __attribute__ ((format (printf, 3, 4))); 1126 __attribute__ ((format (printf, 3, 4)));
1124extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1127extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1125 __attribute__ ((format (printf, 3, 4))); 1128 __attribute__ ((format (printf, 3, 4)));
1129extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
1130 const char *, const char *, ...)
1131 __attribute__ ((format (printf, 4, 5)));
1126extern void ext4_update_dynamic_rev(struct super_block *sb); 1132extern void ext4_update_dynamic_rev(struct super_block *sb);
1127extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1133extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1128 __u32 compat); 1134 __u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
1136 struct ext4_group_desc *bg); 1142 struct ext4_group_desc *bg);
1137extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, 1143extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
1138 struct ext4_group_desc *bg); 1144 struct ext4_group_desc *bg);
1145extern __u32 ext4_free_blks_count(struct super_block *sb,
1146 struct ext4_group_desc *bg);
1147extern __u32 ext4_free_inodes_count(struct super_block *sb,
1148 struct ext4_group_desc *bg);
1149extern __u32 ext4_used_dirs_count(struct super_block *sb,
1150 struct ext4_group_desc *bg);
1151extern __u32 ext4_itable_unused_count(struct super_block *sb,
1152 struct ext4_group_desc *bg);
1139extern void ext4_block_bitmap_set(struct super_block *sb, 1153extern void ext4_block_bitmap_set(struct super_block *sb,
1140 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1154 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1141extern void ext4_inode_bitmap_set(struct super_block *sb, 1155extern void ext4_inode_bitmap_set(struct super_block *sb,
1142 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1156 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1143extern void ext4_inode_table_set(struct super_block *sb, 1157extern void ext4_inode_table_set(struct super_block *sb,
1144 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1158 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1159extern void ext4_free_blks_set(struct super_block *sb,
1160 struct ext4_group_desc *bg, __u32 count);
1161extern void ext4_free_inodes_set(struct super_block *sb,
1162 struct ext4_group_desc *bg, __u32 count);
1163extern void ext4_used_dirs_set(struct super_block *sb,
1164 struct ext4_group_desc *bg, __u32 count);
1165extern void ext4_itable_unused_set(struct super_block *sb,
1166 struct ext4_group_desc *bg, __u32 count);
1145 1167
1146static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 1168static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
1147{ 1169{
@@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1246 return ; 1268 return ;
1247} 1269}
1248 1270
1271struct ext4_group_info {
1272 unsigned long bb_state;
1273 struct rb_root bb_free_root;
1274 unsigned short bb_first_free;
1275 unsigned short bb_free;
1276 unsigned short bb_fragments;
1277 struct list_head bb_prealloc_list;
1278#ifdef DOUBLE_CHECK
1279 void *bb_bitmap;
1280#endif
1281 struct rw_semaphore alloc_sem;
1282 unsigned short bb_counters[];
1283};
1284
1285#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
1286#define EXT4_GROUP_INFO_LOCKED_BIT 1
1287
1288#define EXT4_MB_GRP_NEED_INIT(grp) \
1289 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1290
1291static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1292{
1293 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1294
1295 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1296}
1297
1298static inline void ext4_unlock_group(struct super_block *sb,
1299 ext4_group_t group)
1300{
1301 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1302
1303 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1304}
1305
1306static inline int ext4_is_group_locked(struct super_block *sb,
1307 ext4_group_t group)
1308{
1309 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1310
1311 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
1312 &(grinfo->bb_state));
1313}
1314
1249/* 1315/*
1250 * Inodes and files operations 1316 * Inodes and files operations
1251 */ 1317 */
@@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1271extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1337extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1272 int chunk); 1338 int chunk);
1273extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1339extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1274 ext4_lblk_t iblock, 1340 ext4_lblk_t iblock, unsigned int max_blocks,
1275 unsigned long max_blocks, struct buffer_head *bh_result, 1341 struct buffer_head *bh_result,
1276 int create, int extend_disksize); 1342 int create, int extend_disksize);
1277extern void ext4_ext_truncate(struct inode *); 1343extern void ext4_ext_truncate(struct inode *);
1278extern void ext4_ext_init(struct super_block *); 1344extern void ext4_ext_init(struct super_block *);
1279extern void ext4_ext_release(struct super_block *); 1345extern void ext4_ext_release(struct super_block *);
1280extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1346extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1281 loff_t len); 1347 loff_t len);
1282extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1348extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1283 sector_t block, unsigned long max_blocks, 1349 sector_t block, unsigned int max_blocks,
1284 struct buffer_head *bh, int create, 1350 struct buffer_head *bh, int create,
1285 int extend_disksize, int flag); 1351 int extend_disksize, int flag);
1352extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1353 __u64 start, __u64 len);
1354
1355/*
1356 * Add new method to test wether block and inode bitmaps are properly
1357 * initialized. With uninit_bg reading the block from disk is not enough
1358 * to mark the bitmap uptodate. We need to also zero-out the bitmap
1359 */
1360#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
1361
1362static inline int bitmap_uptodate(struct buffer_head *bh)
1363{
1364 return (buffer_uptodate(bh) &&
1365 test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
1366}
1367static inline void set_bitmap_uptodate(struct buffer_head *bh)
1368{
1369 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1370}
1371
1286#endif /* __KERNEL__ */ 1372#endif /* __KERNEL__ */
1287 1373
1288#endif /* _EXT4_H */ 1374#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0d..18cb67b2cbbc 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
194 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); 194 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
195} 195}
196 196
197static inline void ext4_ext_tree_changed(struct inode *inode)
198{
199 EXT4_I(inode)->i_ext_generation++;
200}
201
202static inline void 197static inline void
203ext4_ext_invalidate_cache(struct inode *inode) 198ext4_ext_invalidate_cache(struct inode *inode)
204{ 199{
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d3..e69acc16f5c4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
31typedef __u32 ext4_lblk_t; 31typedef __u32 ext4_lblk_t;
32 32
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned long ext4_group_t; 34typedef unsigned int ext4_group_t;
35 35
36#define rsv_start rsv_window._rsv_start 36#define rsv_start rsv_window._rsv_start
37#define rsv_end rsv_window._rsv_end 37#define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
100 */ 100 */
101 loff_t i_disksize; 101 loff_t i_disksize;
102 102
103 /* on-disk additional length */
104 __u16 i_extra_isize;
105
106 /* 103 /*
107 * i_data_sem is for serialising ext4_truncate() against 104 * i_data_sem is for serialising ext4_truncate() against
108 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's 105 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
117 struct inode vfs_inode; 114 struct inode vfs_inode;
118 struct jbd2_inode jinode; 115 struct jbd2_inode jinode;
119 116
120 unsigned long i_ext_generation;
121 struct ext4_ext_cache i_cached_extent; 117 struct ext4_ext_cache i_cached_extent;
122 /* 118 /*
123 * File creation time. Its function is same as that of 119 * File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
130 spinlock_t i_prealloc_lock; 126 spinlock_t i_prealloc_lock;
131 127
132 /* allocation reservation info for delalloc */ 128 /* allocation reservation info for delalloc */
133 unsigned long i_reserved_data_blocks; 129 unsigned int i_reserved_data_blocks;
134 unsigned long i_reserved_meta_blocks; 130 unsigned int i_reserved_meta_blocks;
135 unsigned long i_allocated_meta_blocks; 131 unsigned int i_allocated_meta_blocks;
136 unsigned short i_delalloc_reserved_flag; 132 unsigned short i_delalloc_reserved_flag;
133
134 /* on-disk additional length */
135 __u16 i_extra_isize;
136
137 spinlock_t i_block_reservation_lock; 137 spinlock_t i_block_reservation_lock;
138}; 138};
139 139
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2c..ad13a84644e1 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
7int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 7int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
8 struct buffer_head *bh) 8 struct buffer_head *bh)
9{ 9{
10 int err = jbd2_journal_get_undo_access(handle, bh); 10 int err = 0;
11 if (err) 11
12 ext4_journal_abort_handle(where, __func__, bh, handle, err); 12 if (ext4_handle_valid(handle)) {
13 err = jbd2_journal_get_undo_access(handle, bh);
14 if (err)
15 ext4_journal_abort_handle(where, __func__, bh,
16 handle, err);
17 }
13 return err; 18 return err;
14} 19}
15 20
16int __ext4_journal_get_write_access(const char *where, handle_t *handle, 21int __ext4_journal_get_write_access(const char *where, handle_t *handle,
17 struct buffer_head *bh) 22 struct buffer_head *bh)
18{ 23{
19 int err = jbd2_journal_get_write_access(handle, bh); 24 int err = 0;
20 if (err) 25
21 ext4_journal_abort_handle(where, __func__, bh, handle, err); 26 if (ext4_handle_valid(handle)) {
27 err = jbd2_journal_get_write_access(handle, bh);
28 if (err)
29 ext4_journal_abort_handle(where, __func__, bh,
30 handle, err);
31 }
22 return err; 32 return err;
23} 33}
24 34
25int __ext4_journal_forget(const char *where, handle_t *handle, 35int __ext4_journal_forget(const char *where, handle_t *handle,
26 struct buffer_head *bh) 36 struct buffer_head *bh)
27{ 37{
28 int err = jbd2_journal_forget(handle, bh); 38 int err = 0;
29 if (err) 39
30 ext4_journal_abort_handle(where, __func__, bh, handle, err); 40 if (ext4_handle_valid(handle)) {
41 err = jbd2_journal_forget(handle, bh);
42 if (err)
43 ext4_journal_abort_handle(where, __func__, bh,
44 handle, err);
45 }
31 return err; 46 return err;
32} 47}
33 48
34int __ext4_journal_revoke(const char *where, handle_t *handle, 49int __ext4_journal_revoke(const char *where, handle_t *handle,
35 ext4_fsblk_t blocknr, struct buffer_head *bh) 50 ext4_fsblk_t blocknr, struct buffer_head *bh)
36{ 51{
37 int err = jbd2_journal_revoke(handle, blocknr, bh); 52 int err = 0;
38 if (err) 53
39 ext4_journal_abort_handle(where, __func__, bh, handle, err); 54 if (ext4_handle_valid(handle)) {
55 err = jbd2_journal_revoke(handle, blocknr, bh);
56 if (err)
57 ext4_journal_abort_handle(where, __func__, bh,
58 handle, err);
59 }
40 return err; 60 return err;
41} 61}
42 62
43int __ext4_journal_get_create_access(const char *where, 63int __ext4_journal_get_create_access(const char *where,
44 handle_t *handle, struct buffer_head *bh) 64 handle_t *handle, struct buffer_head *bh)
45{ 65{
46 int err = jbd2_journal_get_create_access(handle, bh); 66 int err = 0;
47 if (err) 67
48 ext4_journal_abort_handle(where, __func__, bh, handle, err); 68 if (ext4_handle_valid(handle)) {
69 err = jbd2_journal_get_create_access(handle, bh);
70 if (err)
71 ext4_journal_abort_handle(where, __func__, bh,
72 handle, err);
73 }
49 return err; 74 return err;
50} 75}
51 76
52int __ext4_journal_dirty_metadata(const char *where, 77int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
53 handle_t *handle, struct buffer_head *bh) 78 struct inode *inode, struct buffer_head *bh)
54{ 79{
55 int err = jbd2_journal_dirty_metadata(handle, bh); 80 int err = 0;
56 if (err) 81
57 ext4_journal_abort_handle(where, __func__, bh, handle, err); 82 if (ext4_handle_valid(handle)) {
83 err = jbd2_journal_dirty_metadata(handle, bh);
84 if (err)
85 ext4_journal_abort_handle(where, __func__, bh,
86 handle, err);
87 } else {
88 mark_buffer_dirty(bh);
89 if (inode && inode_needs_sync(inode)) {
90 sync_dirty_buffer(bh);
91 if (buffer_req(bh) && !buffer_uptodate(bh)) {
92 ext4_error(inode->i_sb, __func__,
93 "IO error syncing inode, "
94 "inode=%lu, block=%llu",
95 inode->i_ino,
96 (unsigned long long) bh->b_blocknr);
97 err = -EIO;
98 }
99 }
100 }
58 return err; 101 return err;
59} 102}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98b..be2f426f6805 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
32 * 5 levels of tree + root which are stored in the inode. */ 32 * 5 levels of tree + root which are stored in the inode. */
33 33
34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ 34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ 35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
36 || test_opt(sb, EXTENTS) ? 27U : 8U) 36 ? 27U : 8U)
37 37
38/* Extended attribute operations touch at most two data buffers, 38/* Extended attribute operations touch at most two data buffers,
39 * two bitmap buffers, and two group summaries, in addition to the inode 39 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
122 * been done yet. 122 * been done yet.
123 */ 123 */
124 124
125static inline void ext4_journal_release_buffer(handle_t *handle,
126 struct buffer_head *bh)
127{
128 jbd2_journal_release_buffer(handle, bh);
129}
130
131void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, const char *err_fn,
132 struct buffer_head *bh, handle_t *handle, int err); 126 struct buffer_head *bh, handle_t *handle, int err);
133 127
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
146int __ext4_journal_get_create_access(const char *where, 140int __ext4_journal_get_create_access(const char *where,
147 handle_t *handle, struct buffer_head *bh); 141 handle_t *handle, struct buffer_head *bh);
148 142
149int __ext4_journal_dirty_metadata(const char *where, 143int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
150 handle_t *handle, struct buffer_head *bh); 144 struct inode *inode, struct buffer_head *bh);
151 145
152#define ext4_journal_get_undo_access(handle, bh) \ 146#define ext4_journal_get_undo_access(handle, bh) \
153 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 147 __ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
157 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) 151 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
158#define ext4_journal_get_create_access(handle, bh) \ 152#define ext4_journal_get_create_access(handle, bh) \
159 __ext4_journal_get_create_access(__func__, (handle), (bh)) 153 __ext4_journal_get_create_access(__func__, (handle), (bh))
160#define ext4_journal_dirty_metadata(handle, bh) \
161 __ext4_journal_dirty_metadata(__func__, (handle), (bh))
162#define ext4_journal_forget(handle, bh) \ 154#define ext4_journal_forget(handle, bh) \
163 __ext4_journal_forget(__func__, (handle), (bh)) 155 __ext4_journal_forget(__func__, (handle), (bh))
156#define ext4_handle_dirty_metadata(handle, inode, bh) \
157 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
164 158
165handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 159handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
166int __ext4_journal_stop(const char *where, handle_t *handle); 160int __ext4_journal_stop(const char *where, handle_t *handle);
167 161
162#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
163
164static inline int ext4_handle_valid(handle_t *handle)
165{
166 if (handle == EXT4_NOJOURNAL_HANDLE)
167 return 0;
168 return 1;
169}
170
171static inline void ext4_handle_sync(handle_t *handle)
172{
173 if (ext4_handle_valid(handle))
174 handle->h_sync = 1;
175}
176
177static inline void ext4_handle_release_buffer(handle_t *handle,
178 struct buffer_head *bh)
179{
180 if (ext4_handle_valid(handle))
181 jbd2_journal_release_buffer(handle, bh);
182}
183
184static inline int ext4_handle_is_aborted(handle_t *handle)
185{
186 if (ext4_handle_valid(handle))
187 return is_handle_aborted(handle);
188 return 0;
189}
190
191static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
192{
193 if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
194 return 0;
195 return 1;
196}
197
198static inline void ext4_journal_release_buffer(handle_t *handle,
199 struct buffer_head *bh)
200{
201 if (ext4_handle_valid(handle))
202 jbd2_journal_release_buffer(handle, bh);
203}
204
168static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 205static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
169{ 206{
170 return ext4_journal_start_sb(inode->i_sb, nblocks); 207 return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
180 217
181static inline int ext4_journal_extend(handle_t *handle, int nblocks) 218static inline int ext4_journal_extend(handle_t *handle, int nblocks)
182{ 219{
183 return jbd2_journal_extend(handle, nblocks); 220 if (ext4_handle_valid(handle))
221 return jbd2_journal_extend(handle, nblocks);
222 return 0;
184} 223}
185 224
186static inline int ext4_journal_restart(handle_t *handle, int nblocks) 225static inline int ext4_journal_restart(handle_t *handle, int nblocks)
187{ 226{
188 return jbd2_journal_restart(handle, nblocks); 227 if (ext4_handle_valid(handle))
228 return jbd2_journal_restart(handle, nblocks);
229 return 0;
189} 230}
190 231
191static inline int ext4_journal_blocks_per_page(struct inode *inode) 232static inline int ext4_journal_blocks_per_page(struct inode *inode)
192{ 233{
193 return jbd2_journal_blocks_per_page(inode); 234 if (EXT4_JOURNAL(inode) != NULL)
235 return jbd2_journal_blocks_per_page(inode);
236 return 0;
194} 237}
195 238
196static inline int ext4_journal_force_commit(journal_t *journal) 239static inline int ext4_journal_force_commit(journal_t *journal)
197{ 240{
198 return jbd2_journal_force_commit(journal); 241 if (journal)
242 return jbd2_journal_force_commit(journal);
243 return 0;
199} 244}
200 245
201static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 246static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
202{ 247{
203 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 248 if (ext4_handle_valid(handle))
249 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
250 return 0;
204} 251}
205 252
206/* super.c */ 253/* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
208 255
209static inline int ext4_should_journal_data(struct inode *inode) 256static inline int ext4_should_journal_data(struct inode *inode)
210{ 257{
258 if (EXT4_JOURNAL(inode) == NULL)
259 return 0;
211 if (!S_ISREG(inode->i_mode)) 260 if (!S_ISREG(inode->i_mode))
212 return 1; 261 return 1;
213 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 262 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
219 268
220static inline int ext4_should_order_data(struct inode *inode) 269static inline int ext4_should_order_data(struct inode *inode)
221{ 270{
271 if (EXT4_JOURNAL(inode) == NULL)
272 return 0;
222 if (!S_ISREG(inode->i_mode)) 273 if (!S_ISREG(inode->i_mode))
223 return 0; 274 return 0;
224 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 275 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
230 281
231static inline int ext4_should_writeback_data(struct inode *inode) 282static inline int ext4_should_writeback_data(struct inode *inode)
232{ 283{
284 if (EXT4_JOURNAL(inode) == NULL)
285 return 0;
233 if (!S_ISREG(inode->i_mode)) 286 if (!S_ISREG(inode->i_mode))
234 return 0; 287 return 0;
235 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 288 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index b21f16713db0..039b6ea1a042 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
57 u32 s_next_generation; 57 u32 s_next_generation;
58 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
59 int s_def_hash_version; 59 int s_def_hash_version;
60 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
60 struct percpu_counter s_freeblocks_counter; 61 struct percpu_counter s_freeblocks_counter;
61 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
62 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
73 struct journal_s *s_journal; 74 struct journal_s *s_journal;
74 struct list_head s_orphan; 75 struct list_head s_orphan;
75 unsigned long s_commit_interval; 76 unsigned long s_commit_interval;
77 u32 s_max_batch_time;
78 u32 s_min_batch_time;
76 struct block_device *journal_bdev; 79 struct block_device *journal_bdev;
77#ifdef CONFIG_JBD2_DEBUG 80#ifdef CONFIG_JBD2_DEBUG
78 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ 81 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
101 spinlock_t s_reserve_lock; 104 spinlock_t s_reserve_lock;
102 spinlock_t s_md_lock; 105 spinlock_t s_md_lock;
103 tid_t s_last_transaction; 106 tid_t s_last_transaction;
104 unsigned short *s_mb_offsets, *s_mb_maxs; 107 unsigned short *s_mb_offsets;
108 unsigned int *s_mb_maxs;
105 109
106 /* tunables */ 110 /* tunables */
107 unsigned long s_stripe; 111 unsigned long s_stripe;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3f54db31cdc2..54bf0623a9ae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
97{ 97{
98 int err; 98 int err;
99 99
100 if (!ext4_handle_valid(handle))
101 return 0;
100 if (handle->h_buffer_credits > needed) 102 if (handle->h_buffer_credits > needed)
101 return 0; 103 return 0;
102 err = ext4_journal_extend(handle, needed); 104 err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
134 int err; 136 int err;
135 if (path->p_bh) { 137 if (path->p_bh) {
136 /* path points to block */ 138 /* path points to block */
137 err = ext4_journal_dirty_metadata(handle, path->p_bh); 139 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
138 } else { 140 } else {
139 /* path points to leaf/index in inode body */ 141 /* path points to leaf/index in inode body */
140 err = ext4_mark_inode_dirty(handle, inode); 142 err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
191 ext4_fsblk_t goal, newblock; 193 ext4_fsblk_t goal, newblock;
192 194
193 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 195 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
194 newblock = ext4_new_meta_block(handle, inode, goal, err); 196 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
195 return newblock; 197 return newblock;
196} 198}
197 199
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
780 set_buffer_uptodate(bh); 782 set_buffer_uptodate(bh);
781 unlock_buffer(bh); 783 unlock_buffer(bh);
782 784
783 err = ext4_journal_dirty_metadata(handle, bh); 785 err = ext4_handle_dirty_metadata(handle, inode, bh);
784 if (err) 786 if (err)
785 goto cleanup; 787 goto cleanup;
786 brelse(bh); 788 brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
859 set_buffer_uptodate(bh); 861 set_buffer_uptodate(bh);
860 unlock_buffer(bh); 862 unlock_buffer(bh);
861 863
862 err = ext4_journal_dirty_metadata(handle, bh); 864 err = ext4_handle_dirty_metadata(handle, inode, bh);
863 if (err) 865 if (err)
864 goto cleanup; 866 goto cleanup;
865 brelse(bh); 867 brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
955 set_buffer_uptodate(bh); 957 set_buffer_uptodate(bh);
956 unlock_buffer(bh); 958 unlock_buffer(bh);
957 959
958 err = ext4_journal_dirty_metadata(handle, bh); 960 err = ext4_handle_dirty_metadata(handle, inode, bh);
959 if (err) 961 if (err)
960 goto out; 962 goto out;
961 963
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1160 while (--depth >= 0) { 1162 while (--depth >= 0) {
1161 ix = path[depth].p_idx; 1163 ix = path[depth].p_idx;
1162 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1164 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1163 break; 1165 goto got_index;
1164 } 1166 }
1165 1167
1166 if (depth < 0) { 1168 /* we've gone up to the root and found no index to the right */
1167 /* we've gone up to the root and 1169 return 0;
1168 * found no index to the right */
1169 return 0;
1170 }
1171 1170
1171got_index:
1172 /* we've found index to the right, let's 1172 /* we've found index to the right, let's
1173 * follow it and find the closest allocated 1173 * follow it and find the closest allocated
1174 * block to the right */ 1174 * block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1201 *phys = ext_pblock(ex); 1201 *phys = ext_pblock(ex);
1202 put_bh(bh); 1202 put_bh(bh);
1203 return 0; 1203 return 0;
1204
1205} 1204}
1206 1205
1207/* 1206/*
@@ -1622,7 +1621,6 @@ cleanup:
1622 ext4_ext_drop_refs(npath); 1621 ext4_ext_drop_refs(npath);
1623 kfree(npath); 1622 kfree(npath);
1624 } 1623 }
1625 ext4_ext_tree_changed(inode);
1626 ext4_ext_invalidate_cache(inode); 1624 ext4_ext_invalidate_cache(inode);
1627 return err; 1625 return err;
1628} 1626}
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2233 } 2231 }
2234 } 2232 }
2235out: 2233out:
2236 ext4_ext_tree_changed(inode);
2237 ext4_ext_drop_refs(path); 2234 ext4_ext_drop_refs(path);
2238 kfree(path); 2235 kfree(path);
2239 ext4_journal_stop(handle); 2236 ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
2250 * possible initialization would be here 2247 * possible initialization would be here
2251 */ 2248 */
2252 2249
2253 if (test_opt(sb, EXTENTS)) { 2250 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2254 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2251 printk(KERN_INFO "EXT4-fs: file extents enabled");
2255#ifdef AGGRESSIVE_TEST 2252#ifdef AGGRESSIVE_TEST
2256 printk(", aggressive tests"); 2253 printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
2275 */ 2272 */
2276void ext4_ext_release(struct super_block *sb) 2273void ext4_ext_release(struct super_block *sb)
2277{ 2274{
2278 if (!test_opt(sb, EXTENTS)) 2275 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2279 return; 2276 return;
2280 2277
2281#ifdef EXTENTS_STATS 2278#ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2380 struct inode *inode, 2377 struct inode *inode,
2381 struct ext4_ext_path *path, 2378 struct ext4_ext_path *path,
2382 ext4_lblk_t iblock, 2379 ext4_lblk_t iblock,
2383 unsigned long max_blocks) 2380 unsigned int max_blocks)
2384{ 2381{
2385 struct ext4_extent *ex, newex, orig_ex; 2382 struct ext4_extent *ex, newex, orig_ex;
2386 struct ext4_extent *ex1 = NULL; 2383 struct ext4_extent *ex1 = NULL;
@@ -2678,26 +2675,26 @@ fix_extent_len:
2678 */ 2675 */
2679int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2676int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2680 ext4_lblk_t iblock, 2677 ext4_lblk_t iblock,
2681 unsigned long max_blocks, struct buffer_head *bh_result, 2678 unsigned int max_blocks, struct buffer_head *bh_result,
2682 int create, int extend_disksize) 2679 int create, int extend_disksize)
2683{ 2680{
2684 struct ext4_ext_path *path = NULL; 2681 struct ext4_ext_path *path = NULL;
2685 struct ext4_extent_header *eh; 2682 struct ext4_extent_header *eh;
2686 struct ext4_extent newex, *ex; 2683 struct ext4_extent newex, *ex;
2687 ext4_fsblk_t goal, newblock; 2684 ext4_fsblk_t newblock;
2688 int err = 0, depth, ret; 2685 int err = 0, depth, ret, cache_type;
2689 unsigned long allocated = 0; 2686 unsigned int allocated = 0;
2690 struct ext4_allocation_request ar; 2687 struct ext4_allocation_request ar;
2691 loff_t disksize; 2688 loff_t disksize;
2692 2689
2693 __clear_bit(BH_New, &bh_result->b_state); 2690 __clear_bit(BH_New, &bh_result->b_state);
2694 ext_debug("blocks %u/%lu requested for inode %u\n", 2691 ext_debug("blocks %u/%u requested for inode %u\n",
2695 iblock, max_blocks, inode->i_ino); 2692 iblock, max_blocks, inode->i_ino);
2696 2693
2697 /* check in cache */ 2694 /* check in cache */
2698 goal = ext4_ext_in_cache(inode, iblock, &newex); 2695 cache_type = ext4_ext_in_cache(inode, iblock, &newex);
2699 if (goal) { 2696 if (cache_type) {
2700 if (goal == EXT4_EXT_CACHE_GAP) { 2697 if (cache_type == EXT4_EXT_CACHE_GAP) {
2701 if (!create) { 2698 if (!create) {
2702 /* 2699 /*
2703 * block isn't allocated yet and 2700 * block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2706 goto out2; 2703 goto out2;
2707 } 2704 }
2708 /* we should allocate requested block */ 2705 /* we should allocate requested block */
2709 } else if (goal == EXT4_EXT_CACHE_EXTENT) { 2706 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
2710 /* block is already allocated */ 2707 /* block is already allocated */
2711 newblock = iblock 2708 newblock = iblock
2712 - le32_to_cpu(newex.ee_block) 2709 - le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2854 if (!newblock) 2851 if (!newblock)
2855 goto out2; 2852 goto out2;
2856 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2853 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2857 goal, newblock, allocated); 2854 ar.goal, newblock, allocated);
2858 2855
2859 /* try to insert new extent into found leaf and return */ 2856 /* try to insert new extent into found leaf and return */
2860 ext4_ext_store_pblock(&newex, newblock); 2857 ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
2950 * transaction synchronous. 2947 * transaction synchronous.
2951 */ 2948 */
2952 if (IS_SYNC(inode)) 2949 if (IS_SYNC(inode))
2953 handle->h_sync = 1; 2950 ext4_handle_sync(handle);
2954 2951
2955out_stop: 2952out_stop:
2956 up_write(&EXT4_I(inode)->i_data_sem); 2953 up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3004 handle_t *handle; 3001 handle_t *handle;
3005 ext4_lblk_t block; 3002 ext4_lblk_t block;
3006 loff_t new_size; 3003 loff_t new_size;
3007 unsigned long max_blocks; 3004 unsigned int max_blocks;
3008 int ret = 0; 3005 int ret = 0;
3009 int ret2 = 0; 3006 int ret2 = 0;
3010 int retries = 0; 3007 int retries = 0;
@@ -3083,7 +3080,7 @@ retry:
3083/* 3080/*
3084 * Callback function called for each extent to gather FIEMAP information. 3081 * Callback function called for each extent to gather FIEMAP information.
3085 */ 3082 */
3086int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3083static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3087 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3084 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3088 void *data) 3085 void *data)
3089{ 3086{
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3152/* fiemap flags we can handle specified here */ 3149/* fiemap flags we can handle specified here */
3153#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 3150#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3154 3151
3155int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) 3152static int ext4_xattr_fiemap(struct inode *inode,
3153 struct fiemap_extent_info *fieinfo)
3156{ 3154{
3157 __u64 physical = 0; 3155 __u64 physical = 0;
3158 __u64 length; 3156 __u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f7..f731cb545a03 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
140 return 0; 140 return 0;
141} 141}
142 142
143extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
144 __u64 start, __u64 len);
145
146const struct file_operations ext4_file_operations = { 143const struct file_operations ext4_file_operations = {
147 .llseek = generic_file_llseek, 144 .llseek = generic_file_llseek,
148 .read = do_sync_read, 145 .read = do_sync_read,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3db..ac8f168c8ab4 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash(const char *name, int len) 38static __u32 dx_hack_hash_unsigned(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 const unsigned char *ucp = (const unsigned char *) name;
42
43 while (len--) {
44 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
45
46 if (hash & 0x80000000)
47 hash -= 0x7fffffff;
48 hash1 = hash0;
49 hash0 = hash;
50 }
51 return hash0 << 1;
52}
53
54static __u32 dx_hack_hash_signed(const char *name, int len)
55{
56 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
57 const signed char *scp = (const signed char *) name;
58
41 while (len--) { 59 while (len--) {
42 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); 60 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
43 61
44 if (hash & 0x80000000) hash -= 0x7fffffff; 62 if (hash & 0x80000000)
63 hash -= 0x7fffffff;
45 hash1 = hash0; 64 hash1 = hash0;
46 hash0 = hash; 65 hash0 = hash;
47 } 66 }
48 return (hash0 << 1); 67 return hash0 << 1;
68}
69
70static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
71{
72 __u32 pad, val;
73 int i;
74 const signed char *scp = (const signed char *) msg;
75
76 pad = (__u32)len | ((__u32)len << 8);
77 pad |= pad << 16;
78
79 val = pad;
80 if (len > num*4)
81 len = num * 4;
82 for (i = 0; i < len; i++) {
83 if ((i % 4) == 0)
84 val = pad;
85 val = ((int) scp[i]) + (val << 8);
86 if ((i % 4) == 3) {
87 *buf++ = val;
88 val = pad;
89 num--;
90 }
91 }
92 if (--num >= 0)
93 *buf++ = val;
94 while (--num >= 0)
95 *buf++ = pad;
49} 96}
50 97
51static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 98static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
52{ 99{
53 __u32 pad, val; 100 __u32 pad, val;
54 int i; 101 int i;
102 const unsigned char *ucp = (const unsigned char *) msg;
55 103
56 pad = (__u32)len | ((__u32)len << 8); 104 pad = (__u32)len | ((__u32)len << 8);
57 pad |= pad << 16; 105 pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
62 for (i = 0; i < len; i++) { 110 for (i = 0; i < len; i++) {
63 if ((i % 4) == 0) 111 if ((i % 4) == 0)
64 val = pad; 112 val = pad;
65 val = msg[i] + (val << 8); 113 val = ((int) ucp[i]) + (val << 8);
66 if ((i % 4) == 3) { 114 if ((i % 4) == 3) {
67 *buf++ = val; 115 *buf++ = val;
68 val = pad; 116 val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
95 const char *p; 143 const char *p;
96 int i; 144 int i;
97 __u32 in[8], buf[4]; 145 __u32 in[8], buf[4];
146 void (*str2hashbuf)(const char *, int, __u32 *, int) =
147 str2hashbuf_signed;
98 148
99 /* Initialize the default seed for the hash checksum functions */ 149 /* Initialize the default seed for the hash checksum functions */
100 buf[0] = 0x67452301; 150 buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
113 } 163 }
114 164
115 switch (hinfo->hash_version) { 165 switch (hinfo->hash_version) {
166 case DX_HASH_LEGACY_UNSIGNED:
167 hash = dx_hack_hash_unsigned(name, len);
168 break;
116 case DX_HASH_LEGACY: 169 case DX_HASH_LEGACY:
117 hash = dx_hack_hash(name, len); 170 hash = dx_hack_hash_signed(name, len);
118 break; 171 break;
172 case DX_HASH_HALF_MD4_UNSIGNED:
173 str2hashbuf = str2hashbuf_unsigned;
119 case DX_HASH_HALF_MD4: 174 case DX_HASH_HALF_MD4:
120 p = name; 175 p = name;
121 while (len > 0) { 176 while (len > 0) {
122 str2hashbuf(p, len, in, 8); 177 (*str2hashbuf)(p, len, in, 8);
123 half_md4_transform(buf, in); 178 half_md4_transform(buf, in);
124 len -= 32; 179 len -= 32;
125 p += 32; 180 p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
127 minor_hash = buf[2]; 182 minor_hash = buf[2];
128 hash = buf[1]; 183 hash = buf[1];
129 break; 184 break;
185 case DX_HASH_TEA_UNSIGNED:
186 str2hashbuf = str2hashbuf_unsigned;
130 case DX_HASH_TEA: 187 case DX_HASH_TEA:
131 p = name; 188 p = name;
132 while (len > 0) { 189 while (len > 0) {
133 str2hashbuf(p, len, in, 4); 190 (*str2hashbuf)(p, len, in, 4);
134 TEA_transform(buf, in); 191 TEA_transform(buf, in);
135 len -= 16; 192 len -= 16;
136 p += 16; 193 p += 16;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6e6052879aa2..4fb86a0061d0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
74 /* If checksum is bad mark all blocks and inodes use to prevent 74 /* If checksum is bad mark all blocks and inodes use to prevent
75 * allocation, essentially implementing a per-group read-only flag. */ 75 * allocation, essentially implementing a per-group read-only flag. */
76 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 76 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
77 ext4_error(sb, __func__, "Checksum bad for group %lu\n", 77 ext4_error(sb, __func__, "Checksum bad for group %u",
78 block_group); 78 block_group);
79 gdp->bg_free_blocks_count = 0; 79 ext4_free_blks_set(sb, gdp, 0);
80 gdp->bg_free_inodes_count = 0; 80 ext4_free_inodes_set(sb, gdp, 0);
81 gdp->bg_itable_unused = 0; 81 ext4_itable_unused_set(sb, gdp, 0);
82 memset(bh->b_data, 0xff, sb->s_blocksize); 82 memset(bh->b_data, 0xff, sb->s_blocksize);
83 return 0; 83 return 0;
84 } 84 }
85 85
86 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 86 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
87 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), 87 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
88 bh->b_data); 88 bh->b_data);
89 89
90 return EXT4_INODES_PER_GROUP(sb); 90 return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 if (unlikely(!bh)) { 111 if (unlikely(!bh)) {
112 ext4_error(sb, __func__, 112 ext4_error(sb, __func__,
113 "Cannot read inode bitmap - " 113 "Cannot read inode bitmap - "
114 "block_group = %lu, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
115 block_group, bitmap_blk); 115 block_group, bitmap_blk);
116 return NULL; 116 return NULL;
117 } 117 }
118 if (buffer_uptodate(bh) && 118 if (bitmap_uptodate(bh))
119 !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
120 return bh; 119 return bh;
121 120
122 lock_buffer(bh); 121 lock_buffer(bh);
122 if (bitmap_uptodate(bh)) {
123 unlock_buffer(bh);
124 return bh;
125 }
123 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 126 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
124 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
125 ext4_init_inode_bitmap(sb, bh, block_group, desc); 128 ext4_init_inode_bitmap(sb, bh, block_group, desc);
129 set_bitmap_uptodate(bh);
126 set_buffer_uptodate(bh); 130 set_buffer_uptodate(bh);
127 unlock_buffer(bh);
128 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
132 unlock_buffer(bh);
129 return bh; 133 return bh;
130 } 134 }
131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 135 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
136 if (buffer_uptodate(bh)) {
137 /*
138 * if not uninit if bh is uptodate,
139 * bitmap is also uptodate
140 */
141 set_bitmap_uptodate(bh);
142 unlock_buffer(bh);
143 return bh;
144 }
145 /*
146 * submit the buffer_head for read. We can
147 * safely mark the bitmap as uptodate now.
148 * We do it here so the bitmap uptodate bit
149 * get set with buffer lock held.
150 */
151 set_bitmap_uptodate(bh);
132 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
133 put_bh(bh); 153 put_bh(bh);
134 ext4_error(sb, __func__, 154 ext4_error(sb, __func__,
135 "Cannot read inode bitmap - " 155 "Cannot read inode bitmap - "
136 "block_group = %lu, inode_bitmap = %llu", 156 "block_group = %u, inode_bitmap = %llu",
137 block_group, bitmap_blk); 157 block_group, bitmap_blk);
138 return NULL; 158 return NULL;
139 } 159 }
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
168 struct ext4_group_desc *gdp; 188 struct ext4_group_desc *gdp;
169 struct ext4_super_block *es; 189 struct ext4_super_block *es;
170 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
171 int fatal = 0, err; 191 int fatal = 0, err, count;
172 ext4_group_t flex_group; 192 ext4_group_t flex_group;
173 193
174 if (atomic_read(&inode->i_count) > 1) { 194 if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
190 210
191 ino = inode->i_ino; 211 ino = inode->i_ino;
192 ext4_debug("freeing inode %lu\n", ino); 212 ext4_debug("freeing inode %lu\n", ino);
213 trace_mark(ext4_free_inode,
214 "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
215 sb->s_id, inode->i_ino, inode->i_mode,
216 (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
217 (unsigned long long) inode->i_blocks);
193 218
194 /* 219 /*
195 * Note: we must free any quota before locking the superblock, 220 * Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
236 261
237 if (gdp) { 262 if (gdp) {
238 spin_lock(sb_bgl_lock(sbi, block_group)); 263 spin_lock(sb_bgl_lock(sbi, block_group));
239 le16_add_cpu(&gdp->bg_free_inodes_count, 1); 264 count = ext4_free_inodes_count(sb, gdp) + 1;
240 if (is_directory) 265 ext4_free_inodes_set(sb, gdp, count);
241 le16_add_cpu(&gdp->bg_used_dirs_count, -1); 266 if (is_directory) {
267 count = ext4_used_dirs_count(sb, gdp) - 1;
268 ext4_used_dirs_set(sb, gdp, count);
269 }
242 gdp->bg_checksum = ext4_group_desc_csum(sbi, 270 gdp->bg_checksum = ext4_group_desc_csum(sbi,
243 block_group, gdp); 271 block_group, gdp);
244 spin_unlock(sb_bgl_lock(sbi, block_group)); 272 spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
253 spin_unlock(sb_bgl_lock(sbi, flex_group)); 281 spin_unlock(sb_bgl_lock(sbi, flex_group));
254 } 282 }
255 } 283 }
256 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
257 err = ext4_journal_dirty_metadata(handle, bh2); 285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
258 if (!fatal) fatal = err; 286 if (!fatal) fatal = err;
259 } 287 }
260 BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); 288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
261 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
262 if (!fatal) 290 if (!fatal)
263 fatal = err; 291 fatal = err;
264 sb->s_dirt = 1; 292 sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
291 319
292 for (group = 0; group < ngroups; group++) { 320 for (group = 0; group < ngroups; group++) {
293 desc = ext4_get_group_desc(sb, group, NULL); 321 desc = ext4_get_group_desc(sb, group, NULL);
294 if (!desc || !desc->bg_free_inodes_count) 322 if (!desc || !ext4_free_inodes_count(sb, desc))
295 continue; 323 continue;
296 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 324 if (ext4_free_inodes_count(sb, desc) < avefreei)
297 continue; 325 continue;
298 if (!best_desc || 326 if (!best_desc ||
299 (le16_to_cpu(desc->bg_free_blocks_count) > 327 (ext4_free_blks_count(sb, desc) >
300 le16_to_cpu(best_desc->bg_free_blocks_count))) { 328 ext4_free_blks_count(sb, best_desc))) {
301 *best_group = group; 329 *best_group = group;
302 best_desc = desc; 330 best_desc = desc;
303 ret = 0; 331 ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
369 for (i = best_flex * flex_size; i < ngroups && 397 for (i = best_flex * flex_size; i < ngroups &&
370 i < (best_flex + 1) * flex_size; i++) { 398 i < (best_flex + 1) * flex_size; i++) {
371 desc = ext4_get_group_desc(sb, i, &bh); 399 desc = ext4_get_group_desc(sb, i, &bh);
372 if (le16_to_cpu(desc->bg_free_inodes_count)) { 400 if (ext4_free_inodes_count(sb, desc)) {
373 *best_group = i; 401 *best_group = i;
374 goto out; 402 goto out;
375 } 403 }
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
443 for (i = 0; i < ngroups; i++) { 471 for (i = 0; i < ngroups; i++) {
444 grp = (parent_group + i) % ngroups; 472 grp = (parent_group + i) % ngroups;
445 desc = ext4_get_group_desc(sb, grp, NULL); 473 desc = ext4_get_group_desc(sb, grp, NULL);
446 if (!desc || !desc->bg_free_inodes_count) 474 if (!desc || !ext4_free_inodes_count(sb, desc))
447 continue; 475 continue;
448 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) 476 if (ext4_used_dirs_count(sb, desc) >= best_ndir)
449 continue; 477 continue;
450 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 478 if (ext4_free_inodes_count(sb, desc) < avefreei)
451 continue; 479 continue;
452 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) 480 if (ext4_free_blks_count(sb, desc) < avefreeb)
453 continue; 481 continue;
454 *group = grp; 482 *group = grp;
455 ret = 0; 483 ret = 0;
456 best_ndir = le16_to_cpu(desc->bg_used_dirs_count); 484 best_ndir = ext4_used_dirs_count(sb, desc);
457 } 485 }
458 if (ret == 0) 486 if (ret == 0)
459 return ret; 487 return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
479 for (i = 0; i < ngroups; i++) { 507 for (i = 0; i < ngroups; i++) {
480 *group = (parent_group + i) % ngroups; 508 *group = (parent_group + i) % ngroups;
481 desc = ext4_get_group_desc(sb, *group, NULL); 509 desc = ext4_get_group_desc(sb, *group, NULL);
482 if (!desc || !desc->bg_free_inodes_count) 510 if (!desc || !ext4_free_inodes_count(sb, desc))
483 continue; 511 continue;
484 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) 512 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
485 continue; 513 continue;
486 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) 514 if (ext4_free_inodes_count(sb, desc) < min_inodes)
487 continue; 515 continue;
488 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) 516 if (ext4_free_blks_count(sb, desc) < min_blocks)
489 continue; 517 continue;
490 return 0; 518 return 0;
491 } 519 }
@@ -494,8 +522,8 @@ fallback:
494 for (i = 0; i < ngroups; i++) { 522 for (i = 0; i < ngroups; i++) {
495 *group = (parent_group + i) % ngroups; 523 *group = (parent_group + i) % ngroups;
496 desc = ext4_get_group_desc(sb, *group, NULL); 524 desc = ext4_get_group_desc(sb, *group, NULL);
497 if (desc && desc->bg_free_inodes_count && 525 if (desc && ext4_free_inodes_count(sb, desc) &&
498 le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) 526 ext4_free_inodes_count(sb, desc) >= avefreei)
499 return 0; 527 return 0;
500 } 528 }
501 529
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
524 */ 552 */
525 *group = parent_group; 553 *group = parent_group;
526 desc = ext4_get_group_desc(sb, *group, NULL); 554 desc = ext4_get_group_desc(sb, *group, NULL);
527 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 555 if (desc && ext4_free_inodes_count(sb, desc) &&
528 le16_to_cpu(desc->bg_free_blocks_count)) 556 ext4_free_blks_count(sb, desc))
529 return 0; 557 return 0;
530 558
531 /* 559 /*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
548 if (*group >= ngroups) 576 if (*group >= ngroups)
549 *group -= ngroups; 577 *group -= ngroups;
550 desc = ext4_get_group_desc(sb, *group, NULL); 578 desc = ext4_get_group_desc(sb, *group, NULL);
551 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 579 if (desc && ext4_free_inodes_count(sb, desc) &&
552 le16_to_cpu(desc->bg_free_blocks_count)) 580 ext4_free_blks_count(sb, desc))
553 return 0; 581 return 0;
554 } 582 }
555 583
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
562 if (++*group >= ngroups) 590 if (++*group >= ngroups)
563 *group = 0; 591 *group = 0;
564 desc = ext4_get_group_desc(sb, *group, NULL); 592 desc = ext4_get_group_desc(sb, *group, NULL);
565 if (desc && le16_to_cpu(desc->bg_free_inodes_count)) 593 if (desc && ext4_free_inodes_count(sb, desc))
566 return 0; 594 return 0;
567 } 595 }
568 596
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
570} 598}
571 599
572/* 600/*
601 * claim the inode from the inode bitmap. If the group
602 * is uninit we need to take the groups's sb_bgl_lock
603 * and clear the uninit flag. The inode bitmap update
604 * and group desc uninit flag clear should be done
605 * after holding sb_bgl_lock so that ext4_read_inode_bitmap
606 * doesn't race with the ext4_claim_inode
607 */
608static int ext4_claim_inode(struct super_block *sb,
609 struct buffer_head *inode_bitmap_bh,
610 unsigned long ino, ext4_group_t group, int mode)
611{
612 int free = 0, retval = 0, count;
613 struct ext4_sb_info *sbi = EXT4_SB(sb);
614 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
615
616 spin_lock(sb_bgl_lock(sbi, group));
617 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
618 /* not a free inode */
619 retval = 1;
620 goto err_ret;
621 }
622 ino++;
623 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
624 ino > EXT4_INODES_PER_GROUP(sb)) {
625 spin_unlock(sb_bgl_lock(sbi, group));
626 ext4_error(sb, __func__,
627 "reserved inode or inode > inodes count - "
628 "block_group = %u, inode=%lu", group,
629 ino + group * EXT4_INODES_PER_GROUP(sb));
630 return 1;
631 }
632 /* If we didn't allocate from within the initialized part of the inode
633 * table then we need to initialize up to this inode. */
634 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
635
636 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
637 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
638 /* When marking the block group with
639 * ~EXT4_BG_INODE_UNINIT we don't want to depend
640 * on the value of bg_itable_unused even though
641 * mke2fs could have initialized the same for us.
642 * Instead we calculated the value below
643 */
644
645 free = 0;
646 } else {
647 free = EXT4_INODES_PER_GROUP(sb) -
648 ext4_itable_unused_count(sb, gdp);
649 }
650
651 /*
652 * Check the relative inode number against the last used
653 * relative inode number in this group. if it is greater
654 * we need to update the bg_itable_unused count
655 *
656 */
657 if (ino > free)
658 ext4_itable_unused_set(sb, gdp,
659 (EXT4_INODES_PER_GROUP(sb) - ino));
660 }
661 count = ext4_free_inodes_count(sb, gdp) - 1;
662 ext4_free_inodes_set(sb, gdp, count);
663 if (S_ISDIR(mode)) {
664 count = ext4_used_dirs_count(sb, gdp) + 1;
665 ext4_used_dirs_set(sb, gdp, count);
666 }
667 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
668err_ret:
669 spin_unlock(sb_bgl_lock(sbi, group));
670 return retval;
671}
672
673/*
573 * There are two policies for allocating an inode. If the new inode is 674 * There are two policies for allocating an inode. If the new inode is
574 * a directory, then a forward search is made for a block group with both 675 * a directory, then a forward search is made for a block group with both
575 * free space and a low directory-to-inode ratio; if that fails, then of 676 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
582struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) 683struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
583{ 684{
584 struct super_block *sb; 685 struct super_block *sb;
585 struct buffer_head *bitmap_bh = NULL; 686 struct buffer_head *inode_bitmap_bh = NULL;
586 struct buffer_head *bh2; 687 struct buffer_head *group_desc_bh;
587 ext4_group_t group = 0; 688 ext4_group_t group = 0;
588 unsigned long ino = 0; 689 unsigned long ino = 0;
589 struct inode *inode; 690 struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
602 return ERR_PTR(-EPERM); 703 return ERR_PTR(-EPERM);
603 704
604 sb = dir->i_sb; 705 sb = dir->i_sb;
706 trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
707 dir->i_ino, mode);
605 inode = new_inode(sb); 708 inode = new_inode(sb);
606 if (!inode) 709 if (!inode)
607 return ERR_PTR(-ENOMEM); 710 return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
631 for (i = 0; i < sbi->s_groups_count; i++) { 734 for (i = 0; i < sbi->s_groups_count; i++) {
632 err = -EIO; 735 err = -EIO;
633 736
634 gdp = ext4_get_group_desc(sb, group, &bh2); 737 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
635 if (!gdp) 738 if (!gdp)
636 goto fail; 739 goto fail;
637 740
638 brelse(bitmap_bh); 741 brelse(inode_bitmap_bh);
639 bitmap_bh = ext4_read_inode_bitmap(sb, group); 742 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
640 if (!bitmap_bh) 743 if (!inode_bitmap_bh)
641 goto fail; 744 goto fail;
642 745
643 ino = 0; 746 ino = 0;
644 747
645repeat_in_this_group: 748repeat_in_this_group:
646 ino = ext4_find_next_zero_bit((unsigned long *) 749 ino = ext4_find_next_zero_bit((unsigned long *)
647 bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); 750 inode_bitmap_bh->b_data,
751 EXT4_INODES_PER_GROUP(sb), ino);
752
648 if (ino < EXT4_INODES_PER_GROUP(sb)) { 753 if (ino < EXT4_INODES_PER_GROUP(sb)) {
649 754
650 BUFFER_TRACE(bitmap_bh, "get_write_access"); 755 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
651 err = ext4_journal_get_write_access(handle, bitmap_bh); 756 err = ext4_journal_get_write_access(handle,
757 inode_bitmap_bh);
652 if (err) 758 if (err)
653 goto fail; 759 goto fail;
654 760
655 if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), 761 BUFFER_TRACE(group_desc_bh, "get_write_access");
656 ino, bitmap_bh->b_data)) { 762 err = ext4_journal_get_write_access(handle,
763 group_desc_bh);
764 if (err)
765 goto fail;
766 if (!ext4_claim_inode(sb, inode_bitmap_bh,
767 ino, group, mode)) {
657 /* we won it */ 768 /* we won it */
658 BUFFER_TRACE(bitmap_bh, 769 BUFFER_TRACE(inode_bitmap_bh,
659 "call ext4_journal_dirty_metadata"); 770 "call ext4_handle_dirty_metadata");
660 err = ext4_journal_dirty_metadata(handle, 771 err = ext4_handle_dirty_metadata(handle,
661 bitmap_bh); 772 inode,
773 inode_bitmap_bh);
662 if (err) 774 if (err)
663 goto fail; 775 goto fail;
776 /* zero bit is inode number 1*/
777 ino++;
664 goto got; 778 goto got;
665 } 779 }
666 /* we lost it */ 780 /* we lost it */
667 jbd2_journal_release_buffer(handle, bitmap_bh); 781 ext4_handle_release_buffer(handle, inode_bitmap_bh);
782 ext4_handle_release_buffer(handle, group_desc_bh);
668 783
669 if (++ino < EXT4_INODES_PER_GROUP(sb)) 784 if (++ino < EXT4_INODES_PER_GROUP(sb))
670 goto repeat_in_this_group; 785 goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
684 goto out; 799 goto out;
685 800
686got: 801got:
687 ino++;
688 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
689 ino > EXT4_INODES_PER_GROUP(sb)) {
690 ext4_error(sb, __func__,
691 "reserved inode or inode > inodes count - "
692 "block_group = %lu, inode=%lu", group,
693 ino + group * EXT4_INODES_PER_GROUP(sb));
694 err = -EIO;
695 goto fail;
696 }
697
698 BUFFER_TRACE(bh2, "get_write_access");
699 err = ext4_journal_get_write_access(handle, bh2);
700 if (err) goto fail;
701
702 /* We may have to initialize the block bitmap if it isn't already */ 802 /* We may have to initialize the block bitmap if it isn't already */
703 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 803 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
704 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 804 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
705 struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); 805 struct buffer_head *block_bitmap_bh;
706 806
707 BUFFER_TRACE(block_bh, "get block bitmap access"); 807 block_bitmap_bh = ext4_read_block_bitmap(sb, group);
708 err = ext4_journal_get_write_access(handle, block_bh); 808 BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
809 err = ext4_journal_get_write_access(handle, block_bitmap_bh);
709 if (err) { 810 if (err) {
710 brelse(block_bh); 811 brelse(block_bitmap_bh);
711 goto fail; 812 goto fail;
712 } 813 }
713 814
@@ -715,9 +816,9 @@ got:
715 spin_lock(sb_bgl_lock(sbi, group)); 816 spin_lock(sb_bgl_lock(sbi, group));
716 /* recheck and clear flag under lock if we still need to */ 817 /* recheck and clear flag under lock if we still need to */
717 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 818 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
718 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
719 free = ext4_free_blocks_after_init(sb, group, gdp); 819 free = ext4_free_blocks_after_init(sb, group, gdp);
720 gdp->bg_free_blocks_count = cpu_to_le16(free); 820 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
821 ext4_free_blks_set(sb, gdp, free);
721 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 822 gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
722 gdp); 823 gdp);
723 } 824 }
@@ -725,55 +826,19 @@ got:
725 826
726 /* Don't need to dirty bitmap block if we didn't change it */ 827 /* Don't need to dirty bitmap block if we didn't change it */
727 if (free) { 828 if (free) {
728 BUFFER_TRACE(block_bh, "dirty block bitmap"); 829 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
729 err = ext4_journal_dirty_metadata(handle, block_bh); 830 err = ext4_handle_dirty_metadata(handle,
831 NULL, block_bitmap_bh);
730 } 832 }
731 833
732 brelse(block_bh); 834 brelse(block_bitmap_bh);
733 if (err) 835 if (err)
734 goto fail; 836 goto fail;
735 } 837 }
736 838 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
737 spin_lock(sb_bgl_lock(sbi, group)); 839 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
738 /* If we didn't allocate from within the initialized part of the inode 840 if (err)
739 * table then we need to initialize up to this inode. */ 841 goto fail;
740 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
741 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
742 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
743
744 /* When marking the block group with
745 * ~EXT4_BG_INODE_UNINIT we don't want to depend
746 * on the value of bg_itable_unused even though
747 * mke2fs could have initialized the same for us.
748 * Instead we calculated the value below
749 */
750
751 free = 0;
752 } else {
753 free = EXT4_INODES_PER_GROUP(sb) -
754 le16_to_cpu(gdp->bg_itable_unused);
755 }
756
757 /*
758 * Check the relative inode number against the last used
759 * relative inode number in this group. if it is greater
760 * we need to update the bg_itable_unused count
761 *
762 */
763 if (ino > free)
764 gdp->bg_itable_unused =
765 cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
766 }
767
768 le16_add_cpu(&gdp->bg_free_inodes_count, -1);
769 if (S_ISDIR(mode)) {
770 le16_add_cpu(&gdp->bg_used_dirs_count, 1);
771 }
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773 spin_unlock(sb_bgl_lock(sbi, group));
774 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
775 err = ext4_journal_dirty_metadata(handle, bh2);
776 if (err) goto fail;
777 842
778 percpu_counter_dec(&sbi->s_freeinodes_counter); 843 percpu_counter_dec(&sbi->s_freeinodes_counter);
779 if (S_ISDIR(mode)) 844 if (S_ISDIR(mode))
@@ -825,7 +890,7 @@ got:
825 890
826 ext4_set_inode_flags(inode); 891 ext4_set_inode_flags(inode);
827 if (IS_DIRSYNC(inode)) 892 if (IS_DIRSYNC(inode))
828 handle->h_sync = 1; 893 ext4_handle_sync(handle);
829 if (insert_inode_locked(inode) < 0) { 894 if (insert_inode_locked(inode) < 0) {
830 err = -EINVAL; 895 err = -EINVAL;
831 goto fail_drop; 896 goto fail_drop;
@@ -852,7 +917,7 @@ got:
852 if (err) 917 if (err)
853 goto fail_free_drop; 918 goto fail_free_drop;
854 919
855 if (test_opt(sb, EXTENTS)) { 920 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
856 /* set extent flag only for directory, file and normal symlink*/ 921 /* set extent flag only for directory, file and normal symlink*/
857 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 922 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
858 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 923 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -867,6 +932,8 @@ got:
867 } 932 }
868 933
869 ext4_debug("allocating inode %lu\n", inode->i_ino); 934 ext4_debug("allocating inode %lu\n", inode->i_ino);
935 trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
936 sb->s_id, inode->i_ino, dir->i_ino, mode);
870 goto really_out; 937 goto really_out;
871fail: 938fail:
872 ext4_std_error(sb, err); 939 ext4_std_error(sb, err);
@@ -874,7 +941,7 @@ out:
874 iput(inode); 941 iput(inode);
875 ret = ERR_PTR(err); 942 ret = ERR_PTR(err);
876really_out: 943really_out:
877 brelse(bitmap_bh); 944 brelse(inode_bitmap_bh);
878 return ret; 945 return ret;
879 946
880fail_free_drop: 947fail_free_drop:
@@ -886,7 +953,7 @@ fail_drop:
886 inode->i_nlink = 0; 953 inode->i_nlink = 0;
887 unlock_new_inode(inode); 954 unlock_new_inode(inode);
888 iput(inode); 955 iput(inode);
889 brelse(bitmap_bh); 956 brelse(inode_bitmap_bh);
890 return ERR_PTR(err); 957 return ERR_PTR(err);
891} 958}
892 959
@@ -985,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
985 gdp = ext4_get_group_desc(sb, i, NULL); 1052 gdp = ext4_get_group_desc(sb, i, NULL);
986 if (!gdp) 1053 if (!gdp)
987 continue; 1054 continue;
988 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1055 desc_count += ext4_free_inodes_count(sb, gdp);
989 brelse(bitmap_bh); 1056 brelse(bitmap_bh);
990 bitmap_bh = ext4_read_inode_bitmap(sb, i); 1057 bitmap_bh = ext4_read_inode_bitmap(sb, i);
991 if (!bitmap_bh) 1058 if (!bitmap_bh)
@@ -993,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
993 1060
994 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1061 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
995 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1062 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
996 i, le16_to_cpu(gdp->bg_free_inodes_count), x); 1063 i, ext4_free_inodes_count(sb, gdp), x);
997 bitmap_count += x; 1064 bitmap_count += x;
998 } 1065 }
999 brelse(bitmap_bh); 1066 brelse(bitmap_bh);
@@ -1007,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1007 gdp = ext4_get_group_desc(sb, i, NULL); 1074 gdp = ext4_get_group_desc(sb, i, NULL);
1008 if (!gdp) 1075 if (!gdp)
1009 continue; 1076 continue;
1010 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1077 desc_count += ext4_free_inodes_count(sb, gdp);
1011 cond_resched(); 1078 cond_resched();
1012 } 1079 }
1013 return desc_count; 1080 return desc_count;
@@ -1024,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1024 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1091 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1025 if (!gdp) 1092 if (!gdp)
1026 continue; 1093 continue;
1027 count += le16_to_cpu(gdp->bg_used_dirs_count); 1094 count += ext4_used_dirs_count(sb, gdp);
1028 } 1095 }
1029 return count; 1096 return count;
1030} 1097}
1031
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 98d3fe7057ef..a6444cee0c7e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
72 * "bh" may be NULL: a metadata block may have been freed from memory 72 * "bh" may be NULL: a metadata block may have been freed from memory
73 * but there may still be a record of it in the journal, and that record 73 * but there may still be a record of it in the journal, and that record
74 * still needs to be revoked. 74 * still needs to be revoked.
75 *
76 * If the handle isn't valid we're not journaling so there's nothing to do.
75 */ 77 */
76int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 78int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
77 struct buffer_head *bh, ext4_fsblk_t blocknr) 79 struct buffer_head *bh, ext4_fsblk_t blocknr)
78{ 80{
79 int err; 81 int err;
80 82
83 if (!ext4_handle_valid(handle))
84 return 0;
85
81 might_sleep(); 86 might_sleep();
82 87
83 BUFFER_TRACE(bh, "enter"); 88 BUFFER_TRACE(bh, "enter");
@@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
170 */ 175 */
171static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 176static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
172{ 177{
173 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 178 if (!ext4_handle_valid(handle))
179 return 0;
180 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
174 return 0; 181 return 0;
175 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 182 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
176 return 0; 183 return 0;
@@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
184 */ 191 */
185static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 192static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
186{ 193{
194 BUG_ON(EXT4_JOURNAL(inode) == NULL);
187 jbd_debug(2, "restarting handle %p\n", handle); 195 jbd_debug(2, "restarting handle %p\n", handle);
188 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 196 return ext4_journal_restart(handle, blocks_for_truncate(inode));
189} 197}
@@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
216 } 224 }
217 225
218 if (IS_SYNC(inode)) 226 if (IS_SYNC(inode))
219 handle->h_sync = 1; 227 ext4_handle_sync(handle);
220 inode->i_size = 0; 228 inode->i_size = 0;
221 err = ext4_mark_inode_dirty(handle, inode); 229 err = ext4_mark_inode_dirty(handle, inode);
222 if (err) { 230 if (err) {
@@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
233 * enough credits left in the handle to remove the inode from 241 * enough credits left in the handle to remove the inode from
234 * the orphan list and set the dtime field. 242 * the orphan list and set the dtime field.
235 */ 243 */
236 if (handle->h_buffer_credits < 3) { 244 if (!ext4_handle_has_enough_credits(handle, 3)) {
237 err = ext4_journal_extend(handle, 3); 245 err = ext4_journal_extend(handle, 3);
238 if (err > 0) 246 if (err > 0)
239 err = ext4_journal_restart(handle, 3); 247 err = ext4_journal_restart(handle, 3);
@@ -506,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
506 * return the total number of blocks to be allocate, including the 514 * return the total number of blocks to be allocate, including the
507 * direct and indirect blocks. 515 * direct and indirect blocks.
508 */ 516 */
509static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 517static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
510 int blocks_to_boundary) 518 int blocks_to_boundary)
511{ 519{
512 unsigned long count = 0; 520 unsigned int count = 0;
513 521
514 /* 522 /*
515 * Simple case, [t,d]Indirect block(s) has not allocated yet 523 * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -547,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
547 int indirect_blks, int blks, 555 int indirect_blks, int blks,
548 ext4_fsblk_t new_blocks[4], int *err) 556 ext4_fsblk_t new_blocks[4], int *err)
549{ 557{
558 struct ext4_allocation_request ar;
550 int target, i; 559 int target, i;
551 unsigned long count = 0, blk_allocated = 0; 560 unsigned long count = 0, blk_allocated = 0;
552 int index = 0; 561 int index = 0;
@@ -595,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
595 if (!target) 604 if (!target)
596 goto allocated; 605 goto allocated;
597 /* Now allocate data blocks */ 606 /* Now allocate data blocks */
598 count = target; 607 memset(&ar, 0, sizeof(ar));
599 /* allocating blocks for data blocks */ 608 ar.inode = inode;
600 current_block = ext4_new_blocks(handle, inode, iblock, 609 ar.goal = goal;
601 goal, &count, err); 610 ar.len = target;
611 ar.logical = iblock;
612 if (S_ISREG(inode->i_mode))
613 /* enable in-core preallocation only for regular files */
614 ar.flags = EXT4_MB_HINT_DATA;
615
616 current_block = ext4_mb_new_blocks(handle, &ar, err);
617
602 if (*err && (target == blks)) { 618 if (*err && (target == blks)) {
603 /* 619 /*
604 * if the allocation failed and we didn't allocate 620 * if the allocation failed and we didn't allocate
@@ -614,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
614 */ 630 */
615 new_blocks[index] = current_block; 631 new_blocks[index] = current_block;
616 } 632 }
617 blk_allocated += count; 633 blk_allocated += ar.len;
618 } 634 }
619allocated: 635allocated:
620 /* total number of blocks allocated for direct blocks */ 636 /* total number of blocks allocated for direct blocks */
@@ -709,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
709 set_buffer_uptodate(bh); 725 set_buffer_uptodate(bh);
710 unlock_buffer(bh); 726 unlock_buffer(bh);
711 727
712 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 728 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
713 err = ext4_journal_dirty_metadata(handle, bh); 729 err = ext4_handle_dirty_metadata(handle, inode, bh);
714 if (err) 730 if (err)
715 goto failed; 731 goto failed;
716 } 732 }
@@ -792,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
792 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 808 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
793 */ 809 */
794 jbd_debug(5, "splicing indirect only\n"); 810 jbd_debug(5, "splicing indirect only\n");
795 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); 811 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
796 err = ext4_journal_dirty_metadata(handle, where->bh); 812 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
797 if (err) 813 if (err)
798 goto err_out; 814 goto err_out;
799 } else { 815 } else {
@@ -840,10 +856,10 @@ err_out:
840 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 856 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
841 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 857 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
842 */ 858 */
843int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 859static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
844 ext4_lblk_t iblock, unsigned long maxblocks, 860 ext4_lblk_t iblock, unsigned int maxblocks,
845 struct buffer_head *bh_result, 861 struct buffer_head *bh_result,
846 int create, int extend_disksize) 862 int create, int extend_disksize)
847{ 863{
848 int err = -EIO; 864 int err = -EIO;
849 ext4_lblk_t offsets[4]; 865 ext4_lblk_t offsets[4];
@@ -1045,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1045 * It returns the error in case of allocation failure. 1061 * It returns the error in case of allocation failure.
1046 */ 1062 */
1047int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1063int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1048 unsigned long max_blocks, struct buffer_head *bh, 1064 unsigned int max_blocks, struct buffer_head *bh,
1049 int create, int extend_disksize, int flag) 1065 int create, int extend_disksize, int flag)
1050{ 1066{
1051 int retval; 1067 int retval;
@@ -1221,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1221 set_buffer_uptodate(bh); 1237 set_buffer_uptodate(bh);
1222 } 1238 }
1223 unlock_buffer(bh); 1239 unlock_buffer(bh);
1224 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1240 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1225 err = ext4_journal_dirty_metadata(handle, bh); 1241 err = ext4_handle_dirty_metadata(handle, inode, bh);
1226 if (!fatal) 1242 if (!fatal)
1227 fatal = err; 1243 fatal = err;
1228 } else { 1244 } else {
@@ -1335,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1335 pgoff_t index; 1351 pgoff_t index;
1336 unsigned from, to; 1352 unsigned from, to;
1337 1353
1354 trace_mark(ext4_write_begin,
1355 "dev %s ino %lu pos %llu len %u flags %u",
1356 inode->i_sb->s_id, inode->i_ino,
1357 (unsigned long long) pos, len, flags);
1338 index = pos >> PAGE_CACHE_SHIFT; 1358 index = pos >> PAGE_CACHE_SHIFT;
1339 from = pos & (PAGE_CACHE_SIZE - 1); 1359 from = pos & (PAGE_CACHE_SIZE - 1);
1340 to = from + len; 1360 to = from + len;
@@ -1387,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1387 if (!buffer_mapped(bh) || buffer_freed(bh)) 1407 if (!buffer_mapped(bh) || buffer_freed(bh))
1388 return 0; 1408 return 0;
1389 set_buffer_uptodate(bh); 1409 set_buffer_uptodate(bh);
1390 return ext4_journal_dirty_metadata(handle, bh); 1410 return ext4_handle_dirty_metadata(handle, NULL, bh);
1391} 1411}
1392 1412
1393/* 1413/*
@@ -1406,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
1406 struct inode *inode = mapping->host; 1426 struct inode *inode = mapping->host;
1407 int ret = 0, ret2; 1427 int ret = 0, ret2;
1408 1428
1429 trace_mark(ext4_ordered_write_end,
1430 "dev %s ino %lu pos %llu len %u copied %u",
1431 inode->i_sb->s_id, inode->i_ino,
1432 (unsigned long long) pos, len, copied);
1409 ret = ext4_jbd2_file_inode(handle, inode); 1433 ret = ext4_jbd2_file_inode(handle, inode);
1410 1434
1411 if (ret == 0) { 1435 if (ret == 0) {
@@ -1444,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
1444 int ret = 0, ret2; 1468 int ret = 0, ret2;
1445 loff_t new_i_size; 1469 loff_t new_i_size;
1446 1470
1471 trace_mark(ext4_writeback_write_end,
1472 "dev %s ino %lu pos %llu len %u copied %u",
1473 inode->i_sb->s_id, inode->i_ino,
1474 (unsigned long long) pos, len, copied);
1447 new_i_size = pos + copied; 1475 new_i_size = pos + copied;
1448 if (new_i_size > EXT4_I(inode)->i_disksize) { 1476 if (new_i_size > EXT4_I(inode)->i_disksize) {
1449 ext4_update_i_disksize(inode, new_i_size); 1477 ext4_update_i_disksize(inode, new_i_size);
@@ -1479,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
1479 unsigned from, to; 1507 unsigned from, to;
1480 loff_t new_i_size; 1508 loff_t new_i_size;
1481 1509
1510 trace_mark(ext4_journalled_write_end,
1511 "dev %s ino %lu pos %llu len %u copied %u",
1512 inode->i_sb->s_id, inode->i_ino,
1513 (unsigned long long) pos, len, copied);
1482 from = pos & (PAGE_CACHE_SIZE - 1); 1514 from = pos & (PAGE_CACHE_SIZE - 1);
1483 to = from + len; 1515 to = from + len;
1484 1516
@@ -1625,7 +1657,7 @@ struct mpage_da_data {
1625 get_block_t *get_block; 1657 get_block_t *get_block;
1626 struct writeback_control *wbc; 1658 struct writeback_control *wbc;
1627 int io_done; 1659 int io_done;
1628 long pages_written; 1660 int pages_written;
1629 int retval; 1661 int retval;
1630}; 1662};
1631 1663
@@ -1645,35 +1677,39 @@ struct mpage_da_data {
1645 */ 1677 */
1646static int mpage_da_submit_io(struct mpage_da_data *mpd) 1678static int mpage_da_submit_io(struct mpage_da_data *mpd)
1647{ 1679{
1648 struct address_space *mapping = mpd->inode->i_mapping;
1649 int ret = 0, err, nr_pages, i;
1650 unsigned long index, end;
1651 struct pagevec pvec;
1652 long pages_skipped; 1680 long pages_skipped;
1681 struct pagevec pvec;
1682 unsigned long index, end;
1683 int ret = 0, err, nr_pages, i;
1684 struct inode *inode = mpd->inode;
1685 struct address_space *mapping = inode->i_mapping;
1653 1686
1654 BUG_ON(mpd->next_page <= mpd->first_page); 1687 BUG_ON(mpd->next_page <= mpd->first_page);
1655 pagevec_init(&pvec, 0); 1688 /*
1689 * We need to start from the first_page to the next_page - 1
1690 * to make sure we also write the mapped dirty buffer_heads.
1691 * If we look at mpd->lbh.b_blocknr we would only be looking
1692 * at the currently mapped buffer_heads.
1693 */
1656 index = mpd->first_page; 1694 index = mpd->first_page;
1657 end = mpd->next_page - 1; 1695 end = mpd->next_page - 1;
1658 1696
1697 pagevec_init(&pvec, 0);
1659 while (index <= end) { 1698 while (index <= end) {
1660 /* 1699 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1661 * We can use PAGECACHE_TAG_DIRTY lookup here because
1662 * even though we have cleared the dirty flag on the page
1663 * We still keep the page in the radix tree with tag
1664 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
1665 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
1666 * which is called via the below writepage callback.
1667 */
1668 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1669 PAGECACHE_TAG_DIRTY,
1670 min(end - index,
1671 (pgoff_t)PAGEVEC_SIZE-1) + 1);
1672 if (nr_pages == 0) 1700 if (nr_pages == 0)
1673 break; 1701 break;
1674 for (i = 0; i < nr_pages; i++) { 1702 for (i = 0; i < nr_pages; i++) {
1675 struct page *page = pvec.pages[i]; 1703 struct page *page = pvec.pages[i];
1676 1704
1705 index = page->index;
1706 if (index > end)
1707 break;
1708 index++;
1709
1710 BUG_ON(!PageLocked(page));
1711 BUG_ON(PageWriteback(page));
1712
1677 pages_skipped = mpd->wbc->pages_skipped; 1713 pages_skipped = mpd->wbc->pages_skipped;
1678 err = mapping->a_ops->writepage(page, mpd->wbc); 1714 err = mapping->a_ops->writepage(page, mpd->wbc);
1679 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1715 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1831,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
1831 ext4_count_free_blocks(inode->i_sb)); 1867 ext4_count_free_blocks(inode->i_sb));
1832 printk(KERN_EMERG "Free/Dirty block details\n"); 1868 printk(KERN_EMERG "Free/Dirty block details\n");
1833 printk(KERN_EMERG "free_blocks=%lld\n", 1869 printk(KERN_EMERG "free_blocks=%lld\n",
1834 percpu_counter_sum(&sbi->s_freeblocks_counter)); 1870 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
1835 printk(KERN_EMERG "dirty_blocks=%lld\n", 1871 printk(KERN_EMERG "dirty_blocks=%lld\n",
1836 percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1872 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1837 printk(KERN_EMERG "Block reservation details\n"); 1873 printk(KERN_EMERG "Block reservation details\n");
1838 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", 1874 printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
1839 EXT4_I(inode)->i_reserved_data_blocks); 1875 EXT4_I(inode)->i_reserved_data_blocks);
1840 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", 1876 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
1841 EXT4_I(inode)->i_reserved_meta_blocks); 1877 EXT4_I(inode)->i_reserved_meta_blocks);
1842 return; 1878 return;
1843} 1879}
@@ -2087,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
2087 bh = head; 2123 bh = head;
2088 do { 2124 do {
2089 BUG_ON(buffer_locked(bh)); 2125 BUG_ON(buffer_locked(bh));
2126 /*
2127 * We need to try to allocate
2128 * unmapped blocks in the same page.
2129 * Otherwise we won't make progress
2130 * with the page in ext4_da_writepage
2131 */
2090 if (buffer_dirty(bh) && 2132 if (buffer_dirty(bh) &&
2091 (!buffer_mapped(bh) || buffer_delay(bh))) { 2133 (!buffer_mapped(bh) || buffer_delay(bh))) {
2092 mpage_add_bh_to_extent(mpd, logical, bh); 2134 mpage_add_bh_to_extent(mpd, logical, bh);
2093 if (mpd->io_done) 2135 if (mpd->io_done)
2094 return MPAGE_DA_EXTENT_TAIL; 2136 return MPAGE_DA_EXTENT_TAIL;
2137 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2138 /*
2139 * mapped dirty buffer. We need to update
2140 * the b_state because we look at
2141 * b_state in mpage_da_map_blocks. We don't
2142 * update b_size because if we find an
2143 * unmapped buffer_head later we need to
2144 * use the b_state flag of that buffer_head.
2145 */
2146 if (mpd->lbh.b_size == 0)
2147 mpd->lbh.b_state =
2148 bh->b_state & BH_FLAGS;
2095 } 2149 }
2096 logical++; 2150 logical++;
2097 } while ((bh = bh->b_this_page) != head); 2151 } while ((bh = bh->b_this_page) != head);
@@ -2269,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
2269{ 2323{
2270 int ret = 0; 2324 int ret = 0;
2271 loff_t size; 2325 loff_t size;
2272 unsigned long len; 2326 unsigned int len;
2273 struct buffer_head *page_bufs; 2327 struct buffer_head *page_bufs;
2274 struct inode *inode = page->mapping->host; 2328 struct inode *inode = page->mapping->host;
2275 2329
2330 trace_mark(ext4_da_writepage,
2331 "dev %s ino %lu page_index %lu",
2332 inode->i_sb->s_id, inode->i_ino, page->index);
2276 size = i_size_read(inode); 2333 size = i_size_read(inode);
2277 if (page->index == size >> PAGE_CACHE_SHIFT) 2334 if (page->index == size >> PAGE_CACHE_SHIFT)
2278 len = size & ~PAGE_CACHE_MASK; 2335 len = size & ~PAGE_CACHE_MASK;
@@ -2378,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
2378 struct mpage_da_data mpd; 2435 struct mpage_da_data mpd;
2379 struct inode *inode = mapping->host; 2436 struct inode *inode = mapping->host;
2380 int no_nrwrite_index_update; 2437 int no_nrwrite_index_update;
2381 long pages_written = 0, pages_skipped; 2438 int pages_written = 0;
2439 long pages_skipped;
2382 int needed_blocks, ret = 0, nr_to_writebump = 0; 2440 int needed_blocks, ret = 0, nr_to_writebump = 0;
2383 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2441 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2384 2442
2443 trace_mark(ext4_da_writepages,
2444 "dev %s ino %lu nr_t_write %ld "
2445 "pages_skipped %ld range_start %llu "
2446 "range_end %llu nonblocking %d "
2447 "for_kupdate %d for_reclaim %d "
2448 "for_writepages %d range_cyclic %d",
2449 inode->i_sb->s_id, inode->i_ino,
2450 wbc->nr_to_write, wbc->pages_skipped,
2451 (unsigned long long) wbc->range_start,
2452 (unsigned long long) wbc->range_end,
2453 wbc->nonblocking, wbc->for_kupdate,
2454 wbc->for_reclaim, wbc->for_writepages,
2455 wbc->range_cyclic);
2456
2385 /* 2457 /*
2386 * No pages to write? This is mainly a kludge to avoid starting 2458 * No pages to write? This is mainly a kludge to avoid starting
2387 * a transaction for special inodes like journal inode on last iput() 2459 * a transaction for special inodes like journal inode on last iput()
@@ -2389,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
2389 */ 2461 */
2390 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2462 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2391 return 0; 2463 return 0;
2464
2465 /*
2466 * If the filesystem has aborted, it is read-only, so return
2467 * right away instead of dumping stack traces later on that
2468 * will obscure the real source of the problem. We test
2469 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
2470 * the latter could be true if the filesystem is mounted
2471 * read-only, and in that case, ext4_da_writepages should
2472 * *never* be called, so if that ever happens, we would want
2473 * the stack trace.
2474 */
2475 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
2476 return -EROFS;
2477
2392 /* 2478 /*
2393 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2479 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2394 * This make sure small files blocks are allocated in 2480 * This make sure small files blocks are allocated in
@@ -2433,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2433 handle = ext4_journal_start(inode, needed_blocks); 2519 handle = ext4_journal_start(inode, needed_blocks);
2434 if (IS_ERR(handle)) { 2520 if (IS_ERR(handle)) {
2435 ret = PTR_ERR(handle); 2521 ret = PTR_ERR(handle);
2436 printk(KERN_EMERG "%s: jbd2_start: " 2522 printk(KERN_CRIT "%s: jbd2_start: "
2437 "%ld pages, ino %lu; err %d\n", __func__, 2523 "%ld pages, ino %lu; err %d\n", __func__,
2438 wbc->nr_to_write, inode->i_ino, ret); 2524 wbc->nr_to_write, inode->i_ino, ret);
2439 dump_stack(); 2525 dump_stack();
@@ -2486,6 +2572,14 @@ out_writepages:
2486 if (!no_nrwrite_index_update) 2572 if (!no_nrwrite_index_update)
2487 wbc->no_nrwrite_index_update = 0; 2573 wbc->no_nrwrite_index_update = 0;
2488 wbc->nr_to_write -= nr_to_writebump; 2574 wbc->nr_to_write -= nr_to_writebump;
2575 trace_mark(ext4_da_writepage_result,
2576 "dev %s ino %lu ret %d pages_written %d "
2577 "pages_skipped %ld congestion %d "
2578 "more_io %d no_nrwrite_index_update %d",
2579 inode->i_sb->s_id, inode->i_ino, ret,
2580 pages_written, wbc->pages_skipped,
2581 wbc->encountered_congestion, wbc->more_io,
2582 wbc->no_nrwrite_index_update);
2489 return ret; 2583 return ret;
2490} 2584}
2491 2585
@@ -2537,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2537 len, flags, pagep, fsdata); 2631 len, flags, pagep, fsdata);
2538 } 2632 }
2539 *fsdata = (void *)0; 2633 *fsdata = (void *)0;
2634
2635 trace_mark(ext4_da_write_begin,
2636 "dev %s ino %lu pos %llu len %u flags %u",
2637 inode->i_sb->s_id, inode->i_ino,
2638 (unsigned long long) pos, len, flags);
2540retry: 2639retry:
2541 /* 2640 /*
2542 * With delayed allocation, we don't log the i_disksize update 2641 * With delayed allocation, we don't log the i_disksize update
@@ -2626,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
2626 } 2725 }
2627 } 2726 }
2628 2727
2728 trace_mark(ext4_da_write_end,
2729 "dev %s ino %lu pos %llu len %u copied %u",
2730 inode->i_sb->s_id, inode->i_ino,
2731 (unsigned long long) pos, len, copied);
2629 start = pos & (PAGE_CACHE_SIZE - 1); 2732 start = pos & (PAGE_CACHE_SIZE - 1);
2630 end = start + copied - 1; 2733 end = start + copied - 1;
2631 2734
@@ -2718,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2718 filemap_write_and_wait(mapping); 2821 filemap_write_and_wait(mapping);
2719 } 2822 }
2720 2823
2721 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2824 BUG_ON(!EXT4_JOURNAL(inode) &&
2825 EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
2826
2827 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
2722 /* 2828 /*
2723 * This is a REALLY heavyweight approach, but the use of 2829 * This is a REALLY heavyweight approach, but the use of
2724 * bmap on dirty files is expected to be extremely rare: 2830 * bmap on dirty files is expected to be extremely rare:
@@ -2836,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page,
2836 loff_t size = i_size_read(inode); 2942 loff_t size = i_size_read(inode);
2837 loff_t len; 2943 loff_t len;
2838 2944
2945 trace_mark(ext4_normal_writepage,
2946 "dev %s ino %lu page_index %lu",
2947 inode->i_sb->s_id, inode->i_ino, page->index);
2839 J_ASSERT(PageLocked(page)); 2948 J_ASSERT(PageLocked(page));
2840 if (page->index == size >> PAGE_CACHE_SHIFT) 2949 if (page->index == size >> PAGE_CACHE_SHIFT)
2841 len = size & ~PAGE_CACHE_MASK; 2950 len = size & ~PAGE_CACHE_MASK;
@@ -2921,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page,
2921 loff_t size = i_size_read(inode); 3030 loff_t size = i_size_read(inode);
2922 loff_t len; 3031 loff_t len;
2923 3032
3033 trace_mark(ext4_journalled_writepage,
3034 "dev %s ino %lu page_index %lu",
3035 inode->i_sb->s_id, inode->i_ino, page->index);
2924 J_ASSERT(PageLocked(page)); 3036 J_ASSERT(PageLocked(page));
2925 if (page->index == size >> PAGE_CACHE_SHIFT) 3037 if (page->index == size >> PAGE_CACHE_SHIFT)
2926 len = size & ~PAGE_CACHE_MASK; 3038 len = size & ~PAGE_CACHE_MASK;
@@ -2989,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
2989 if (offset == 0) 3101 if (offset == 0)
2990 ClearPageChecked(page); 3102 ClearPageChecked(page);
2991 3103
2992 jbd2_journal_invalidatepage(journal, page, offset); 3104 if (journal)
3105 jbd2_journal_invalidatepage(journal, page, offset);
3106 else
3107 block_invalidatepage(page, offset);
2993} 3108}
2994 3109
2995static int ext4_releasepage(struct page *page, gfp_t wait) 3110static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2999,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
2999 WARN_ON(PageChecked(page)); 3114 WARN_ON(PageChecked(page));
3000 if (!page_has_buffers(page)) 3115 if (!page_has_buffers(page))
3001 return 0; 3116 return 0;
3002 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3117 if (journal)
3118 return jbd2_journal_try_to_free_buffers(journal, page, wait);
3119 else
3120 return try_to_free_buffers(page);
3003} 3121}
3004 3122
3005/* 3123/*
@@ -3271,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle,
3271 3389
3272 err = 0; 3390 err = 0;
3273 if (ext4_should_journal_data(inode)) { 3391 if (ext4_should_journal_data(inode)) {
3274 err = ext4_journal_dirty_metadata(handle, bh); 3392 err = ext4_handle_dirty_metadata(handle, inode, bh);
3275 } else { 3393 } else {
3276 if (ext4_should_order_data(inode)) 3394 if (ext4_should_order_data(inode))
3277 err = ext4_jbd2_file_inode(handle, inode); 3395 err = ext4_jbd2_file_inode(handle, inode);
@@ -3395,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3395 __le32 *p; 3513 __le32 *p;
3396 if (try_to_extend_transaction(handle, inode)) { 3514 if (try_to_extend_transaction(handle, inode)) {
3397 if (bh) { 3515 if (bh) {
3398 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 3516 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
3399 ext4_journal_dirty_metadata(handle, bh); 3517 ext4_handle_dirty_metadata(handle, inode, bh);
3400 } 3518 }
3401 ext4_mark_inode_dirty(handle, inode); 3519 ext4_mark_inode_dirty(handle, inode);
3402 ext4_journal_test_restart(handle, inode); 3520 ext4_journal_test_restart(handle, inode);
@@ -3496,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3496 count, block_to_free_p, p); 3614 count, block_to_free_p, p);
3497 3615
3498 if (this_bh) { 3616 if (this_bh) {
3499 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3617 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
3500 3618
3501 /* 3619 /*
3502 * The buffer head should have an attached journal head at this 3620 * The buffer head should have an attached journal head at this
@@ -3505,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3505 * the block was cleared. Check for this instead of OOPSing. 3623 * the block was cleared. Check for this instead of OOPSing.
3506 */ 3624 */
3507 if (bh2jh(this_bh)) 3625 if (bh2jh(this_bh))
3508 ext4_journal_dirty_metadata(handle, this_bh); 3626 ext4_handle_dirty_metadata(handle, inode, this_bh);
3509 else 3627 else
3510 ext4_error(inode->i_sb, __func__, 3628 ext4_error(inode->i_sb, __func__,
3511 "circular indirect block detected, " 3629 "circular indirect block detected, "
@@ -3535,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3535 ext4_fsblk_t nr; 3653 ext4_fsblk_t nr;
3536 __le32 *p; 3654 __le32 *p;
3537 3655
3538 if (is_handle_aborted(handle)) 3656 if (ext4_handle_is_aborted(handle))
3539 return; 3657 return;
3540 3658
3541 if (depth--) { 3659 if (depth--) {
@@ -3605,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3605 * will merely complain about releasing a free block, 3723 * will merely complain about releasing a free block,
3606 * rather than leaking blocks. 3724 * rather than leaking blocks.
3607 */ 3725 */
3608 if (is_handle_aborted(handle)) 3726 if (ext4_handle_is_aborted(handle))
3609 return; 3727 return;
3610 if (try_to_extend_transaction(handle, inode)) { 3728 if (try_to_extend_transaction(handle, inode)) {
3611 ext4_mark_inode_dirty(handle, inode); 3729 ext4_mark_inode_dirty(handle, inode);
@@ -3624,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3624 parent_bh)){ 3742 parent_bh)){
3625 *p = 0; 3743 *p = 0;
3626 BUFFER_TRACE(parent_bh, 3744 BUFFER_TRACE(parent_bh,
3627 "call ext4_journal_dirty_metadata"); 3745 "call ext4_handle_dirty_metadata");
3628 ext4_journal_dirty_metadata(handle, 3746 ext4_handle_dirty_metadata(handle,
3629 parent_bh); 3747 inode,
3748 parent_bh);
3630 } 3749 }
3631 } 3750 }
3632 } 3751 }
@@ -3814,7 +3933,7 @@ do_indirects:
3814 * synchronous 3933 * synchronous
3815 */ 3934 */
3816 if (IS_SYNC(inode)) 3935 if (IS_SYNC(inode))
3817 handle->h_sync = 1; 3936 ext4_handle_sync(handle);
3818out_stop: 3937out_stop:
3819 /* 3938 /*
3820 * If this was a simple ftruncate(), and the file will remain alive 3939 * If this was a simple ftruncate(), and the file will remain alive
@@ -3844,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
3844 ext4_fsblk_t block; 3963 ext4_fsblk_t block;
3845 int inodes_per_block, inode_offset; 3964 int inodes_per_block, inode_offset;
3846 3965
3847 iloc->bh = 0; 3966 iloc->bh = NULL;
3848 if (!ext4_valid_inum(sb, inode->i_ino)) 3967 if (!ext4_valid_inum(sb, inode->i_ino))
3849 return -EIO; 3968 return -EIO;
3850 3969
@@ -3951,7 +4070,7 @@ make_io:
3951 num = EXT4_INODES_PER_GROUP(sb); 4070 num = EXT4_INODES_PER_GROUP(sb);
3952 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4071 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3953 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4072 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3954 num -= le16_to_cpu(gdp->bg_itable_unused); 4073 num -= ext4_itable_unused_count(sb, gdp);
3955 table += num / inodes_per_block; 4074 table += num / inodes_per_block;
3956 if (end > table) 4075 if (end > table)
3957 end = table; 4076 end = table;
@@ -4313,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle,
4313 EXT4_SET_RO_COMPAT_FEATURE(sb, 4432 EXT4_SET_RO_COMPAT_FEATURE(sb,
4314 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4433 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
4315 sb->s_dirt = 1; 4434 sb->s_dirt = 1;
4316 handle->h_sync = 1; 4435 ext4_handle_sync(handle);
4317 err = ext4_journal_dirty_metadata(handle, 4436 err = ext4_handle_dirty_metadata(handle, inode,
4318 EXT4_SB(sb)->s_sbh); 4437 EXT4_SB(sb)->s_sbh);
4319 } 4438 }
4320 } 4439 }
@@ -4341,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle,
4341 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4460 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4342 } 4461 }
4343 4462
4344 4463 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4345 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 4464 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4346 rc = ext4_journal_dirty_metadata(handle, bh);
4347 if (!err) 4465 if (!err)
4348 err = rc; 4466 err = rc;
4349 ei->i_state &= ~EXT4_STATE_NEW; 4467 ei->i_state &= ~EXT4_STATE_NEW;
@@ -4406,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait)
4406 return ext4_force_commit(inode->i_sb); 4524 return ext4_force_commit(inode->i_sb);
4407} 4525}
4408 4526
4527int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
4528{
4529 int err = 0;
4530
4531 mark_buffer_dirty(bh);
4532 if (inode && inode_needs_sync(inode)) {
4533 sync_dirty_buffer(bh);
4534 if (buffer_req(bh) && !buffer_uptodate(bh)) {
4535 ext4_error(inode->i_sb, __func__,
4536 "IO error syncing inode, "
4537 "inode=%lu, block=%llu",
4538 inode->i_ino,
4539 (unsigned long long)bh->b_blocknr);
4540 err = -EIO;
4541 }
4542 }
4543 return err;
4544}
4545
4409/* 4546/*
4410 * ext4_setattr() 4547 * ext4_setattr()
4411 * 4548 *
@@ -4710,16 +4847,15 @@ int
4710ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 4847ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
4711 struct ext4_iloc *iloc) 4848 struct ext4_iloc *iloc)
4712{ 4849{
4713 int err = 0; 4850 int err;
4714 if (handle) { 4851
4715 err = ext4_get_inode_loc(inode, iloc); 4852 err = ext4_get_inode_loc(inode, iloc);
4716 if (!err) { 4853 if (!err) {
4717 BUFFER_TRACE(iloc->bh, "get_write_access"); 4854 BUFFER_TRACE(iloc->bh, "get_write_access");
4718 err = ext4_journal_get_write_access(handle, iloc->bh); 4855 err = ext4_journal_get_write_access(handle, iloc->bh);
4719 if (err) { 4856 if (err) {
4720 brelse(iloc->bh); 4857 brelse(iloc->bh);
4721 iloc->bh = NULL; 4858 iloc->bh = NULL;
4722 }
4723 } 4859 }
4724 } 4860 }
4725 ext4_std_error(inode->i_sb, err); 4861 ext4_std_error(inode->i_sb, err);
@@ -4791,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4791 4927
4792 might_sleep(); 4928 might_sleep();
4793 err = ext4_reserve_inode_write(handle, inode, &iloc); 4929 err = ext4_reserve_inode_write(handle, inode, &iloc);
4794 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 4930 if (ext4_handle_valid(handle) &&
4931 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4795 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 4932 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
4796 /* 4933 /*
4797 * We need extra buffer credits since we may write into EA block 4934 * We need extra buffer credits since we may write into EA block
@@ -4843,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode)
4843 handle_t *current_handle = ext4_journal_current_handle(); 4980 handle_t *current_handle = ext4_journal_current_handle();
4844 handle_t *handle; 4981 handle_t *handle;
4845 4982
4983 if (!ext4_handle_valid(current_handle)) {
4984 ext4_mark_inode_dirty(current_handle, inode);
4985 return;
4986 }
4987
4846 handle = ext4_journal_start(inode, 2); 4988 handle = ext4_journal_start(inode, 2);
4847 if (IS_ERR(handle)) 4989 if (IS_ERR(handle))
4848 goto out; 4990 goto out;
@@ -4880,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4880 BUFFER_TRACE(iloc.bh, "get_write_access"); 5022 BUFFER_TRACE(iloc.bh, "get_write_access");
4881 err = jbd2_journal_get_write_access(handle, iloc.bh); 5023 err = jbd2_journal_get_write_access(handle, iloc.bh);
4882 if (!err) 5024 if (!err)
4883 err = ext4_journal_dirty_metadata(handle, 5025 err = ext4_handle_dirty_metadata(handle,
4884 iloc.bh); 5026 inode,
5027 iloc.bh);
4885 brelse(iloc.bh); 5028 brelse(iloc.bh);
4886 } 5029 }
4887 } 5030 }
@@ -4907,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4907 */ 5050 */
4908 5051
4909 journal = EXT4_JOURNAL(inode); 5052 journal = EXT4_JOURNAL(inode);
5053 if (!journal)
5054 return 0;
4910 if (is_journal_aborted(journal)) 5055 if (is_journal_aborted(journal))
4911 return -EROFS; 5056 return -EROFS;
4912 5057
@@ -4936,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4936 return PTR_ERR(handle); 5081 return PTR_ERR(handle);
4937 5082
4938 err = ext4_mark_inode_dirty(handle, inode); 5083 err = ext4_mark_inode_dirty(handle, inode);
4939 handle->h_sync = 1; 5084 ext4_handle_sync(handle);
4940 ext4_journal_stop(handle); 5085 ext4_journal_stop(handle);
4941 ext4_std_error(inode->i_sb, err); 5086 ext4_std_error(inode->i_sb, err);
4942 5087
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d58..42dc83fb247a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
99 goto flags_out; 99 goto flags_out;
100 } 100 }
101 if (IS_SYNC(inode)) 101 if (IS_SYNC(inode))
102 handle->h_sync = 1; 102 ext4_handle_sync(handle);
103 err = ext4_reserve_inode_write(handle, inode, &iloc); 103 err = ext4_reserve_inode_write(handle, inode, &iloc);
104 if (err) 104 if (err)
105 goto flags_err; 105 goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72e..918aec0c8a11 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
100 * inode as: 100 * inode as:
101 * 101 *
102 * { page } 102 * { page }
103 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 103 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
104 * 104 *
105 * 105 *
106 * one block each for bitmap and buddy information. So for each group we 106 * one block each for bitmap and buddy information. So for each group we
@@ -330,6 +330,18 @@
330 * object 330 * object
331 * 331 *
332 */ 332 */
333static struct kmem_cache *ext4_pspace_cachep;
334static struct kmem_cache *ext4_ac_cachep;
335static struct kmem_cache *ext4_free_ext_cachep;
336static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343
344
333 345
334static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 346static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
335{ 347{
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
445 blocknr += first + i; 457 blocknr += first + i;
446 blocknr += 458 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 459 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 460 ext4_grp_locked_error(sb, e4b->bd_group,
449 ext4_error(sb, __func__, "double-free of inode" 461 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %lu)\n", 462 " %lu's block %llu(bit %u in group %u)",
451 inode ? inode->i_ino : 0, blocknr, 463 inode ? inode->i_ino : 0, blocknr,
452 first + i, e4b->bd_group); 464 first + i, e4b->bd_group);
453 } 465 }
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
477 b2 = (unsigned char *) bitmap; 489 b2 = (unsigned char *) bitmap;
478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 490 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
479 if (b1[i] != b2[i]) { 491 if (b1[i] != b2[i]) {
480 printk(KERN_ERR "corruption in group %lu " 492 printk(KERN_ERR "corruption in group %u "
481 "at byte %u(%u): %x in copy != %x " 493 "at byte %u(%u): %x in copy != %x "
482 "on disk/prealloc\n", 494 "on disk/prealloc\n",
483 e4b->bd_group, i, i * 8, b1[i], b2[i]); 495 e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
690 grp->bb_fragments = fragments; 702 grp->bb_fragments = fragments;
691 703
692 if (free != grp->bb_free) { 704 if (free != grp->bb_free) {
693 ext4_error(sb, __func__, 705 ext4_grp_locked_error(sb, group, __func__,
694 "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", 706 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
695 group, free, grp->bb_free); 707 group, free, grp->bb_free);
696 /* 708 /*
697 * If we intent to continue, we consider group descritor 709 * If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
716 * stored in the inode as 728 * stored in the inode as
717 * 729 *
718 * { page } 730 * { page }
719 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 731 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
720 * 732 *
721 * 733 *
722 * one block each for bitmap and buddy information. 734 * one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
782 if (bh[i] == NULL) 794 if (bh[i] == NULL)
783 goto out; 795 goto out;
784 796
785 if (buffer_uptodate(bh[i]) && 797 if (bitmap_uptodate(bh[i]))
786 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
787 continue; 798 continue;
788 799
789 lock_buffer(bh[i]); 800 lock_buffer(bh[i]);
801 if (bitmap_uptodate(bh[i])) {
802 unlock_buffer(bh[i]);
803 continue;
804 }
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 805 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 806 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
792 ext4_init_block_bitmap(sb, bh[i], 807 ext4_init_block_bitmap(sb, bh[i],
793 first_group + i, desc); 808 first_group + i, desc);
809 set_bitmap_uptodate(bh[i]);
794 set_buffer_uptodate(bh[i]); 810 set_buffer_uptodate(bh[i]);
795 unlock_buffer(bh[i]);
796 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 811 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
812 unlock_buffer(bh[i]);
797 continue; 813 continue;
798 } 814 }
799 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 815 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
816 if (buffer_uptodate(bh[i])) {
817 /*
818 * if not uninit if bh is uptodate,
819 * bitmap is also uptodate
820 */
821 set_bitmap_uptodate(bh[i]);
822 unlock_buffer(bh[i]);
823 continue;
824 }
800 get_bh(bh[i]); 825 get_bh(bh[i]);
826 /*
827 * submit the buffer_head for read. We can
828 * safely mark the bitmap as uptodate now.
829 * We do it here so the bitmap uptodate bit
830 * get set with buffer lock held.
831 */
832 set_bitmap_uptodate(bh[i]);
801 bh[i]->b_end_io = end_buffer_read_sync; 833 bh[i]->b_end_io = end_buffer_read_sync;
802 submit_bh(READ, bh[i]); 834 submit_bh(READ, bh[i]);
803 mb_debug("read bitmap for group %lu\n", first_group + i); 835 mb_debug("read bitmap for group %u\n", first_group + i);
804 } 836 }
805 837
806 /* wait for I/O completion */ 838 /* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
814 846
815 err = 0; 847 err = 0;
816 first_block = page->index * blocks_per_page; 848 first_block = page->index * blocks_per_page;
849 /* init the page */
850 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
817 for (i = 0; i < blocks_per_page; i++) { 851 for (i = 0; i < blocks_per_page; i++) {
818 int group; 852 int group;
819 struct ext4_group_info *grinfo; 853 struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
840 BUG_ON(incore == NULL); 874 BUG_ON(incore == NULL);
841 mb_debug("put buddy for group %u in page %lu/%x\n", 875 mb_debug("put buddy for group %u in page %lu/%x\n",
842 group, page->index, i * blocksize); 876 group, page->index, i * blocksize);
843 memset(data, 0xff, blocksize);
844 grinfo = ext4_get_group_info(sb, group); 877 grinfo = ext4_get_group_info(sb, group);
845 grinfo->bb_fragments = 0; 878 grinfo->bb_fragments = 0;
846 memset(grinfo->bb_counters, 0, 879 memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
848 /* 881 /*
849 * incore got set to the group block bitmap below 882 * incore got set to the group block bitmap below
850 */ 883 */
884 ext4_lock_group(sb, group);
851 ext4_mb_generate_buddy(sb, data, incore, group); 885 ext4_mb_generate_buddy(sb, data, incore, group);
886 ext4_unlock_group(sb, group);
852 incore = NULL; 887 incore = NULL;
853 } else { 888 } else {
854 /* this is block of bitmap */ 889 /* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 897
863 /* mark all preallocated blks used in in-core bitmap */ 898 /* mark all preallocated blks used in in-core bitmap */
864 ext4_mb_generate_from_pa(sb, data, group); 899 ext4_mb_generate_from_pa(sb, data, group);
900 ext4_mb_generate_from_freelist(sb, data, group);
865 ext4_unlock_group(sb, group); 901 ext4_unlock_group(sb, group);
866 902
867 /* set incore so that the buddy information can be 903 /* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
886ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 922ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
887 struct ext4_buddy *e4b) 923 struct ext4_buddy *e4b)
888{ 924{
889 struct ext4_sb_info *sbi = EXT4_SB(sb);
890 struct inode *inode = sbi->s_buddy_cache;
891 int blocks_per_page; 925 int blocks_per_page;
892 int block; 926 int block;
893 int pnum; 927 int pnum;
894 int poff; 928 int poff;
895 struct page *page; 929 struct page *page;
896 int ret; 930 int ret;
931 struct ext4_group_info *grp;
932 struct ext4_sb_info *sbi = EXT4_SB(sb);
933 struct inode *inode = sbi->s_buddy_cache;
897 934
898 mb_debug("load group %lu\n", group); 935 mb_debug("load group %u\n", group);
899 936
900 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 937 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
938 grp = ext4_get_group_info(sb, group);
901 939
902 e4b->bd_blkbits = sb->s_blocksize_bits; 940 e4b->bd_blkbits = sb->s_blocksize_bits;
903 e4b->bd_info = ext4_get_group_info(sb, group); 941 e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
905 e4b->bd_group = group; 943 e4b->bd_group = group;
906 e4b->bd_buddy_page = NULL; 944 e4b->bd_buddy_page = NULL;
907 e4b->bd_bitmap_page = NULL; 945 e4b->bd_bitmap_page = NULL;
946 e4b->alloc_semp = &grp->alloc_sem;
947
948 /* Take the read lock on the group alloc
949 * sem. This would make sure a parallel
950 * ext4_mb_init_group happening on other
951 * groups mapped by the page is blocked
952 * till we are done with allocation
953 */
954 down_read(e4b->alloc_semp);
908 955
909 /* 956 /*
910 * the buddy cache inode stores the block bitmap 957 * the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
920 page = find_get_page(inode->i_mapping, pnum); 967 page = find_get_page(inode->i_mapping, pnum);
921 if (page == NULL || !PageUptodate(page)) { 968 if (page == NULL || !PageUptodate(page)) {
922 if (page) 969 if (page)
970 /*
971 * drop the page reference and try
972 * to get the page with lock. If we
973 * are not uptodate that implies
974 * somebody just created the page but
975 * is yet to initialize the same. So
976 * wait for it to initialize.
977 */
923 page_cache_release(page); 978 page_cache_release(page);
924 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 979 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
925 if (page) { 980 if (page) {
@@ -985,6 +1040,9 @@ err:
985 page_cache_release(e4b->bd_buddy_page); 1040 page_cache_release(e4b->bd_buddy_page);
986 e4b->bd_buddy = NULL; 1041 e4b->bd_buddy = NULL;
987 e4b->bd_bitmap = NULL; 1042 e4b->bd_bitmap = NULL;
1043
1044 /* Done with the buddy cache */
1045 up_read(e4b->alloc_semp);
988 return ret; 1046 return ret;
989} 1047}
990 1048
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
994 page_cache_release(e4b->bd_bitmap_page); 1052 page_cache_release(e4b->bd_bitmap_page);
995 if (e4b->bd_buddy_page) 1053 if (e4b->bd_buddy_page)
996 page_cache_release(e4b->bd_buddy_page); 1054 page_cache_release(e4b->bd_buddy_page);
1055 /* Done with the buddy cache */
1056 if (e4b->alloc_semp)
1057 up_read(e4b->alloc_semp);
997} 1058}
998 1059
999 1060
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1031 cur += 32; 1092 cur += 32;
1032 continue; 1093 continue;
1033 } 1094 }
1034 mb_clear_bit_atomic(lock, cur, bm); 1095 if (lock)
1096 mb_clear_bit_atomic(lock, cur, bm);
1097 else
1098 mb_clear_bit(cur, bm);
1035 cur++; 1099 cur++;
1036 } 1100 }
1037} 1101}
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1049 cur += 32; 1113 cur += 32;
1050 continue; 1114 continue;
1051 } 1115 }
1052 mb_set_bit_atomic(lock, cur, bm); 1116 if (lock)
1117 mb_set_bit_atomic(lock, cur, bm);
1118 else
1119 mb_set_bit(cur, bm);
1053 cur++; 1120 cur++;
1054 } 1121 }
1055} 1122}
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1094 blocknr += block; 1161 blocknr += block;
1095 blocknr += 1162 blocknr +=
1096 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1163 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1097 ext4_unlock_group(sb, e4b->bd_group); 1164 ext4_grp_locked_error(sb, e4b->bd_group,
1098 ext4_error(sb, __func__, "double-free of inode" 1165 __func__, "double-free of inode"
1099 " %lu's block %llu(bit %u in group %lu)\n", 1166 " %lu's block %llu(bit %u in group %u)",
1100 inode ? inode->i_ino : 0, blocknr, block, 1167 inode ? inode->i_ino : 0, blocknr, block,
1101 e4b->bd_group); 1168 e4b->bd_group);
1102 ext4_lock_group(sb, e4b->bd_group);
1103 } 1169 }
1104 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1170 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1105 e4b->bd_info->bb_counters[order]++; 1171 e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1296 ac->ac_tail = ret & 0xffff; 1362 ac->ac_tail = ret & 0xffff;
1297 ac->ac_buddy = ret >> 16; 1363 ac->ac_buddy = ret >> 16;
1298 1364
1299 /* XXXXXXX: SUCH A HORRIBLE **CK */ 1365 /*
1300 /*FIXME!! Why ? */ 1366 * take the page reference. We want the page to be pinned
1367 * so that we don't get a ext4_mb_init_cache_call for this
1368 * group until we update the bitmap. That would mean we
1369 * double allocate blocks. The reference is dropped
1370 * in ext4_mb_release_context
1371 */
1301 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1372 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1302 get_page(ac->ac_bitmap_page); 1373 get_page(ac->ac_bitmap_page);
1303 ac->ac_buddy_page = e4b->bd_buddy_page; 1374 ac->ac_buddy_page = e4b->bd_buddy_page;
1304 get_page(ac->ac_buddy_page); 1375 get_page(ac->ac_buddy_page);
1305 1376 /* on allocation we use ac to track the held semaphore */
1377 ac->alloc_semp = e4b->alloc_semp;
1378 e4b->alloc_semp = NULL;
1306 /* store last allocated for subsequent stream allocation */ 1379 /* store last allocated for subsequent stream allocation */
1307 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1380 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1308 spin_lock(&sbi->s_md_lock); 1381 spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1326 struct ext4_free_extent ex; 1399 struct ext4_free_extent ex;
1327 int max; 1400 int max;
1328 1401
1402 if (ac->ac_status == AC_STATUS_FOUND)
1403 return;
1329 /* 1404 /*
1330 * We don't want to scan for a whole year 1405 * We don't want to scan for a whole year
1331 */ 1406 */
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1575 * free blocks even though group info says we 1650 * free blocks even though group info says we
1576 * we have free blocks 1651 * we have free blocks
1577 */ 1652 */
1578 ext4_error(sb, __func__, "%d free blocks as per " 1653 ext4_grp_locked_error(sb, e4b->bd_group,
1579 "group info. But bitmap says 0\n", 1654 __func__, "%d free blocks as per "
1655 "group info. But bitmap says 0",
1580 free); 1656 free);
1581 break; 1657 break;
1582 } 1658 }
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1584 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1660 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1585 BUG_ON(ex.fe_len <= 0); 1661 BUG_ON(ex.fe_len <= 0);
1586 if (free < ex.fe_len) { 1662 if (free < ex.fe_len) {
1587 ext4_error(sb, __func__, "%d free blocks as per " 1663 ext4_grp_locked_error(sb, e4b->bd_group,
1588 "group info. But got %d blocks\n", 1664 __func__, "%d free blocks as per "
1665 "group info. But got %d blocks",
1589 free, ex.fe_len); 1666 free, ex.fe_len);
1590 /* 1667 /*
1591 * The number of free blocks differs. This mostly 1668 * The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1692 return 0; 1769 return 0;
1693} 1770}
1694 1771
1772/*
1773 * lock the group_info alloc_sem of all the groups
1774 * belonging to the same buddy cache page. This
1775 * make sure other parallel operation on the buddy
1776 * cache doesn't happen whild holding the buddy cache
1777 * lock
1778 */
1779int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1780{
1781 int i;
1782 int block, pnum;
1783 int blocks_per_page;
1784 int groups_per_page;
1785 ext4_group_t first_group;
1786 struct ext4_group_info *grp;
1787
1788 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1789 /*
1790 * the buddy cache inode stores the block bitmap
1791 * and buddy information in consecutive blocks.
1792 * So for each group we need two blocks.
1793 */
1794 block = group * 2;
1795 pnum = block / blocks_per_page;
1796 first_group = pnum * blocks_per_page / 2;
1797
1798 groups_per_page = blocks_per_page >> 1;
1799 if (groups_per_page == 0)
1800 groups_per_page = 1;
1801 /* read all groups the page covers into the cache */
1802 for (i = 0; i < groups_per_page; i++) {
1803
1804 if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1805 break;
1806 grp = ext4_get_group_info(sb, first_group + i);
1807 /* take all groups write allocation
1808 * semaphore. This make sure there is
1809 * no block allocation going on in any
1810 * of that groups
1811 */
1812 down_write_nested(&grp->alloc_sem, i);
1813 }
1814 return i;
1815}
1816
1817void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1818 ext4_group_t group, int locked_group)
1819{
1820 int i;
1821 int block, pnum;
1822 int blocks_per_page;
1823 ext4_group_t first_group;
1824 struct ext4_group_info *grp;
1825
1826 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1827 /*
1828 * the buddy cache inode stores the block bitmap
1829 * and buddy information in consecutive blocks.
1830 * So for each group we need two blocks.
1831 */
1832 block = group * 2;
1833 pnum = block / blocks_per_page;
1834 first_group = pnum * blocks_per_page / 2;
1835 /* release locks on all the groups */
1836 for (i = 0; i < locked_group; i++) {
1837
1838 grp = ext4_get_group_info(sb, first_group + i);
1839 /* take all groups write allocation
1840 * semaphore. This make sure there is
1841 * no block allocation going on in any
1842 * of that groups
1843 */
1844 up_write(&grp->alloc_sem);
1845 }
1846
1847}
1848
1849static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1850{
1851
1852 int ret;
1853 void *bitmap;
1854 int blocks_per_page;
1855 int block, pnum, poff;
1856 int num_grp_locked = 0;
1857 struct ext4_group_info *this_grp;
1858 struct ext4_sb_info *sbi = EXT4_SB(sb);
1859 struct inode *inode = sbi->s_buddy_cache;
1860 struct page *page = NULL, *bitmap_page = NULL;
1861
1862 mb_debug("init group %lu\n", group);
1863 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1864 this_grp = ext4_get_group_info(sb, group);
1865 /*
1866 * This ensures we don't add group
1867 * to this buddy cache via resize
1868 */
1869 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1870 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1871 /*
1872 * somebody initialized the group
1873 * return without doing anything
1874 */
1875 ret = 0;
1876 goto err;
1877 }
1878 /*
1879 * the buddy cache inode stores the block bitmap
1880 * and buddy information in consecutive blocks.
1881 * So for each group we need two blocks.
1882 */
1883 block = group * 2;
1884 pnum = block / blocks_per_page;
1885 poff = block % blocks_per_page;
1886 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1887 if (page) {
1888 BUG_ON(page->mapping != inode->i_mapping);
1889 ret = ext4_mb_init_cache(page, NULL);
1890 if (ret) {
1891 unlock_page(page);
1892 goto err;
1893 }
1894 unlock_page(page);
1895 }
1896 if (page == NULL || !PageUptodate(page)) {
1897 ret = -EIO;
1898 goto err;
1899 }
1900 mark_page_accessed(page);
1901 bitmap_page = page;
1902 bitmap = page_address(page) + (poff * sb->s_blocksize);
1903
1904 /* init buddy cache */
1905 block++;
1906 pnum = block / blocks_per_page;
1907 poff = block % blocks_per_page;
1908 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1909 if (page == bitmap_page) {
1910 /*
1911 * If both the bitmap and buddy are in
1912 * the same page we don't need to force
1913 * init the buddy
1914 */
1915 unlock_page(page);
1916 } else if (page) {
1917 BUG_ON(page->mapping != inode->i_mapping);
1918 ret = ext4_mb_init_cache(page, bitmap);
1919 if (ret) {
1920 unlock_page(page);
1921 goto err;
1922 }
1923 unlock_page(page);
1924 }
1925 if (page == NULL || !PageUptodate(page)) {
1926 ret = -EIO;
1927 goto err;
1928 }
1929 mark_page_accessed(page);
1930err:
1931 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1932 if (bitmap_page)
1933 page_cache_release(bitmap_page);
1934 if (page)
1935 page_cache_release(page);
1936 return ret;
1937}
1938
1695static noinline_for_stack int 1939static noinline_for_stack int
1696ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1940ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1697{ 1941{
@@ -1775,7 +2019,7 @@ repeat:
1775 group = 0; 2019 group = 0;
1776 2020
1777 /* quick check to skip empty groups */ 2021 /* quick check to skip empty groups */
1778 grp = ext4_get_group_info(ac->ac_sb, group); 2022 grp = ext4_get_group_info(sb, group);
1779 if (grp->bb_free == 0) 2023 if (grp->bb_free == 0)
1780 continue; 2024 continue;
1781 2025
@@ -1788,10 +2032,9 @@ repeat:
1788 * we need full data about the group 2032 * we need full data about the group
1789 * to make a good selection 2033 * to make a good selection
1790 */ 2034 */
1791 err = ext4_mb_load_buddy(sb, group, &e4b); 2035 err = ext4_mb_init_group(sb, group);
1792 if (err) 2036 if (err)
1793 goto out; 2037 goto out;
1794 ext4_mb_release_desc(&e4b);
1795 } 2038 }
1796 2039
1797 /* 2040 /*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
1932 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2175 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
1933 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2176 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
1934 "%-5u %-5s %-5u %-6u\n"; 2177 "%-5u %-5s %-5u %-6u\n";
1935 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 2178 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
1936 hs->result.fe_start, hs->result.fe_len, 2179 hs->result.fe_start, hs->result.fe_len,
1937 hs->result.fe_logical); 2180 hs->result.fe_logical);
1938 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 2181 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
1939 hs->orig.fe_start, hs->orig.fe_len, 2182 hs->orig.fe_start, hs->orig.fe_len,
1940 hs->orig.fe_logical); 2183 hs->orig.fe_logical);
1941 sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, 2184 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
1942 hs->goal.fe_start, hs->goal.fe_len, 2185 hs->goal.fe_start, hs->goal.fe_len,
1943 hs->goal.fe_logical); 2186 hs->goal.fe_logical);
1944 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, 2187 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
1947 hs->buddy ? 1 << hs->buddy : 0); 2190 hs->buddy ? 1 << hs->buddy : 0);
1948 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { 2191 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
1949 fmt = "%-5u %-8u %-23s %-23s %-23s\n"; 2192 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
1950 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 2193 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
1951 hs->result.fe_start, hs->result.fe_len, 2194 hs->result.fe_start, hs->result.fe_len,
1952 hs->result.fe_logical); 2195 hs->result.fe_logical);
1953 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 2196 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
1954 hs->orig.fe_start, hs->orig.fe_len, 2197 hs->orig.fe_start, hs->orig.fe_len,
1955 hs->orig.fe_logical); 2198 hs->orig.fe_logical);
1956 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); 2199 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
1957 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { 2200 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
1958 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 2201 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
1959 hs->result.fe_start, hs->result.fe_len); 2202 hs->result.fe_start, hs->result.fe_len);
1960 seq_printf(seq, "%-5u %-8u %-23s discard\n", 2203 seq_printf(seq, "%-5u %-8u %-23s discard\n",
1961 hs->pid, hs->ino, buf2); 2204 hs->pid, hs->ino, buf2);
1962 } else if (hs->op == EXT4_MB_HISTORY_FREE) { 2205 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
1963 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 2206 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
1964 hs->result.fe_start, hs->result.fe_len); 2207 hs->result.fe_start, hs->result.fe_len);
1965 seq_printf(seq, "%-5u %-8u %-23s free\n", 2208 seq_printf(seq, "%-5u %-8u %-23s free\n",
1966 hs->pid, hs->ino, buf2); 2209 hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2073 return NULL; 2316 return NULL;
2074 2317
2075 group = *pos + 1; 2318 group = *pos + 1;
2076 return (void *) group; 2319 return (void *) ((unsigned long) group);
2077} 2320}
2078 2321
2079static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2322static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2086 if (*pos < 0 || *pos >= sbi->s_groups_count) 2329 if (*pos < 0 || *pos >= sbi->s_groups_count)
2087 return NULL; 2330 return NULL;
2088 group = *pos + 1; 2331 group = *pos + 1;
2089 return (void *) group;; 2332 return (void *) ((unsigned long) group);
2090} 2333}
2091 2334
2092static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2335static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2093{ 2336{
2094 struct super_block *sb = seq->private; 2337 struct super_block *sb = seq->private;
2095 long group = (long) v; 2338 ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2096 int i; 2339 int i;
2097 int err; 2340 int err;
2098 struct ext4_buddy e4b; 2341 struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2114 sizeof(struct ext4_group_info); 2357 sizeof(struct ext4_group_info);
2115 err = ext4_mb_load_buddy(sb, group, &e4b); 2358 err = ext4_mb_load_buddy(sb, group, &e4b);
2116 if (err) { 2359 if (err) {
2117 seq_printf(seq, "#%-5lu: I/O error\n", group); 2360 seq_printf(seq, "#%-5u: I/O error\n", group);
2118 return 0; 2361 return 0;
2119 } 2362 }
2120 ext4_lock_group(sb, group); 2363 ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2122 ext4_unlock_group(sb, group); 2365 ext4_unlock_group(sb, group);
2123 ext4_mb_release_desc(&e4b); 2366 ext4_mb_release_desc(&e4b);
2124 2367
2125 seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, 2368 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2126 sg.info.bb_fragments, sg.info.bb_first_free); 2369 sg.info.bb_fragments, sg.info.bb_first_free);
2127 for (i = 0; i <= 13; i++) 2370 for (i = 0; i <= 13; i++)
2128 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2371 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2296 ext4_free_blocks_after_init(sb, group, desc); 2539 ext4_free_blocks_after_init(sb, group, desc);
2297 } else { 2540 } else {
2298 meta_group_info[i]->bb_free = 2541 meta_group_info[i]->bb_free =
2299 le16_to_cpu(desc->bg_free_blocks_count); 2542 ext4_free_blks_count(sb, desc);
2300 } 2543 }
2301 2544
2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2545 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2546 init_rwsem(&meta_group_info[i]->alloc_sem);
2303 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2547 meta_group_info[i]->bb_free_root.rb_node = NULL;;
2304 2548
2305#ifdef DOUBLE_CHECK 2549#ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
2327} /* ext4_mb_add_groupinfo */ 2571} /* ext4_mb_add_groupinfo */
2328 2572
2329/* 2573/*
2330 * Add a group to the existing groups.
2331 * This function is used for online resize
2332 */
2333int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
2334 struct ext4_group_desc *desc)
2335{
2336 struct ext4_sb_info *sbi = EXT4_SB(sb);
2337 struct inode *inode = sbi->s_buddy_cache;
2338 int blocks_per_page;
2339 int block;
2340 int pnum;
2341 struct page *page;
2342 int err;
2343
2344 /* Add group based on group descriptor*/
2345 err = ext4_mb_add_groupinfo(sb, group, desc);
2346 if (err)
2347 return err;
2348
2349 /*
2350 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
2351 * datas) are set not up to date so that they will be re-initilaized
2352 * during the next call to ext4_mb_load_buddy
2353 */
2354
2355 /* Set buddy page as not up to date */
2356 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2357 block = group * 2;
2358 pnum = block / blocks_per_page;
2359 page = find_get_page(inode->i_mapping, pnum);
2360 if (page != NULL) {
2361 ClearPageUptodate(page);
2362 page_cache_release(page);
2363 }
2364
2365 /* Set bitmap page as not up to date */
2366 block++;
2367 pnum = block / blocks_per_page;
2368 page = find_get_page(inode->i_mapping, pnum);
2369 if (page != NULL) {
2370 ClearPageUptodate(page);
2371 page_cache_release(page);
2372 }
2373
2374 return 0;
2375}
2376
2377/*
2378 * Update an existing group. 2574 * Update an existing group.
2379 * This function is used for online resize 2575 * This function is used for online resize
2380 */ 2576 */
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2457 desc = ext4_get_group_desc(sb, i, NULL); 2653 desc = ext4_get_group_desc(sb, i, NULL);
2458 if (desc == NULL) { 2654 if (desc == NULL) {
2459 printk(KERN_ERR 2655 printk(KERN_ERR
2460 "EXT4-fs: can't read descriptor %lu\n", i); 2656 "EXT4-fs: can't read descriptor %u\n", i);
2461 goto err_freebuddy; 2657 goto err_freebuddy;
2462 } 2658 }
2463 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2659 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2493 if (sbi->s_mb_offsets == NULL) { 2689 if (sbi->s_mb_offsets == NULL) {
2494 return -ENOMEM; 2690 return -ENOMEM;
2495 } 2691 }
2692
2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
2496 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2497 if (sbi->s_mb_maxs == NULL) { 2695 if (sbi->s_mb_maxs == NULL) {
2498 kfree(sbi->s_mb_maxs); 2696 kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2551 ext4_mb_init_per_dev_proc(sb); 2749 ext4_mb_init_per_dev_proc(sb);
2552 ext4_mb_history_init(sb); 2750 ext4_mb_history_init(sb);
2553 2751
2554 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2752 if (sbi->s_journal)
2753 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2555 2754
2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2755 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2557 return 0; 2756 return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2652 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2851 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2653 entry = list_entry(l, struct ext4_free_data, list); 2852 entry = list_entry(l, struct ext4_free_data, list);
2654 2853
2655 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2854 mb_debug("gonna free %u blocks in group %u (0x%p):",
2656 entry->count, entry->group, entry); 2855 entry->count, entry->group, entry);
2657 2856
2658 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2857 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2679 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2878 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2680 + entry->start_blk 2879 + entry->start_blk
2681 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2880 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2682 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, 2881 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
2683 (unsigned long long) discard_block, entry->count); 2882 sb->s_id, (unsigned long long) discard_block,
2883 entry->count);
2684 sb_issue_discard(sb, discard_block, entry->count); 2884 sb_issue_discard(sb, discard_block, entry->count);
2685 2885
2686 kmem_cache_free(ext4_free_ext_cachep, entry); 2886 kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
2791 */ 2991 */
2792static noinline_for_stack int 2992static noinline_for_stack int
2793ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2993ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2794 handle_t *handle, unsigned long reserv_blks) 2994 handle_t *handle, unsigned int reserv_blks)
2795{ 2995{
2796 struct buffer_head *bitmap_bh = NULL; 2996 struct buffer_head *bitmap_bh = NULL;
2797 struct ext4_super_block *es; 2997 struct ext4_super_block *es;
@@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2824 if (!gdp) 3024 if (!gdp)
2825 goto out_err; 3025 goto out_err;
2826 3026
2827 ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, 3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2828 gdp->bg_free_blocks_count); 3028 gdp->bg_free_blocks_count);
2829 3029
2830 err = ext4_journal_get_write_access(handle, gdp_bh); 3030 err = ext4_journal_get_write_access(handle, gdp_bh);
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2843 in_range(block + len - 1, ext4_inode_table(sb, gdp), 3043 in_range(block + len - 1, ext4_inode_table(sb, gdp),
2844 EXT4_SB(sb)->s_itb_per_group)) { 3044 EXT4_SB(sb)->s_itb_per_group)) {
2845 ext4_error(sb, __func__, 3045 ext4_error(sb, __func__,
2846 "Allocating block in system zone - block = %llu", 3046 "Allocating block %llu in system zone of %d group\n",
2847 block); 3047 block, ac->ac_b_ex.fe_group);
2848 /* File system mounted not to panic on error 3048 /* File system mounted not to panic on error
2849 * Fix the bitmap and repeat the block allocation 3049 * Fix the bitmap and repeat the block allocation
2850 * We leak some of the blocks here. 3050 * We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2852 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), 3052 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
2853 bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3053 bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2854 ac->ac_b_ex.fe_len); 3054 ac->ac_b_ex.fe_len);
2855 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 3055 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2856 if (!err) 3056 if (!err)
2857 err = -EAGAIN; 3057 err = -EAGAIN;
2858 goto out_err; 3058 goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2866 } 3066 }
2867 } 3067 }
2868#endif 3068#endif
2869 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
2870 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2871
2872 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3069 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3070 mb_set_bits(NULL, bitmap_bh->b_data,
3071 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2873 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 3072 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2874 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3073 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2875 gdp->bg_free_blocks_count = 3074 ext4_free_blks_set(sb, gdp,
2876 cpu_to_le16(ext4_free_blocks_after_init(sb, 3075 ext4_free_blocks_after_init(sb,
2877 ac->ac_b_ex.fe_group, 3076 ac->ac_b_ex.fe_group, gdp));
2878 gdp));
2879 } 3077 }
2880 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 3078 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
3079 ext4_free_blks_set(sb, gdp, len);
2881 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 3080 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2882 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3081 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2883 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 3082 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2899 spin_unlock(sb_bgl_lock(sbi, flex_group)); 3098 spin_unlock(sb_bgl_lock(sbi, flex_group));
2900 } 3099 }
2901 3100
2902 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 3101 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2903 if (err) 3102 if (err)
2904 goto out_err; 3103 goto out_err;
2905 err = ext4_journal_dirty_metadata(handle, gdp_bh); 3104 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2906 3105
2907out_err: 3106out_err:
2908 sb->s_dirt = 1; 3107 sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3031 /* check we don't cross already preallocated blocks */ 3230 /* check we don't cross already preallocated blocks */
3032 rcu_read_lock(); 3231 rcu_read_lock();
3033 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3232 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3034 unsigned long pa_end; 3233 ext4_lblk_t pa_end;
3035 3234
3036 if (pa->pa_deleted) 3235 if (pa->pa_deleted)
3037 continue; 3236 continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3075 /* XXX: extra loop to check we really don't overlap preallocations */ 3274 /* XXX: extra loop to check we really don't overlap preallocations */
3076 rcu_read_lock(); 3275 rcu_read_lock();
3077 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3276 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3078 unsigned long pa_end; 3277 ext4_lblk_t pa_end;
3079 spin_lock(&pa->pa_lock); 3278 spin_lock(&pa->pa_lock);
3080 if (pa->pa_deleted == 0) { 3279 if (pa->pa_deleted == 0) {
3081 pa_end = pa->pa_lstart + pa->pa_len; 3280 pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3307} 3506}
3308 3507
3309/* 3508/*
3509 * the function goes through all block freed in the group
3510 * but not yet committed and marks them used in in-core bitmap.
3511 * buddy must be generated from this bitmap
3512 * Need to be called with ext4 group lock (ext4_lock_group)
3513 */
3514static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3515 ext4_group_t group)
3516{
3517 struct rb_node *n;
3518 struct ext4_group_info *grp;
3519 struct ext4_free_data *entry;
3520
3521 grp = ext4_get_group_info(sb, group);
3522 n = rb_first(&(grp->bb_free_root));
3523
3524 while (n) {
3525 entry = rb_entry(n, struct ext4_free_data, node);
3526 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
3527 bitmap, entry->start_blk,
3528 entry->count);
3529 n = rb_next(n);
3530 }
3531 return;
3532}
3533
3534/*
3310 * the function goes through all preallocation in this group and marks them 3535 * the function goes through all preallocation in this group and marks them
3311 * used in in-core bitmap. buddy must be generated from this bitmap 3536 * used in in-core bitmap. buddy must be generated from this bitmap
3312 * Need to be called with ext4 group lock (ext4_lock_group) 3537 * Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3346 preallocated += len; 3571 preallocated += len;
3347 count++; 3572 count++;
3348 } 3573 }
3349 mb_debug("prellocated %u for group %lu\n", preallocated, group); 3574 mb_debug("prellocated %u for group %u\n", preallocated, group);
3350} 3575}
3351 3576
3352static void ext4_mb_pa_callback(struct rcu_head *head) 3577static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
3363static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3588static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3364 struct super_block *sb, struct ext4_prealloc_space *pa) 3589 struct super_block *sb, struct ext4_prealloc_space *pa)
3365{ 3590{
3366 unsigned long grp; 3591 ext4_group_t grp;
3367 3592
3368 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3593 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3369 return; 3594 return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3473 3698
3474 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3699 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3475 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3700 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3701 trace_mark(ext4_mb_new_inode_pa,
3702 "dev %s ino %lu pstart %llu len %u lstart %u",
3703 sb->s_id, ac->ac_inode->i_ino,
3704 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3476 3705
3477 ext4_mb_use_inode_pa(ac, pa); 3706 ext4_mb_use_inode_pa(ac, pa);
3478 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3707 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3530 pa->pa_linear = 1; 3759 pa->pa_linear = 1;
3531 3760
3532 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3761 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3533 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3762 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3763 trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
3764 sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3534 3765
3535 ext4_mb_use_group_pa(ac, pa); 3766 ext4_mb_use_group_pa(ac, pa);
3536 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3767 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3579{ 3810{
3580 struct super_block *sb = e4b->bd_sb; 3811 struct super_block *sb = e4b->bd_sb;
3581 struct ext4_sb_info *sbi = EXT4_SB(sb); 3812 struct ext4_sb_info *sbi = EXT4_SB(sb);
3582 unsigned long end; 3813 unsigned int end;
3583 unsigned long next; 3814 unsigned int next;
3584 ext4_group_t group; 3815 ext4_group_t group;
3585 ext4_grpblk_t bit; 3816 ext4_grpblk_t bit;
3817 unsigned long long grp_blk_start;
3586 sector_t start; 3818 sector_t start;
3587 int err = 0; 3819 int err = 0;
3588 int free = 0; 3820 int free = 0;
3589 3821
3590 BUG_ON(pa->pa_deleted == 0); 3822 BUG_ON(pa->pa_deleted == 0);
3591 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3823 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3824 grp_blk_start = pa->pa_pstart - bit;
3592 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3825 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3593 end = bit + pa->pa_len; 3826 end = bit + pa->pa_len;
3594 3827
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3618 ext4_mb_store_history(ac); 3851 ext4_mb_store_history(ac);
3619 } 3852 }
3620 3853
3854 trace_mark(ext4_mb_release_inode_pa,
3855 "dev %s ino %lu block %llu count %u",
3856 sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
3857 next - bit);
3621 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3858 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3622 bit = next + 1; 3859 bit = next + 1;
3623 } 3860 }
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3626 pa, (unsigned long) pa->pa_lstart, 3863 pa, (unsigned long) pa->pa_lstart,
3627 (unsigned long) pa->pa_pstart, 3864 (unsigned long) pa->pa_pstart,
3628 (unsigned long) pa->pa_len); 3865 (unsigned long) pa->pa_len);
3629 ext4_error(sb, __func__, "free %u, pa_free %u\n", 3866 ext4_grp_locked_error(sb, group,
3630 free, pa->pa_free); 3867 __func__, "free %u, pa_free %u",
3868 free, pa->pa_free);
3631 /* 3869 /*
3632 * pa is already deleted so we use the value obtained 3870 * pa is already deleted so we use the value obtained
3633 * from the bitmap and continue. 3871 * from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3650 if (ac) 3888 if (ac)
3651 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3889 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3652 3890
3891 trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
3892 sb->s_id, pa->pa_pstart, pa->pa_len);
3653 BUG_ON(pa->pa_deleted == 0); 3893 BUG_ON(pa->pa_deleted == 0);
3654 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3894 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3655 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3895 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3692 int busy = 0; 3932 int busy = 0;
3693 int free = 0; 3933 int free = 0;
3694 3934
3695 mb_debug("discard preallocation for group %lu\n", group); 3935 mb_debug("discard preallocation for group %u\n", group);
3696 3936
3697 if (list_empty(&grp->bb_prealloc_list)) 3937 if (list_empty(&grp->bb_prealloc_list))
3698 return 0; 3938 return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3700 bitmap_bh = ext4_read_block_bitmap(sb, group); 3940 bitmap_bh = ext4_read_block_bitmap(sb, group);
3701 if (bitmap_bh == NULL) { 3941 if (bitmap_bh == NULL) {
3702 ext4_error(sb, __func__, "Error in reading block " 3942 ext4_error(sb, __func__, "Error in reading block "
3703 "bitmap for %lu\n", group); 3943 "bitmap for %u", group);
3704 return 0; 3944 return 0;
3705 } 3945 }
3706 3946
3707 err = ext4_mb_load_buddy(sb, group, &e4b); 3947 err = ext4_mb_load_buddy(sb, group, &e4b);
3708 if (err) { 3948 if (err) {
3709 ext4_error(sb, __func__, "Error in loading buddy " 3949 ext4_error(sb, __func__, "Error in loading buddy "
3710 "information for %lu\n", group); 3950 "information for %u", group);
3711 put_bh(bitmap_bh); 3951 put_bh(bitmap_bh);
3712 return 0; 3952 return 0;
3713 } 3953 }
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
3815 } 4055 }
3816 4056
3817 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4057 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
4058 trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
4059 inode->i_ino);
3818 4060
3819 INIT_LIST_HEAD(&list); 4061 INIT_LIST_HEAD(&list);
3820 4062
@@ -3874,14 +4116,14 @@ repeat:
3874 err = ext4_mb_load_buddy(sb, group, &e4b); 4116 err = ext4_mb_load_buddy(sb, group, &e4b);
3875 if (err) { 4117 if (err) {
3876 ext4_error(sb, __func__, "Error in loading buddy " 4118 ext4_error(sb, __func__, "Error in loading buddy "
3877 "information for %lu\n", group); 4119 "information for %u", group);
3878 continue; 4120 continue;
3879 } 4121 }
3880 4122
3881 bitmap_bh = ext4_read_block_bitmap(sb, group); 4123 bitmap_bh = ext4_read_block_bitmap(sb, group);
3882 if (bitmap_bh == NULL) { 4124 if (bitmap_bh == NULL) {
3883 ext4_error(sb, __func__, "Error in reading block " 4125 ext4_error(sb, __func__, "Error in reading block "
3884 "bitmap for %lu\n", group); 4126 "bitmap for %u", group);
3885 ext4_mb_release_desc(&e4b); 4127 ext4_mb_release_desc(&e4b);
3886 continue; 4128 continue;
3887 } 4129 }
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4024 struct ext4_sb_info *sbi = EXT4_SB(sb); 4266 struct ext4_sb_info *sbi = EXT4_SB(sb);
4025 struct ext4_super_block *es = sbi->s_es; 4267 struct ext4_super_block *es = sbi->s_es;
4026 ext4_group_t group; 4268 ext4_group_t group;
4027 unsigned long len; 4269 unsigned int len;
4028 unsigned long goal; 4270 ext4_fsblk_t goal;
4029 ext4_grpblk_t block; 4271 ext4_grpblk_t block;
4030 4272
4031 /* we can't allocate > group size */ 4273 /* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4068 ac->ac_pa = NULL; 4310 ac->ac_pa = NULL;
4069 ac->ac_bitmap_page = NULL; 4311 ac->ac_bitmap_page = NULL;
4070 ac->ac_buddy_page = NULL; 4312 ac->ac_buddy_page = NULL;
4313 ac->alloc_semp = NULL;
4071 ac->ac_lg = NULL; 4314 ac->ac_lg = NULL;
4072 4315
4073 /* we have to define context: we'll we work with a file or 4316 /* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4146 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4389 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4147 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4390 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4148 ext4_error(sb, __func__, "Error in loading buddy " 4391 ext4_error(sb, __func__, "Error in loading buddy "
4149 "information for %lu\n", group); 4392 "information for %u", group);
4150 continue; 4393 continue;
4151 } 4394 }
4152 ext4_lock_group(sb, group); 4395 ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4248 } 4491 }
4249 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4492 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4250 } 4493 }
4494 if (ac->alloc_semp)
4495 up_read(ac->alloc_semp);
4251 if (ac->ac_bitmap_page) 4496 if (ac->ac_bitmap_page)
4252 page_cache_release(ac->ac_bitmap_page); 4497 page_cache_release(ac->ac_bitmap_page);
4253 if (ac->ac_buddy_page) 4498 if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4264 int ret; 4509 int ret;
4265 int freed = 0; 4510 int freed = 0;
4266 4511
4512 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
4513 sb->s_id, needed);
4267 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { 4514 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
4268 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4515 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4269 freed += ret; 4516 freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4286 struct ext4_sb_info *sbi; 4533 struct ext4_sb_info *sbi;
4287 struct super_block *sb; 4534 struct super_block *sb;
4288 ext4_fsblk_t block = 0; 4535 ext4_fsblk_t block = 0;
4289 unsigned long inquota; 4536 unsigned int inquota;
4290 unsigned long reserv_blks = 0; 4537 unsigned int reserv_blks = 0;
4291 4538
4292 sb = ar->inode->i_sb; 4539 sb = ar->inode->i_sb;
4293 sbi = EXT4_SB(sb); 4540 sbi = EXT4_SB(sb);
4294 4541
4542 trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
4543 "lblk %llu goal %llu lleft %llu lright %llu "
4544 "pleft %llu pright %llu ",
4545 sb->s_id, ar->flags, ar->len,
4546 ar->inode ? ar->inode->i_ino : 0,
4547 (unsigned long long) ar->logical,
4548 (unsigned long long) ar->goal,
4549 (unsigned long long) ar->lleft,
4550 (unsigned long long) ar->lright,
4551 (unsigned long long) ar->pleft,
4552 (unsigned long long) ar->pright);
4553
4295 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4554 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4296 /* 4555 /*
4297 * With delalloc we already reserved the blocks 4556 * With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4313 } 4572 }
4314 if (ar->len == 0) { 4573 if (ar->len == 0) {
4315 *errp = -EDQUOT; 4574 *errp = -EDQUOT;
4316 return 0; 4575 goto out3;
4317 } 4576 }
4318 inquota = ar->len; 4577 inquota = ar->len;
4319 4578
@@ -4348,10 +4607,14 @@ repeat:
4348 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4607 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4349 ext4_mb_new_preallocation(ac); 4608 ext4_mb_new_preallocation(ac);
4350 } 4609 }
4351
4352 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4610 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4353 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4611 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4354 if (*errp == -EAGAIN) { 4612 if (*errp == -EAGAIN) {
4613 /*
4614 * drop the reference that we took
4615 * in ext4_mb_use_best_found
4616 */
4617 ext4_mb_release_context(ac);
4355 ac->ac_b_ex.fe_group = 0; 4618 ac->ac_b_ex.fe_group = 0;
4356 ac->ac_b_ex.fe_start = 0; 4619 ac->ac_b_ex.fe_start = 0;
4357 ac->ac_b_ex.fe_len = 0; 4620 ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
4382out1: 4645out1:
4383 if (ar->len < inquota) 4646 if (ar->len < inquota)
4384 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4647 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4648out3:
4649 if (!ar->len) {
4650 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4651 /* release all the reserved blocks if non delalloc */
4652 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4653 reserv_blks);
4654 }
4655
4656 trace_mark(ext4_allocate_blocks,
4657 "dev %s block %llu flags %u len %u ino %lu "
4658 "logical %llu goal %llu lleft %llu lright %llu "
4659 "pleft %llu pright %llu ",
4660 sb->s_id, (unsigned long long) block,
4661 ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
4662 (unsigned long long) ar->logical,
4663 (unsigned long long) ar->goal,
4664 (unsigned long long) ar->lleft,
4665 (unsigned long long) ar->lright,
4666 (unsigned long long) ar->pleft,
4667 (unsigned long long) ar->pright);
4385 4668
4386 return block; 4669 return block;
4387} 4670}
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
4403 4686
4404static noinline_for_stack int 4687static noinline_for_stack int
4405ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4688ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4406 ext4_group_t group, ext4_grpblk_t block, int count) 4689 struct ext4_free_data *new_entry)
4407{ 4690{
4691 ext4_grpblk_t block;
4692 struct ext4_free_data *entry;
4408 struct ext4_group_info *db = e4b->bd_info; 4693 struct ext4_group_info *db = e4b->bd_info;
4409 struct super_block *sb = e4b->bd_sb; 4694 struct super_block *sb = e4b->bd_sb;
4410 struct ext4_sb_info *sbi = EXT4_SB(sb); 4695 struct ext4_sb_info *sbi = EXT4_SB(sb);
4411 struct ext4_free_data *entry, *new_entry;
4412 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4696 struct rb_node **n = &db->bb_free_root.rb_node, *node;
4413 struct rb_node *parent = NULL, *new_node; 4697 struct rb_node *parent = NULL, *new_node;
4414 4698
4415 4699 BUG_ON(!ext4_handle_valid(handle));
4416 BUG_ON(e4b->bd_bitmap_page == NULL); 4700 BUG_ON(e4b->bd_bitmap_page == NULL);
4417 BUG_ON(e4b->bd_buddy_page == NULL); 4701 BUG_ON(e4b->bd_buddy_page == NULL);
4418 4702
4419 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4420 new_entry->start_blk = block;
4421 new_entry->group = group;
4422 new_entry->count = count;
4423 new_entry->t_tid = handle->h_transaction->t_tid;
4424 new_node = &new_entry->node; 4703 new_node = &new_entry->node;
4704 block = new_entry->start_blk;
4425 4705
4426 ext4_lock_group(sb, group);
4427 if (!*n) { 4706 if (!*n) {
4428 /* first free block exent. We need to 4707 /* first free block exent. We need to
4429 protect buddy cache from being freed, 4708 protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4441 else if (block >= (entry->start_blk + entry->count)) 4720 else if (block >= (entry->start_blk + entry->count))
4442 n = &(*n)->rb_right; 4721 n = &(*n)->rb_right;
4443 else { 4722 else {
4444 ext4_unlock_group(sb, group); 4723 ext4_grp_locked_error(sb, e4b->bd_group, __func__,
4445 ext4_error(sb, __func__, 4724 "Double free of blocks %d (%d %d)",
4446 "Double free of blocks %d (%d %d)\n", 4725 block, entry->start_blk, entry->count);
4447 block, entry->start_blk, entry->count);
4448 return 0; 4726 return 0;
4449 } 4727 }
4450 } 4728 }
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4483 spin_lock(&sbi->s_md_lock); 4761 spin_lock(&sbi->s_md_lock);
4484 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4762 list_add(&new_entry->list, &handle->h_transaction->t_private_list);
4485 spin_unlock(&sbi->s_md_lock); 4763 spin_unlock(&sbi->s_md_lock);
4486 ext4_unlock_group(sb, group);
4487 return 0; 4764 return 0;
4488} 4765}
4489 4766
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4499 struct ext4_allocation_context *ac = NULL; 4776 struct ext4_allocation_context *ac = NULL;
4500 struct ext4_group_desc *gdp; 4777 struct ext4_group_desc *gdp;
4501 struct ext4_super_block *es; 4778 struct ext4_super_block *es;
4502 unsigned long overflow; 4779 unsigned int overflow;
4503 ext4_grpblk_t bit; 4780 ext4_grpblk_t bit;
4504 struct buffer_head *gd_bh; 4781 struct buffer_head *gd_bh;
4505 ext4_group_t block_group; 4782 ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4522 } 4799 }
4523 4800
4524 ext4_debug("freeing block %lu\n", block); 4801 ext4_debug("freeing block %lu\n", block);
4802 trace_mark(ext4_free_blocks,
4803 "dev %s block %llu count %lu metadata %d ino %lu",
4804 sb->s_id, (unsigned long long) block, count, metadata,
4805 inode ? inode->i_ino : 0);
4525 4806
4526 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4807 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4527 if (ac) { 4808 if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
4581 err = ext4_journal_get_write_access(handle, gd_bh); 4862 err = ext4_journal_get_write_access(handle, gd_bh);
4582 if (err) 4863 if (err)
4583 goto error_return; 4864 goto error_return;
4584
4585 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4586 if (err)
4587 goto error_return;
4588
4589#ifdef AGGRESSIVE_CHECK 4865#ifdef AGGRESSIVE_CHECK
4590 { 4866 {
4591 int i; 4867 int i;
@@ -4593,13 +4869,6 @@ do_more:
4593 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4869 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4594 } 4870 }
4595#endif 4871#endif
4596 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4597 bit, count);
4598
4599 /* We dirtied the bitmap block */
4600 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4601 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
4602
4603 if (ac) { 4872 if (ac) {
4604 ac->ac_b_ex.fe_group = block_group; 4873 ac->ac_b_ex.fe_group = block_group;
4605 ac->ac_b_ex.fe_start = bit; 4874 ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
4607 ext4_mb_store_history(ac); 4876 ext4_mb_store_history(ac);
4608 } 4877 }
4609 4878
4610 if (metadata) { 4879 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4611 /* blocks being freed are metadata. these blocks shouldn't 4880 if (err)
4612 * be used until this transaction is committed */ 4881 goto error_return;
4613 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); 4882 if (metadata && ext4_handle_valid(handle)) {
4883 struct ext4_free_data *new_entry;
4884 /*
4885 * blocks being freed are metadata. these blocks shouldn't
4886 * be used until this transaction is committed
4887 */
4888 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4889 new_entry->start_blk = bit;
4890 new_entry->group = block_group;
4891 new_entry->count = count;
4892 new_entry->t_tid = handle->h_transaction->t_tid;
4893 ext4_lock_group(sb, block_group);
4894 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4895 bit, count);
4896 ext4_mb_free_metadata(handle, &e4b, new_entry);
4897 ext4_unlock_group(sb, block_group);
4614 } else { 4898 } else {
4615 ext4_lock_group(sb, block_group); 4899 ext4_lock_group(sb, block_group);
4900 /* need to update group_info->bb_free and bitmap
4901 * with group lock held. generate_buddy look at
4902 * them with group lock_held
4903 */
4904 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4905 bit, count);
4616 mb_free_blocks(inode, &e4b, bit, count); 4906 mb_free_blocks(inode, &e4b, bit, count);
4617 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4907 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4618 ext4_unlock_group(sb, block_group); 4908 ext4_unlock_group(sb, block_group);
4619 } 4909 }
4620 4910
4621 spin_lock(sb_bgl_lock(sbi, block_group)); 4911 spin_lock(sb_bgl_lock(sbi, block_group));
4622 le16_add_cpu(&gdp->bg_free_blocks_count, count); 4912 ret = ext4_free_blks_count(sb, gdp) + count;
4913 ext4_free_blks_set(sb, gdp, ret);
4623 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4914 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4624 spin_unlock(sb_bgl_lock(sbi, block_group)); 4915 spin_unlock(sb_bgl_lock(sbi, block_group));
4625 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4916 percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
4635 4926
4636 *freed += count; 4927 *freed += count;
4637 4928
4929 /* We dirtied the bitmap block */
4930 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4931 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4932
4638 /* And the group descriptor block */ 4933 /* And the group descriptor block */
4639 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4934 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4640 ret = ext4_journal_dirty_metadata(handle, gd_bh); 4935 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4641 if (!err) 4936 if (!err)
4642 err = ret; 4937 err = ret;
4643 4938
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e5..10a2921baf14 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
20#include <linux/version.h> 20#include <linux/version.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/marker.h> 22#include <linux/marker.h>
23#include <linux/mutex.h>
23#include "ext4_jbd2.h" 24#include "ext4_jbd2.h"
24#include "ext4.h" 25#include "ext4.h"
25#include "group.h" 26#include "group.h"
@@ -98,9 +99,6 @@
98 */ 99 */
99#define MB_DEFAULT_GROUP_PREALLOC 512 100#define MB_DEFAULT_GROUP_PREALLOC 512
100 101
101static struct kmem_cache *ext4_pspace_cachep;
102static struct kmem_cache *ext4_ac_cachep;
103static struct kmem_cache *ext4_free_ext_cachep;
104 102
105struct ext4_free_data { 103struct ext4_free_data {
106 /* this links the free block information from group_info */ 104 /* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
120 tid_t t_tid; 118 tid_t t_tid;
121}; 119};
122 120
123struct ext4_group_info {
124 unsigned long bb_state;
125 struct rb_root bb_free_root;
126 unsigned short bb_first_free;
127 unsigned short bb_free;
128 unsigned short bb_fragments;
129 struct list_head bb_prealloc_list;
130#ifdef DOUBLE_CHECK
131 void *bb_bitmap;
132#endif
133 unsigned short bb_counters[];
134};
135
136#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
137#define EXT4_GROUP_INFO_LOCKED_BIT 1
138
139#define EXT4_MB_GRP_NEED_INIT(grp) \
140 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
141
142
143struct ext4_prealloc_space { 121struct ext4_prealloc_space {
144 struct list_head pa_inode_list; 122 struct list_head pa_inode_list;
145 struct list_head pa_group_list; 123 struct list_head pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
217 __u8 ac_op; /* operation, for history only */ 195 __u8 ac_op; /* operation, for history only */
218 struct page *ac_bitmap_page; 196 struct page *ac_bitmap_page;
219 struct page *ac_buddy_page; 197 struct page *ac_buddy_page;
198 /*
199 * pointer to the held semaphore upon successful
200 * block allocation
201 */
202 struct rw_semaphore *alloc_semp;
220 struct ext4_prealloc_space *ac_pa; 203 struct ext4_prealloc_space *ac_pa;
221 struct ext4_locality_group *ac_lg; 204 struct ext4_locality_group *ac_lg;
222}; 205};
@@ -250,6 +233,7 @@ struct ext4_buddy {
250 struct super_block *bd_sb; 233 struct super_block *bd_sb;
251 __u16 bd_blkbits; 234 __u16 bd_blkbits;
252 ext4_group_t bd_group; 235 ext4_group_t bd_group;
236 struct rw_semaphore *alloc_semp;
253}; 237};
254#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 238#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
255#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 239#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
259{ 243{
260 return; 244 return;
261} 245}
262#else
263static void ext4_mb_store_history(struct ext4_allocation_context *ac);
264#endif 246#endif
265 247
266#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 248#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
267 249
268struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); 250struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
269 251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
270static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
271 ext4_group_t group);
272static void ext4_mb_return_to_preallocation(struct inode *inode,
273 struct ext4_buddy *e4b, sector_t block,
274 int count);
275static void ext4_mb_put_pa(struct ext4_allocation_context *,
276 struct super_block *, struct ext4_prealloc_space *pa);
277static int ext4_mb_init_per_dev_proc(struct super_block *sb);
278static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
279static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
280
281
282static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
283{
284 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
285
286 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
287}
288
289static inline void ext4_unlock_group(struct super_block *sb,
290 ext4_group_t group)
291{
292 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
293
294 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
295}
296
297static inline int ext4_is_group_locked(struct super_block *sb,
298 ext4_group_t group)
299{
300 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
301
302 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
303 &(grinfo->bb_state));
304}
305
306static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
307 struct ext4_free_extent *fex) 252 struct ext4_free_extent *fex)
308{ 253{
309 ext4_fsblk_t block; 254 ext4_fsblk_t block;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ecd..734abca25e35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
59 /* 59 /*
60 * Make sure the credit we accumalated is not really high 60 * Make sure the credit we accumalated is not really high
61 */ 61 */
62 if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { 62 if (needed && ext4_handle_has_enough_credits(handle,
63 EXT4_RESERVE_TRANS_BLOCKS)) {
63 retval = ext4_journal_restart(handle, needed); 64 retval = ext4_journal_restart(handle, needed);
64 if (retval) 65 if (retval)
65 goto err_out; 66 goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
229{ 230{
230 int retval = 0, needed; 231 int retval = 0, needed;
231 232
232 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 233 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
233 return 0; 234 return 0;
234 /* 235 /*
235 * We are freeing a blocks. During this we touch 236 * We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
458 struct list_blocks_struct lb; 459 struct list_blocks_struct lb;
459 unsigned long max_entries; 460 unsigned long max_entries;
460 461
461 if (!test_opt(inode->i_sb, EXTENTS)) 462 /*
462 /* 463 * If the filesystem does not support extents, or the inode
463 * if mounted with noextents we don't allow the migrate 464 * already is extent-based, error out.
464 */ 465 */
465 return -EINVAL; 466 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
466 467 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
467 if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 468 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
468 return -EINVAL; 469 return -EINVAL;
469 470
470 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 471 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4b8d431d7dff..fec0b4c2f5f1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -368,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
368 goto fail; 368 goto fail;
369 } 369 }
370 hinfo->hash_version = root->info.hash_version; 370 hinfo->hash_version = root->info.hash_version;
371 if (hinfo->hash_version <= DX_HASH_TEA)
372 hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
371 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 373 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
372 if (d_name) 374 if (d_name)
373 ext4fs_dirhash(d_name->name, d_name->len, hinfo); 375 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -637,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
637 dir = dir_file->f_path.dentry->d_inode; 639 dir = dir_file->f_path.dentry->d_inode;
638 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 640 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
639 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 641 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
642 if (hinfo.hash_version <= DX_HASH_TEA)
643 hinfo.hash_version +=
644 EXT4_SB(dir->i_sb)->s_hash_unsigned;
640 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 645 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
641 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 646 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
642 start_hash, start_minor_hash); 647 start_hash, start_minor_hash);
@@ -802,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
802static inline int search_dirblock(struct buffer_head *bh, 807static inline int search_dirblock(struct buffer_head *bh,
803 struct inode *dir, 808 struct inode *dir,
804 const struct qstr *d_name, 809 const struct qstr *d_name,
805 unsigned long offset, 810 unsigned int offset,
806 struct ext4_dir_entry_2 ** res_dir) 811 struct ext4_dir_entry_2 ** res_dir)
807{ 812{
808 struct ext4_dir_entry_2 * de; 813 struct ext4_dir_entry_2 * de;
@@ -1039,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1039 bh = ext4_find_entry(dir, &dentry->d_name, &de); 1044 bh = ext4_find_entry(dir, &dentry->d_name, &de);
1040 inode = NULL; 1045 inode = NULL;
1041 if (bh) { 1046 if (bh) {
1042 unsigned long ino = le32_to_cpu(de->inode); 1047 __u32 ino = le32_to_cpu(de->inode);
1043 brelse(bh); 1048 brelse(bh);
1044 if (!ext4_valid_inum(dir->i_sb, ino)) { 1049 if (!ext4_valid_inum(dir->i_sb, ino)) {
1045 ext4_error(dir->i_sb, "ext4_lookup", 1050 ext4_error(dir->i_sb, "ext4_lookup",
1046 "bad inode number: %lu", ino); 1051 "bad inode number: %u", ino);
1047 return ERR_PTR(-EIO); 1052 return ERR_PTR(-EIO);
1048 } 1053 }
1049 inode = ext4_iget(dir->i_sb, ino); 1054 inode = ext4_iget(dir->i_sb, ino);
@@ -1056,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1056 1061
1057struct dentry *ext4_get_parent(struct dentry *child) 1062struct dentry *ext4_get_parent(struct dentry *child)
1058{ 1063{
1059 unsigned long ino; 1064 __u32 ino;
1060 struct inode *inode; 1065 struct inode *inode;
1061 static const struct qstr dotdot = { 1066 static const struct qstr dotdot = {
1062 .name = "..", 1067 .name = "..",
@@ -1074,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1074 1079
1075 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1080 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1076 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1081 ext4_error(child->d_inode->i_sb, "ext4_get_parent",
1077 "bad inode number: %lu", ino); 1082 "bad inode number: %u", ino);
1078 return ERR_PTR(-EIO); 1083 return ERR_PTR(-EIO);
1079 } 1084 }
1080 1085
@@ -1162,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1162 u32 hash2; 1167 u32 hash2;
1163 struct dx_map_entry *map; 1168 struct dx_map_entry *map;
1164 char *data1 = (*bh)->b_data, *data2; 1169 char *data1 = (*bh)->b_data, *data2;
1165 unsigned split, move, size, i; 1170 unsigned split, move, size;
1166 struct ext4_dir_entry_2 *de = NULL, *de2; 1171 struct ext4_dir_entry_2 *de = NULL, *de2;
1167 int err = 0; 1172 int err = 0, i;
1168 1173
1169 bh2 = ext4_append (handle, dir, &newblock, &err); 1174 bh2 = ext4_append (handle, dir, &newblock, &err);
1170 if (!(bh2)) { 1175 if (!(bh2)) {
@@ -1224,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1224 de = de2; 1229 de = de2;
1225 } 1230 }
1226 dx_insert_block(frame, hash2 + continued, newblock); 1231 dx_insert_block(frame, hash2 + continued, newblock);
1227 err = ext4_journal_dirty_metadata(handle, bh2); 1232 err = ext4_handle_dirty_metadata(handle, dir, bh2);
1228 if (err) 1233 if (err)
1229 goto journal_error; 1234 goto journal_error;
1230 err = ext4_journal_dirty_metadata(handle, frame->bh); 1235 err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
1231 if (err) 1236 if (err)
1232 goto journal_error; 1237 goto journal_error;
1233 brelse(bh2); 1238 brelse(bh2);
@@ -1262,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1262 struct inode *dir = dentry->d_parent->d_inode; 1267 struct inode *dir = dentry->d_parent->d_inode;
1263 const char *name = dentry->d_name.name; 1268 const char *name = dentry->d_name.name;
1264 int namelen = dentry->d_name.len; 1269 int namelen = dentry->d_name.len;
1265 unsigned long offset = 0; 1270 unsigned int offset = 0;
1266 unsigned short reclen; 1271 unsigned short reclen;
1267 int nlen, rlen, err; 1272 int nlen, rlen, err;
1268 char *top; 1273 char *top;
@@ -1331,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1331 ext4_update_dx_flag(dir); 1336 ext4_update_dx_flag(dir);
1332 dir->i_version++; 1337 dir->i_version++;
1333 ext4_mark_inode_dirty(handle, dir); 1338 ext4_mark_inode_dirty(handle, dir);
1334 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1339 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1335 err = ext4_journal_dirty_metadata(handle, bh); 1340 err = ext4_handle_dirty_metadata(handle, dir, bh);
1336 if (err) 1341 if (err)
1337 ext4_std_error(dir->i_sb, err); 1342 ext4_std_error(dir->i_sb, err);
1338 brelse(bh); 1343 brelse(bh);
@@ -1404,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1404 1409
1405 /* Initialize as for dx_probe */ 1410 /* Initialize as for dx_probe */
1406 hinfo.hash_version = root->info.hash_version; 1411 hinfo.hash_version = root->info.hash_version;
1412 if (hinfo.hash_version <= DX_HASH_TEA)
1413 hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1407 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 1414 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1408 ext4fs_dirhash(name, namelen, &hinfo); 1415 ext4fs_dirhash(name, namelen, &hinfo);
1409 frame = frames; 1416 frame = frames;
@@ -1433,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1433 struct inode *inode) 1440 struct inode *inode)
1434{ 1441{
1435 struct inode *dir = dentry->d_parent->d_inode; 1442 struct inode *dir = dentry->d_parent->d_inode;
1436 unsigned long offset;
1437 struct buffer_head *bh; 1443 struct buffer_head *bh;
1438 struct ext4_dir_entry_2 *de; 1444 struct ext4_dir_entry_2 *de;
1439 struct super_block *sb; 1445 struct super_block *sb;
@@ -1455,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1455 ext4_mark_inode_dirty(handle, dir); 1461 ext4_mark_inode_dirty(handle, dir);
1456 } 1462 }
1457 blocks = dir->i_size >> sb->s_blocksize_bits; 1463 blocks = dir->i_size >> sb->s_blocksize_bits;
1458 for (block = 0, offset = 0; block < blocks; block++) { 1464 for (block = 0; block < blocks; block++) {
1459 bh = ext4_bread(handle, dir, block, 0, &retval); 1465 bh = ext4_bread(handle, dir, block, 0, &retval);
1460 if(!bh) 1466 if(!bh)
1461 return retval; 1467 return retval;
@@ -1570,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1570 dxtrace(dx_show_index("node", frames[1].entries)); 1576 dxtrace(dx_show_index("node", frames[1].entries));
1571 dxtrace(dx_show_index("node", 1577 dxtrace(dx_show_index("node",
1572 ((struct dx_node *) bh2->b_data)->entries)); 1578 ((struct dx_node *) bh2->b_data)->entries));
1573 err = ext4_journal_dirty_metadata(handle, bh2); 1579 err = ext4_handle_dirty_metadata(handle, inode, bh2);
1574 if (err) 1580 if (err)
1575 goto journal_error; 1581 goto journal_error;
1576 brelse (bh2); 1582 brelse (bh2);
@@ -1596,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1596 if (err) 1602 if (err)
1597 goto journal_error; 1603 goto journal_error;
1598 } 1604 }
1599 ext4_journal_dirty_metadata(handle, frames[0].bh); 1605 ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
1600 } 1606 }
1601 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1607 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1602 if (!de) 1608 if (!de)
@@ -1642,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle,
1642 else 1648 else
1643 de->inode = 0; 1649 de->inode = 0;
1644 dir->i_version++; 1650 dir->i_version++;
1645 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1651 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1646 ext4_journal_dirty_metadata(handle, bh); 1652 ext4_handle_dirty_metadata(handle, dir, bh);
1647 return 0; 1653 return 0;
1648 } 1654 }
1649 i += ext4_rec_len_from_disk(de->rec_len); 1655 i += ext4_rec_len_from_disk(de->rec_len);
@@ -1721,7 +1727,7 @@ retry:
1721 return PTR_ERR(handle); 1727 return PTR_ERR(handle);
1722 1728
1723 if (IS_DIRSYNC(dir)) 1729 if (IS_DIRSYNC(dir))
1724 handle->h_sync = 1; 1730 ext4_handle_sync(handle);
1725 1731
1726 inode = ext4_new_inode (handle, dir, mode); 1732 inode = ext4_new_inode (handle, dir, mode);
1727 err = PTR_ERR(inode); 1733 err = PTR_ERR(inode);
@@ -1755,7 +1761,7 @@ retry:
1755 return PTR_ERR(handle); 1761 return PTR_ERR(handle);
1756 1762
1757 if (IS_DIRSYNC(dir)) 1763 if (IS_DIRSYNC(dir))
1758 handle->h_sync = 1; 1764 ext4_handle_sync(handle);
1759 1765
1760 inode = ext4_new_inode(handle, dir, mode); 1766 inode = ext4_new_inode(handle, dir, mode);
1761 err = PTR_ERR(inode); 1767 err = PTR_ERR(inode);
@@ -1791,7 +1797,7 @@ retry:
1791 return PTR_ERR(handle); 1797 return PTR_ERR(handle);
1792 1798
1793 if (IS_DIRSYNC(dir)) 1799 if (IS_DIRSYNC(dir))
1794 handle->h_sync = 1; 1800 ext4_handle_sync(handle);
1795 1801
1796 inode = ext4_new_inode(handle, dir, S_IFDIR | mode); 1802 inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
1797 err = PTR_ERR(inode); 1803 err = PTR_ERR(inode);
@@ -1820,8 +1826,8 @@ retry:
1820 strcpy(de->name, ".."); 1826 strcpy(de->name, "..");
1821 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1827 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1822 inode->i_nlink = 2; 1828 inode->i_nlink = 2;
1823 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); 1829 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1824 ext4_journal_dirty_metadata(handle, dir_block); 1830 ext4_handle_dirty_metadata(handle, dir, dir_block);
1825 brelse(dir_block); 1831 brelse(dir_block);
1826 ext4_mark_inode_dirty(handle, inode); 1832 ext4_mark_inode_dirty(handle, inode);
1827 err = ext4_add_entry(handle, dentry, inode); 1833 err = ext4_add_entry(handle, dentry, inode);
@@ -1850,7 +1856,7 @@ out_stop:
1850 */ 1856 */
1851static int empty_dir(struct inode *inode) 1857static int empty_dir(struct inode *inode)
1852{ 1858{
1853 unsigned long offset; 1859 unsigned int offset;
1854 struct buffer_head *bh; 1860 struct buffer_head *bh;
1855 struct ext4_dir_entry_2 *de, *de1; 1861 struct ext4_dir_entry_2 *de, *de1;
1856 struct super_block *sb; 1862 struct super_block *sb;
@@ -1895,7 +1901,7 @@ static int empty_dir(struct inode *inode)
1895 if (err) 1901 if (err)
1896 ext4_error(sb, __func__, 1902 ext4_error(sb, __func__,
1897 "error %d reading directory" 1903 "error %d reading directory"
1898 " #%lu offset %lu", 1904 " #%lu offset %u",
1899 err, inode->i_ino, offset); 1905 err, inode->i_ino, offset);
1900 offset += sb->s_blocksize; 1906 offset += sb->s_blocksize;
1901 continue; 1907 continue;
@@ -1933,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1933 struct ext4_iloc iloc; 1939 struct ext4_iloc iloc;
1934 int err = 0, rc; 1940 int err = 0, rc;
1935 1941
1942 if (!ext4_handle_valid(handle))
1943 return 0;
1944
1936 lock_super(sb); 1945 lock_super(sb);
1937 if (!list_empty(&EXT4_I(inode)->i_orphan)) 1946 if (!list_empty(&EXT4_I(inode)->i_orphan))
1938 goto out_unlock; 1947 goto out_unlock;
@@ -1961,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1961 /* Insert this inode at the head of the on-disk orphan list... */ 1970 /* Insert this inode at the head of the on-disk orphan list... */
1962 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 1971 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
1963 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 1972 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1964 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1973 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
1965 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 1974 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
1966 if (!err) 1975 if (!err)
1967 err = rc; 1976 err = rc;
@@ -1995,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
1995 struct list_head *prev; 2004 struct list_head *prev;
1996 struct ext4_inode_info *ei = EXT4_I(inode); 2005 struct ext4_inode_info *ei = EXT4_I(inode);
1997 struct ext4_sb_info *sbi; 2006 struct ext4_sb_info *sbi;
1998 unsigned long ino_next; 2007 __u32 ino_next;
1999 struct ext4_iloc iloc; 2008 struct ext4_iloc iloc;
2000 int err = 0; 2009 int err = 0;
2001 2010
2011 if (!ext4_handle_valid(handle))
2012 return 0;
2013
2002 lock_super(inode->i_sb); 2014 lock_super(inode->i_sb);
2003 if (list_empty(&ei->i_orphan)) { 2015 if (list_empty(&ei->i_orphan)) {
2004 unlock_super(inode->i_sb); 2016 unlock_super(inode->i_sb);
@@ -2017,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2017 * transaction handle with which to update the orphan list on 2029 * transaction handle with which to update the orphan list on
2018 * disk, but we still need to remove the inode from the linked 2030 * disk, but we still need to remove the inode from the linked
2019 * list in memory. */ 2031 * list in memory. */
2020 if (!handle) 2032 if (sbi->s_journal && !handle)
2021 goto out; 2033 goto out;
2022 2034
2023 err = ext4_reserve_inode_write(handle, inode, &iloc); 2035 err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2025,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2025 goto out_err; 2037 goto out_err;
2026 2038
2027 if (prev == &sbi->s_orphan) { 2039 if (prev == &sbi->s_orphan) {
2028 jbd_debug(4, "superblock will point to %lu\n", ino_next); 2040 jbd_debug(4, "superblock will point to %u\n", ino_next);
2029 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 2041 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2030 err = ext4_journal_get_write_access(handle, sbi->s_sbh); 2042 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
2031 if (err) 2043 if (err)
2032 goto out_brelse; 2044 goto out_brelse;
2033 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2045 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2034 err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); 2046 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
2035 } else { 2047 } else {
2036 struct ext4_iloc iloc2; 2048 struct ext4_iloc iloc2;
2037 struct inode *i_prev = 2049 struct inode *i_prev =
2038 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; 2050 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
2039 2051
2040 jbd_debug(4, "orphan inode %lu will point to %lu\n", 2052 jbd_debug(4, "orphan inode %lu will point to %u\n",
2041 i_prev->i_ino, ino_next); 2053 i_prev->i_ino, ino_next);
2042 err = ext4_reserve_inode_write(handle, i_prev, &iloc2); 2054 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
2043 if (err) 2055 if (err)
@@ -2082,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2082 goto end_rmdir; 2094 goto end_rmdir;
2083 2095
2084 if (IS_DIRSYNC(dir)) 2096 if (IS_DIRSYNC(dir))
2085 handle->h_sync = 1; 2097 ext4_handle_sync(handle);
2086 2098
2087 inode = dentry->d_inode; 2099 inode = dentry->d_inode;
2088 2100
@@ -2136,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2136 return PTR_ERR(handle); 2148 return PTR_ERR(handle);
2137 2149
2138 if (IS_DIRSYNC(dir)) 2150 if (IS_DIRSYNC(dir))
2139 handle->h_sync = 1; 2151 ext4_handle_sync(handle);
2140 2152
2141 retval = -ENOENT; 2153 retval = -ENOENT;
2142 bh = ext4_find_entry(dir, &dentry->d_name, &de); 2154 bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2193,7 +2205,7 @@ retry:
2193 return PTR_ERR(handle); 2205 return PTR_ERR(handle);
2194 2206
2195 if (IS_DIRSYNC(dir)) 2207 if (IS_DIRSYNC(dir))
2196 handle->h_sync = 1; 2208 ext4_handle_sync(handle);
2197 2209
2198 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); 2210 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
2199 err = PTR_ERR(inode); 2211 err = PTR_ERR(inode);
@@ -2256,7 +2268,7 @@ retry:
2256 return PTR_ERR(handle); 2268 return PTR_ERR(handle);
2257 2269
2258 if (IS_DIRSYNC(dir)) 2270 if (IS_DIRSYNC(dir))
2259 handle->h_sync = 1; 2271 ext4_handle_sync(handle);
2260 2272
2261 inode->i_ctime = ext4_current_time(inode); 2273 inode->i_ctime = ext4_current_time(inode);
2262 ext4_inc_count(handle, inode); 2274 ext4_inc_count(handle, inode);
@@ -2305,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2305 return PTR_ERR(handle); 2317 return PTR_ERR(handle);
2306 2318
2307 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 2319 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2308 handle->h_sync = 1; 2320 ext4_handle_sync(handle);
2309 2321
2310 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); 2322 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2311 /* 2323 /*
@@ -2359,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2359 new_dir->i_ctime = new_dir->i_mtime = 2371 new_dir->i_ctime = new_dir->i_mtime =
2360 ext4_current_time(new_dir); 2372 ext4_current_time(new_dir);
2361 ext4_mark_inode_dirty(handle, new_dir); 2373 ext4_mark_inode_dirty(handle, new_dir);
2362 BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); 2374 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2363 ext4_journal_dirty_metadata(handle, new_bh); 2375 ext4_handle_dirty_metadata(handle, new_dir, new_bh);
2364 brelse(new_bh); 2376 brelse(new_bh);
2365 new_bh = NULL; 2377 new_bh = NULL;
2366 } 2378 }
@@ -2410,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2410 BUFFER_TRACE(dir_bh, "get_write_access"); 2422 BUFFER_TRACE(dir_bh, "get_write_access");
2411 ext4_journal_get_write_access(handle, dir_bh); 2423 ext4_journal_get_write_access(handle, dir_bh);
2412 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2424 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2413 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); 2425 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2414 ext4_journal_dirty_metadata(handle, dir_bh); 2426 ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2415 ext4_dec_count(handle, old_dir); 2427 ext4_dec_count(handle, old_dir);
2416 if (new_inode) { 2428 if (new_inode) {
2417 /* checked empty_dir above, can't have another parent, 2429 /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a015..c328be5d6885 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
50 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 50 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
51 if (group != sbi->s_groups_count) 51 if (group != sbi->s_groups_count)
52 ext4_warning(sb, __func__, 52 ext4_warning(sb, __func__,
53 "Cannot add at group %u (only %lu groups)", 53 "Cannot add at group %u (only %u groups)",
54 input->group, sbi->s_groups_count); 54 input->group, sbi->s_groups_count);
55 else if (offset != 0) 55 else if (offset != 0)
56 ext4_warning(sb, __func__, "Last group not full"); 56 ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
149{ 149{
150 int err; 150 int err;
151 151
152 if (handle->h_buffer_credits >= thresh) 152 if (ext4_handle_has_enough_credits(handle, thresh))
153 return 0; 153 return 0;
154 154
155 err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); 155 err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
232 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 232 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
233 set_buffer_uptodate(gdb); 233 set_buffer_uptodate(gdb);
234 unlock_buffer(gdb); 234 unlock_buffer(gdb);
235 ext4_journal_dirty_metadata(handle, gdb); 235 ext4_handle_dirty_metadata(handle, NULL, gdb);
236 ext4_set_bit(bit, bh->b_data); 236 ext4_set_bit(bit, bh->b_data);
237 brelse(gdb); 237 brelse(gdb);
238 } 238 }
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
251 err = PTR_ERR(bh); 251 err = PTR_ERR(bh);
252 goto exit_bh; 252 goto exit_bh;
253 } 253 }
254 ext4_journal_dirty_metadata(handle, gdb); 254 ext4_handle_dirty_metadata(handle, NULL, gdb);
255 ext4_set_bit(bit, bh->b_data); 255 ext4_set_bit(bit, bh->b_data);
256 brelse(gdb); 256 brelse(gdb);
257 } 257 }
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
276 err = PTR_ERR(it); 276 err = PTR_ERR(it);
277 goto exit_bh; 277 goto exit_bh;
278 } 278 }
279 ext4_journal_dirty_metadata(handle, it); 279 ext4_handle_dirty_metadata(handle, NULL, it);
280 brelse(it); 280 brelse(it);
281 ext4_set_bit(bit, bh->b_data); 281 ext4_set_bit(bit, bh->b_data);
282 } 282 }
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
284 if ((err = extend_or_restart_transaction(handle, 2, bh))) 284 if ((err = extend_or_restart_transaction(handle, 2, bh)))
285 goto exit_bh; 285 goto exit_bh;
286 286
287 mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), 287 mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
288 bh->b_data); 288 ext4_handle_dirty_metadata(handle, NULL, bh);
289 ext4_journal_dirty_metadata(handle, bh);
290 brelse(bh); 289 brelse(bh);
291
292 /* Mark unused entries in inode bitmap used */ 290 /* Mark unused entries in inode bitmap used */
293 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 291 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
294 input->inode_bitmap, input->inode_bitmap - start); 292 input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
297 goto exit_journal; 295 goto exit_journal;
298 } 296 }
299 297
300 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), 298 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
301 bh->b_data); 299 bh->b_data);
302 ext4_journal_dirty_metadata(handle, bh); 300 ext4_handle_dirty_metadata(handle, NULL, bh);
303exit_bh: 301exit_bh:
304 brelse(bh); 302 brelse(bh);
305 303
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
486 * reserved inode, and will become GDT blocks (primary and backup). 484 * reserved inode, and will become GDT blocks (primary and backup).
487 */ 485 */
488 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; 486 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
489 ext4_journal_dirty_metadata(handle, dind); 487 ext4_handle_dirty_metadata(handle, NULL, dind);
490 brelse(dind); 488 brelse(dind);
491 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 489 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
492 ext4_mark_iloc_dirty(handle, inode, &iloc); 490 ext4_mark_iloc_dirty(handle, inode, &iloc);
493 memset((*primary)->b_data, 0, sb->s_blocksize); 491 memset((*primary)->b_data, 0, sb->s_blocksize);
494 ext4_journal_dirty_metadata(handle, *primary); 492 ext4_handle_dirty_metadata(handle, NULL, *primary);
495 493
496 o_group_desc = EXT4_SB(sb)->s_group_desc; 494 o_group_desc = EXT4_SB(sb)->s_group_desc;
497 memcpy(n_group_desc, o_group_desc, 495 memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
502 kfree(o_group_desc); 500 kfree(o_group_desc);
503 501
504 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 502 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
505 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 503 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
506 504
507 return 0; 505 return 0;
508 506
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
618 primary[i]->b_blocknr, gdbackups, 616 primary[i]->b_blocknr, gdbackups,
619 blk + primary[i]->b_blocknr); */ 617 blk + primary[i]->b_blocknr); */
620 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); 618 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
621 err2 = ext4_journal_dirty_metadata(handle, primary[i]); 619 err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
622 if (!err) 620 if (!err)
623 err = err2; 621 err = err2;
624 } 622 }
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
676 struct buffer_head *bh; 674 struct buffer_head *bh;
677 675
678 /* Out of journal space, and can't get more - abort - so sad */ 676 /* Out of journal space, and can't get more - abort - so sad */
679 if (handle->h_buffer_credits == 0 && 677 if (ext4_handle_valid(handle) &&
678 handle->h_buffer_credits == 0 &&
680 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && 679 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
681 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 680 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
682 break; 681 break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
696 memset(bh->b_data + size, 0, rest); 695 memset(bh->b_data + size, 0, rest);
697 set_buffer_uptodate(bh); 696 set_buffer_uptodate(bh);
698 unlock_buffer(bh); 697 unlock_buffer(bh);
699 ext4_journal_dirty_metadata(handle, bh); 698 ext4_handle_dirty_metadata(handle, NULL, bh);
700 brelse(bh); 699 brelse(bh);
701 } 700 }
702 if ((err2 = ext4_journal_stop(handle)) && !err) 701 if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
715exit_err: 714exit_err:
716 if (err) { 715 if (err) {
717 ext4_warning(sb, __func__, 716 ext4_warning(sb, __func__,
718 "can't update backup for group %lu (err %d), " 717 "can't update backup for group %u (err %d), "
719 "forcing fsck on next reboot", group, err); 718 "forcing fsck on next reboot", group, err);
720 sbi->s_mount_state &= ~EXT4_VALID_FS; 719 sbi->s_mount_state &= ~EXT4_VALID_FS;
721 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
747 struct inode *inode = NULL; 746 struct inode *inode = NULL;
748 handle_t *handle; 747 handle_t *handle;
749 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 750 int err, err2;
751 751
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
761 761
762 if (ext4_blocks_count(es) + input->blocks_count < 762 if (ext4_blocks_count(es) + input->blocks_count <
763 ext4_blocks_count(es)) { 763 ext4_blocks_count(es)) {
764 ext4_warning(sb, __func__, "blocks_count overflow\n"); 764 ext4_warning(sb, __func__, "blocks_count overflow");
765 return -EINVAL; 765 return -EINVAL;
766 } 766 }
767 767
768 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 768 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
769 le32_to_cpu(es->s_inodes_count)) { 769 le32_to_cpu(es->s_inodes_count)) {
770 ext4_warning(sb, __func__, "inodes_count overflow\n"); 770 ext4_warning(sb, __func__, "inodes_count overflow");
771 return -EINVAL; 771 return -EINVAL;
772 } 772 }
773 773
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
787 } 787 }
788 } 788 }
789 789
790
790 if ((err = verify_group_input(sb, input))) 791 if ((err = verify_group_input(sb, input)))
791 goto exit_put; 792 goto exit_put;
792 793
@@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
855 * using the new disk blocks. 856 * using the new disk blocks.
856 */ 857 */
857 858
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
858 /* Update group descriptor block for new group */ 860 /* Update group descriptor block for new group */
859 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 861 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
860 gdb_off * EXT4_DESC_SIZE(sb)); 862 gdb_off * EXT4_DESC_SIZE(sb));
@@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
862 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ 864 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
863 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ 865 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
864 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 866 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
865 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); 867 ext4_free_blks_set(sb, gdp, input->free_blocks_count);
866 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); 868 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
869 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
867 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 870 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
868 871
869 /* 872 /*
870 * We can allocate memory for mb_alloc based on the new group 873 * We can allocate memory for mb_alloc based on the new group
871 * descriptor 874 * descriptor
872 */ 875 */
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); 876 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
874 if (err) 877 if (err) {
878 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
875 goto exit_journal; 879 goto exit_journal;
880 }
876 881
877 /* 882 /*
878 * Make the new blocks and inodes valid next. We do this before 883 * Make the new blocks and inodes valid next. We do this before
@@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
914 919
915 /* Update the global fs size fields */ 920 /* Update the global fs size fields */
916 sbi->s_groups_count++; 921 sbi->s_groups_count++;
922 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
917 923
918 ext4_journal_dirty_metadata(handle, primary); 924 ext4_handle_dirty_metadata(handle, NULL, primary);
919 925
920 /* Update the reserved block counts only once the new group is 926 /* Update the reserved block counts only once the new group is
921 * active. */ 927 * active. */
@@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
937 EXT4_INODES_PER_GROUP(sb); 943 EXT4_INODES_PER_GROUP(sb);
938 } 944 }
939 945
940 ext4_journal_dirty_metadata(handle, sbi->s_sbh); 946 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
941 sb->s_dirt = 1; 947 sb->s_dirt = 1;
942 948
943exit_journal: 949exit_journal:
@@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
975 struct buffer_head *bh; 981 struct buffer_head *bh;
976 handle_t *handle; 982 handle_t *handle;
977 int err; 983 int err;
978 unsigned long freed_blocks;
979 ext4_group_t group; 984 ext4_group_t group;
980 struct ext4_group_info *grp;
981 985
982 /* We don't need to worry about locking wrt other resizers just 986 /* We don't need to worry about locking wrt other resizers just
983 * yet: we're going to revalidate es->s_blocks_count after 987 * yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 1001 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 1002 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 1003 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, 1004 ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
1001 "CONFIG_LBD not enabled\n");
1002 return -EINVAL; 1005 return -EINVAL;
1003 } 1006 }
1004 1007
@@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1071 goto exit_put; 1074 goto exit_put;
1072 } 1075 }
1073 ext4_blocks_count_set(es, o_blocks_count + add); 1076 ext4_blocks_count_set(es, o_blocks_count + add);
1074 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1077 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1075 sb->s_dirt = 1; 1078 sb->s_dirt = 1;
1076 unlock_super(sb); 1079 unlock_super(sb);
1077 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1080 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1078 o_blocks_count + add); 1081 o_blocks_count + add);
1079 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1082 /* We add the blocks to the bitmap and set the group need init bit */
1083 ext4_add_groupblocks(handle, sb, o_blocks_count, add);
1080 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1084 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1081 o_blocks_count + add); 1085 o_blocks_count + add);
1082 if ((err = ext4_journal_stop(handle))) 1086 if ((err = ext4_journal_stop(handle)))
1083 goto exit_put; 1087 goto exit_put;
1084 1088
1085 /*
1086 * Mark mballoc pages as not up to date so that they will be updated
1087 * next time they are loaded by ext4_mb_load_buddy.
1088 *
1089 * XXX Bad, Bad, BAD!!! We should not be overloading the
1090 * Uptodate flag, particularly on thte bitmap bh, as way of
1091 * hinting to ext4_mb_load_buddy() that it needs to be
1092 * overloaded. A user could take a LVM snapshot, then do an
1093 * on-line fsck, and clear the uptodate flag, and this would
1094 * not be a bug in userspace, but a bug in the kernel. FIXME!!!
1095 */
1096 {
1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1098 struct inode *inode = sbi->s_buddy_cache;
1099 int blocks_per_page;
1100 int block;
1101 int pnum;
1102 struct page *page;
1103
1104 /* Set buddy page as not up to date */
1105 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1106 block = group * 2;
1107 pnum = block / blocks_per_page;
1108 page = find_get_page(inode->i_mapping, pnum);
1109 if (page != NULL) {
1110 ClearPageUptodate(page);
1111 page_cache_release(page);
1112 }
1113
1114 /* Set bitmap page as not up to date */
1115 block++;
1116 pnum = block / blocks_per_page;
1117 page = find_get_page(inode->i_mapping, pnum);
1118 if (page != NULL) {
1119 ClearPageUptodate(page);
1120 page_cache_release(page);
1121 }
1122
1123 /* Get the info on the last group */
1124 grp = ext4_get_group_info(sb, group);
1125
1126 /* Update free blocks in group info */
1127 ext4_mb_update_group_info(grp, add);
1128 }
1129
1130 if (test_opt(sb, DEBUG)) 1089 if (test_opt(sb, DEBUG))
1131 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", 1090 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1132 ext4_blocks_count(es)); 1091 ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9494bb249390..8f7e0be8ab1b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root;
51 51
52static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 52static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 unsigned long journal_devnum); 53 unsigned long journal_devnum);
54static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
55 unsigned int);
56static void ext4_commit_super(struct super_block *sb, 54static void ext4_commit_super(struct super_block *sb,
57 struct ext4_super_block *es, int sync); 55 struct ext4_super_block *es, int sync);
58static void ext4_mark_recovery_complete(struct super_block *sb, 56static void ext4_mark_recovery_complete(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
93 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 91 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
94} 92}
95 93
94__u32 ext4_free_blks_count(struct super_block *sb,
95 struct ext4_group_desc *bg)
96{
97 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
98 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
99 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
100}
101
102__u32 ext4_free_inodes_count(struct super_block *sb,
103 struct ext4_group_desc *bg)
104{
105 return le16_to_cpu(bg->bg_free_inodes_count_lo) |
106 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
107 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
108}
109
110__u32 ext4_used_dirs_count(struct super_block *sb,
111 struct ext4_group_desc *bg)
112{
113 return le16_to_cpu(bg->bg_used_dirs_count_lo) |
114 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
115 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
116}
117
118__u32 ext4_itable_unused_count(struct super_block *sb,
119 struct ext4_group_desc *bg)
120{
121 return le16_to_cpu(bg->bg_itable_unused_lo) |
122 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
123 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
124}
125
96void ext4_block_bitmap_set(struct super_block *sb, 126void ext4_block_bitmap_set(struct super_block *sb,
97 struct ext4_group_desc *bg, ext4_fsblk_t blk) 127 struct ext4_group_desc *bg, ext4_fsblk_t blk)
98{ 128{
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
117 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); 147 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
118} 148}
119 149
150void ext4_free_blks_set(struct super_block *sb,
151 struct ext4_group_desc *bg, __u32 count)
152{
153 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
154 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
155 bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
156}
157
158void ext4_free_inodes_set(struct super_block *sb,
159 struct ext4_group_desc *bg, __u32 count)
160{
161 bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
162 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
163 bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
164}
165
166void ext4_used_dirs_set(struct super_block *sb,
167 struct ext4_group_desc *bg, __u32 count)
168{
169 bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
170 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
171 bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
172}
173
174void ext4_itable_unused_set(struct super_block *sb,
175 struct ext4_group_desc *bg, __u32 count)
176{
177 bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
178 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
179 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
180}
181
120/* 182/*
121 * Wrappers for jbd2_journal_start/end. 183 * Wrappers for jbd2_journal_start/end.
122 * 184 *
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
136 * backs (eg. EIO in the commit thread), then we still need to 198 * backs (eg. EIO in the commit thread), then we still need to
137 * take the FS itself readonly cleanly. */ 199 * take the FS itself readonly cleanly. */
138 journal = EXT4_SB(sb)->s_journal; 200 journal = EXT4_SB(sb)->s_journal;
139 if (is_journal_aborted(journal)) { 201 if (journal) {
140 ext4_abort(sb, __func__, 202 if (is_journal_aborted(journal)) {
141 "Detected aborted journal"); 203 ext4_abort(sb, __func__,
142 return ERR_PTR(-EROFS); 204 "Detected aborted journal");
205 return ERR_PTR(-EROFS);
206 }
207 return jbd2_journal_start(journal, nblocks);
143 } 208 }
144 209 /*
145 return jbd2_journal_start(journal, nblocks); 210 * We're not journaling, return the appropriate indication.
211 */
212 current->journal_info = EXT4_NOJOURNAL_HANDLE;
213 return current->journal_info;
146} 214}
147 215
148/* 216/*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
157 int err; 225 int err;
158 int rc; 226 int rc;
159 227
228 if (!ext4_handle_valid(handle)) {
229 /*
230 * Do this here since we don't call jbd2_journal_stop() in
231 * no-journal mode.
232 */
233 current->journal_info = NULL;
234 return 0;
235 }
160 sb = handle->h_transaction->t_journal->j_private; 236 sb = handle->h_transaction->t_journal->j_private;
161 err = handle->h_err; 237 err = handle->h_err;
162 rc = jbd2_journal_stop(handle); 238 rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
174 char nbuf[16]; 250 char nbuf[16];
175 const char *errstr = ext4_decode_error(NULL, err, nbuf); 251 const char *errstr = ext4_decode_error(NULL, err, nbuf);
176 252
253 BUG_ON(!ext4_handle_valid(handle));
254
177 if (bh) 255 if (bh)
178 BUFFER_TRACE(bh, "abort"); 256 BUFFER_TRACE(bh, "abort");
179 257
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
350 va_end(args); 428 va_end(args);
351} 429}
352 430
431void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
432 const char *function, const char *fmt, ...)
433__releases(bitlock)
434__acquires(bitlock)
435{
436 va_list args;
437 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
438
439 va_start(args, fmt);
440 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
441 vprintk(fmt, args);
442 printk("\n");
443 va_end(args);
444
445 if (test_opt(sb, ERRORS_CONT)) {
446 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
447 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
448 ext4_commit_super(sb, es, 0);
449 return;
450 }
451 ext4_unlock_group(sb, grp);
452 ext4_handle_error(sb);
453 /*
454 * We only get here in the ERRORS_RO case; relocking the group
455 * may be dangerous, but nothing bad will happen since the
456 * filesystem will have already been marked read/only and the
457 * journal has been aborted. We return 1 as a hint to callers
458 * who might what to use the return value from
459 * ext4_grp_locked_error() to distinguish beween the
460 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
461 * aggressively from the ext4 function in question, with a
462 * more appropriate error code.
463 */
464 ext4_lock_group(sb, grp);
465 return;
466}
467
468
353void ext4_update_dynamic_rev(struct super_block *sb) 469void ext4_update_dynamic_rev(struct super_block *sb)
354{ 470{
355 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 471 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
389 return bdev; 505 return bdev;
390 506
391fail: 507fail:
392 printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", 508 printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
393 __bdevname(dev, b), PTR_ERR(bdev)); 509 __bdevname(dev, b), PTR_ERR(bdev));
394 return NULL; 510 return NULL;
395} 511}
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
448 ext4_mb_release(sb); 564 ext4_mb_release(sb);
449 ext4_ext_release(sb); 565 ext4_ext_release(sb);
450 ext4_xattr_put_super(sb); 566 ext4_xattr_put_super(sb);
451 err = jbd2_journal_destroy(sbi->s_journal); 567 if (sbi->s_journal) {
452 sbi->s_journal = NULL; 568 err = jbd2_journal_destroy(sbi->s_journal);
453 if (err < 0) 569 sbi->s_journal = NULL;
454 ext4_abort(sb, __func__, "Couldn't clean up the journal"); 570 if (err < 0)
455 571 ext4_abort(sb, __func__,
572 "Couldn't clean up the journal");
573 }
456 if (!(sb->s_flags & MS_RDONLY)) { 574 if (!(sb->s_flags & MS_RDONLY)) {
457 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 575 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
458 es->s_state = cpu_to_le16(sbi->s_mount_state); 576 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
522 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 640 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
523 INIT_LIST_HEAD(&ei->i_prealloc_list); 641 INIT_LIST_HEAD(&ei->i_prealloc_list);
524 spin_lock_init(&ei->i_prealloc_lock); 642 spin_lock_init(&ei->i_prealloc_lock);
643 /*
644 * Note: We can be called before EXT4_SB(sb)->s_journal is set,
645 * therefore it can be null here. Don't check it, just initialize
646 * jinode.
647 */
525 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); 648 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
526 ei->i_reserved_data_blocks = 0; 649 ei->i_reserved_data_blocks = 0;
527 ei->i_reserved_meta_blocks = 0; 650 ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
588 } 711 }
589#endif 712#endif
590 ext4_discard_preallocations(inode); 713 ext4_discard_preallocations(inode);
591 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 714 if (EXT4_JOURNAL(inode))
715 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
592 &EXT4_I(inode)->jinode); 716 &EXT4_I(inode)->jinode);
593} 717}
594 718
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
681#endif 805#endif
682 if (!test_opt(sb, RESERVATION)) 806 if (!test_opt(sb, RESERVATION))
683 seq_puts(seq, ",noreservation"); 807 seq_puts(seq, ",noreservation");
684 if (sbi->s_commit_interval) { 808 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
685 seq_printf(seq, ",commit=%u", 809 seq_printf(seq, ",commit=%u",
686 (unsigned) (sbi->s_commit_interval / HZ)); 810 (unsigned) (sbi->s_commit_interval / HZ));
687 } 811 }
812 if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
813 seq_printf(seq, ",min_batch_time=%u",
814 (unsigned) sbi->s_min_batch_time);
815 }
816 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
817 seq_printf(seq, ",max_batch_time=%u",
818 (unsigned) sbi->s_min_batch_time);
819 }
820
688 /* 821 /*
689 * We're changing the default of barrier mount option, so 822 * We're changing the default of barrier mount option, so
690 * let's always display its mount state so it's clear what its 823 * let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
696 seq_puts(seq, ",journal_async_commit"); 829 seq_puts(seq, ",journal_async_commit");
697 if (test_opt(sb, NOBH)) 830 if (test_opt(sb, NOBH))
698 seq_puts(seq, ",nobh"); 831 seq_puts(seq, ",nobh");
699 if (!test_opt(sb, EXTENTS))
700 seq_puts(seq, ",noextents");
701 if (test_opt(sb, I_VERSION)) 832 if (test_opt(sb, I_VERSION))
702 seq_puts(seq, ",i_version"); 833 seq_puts(seq, ",i_version");
703 if (!test_opt(sb, DELALLOC)) 834 if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
772 ext4_nfs_get_inode); 903 ext4_nfs_get_inode);
773} 904}
774 905
906/*
907 * Try to release metadata pages (indirect blocks, directories) which are
908 * mapped via the block device. Since these pages could have journal heads
909 * which would prevent try_to_free_buffers() from freeing them, we must use
910 * jbd2 layer's try_to_free_buffers() function to release them.
911 */
912static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
913{
914 journal_t *journal = EXT4_SB(sb)->s_journal;
915
916 WARN_ON(PageChecked(page));
917 if (!page_has_buffers(page))
918 return 0;
919 if (journal)
920 return jbd2_journal_try_to_free_buffers(journal, page,
921 wait & ~__GFP_WAIT);
922 return try_to_free_buffers(page);
923}
924
775#ifdef CONFIG_QUOTA 925#ifdef CONFIG_QUOTA
776#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") 926#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
777#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 927#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -838,6 +988,7 @@ static const struct super_operations ext4_sops = {
838 .quota_read = ext4_quota_read, 988 .quota_read = ext4_quota_read,
839 .quota_write = ext4_quota_write, 989 .quota_write = ext4_quota_write,
840#endif 990#endif
991 .bdev_try_to_free_page = bdev_try_to_free_page,
841}; 992};
842 993
843static const struct export_operations ext4_export_ops = { 994static const struct export_operations ext4_export_ops = {
@@ -852,16 +1003,17 @@ enum {
852 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1003 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
853 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1004 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
854 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 1005 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
855 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 1006 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1007 Opt_journal_update, Opt_journal_dev,
856 Opt_journal_checksum, Opt_journal_async_commit, 1008 Opt_journal_checksum, Opt_journal_async_commit,
857 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1009 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
858 Opt_data_err_abort, Opt_data_err_ignore, 1010 Opt_data_err_abort, Opt_data_err_ignore,
859 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1011 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
860 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1012 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
861 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 1013 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
862 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 1014 Opt_grpquota, Opt_i_version,
863 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1015 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
864 Opt_inode_readahead_blks 1016 Opt_inode_readahead_blks, Opt_journal_ioprio
865}; 1017};
866 1018
867static const match_table_t tokens = { 1019static const match_table_t tokens = {
@@ -891,8 +1043,9 @@ static const match_table_t tokens = {
891 {Opt_nobh, "nobh"}, 1043 {Opt_nobh, "nobh"},
892 {Opt_bh, "bh"}, 1044 {Opt_bh, "bh"},
893 {Opt_commit, "commit=%u"}, 1045 {Opt_commit, "commit=%u"},
1046 {Opt_min_batch_time, "min_batch_time=%u"},
1047 {Opt_max_batch_time, "max_batch_time=%u"},
894 {Opt_journal_update, "journal=update"}, 1048 {Opt_journal_update, "journal=update"},
895 {Opt_journal_inum, "journal=%u"},
896 {Opt_journal_dev, "journal_dev=%u"}, 1049 {Opt_journal_dev, "journal_dev=%u"},
897 {Opt_journal_checksum, "journal_checksum"}, 1050 {Opt_journal_checksum, "journal_checksum"},
898 {Opt_journal_async_commit, "journal_async_commit"}, 1051 {Opt_journal_async_commit, "journal_async_commit"},
@@ -913,14 +1066,13 @@ static const match_table_t tokens = {
913 {Opt_quota, "quota"}, 1066 {Opt_quota, "quota"},
914 {Opt_usrquota, "usrquota"}, 1067 {Opt_usrquota, "usrquota"},
915 {Opt_barrier, "barrier=%u"}, 1068 {Opt_barrier, "barrier=%u"},
916 {Opt_extents, "extents"},
917 {Opt_noextents, "noextents"},
918 {Opt_i_version, "i_version"}, 1069 {Opt_i_version, "i_version"},
919 {Opt_stripe, "stripe=%u"}, 1070 {Opt_stripe, "stripe=%u"},
920 {Opt_resize, "resize"}, 1071 {Opt_resize, "resize"},
921 {Opt_delalloc, "delalloc"}, 1072 {Opt_delalloc, "delalloc"},
922 {Opt_nodelalloc, "nodelalloc"}, 1073 {Opt_nodelalloc, "nodelalloc"},
923 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1074 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1075 {Opt_journal_ioprio, "journal_ioprio=%u"},
924 {Opt_err, NULL}, 1076 {Opt_err, NULL},
925}; 1077};
926 1078
@@ -945,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
945 return sb_block; 1097 return sb_block;
946} 1098}
947 1099
1100#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1101
948static int parse_options(char *options, struct super_block *sb, 1102static int parse_options(char *options, struct super_block *sb,
949 unsigned int *inum, unsigned long *journal_devnum, 1103 unsigned long *journal_devnum,
1104 unsigned int *journal_ioprio,
950 ext4_fsblk_t *n_blocks_count, int is_remount) 1105 ext4_fsblk_t *n_blocks_count, int is_remount)
951{ 1106{
952 struct ext4_sb_info *sbi = EXT4_SB(sb); 1107 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -958,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
958 int qtype, qfmt; 1113 int qtype, qfmt;
959 char *qname; 1114 char *qname;
960#endif 1115#endif
961 ext4_fsblk_t last_block;
962 1116
963 if (!options) 1117 if (!options)
964 return 1; 1118 return 1;
@@ -1070,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
1070 } 1224 }
1071 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1225 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
1072 break; 1226 break;
1073 case Opt_journal_inum:
1074 if (is_remount) {
1075 printk(KERN_ERR "EXT4-fs: cannot specify "
1076 "journal on remount\n");
1077 return 0;
1078 }
1079 if (match_int(&args[0], &option))
1080 return 0;
1081 *inum = option;
1082 break;
1083 case Opt_journal_dev: 1227 case Opt_journal_dev:
1084 if (is_remount) { 1228 if (is_remount) {
1085 printk(KERN_ERR "EXT4-fs: cannot specify " 1229 printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1109,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
1109 option = JBD2_DEFAULT_MAX_COMMIT_AGE; 1253 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1110 sbi->s_commit_interval = HZ * option; 1254 sbi->s_commit_interval = HZ * option;
1111 break; 1255 break;
1256 case Opt_max_batch_time:
1257 if (match_int(&args[0], &option))
1258 return 0;
1259 if (option < 0)
1260 return 0;
1261 if (option == 0)
1262 option = EXT4_DEF_MAX_BATCH_TIME;
1263 sbi->s_max_batch_time = option;
1264 break;
1265 case Opt_min_batch_time:
1266 if (match_int(&args[0], &option))
1267 return 0;
1268 if (option < 0)
1269 return 0;
1270 sbi->s_min_batch_time = option;
1271 break;
1112 case Opt_data_journal: 1272 case Opt_data_journal:
1113 data_opt = EXT4_MOUNT_JOURNAL_DATA; 1273 data_opt = EXT4_MOUNT_JOURNAL_DATA;
1114 goto datacheck; 1274 goto datacheck;
@@ -1279,33 +1439,6 @@ set_qf_format:
1279 case Opt_bh: 1439 case Opt_bh:
1280 clear_opt(sbi->s_mount_opt, NOBH); 1440 clear_opt(sbi->s_mount_opt, NOBH);
1281 break; 1441 break;
1282 case Opt_extents:
1283 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
1284 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1285 ext4_warning(sb, __func__,
1286 "extents feature not enabled "
1287 "on this filesystem, use tune2fs\n");
1288 return 0;
1289 }
1290 set_opt(sbi->s_mount_opt, EXTENTS);
1291 break;
1292 case Opt_noextents:
1293 /*
1294 * When e2fsprogs support resizing an already existing
1295 * ext3 file system to greater than 2**32 we need to
1296 * add support to block allocator to handle growing
1297 * already existing block mapped inode so that blocks
1298 * allocated for them fall within 2**32
1299 */
1300 last_block = ext4_blocks_count(sbi->s_es) - 1;
1301 if (last_block > 0xffffffffULL) {
1302 printk(KERN_ERR "EXT4-fs: Filesystem too "
1303 "large to mount with "
1304 "-o noextents options\n");
1305 return 0;
1306 }
1307 clear_opt(sbi->s_mount_opt, EXTENTS);
1308 break;
1309 case Opt_i_version: 1442 case Opt_i_version:
1310 set_opt(sbi->s_mount_opt, I_VERSION); 1443 set_opt(sbi->s_mount_opt, I_VERSION);
1311 sb->s_flags |= MS_I_VERSION; 1444 sb->s_flags |= MS_I_VERSION;
@@ -1330,6 +1463,14 @@ set_qf_format:
1330 return 0; 1463 return 0;
1331 sbi->s_inode_readahead_blks = option; 1464 sbi->s_inode_readahead_blks = option;
1332 break; 1465 break;
1466 case Opt_journal_ioprio:
1467 if (match_int(&args[0], &option))
1468 return 0;
1469 if (option < 0 || option > 7)
1470 break;
1471 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1472 option);
1473 break;
1333 default: 1474 default:
1334 printk(KERN_ERR 1475 printk(KERN_ERR
1335 "EXT4-fs: Unrecognized mount option \"%s\" " 1476 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1405,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1405 printk(KERN_WARNING 1546 printk(KERN_WARNING
1406 "EXT4-fs warning: checktime reached, " 1547 "EXT4-fs warning: checktime reached, "
1407 "running e2fsck is recommended\n"); 1548 "running e2fsck is recommended\n");
1408#if 0 1549 if (!sbi->s_journal)
1409 /* @@@ We _will_ want to clear the valid bit if we find 1550 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1410 * inconsistencies, to force a fsck at reboot. But for
1411 * a plain journaled filesystem we can keep it set as
1412 * valid forever! :)
1413 */
1414 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1415#endif
1416 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1551 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1417 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 1552 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1418 le16_add_cpu(&es->s_mnt_count, 1); 1553 le16_add_cpu(&es->s_mnt_count, 1);
1419 es->s_mtime = cpu_to_le32(get_seconds()); 1554 es->s_mtime = cpu_to_le32(get_seconds());
1420 ext4_update_dynamic_rev(sb); 1555 ext4_update_dynamic_rev(sb);
1421 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 1556 if (sbi->s_journal)
1557 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1422 1558
1423 ext4_commit_super(sb, es, 1); 1559 ext4_commit_super(sb, es, 1);
1424 if (test_opt(sb, DEBUG)) 1560 if (test_opt(sb, DEBUG))
1425 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " 1561 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1426 "bpg=%lu, ipg=%lu, mo=%04lx]\n", 1562 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
1427 sb->s_blocksize, 1563 sb->s_blocksize,
1428 sbi->s_groups_count, 1564 sbi->s_groups_count,
@@ -1430,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1430 EXT4_INODES_PER_GROUP(sb), 1566 EXT4_INODES_PER_GROUP(sb),
1431 sbi->s_mount_opt); 1567 sbi->s_mount_opt);
1432 1568
1433 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", 1569 if (EXT4_SB(sb)->s_journal) {
1434 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : 1570 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1435 "external", EXT4_SB(sb)->s_journal->j_devname); 1571 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1572 "external", EXT4_SB(sb)->s_journal->j_devname);
1573 } else {
1574 printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
1575 }
1436 return res; 1576 return res;
1437} 1577}
1438 1578
@@ -1444,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1444 ext4_group_t flex_group_count; 1584 ext4_group_t flex_group_count;
1445 ext4_group_t flex_group; 1585 ext4_group_t flex_group;
1446 int groups_per_flex = 0; 1586 int groups_per_flex = 0;
1447 __u64 block_bitmap = 0;
1448 int i; 1587 int i;
1449 1588
1450 if (!sbi->s_es->s_log_groups_per_flex) { 1589 if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1463,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
1463 sizeof(struct flex_groups), GFP_KERNEL); 1602 sizeof(struct flex_groups), GFP_KERNEL);
1464 if (sbi->s_flex_groups == NULL) { 1603 if (sbi->s_flex_groups == NULL) {
1465 printk(KERN_ERR "EXT4-fs: not enough memory for " 1604 printk(KERN_ERR "EXT4-fs: not enough memory for "
1466 "%lu flex groups\n", flex_group_count); 1605 "%u flex groups\n", flex_group_count);
1467 goto failed; 1606 goto failed;
1468 } 1607 }
1469 1608
1470 gdp = ext4_get_group_desc(sb, 1, &bh);
1471 block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1472
1473 for (i = 0; i < sbi->s_groups_count; i++) { 1609 for (i = 0; i < sbi->s_groups_count; i++) {
1474 gdp = ext4_get_group_desc(sb, i, &bh); 1610 gdp = ext4_get_group_desc(sb, i, &bh);
1475 1611
1476 flex_group = ext4_flex_group(sbi, i); 1612 flex_group = ext4_flex_group(sbi, i);
1477 sbi->s_flex_groups[flex_group].free_inodes += 1613 sbi->s_flex_groups[flex_group].free_inodes +=
1478 le16_to_cpu(gdp->bg_free_inodes_count); 1614 ext4_free_inodes_count(sb, gdp);
1479 sbi->s_flex_groups[flex_group].free_blocks += 1615 sbi->s_flex_groups[flex_group].free_blocks +=
1480 le16_to_cpu(gdp->bg_free_blocks_count); 1616 ext4_free_blks_count(sb, gdp);
1481 } 1617 }
1482 1618
1483 return 1; 1619 return 1;
@@ -1551,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1551 block_bitmap = ext4_block_bitmap(sb, gdp); 1687 block_bitmap = ext4_block_bitmap(sb, gdp);
1552 if (block_bitmap < first_block || block_bitmap > last_block) { 1688 if (block_bitmap < first_block || block_bitmap > last_block) {
1553 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1689 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1554 "Block bitmap for group %lu not in group " 1690 "Block bitmap for group %u not in group "
1555 "(block %llu)!\n", i, block_bitmap); 1691 "(block %llu)!\n", i, block_bitmap);
1556 return 0; 1692 return 0;
1557 } 1693 }
1558 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1694 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1559 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1695 if (inode_bitmap < first_block || inode_bitmap > last_block) {
1560 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1696 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1561 "Inode bitmap for group %lu not in group " 1697 "Inode bitmap for group %u not in group "
1562 "(block %llu)!\n", i, inode_bitmap); 1698 "(block %llu)!\n", i, inode_bitmap);
1563 return 0; 1699 return 0;
1564 } 1700 }
@@ -1566,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1566 if (inode_table < first_block || 1702 if (inode_table < first_block ||
1567 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1703 inode_table + sbi->s_itb_per_group - 1 > last_block) {
1568 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1704 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1569 "Inode table for group %lu not in group " 1705 "Inode table for group %u not in group "
1570 "(block %llu)!\n", i, inode_table); 1706 "(block %llu)!\n", i, inode_table);
1571 return 0; 1707 return 0;
1572 } 1708 }
1573 spin_lock(sb_bgl_lock(sbi, i)); 1709 spin_lock(sb_bgl_lock(sbi, i));
1574 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 1710 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1575 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1711 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1576 "Checksum for group %lu failed (%u!=%u)\n", 1712 "Checksum for group %u failed (%u!=%u)\n",
1577 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1713 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1578 gdp)), le16_to_cpu(gdp->bg_checksum)); 1714 gdp)), le16_to_cpu(gdp->bg_checksum));
1579 if (!(sb->s_flags & MS_RDONLY)) { 1715 if (!(sb->s_flags & MS_RDONLY)) {
@@ -1865,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1865 ext4_fsblk_t sb_block = get_sb_block(&data); 2001 ext4_fsblk_t sb_block = get_sb_block(&data);
1866 ext4_fsblk_t logical_sb_block; 2002 ext4_fsblk_t logical_sb_block;
1867 unsigned long offset = 0; 2003 unsigned long offset = 0;
1868 unsigned int journal_inum = 0;
1869 unsigned long journal_devnum = 0; 2004 unsigned long journal_devnum = 0;
1870 unsigned long def_mount_opts; 2005 unsigned long def_mount_opts;
1871 struct inode *root; 2006 struct inode *root;
1872 char *cp; 2007 char *cp;
2008 const char *descr;
1873 int ret = -EINVAL; 2009 int ret = -EINVAL;
1874 int blocksize; 2010 int blocksize;
1875 int db_count; 2011 unsigned int db_count;
1876 int i; 2012 unsigned int i;
1877 int needs_recovery, has_huge_files; 2013 int needs_recovery, has_huge_files;
1878 __le32 features; 2014 int features;
1879 __u64 blocks_count; 2015 __u64 blocks_count;
1880 int err; 2016 int err;
2017 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
1881 2018
1882 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2019 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1883 if (!sbi) 2020 if (!sbi)
@@ -1958,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1958 2095
1959 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 2096 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1960 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 2097 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
2098 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2099 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2100 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
1961 2101
1962 set_opt(sbi->s_mount_opt, RESERVATION); 2102 set_opt(sbi->s_mount_opt, RESERVATION);
1963 set_opt(sbi->s_mount_opt, BARRIER); 2103 set_opt(sbi->s_mount_opt, BARRIER);
1964 2104
1965 /* 2105 /*
1966 * turn on extents feature by default in ext4 filesystem
1967 * only if feature flag already set by mkfs or tune2fs.
1968 * Use -o noextents to turn it off
1969 */
1970 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
1971 set_opt(sbi->s_mount_opt, EXTENTS);
1972 else
1973 ext4_warning(sb, __func__,
1974 "extents feature not enabled on this filesystem, "
1975 "use tune2fs.\n");
1976
1977 /*
1978 * enable delayed allocation by default 2106 * enable delayed allocation by default
1979 * Use -o nodelalloc to turn it off 2107 * Use -o nodelalloc to turn it off
1980 */ 2108 */
1981 set_opt(sbi->s_mount_opt, DELALLOC); 2109 set_opt(sbi->s_mount_opt, DELALLOC);
1982 2110
1983 2111
1984 if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, 2112 if (!parse_options((char *) data, sb, &journal_devnum,
1985 NULL, 0)) 2113 &journal_ioprio, NULL, 0))
1986 goto failed_mount; 2114 goto failed_mount;
1987 2115
1988 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2116 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2004,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2004 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2132 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
2005 if (features) { 2133 if (features) {
2006 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " 2134 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
2007 "unsupported optional features (%x).\n", 2135 "unsupported optional features (%x).\n", sb->s_id,
2008 sb->s_id, le32_to_cpu(features)); 2136 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2137 ~EXT4_FEATURE_INCOMPAT_SUPP));
2009 goto failed_mount; 2138 goto failed_mount;
2010 } 2139 }
2011 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); 2140 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2012 if (!(sb->s_flags & MS_RDONLY) && features) { 2141 if (!(sb->s_flags & MS_RDONLY) && features) {
2013 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " 2142 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
2014 "unsupported optional features (%x).\n", 2143 "unsupported optional features (%x).\n", sb->s_id,
2015 sb->s_id, le32_to_cpu(features)); 2144 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2145 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2016 goto failed_mount; 2146 goto failed_mount;
2017 } 2147 }
2018 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, 2148 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2117,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2117 for (i = 0; i < 4; i++) 2247 for (i = 0; i < 4; i++)
2118 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 2248 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2119 sbi->s_def_hash_version = es->s_def_hash_version; 2249 sbi->s_def_hash_version = es->s_def_hash_version;
2250 i = le32_to_cpu(es->s_flags);
2251 if (i & EXT2_FLAGS_UNSIGNED_HASH)
2252 sbi->s_hash_unsigned = 3;
2253 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2254#ifdef __CHAR_UNSIGNED__
2255 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2256 sbi->s_hash_unsigned = 3;
2257#else
2258 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2259#endif
2260 sb->s_dirt = 1;
2261 }
2120 2262
2121 if (sbi->s_blocks_per_group > blocksize * 8) { 2263 if (sbi->s_blocks_per_group > blocksize * 8) {
2122 printk(KERN_ERR 2264 printk(KERN_ERR
@@ -2144,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2144 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2286 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2145 goto cantfind_ext4; 2287 goto cantfind_ext4;
2146 2288
2147 /* ensure blocks_count calculation below doesn't sign-extend */ 2289 /*
2148 if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < 2290 * It makes no sense for the first data block to be beyond the end
2149 le32_to_cpu(es->s_first_data_block) + 1) { 2291 * of the filesystem.
2150 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " 2292 */
2151 "first data block %u, blocks per group %lu\n", 2293 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2152 ext4_blocks_count(es), 2294 printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2153 le32_to_cpu(es->s_first_data_block), 2295 "block %u is beyond end of filesystem (%llu)\n",
2154 EXT4_BLOCKS_PER_GROUP(sb)); 2296 le32_to_cpu(es->s_first_data_block),
2297 ext4_blocks_count(es));
2155 goto failed_mount; 2298 goto failed_mount;
2156 } 2299 }
2157 blocks_count = (ext4_blocks_count(es) - 2300 blocks_count = (ext4_blocks_count(es) -
2158 le32_to_cpu(es->s_first_data_block) + 2301 le32_to_cpu(es->s_first_data_block) +
2159 EXT4_BLOCKS_PER_GROUP(sb) - 1); 2302 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2160 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); 2303 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2304 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2305 printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2306 "(block count %llu, first data block %u, "
2307 "blocks per group %lu)\n", sbi->s_groups_count,
2308 ext4_blocks_count(es),
2309 le32_to_cpu(es->s_first_data_block),
2310 EXT4_BLOCKS_PER_GROUP(sb));
2311 goto failed_mount;
2312 }
2161 sbi->s_groups_count = blocks_count; 2313 sbi->s_groups_count = blocks_count;
2162 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2314 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2163 EXT4_DESC_PER_BLOCK(sb); 2315 EXT4_DESC_PER_BLOCK(sb);
@@ -2269,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2269 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2421 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2270 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2422 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2271 ext4_commit_super(sb, es, 1); 2423 ext4_commit_super(sb, es, 1);
2272 printk(KERN_CRIT
2273 "EXT4-fs (device %s): mount failed\n",
2274 sb->s_id);
2275 goto failed_mount4; 2424 goto failed_mount4;
2276 } 2425 }
2277 } 2426 }
2278 } else if (journal_inum) { 2427 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
2279 if (ext4_create_journal(sb, es, journal_inum)) 2428 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2280 goto failed_mount3; 2429 printk(KERN_ERR "EXT4-fs: required journal recovery "
2430 "suppressed and not mounted read-only\n");
2431 goto failed_mount4;
2281 } else { 2432 } else {
2282 if (!silent) 2433 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2283 printk(KERN_ERR 2434 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2284 "ext4: No journal on filesystem on %s\n", 2435 sbi->s_journal = NULL;
2285 sb->s_id); 2436 needs_recovery = 0;
2286 goto failed_mount3; 2437 goto no_journal;
2287 } 2438 }
2288 2439
2289 if (ext4_blocks_count(es) > 0xffffffffULL && 2440 if (ext4_blocks_count(es) > 0xffffffffULL &&
2290 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2441 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2291 JBD2_FEATURE_INCOMPAT_64BIT)) { 2442 JBD2_FEATURE_INCOMPAT_64BIT)) {
2292 printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); 2443 printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
2293 goto failed_mount4; 2444 goto failed_mount4;
2294 } 2445 }
2295 2446
@@ -2334,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2334 default: 2485 default:
2335 break; 2486 break;
2336 } 2487 }
2488 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2489
2490no_journal:
2337 2491
2338 if (test_opt(sb, NOBH)) { 2492 if (test_opt(sb, NOBH)) {
2339 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2493 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2419,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2419 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 2573 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
2420 ext4_orphan_cleanup(sb, es); 2574 ext4_orphan_cleanup(sb, es);
2421 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 2575 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
2422 if (needs_recovery) 2576 if (needs_recovery) {
2423 printk(KERN_INFO "EXT4-fs: recovery complete.\n"); 2577 printk(KERN_INFO "EXT4-fs: recovery complete.\n");
2424 ext4_mark_recovery_complete(sb, es); 2578 ext4_mark_recovery_complete(sb, es);
2425 printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", 2579 }
2426 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": 2580 if (EXT4_SB(sb)->s_journal) {
2427 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2581 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
2428 "writeback"); 2582 descr = " journalled data mode";
2583 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
2584 descr = " ordered data mode";
2585 else
2586 descr = " writeback data mode";
2587 } else
2588 descr = "out journal";
2589
2590 printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
2591 sb->s_id, descr);
2429 2592
2430 lock_kernel(); 2593 lock_kernel();
2431 return 0; 2594 return 0;
@@ -2437,8 +2600,11 @@ cantfind_ext4:
2437 goto failed_mount; 2600 goto failed_mount;
2438 2601
2439failed_mount4: 2602failed_mount4:
2440 jbd2_journal_destroy(sbi->s_journal); 2603 printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
2441 sbi->s_journal = NULL; 2604 if (sbi->s_journal) {
2605 jbd2_journal_destroy(sbi->s_journal);
2606 sbi->s_journal = NULL;
2607 }
2442failed_mount3: 2608failed_mount3:
2443 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2609 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2444 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2610 percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2475,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2475{ 2641{
2476 struct ext4_sb_info *sbi = EXT4_SB(sb); 2642 struct ext4_sb_info *sbi = EXT4_SB(sb);
2477 2643
2478 if (sbi->s_commit_interval) 2644 journal->j_commit_interval = sbi->s_commit_interval;
2479 journal->j_commit_interval = sbi->s_commit_interval; 2645 journal->j_min_batch_time = sbi->s_min_batch_time;
2480 /* We could also set up an ext4-specific default for the commit 2646 journal->j_max_batch_time = sbi->s_max_batch_time;
2481 * interval here, but for now we'll just fall back to the jbd
2482 * default. */
2483 2647
2484 spin_lock(&journal->j_state_lock); 2648 spin_lock(&journal->j_state_lock);
2485 if (test_opt(sb, BARRIER)) 2649 if (test_opt(sb, BARRIER))
@@ -2499,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2499 struct inode *journal_inode; 2663 struct inode *journal_inode;
2500 journal_t *journal; 2664 journal_t *journal;
2501 2665
2666 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2667
2502 /* First, test for the existence of a valid inode on disk. Bad 2668 /* First, test for the existence of a valid inode on disk. Bad
2503 * things happen if we iget() an unused inode, as the subsequent 2669 * things happen if we iget() an unused inode, as the subsequent
2504 * iput() will try to delete it. */ 2670 * iput() will try to delete it. */
@@ -2547,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2547 struct ext4_super_block *es; 2713 struct ext4_super_block *es;
2548 struct block_device *bdev; 2714 struct block_device *bdev;
2549 2715
2716 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2717
2550 bdev = ext4_blkdev_get(j_dev); 2718 bdev = ext4_blkdev_get(j_dev);
2551 if (bdev == NULL) 2719 if (bdev == NULL)
2552 return NULL; 2720 return NULL;
2553 2721
2554 if (bd_claim(bdev, sb)) { 2722 if (bd_claim(bdev, sb)) {
2555 printk(KERN_ERR 2723 printk(KERN_ERR
2556 "EXT4: failed to claim external journal device.\n"); 2724 "EXT4-fs: failed to claim external journal device.\n");
2557 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 2725 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2558 return NULL; 2726 return NULL;
2559 } 2727 }
@@ -2634,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
2634 int err = 0; 2802 int err = 0;
2635 int really_read_only; 2803 int really_read_only;
2636 2804
2805 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2806
2637 if (journal_devnum && 2807 if (journal_devnum &&
2638 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2808 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2639 printk(KERN_INFO "EXT4-fs: external journal device major/minor " 2809 printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2718,48 +2888,6 @@ static int ext4_load_journal(struct super_block *sb,
2718 return 0; 2888 return 0;
2719} 2889}
2720 2890
2721static int ext4_create_journal(struct super_block *sb,
2722 struct ext4_super_block *es,
2723 unsigned int journal_inum)
2724{
2725 journal_t *journal;
2726 int err;
2727
2728 if (sb->s_flags & MS_RDONLY) {
2729 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
2730 "create journal.\n");
2731 return -EROFS;
2732 }
2733
2734 journal = ext4_get_journal(sb, journal_inum);
2735 if (!journal)
2736 return -EINVAL;
2737
2738 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2739 journal_inum);
2740
2741 err = jbd2_journal_create(journal);
2742 if (err) {
2743 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2744 jbd2_journal_destroy(journal);
2745 return -EIO;
2746 }
2747
2748 EXT4_SB(sb)->s_journal = journal;
2749
2750 ext4_update_dynamic_rev(sb);
2751 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2752 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
2753
2754 es->s_journal_inum = cpu_to_le32(journal_inum);
2755 sb->s_dirt = 1;
2756
2757 /* Make sure we flush the recovery flag to disk. */
2758 ext4_commit_super(sb, es, 1);
2759
2760 return 0;
2761}
2762
2763static void ext4_commit_super(struct super_block *sb, 2891static void ext4_commit_super(struct super_block *sb,
2764 struct ext4_super_block *es, int sync) 2892 struct ext4_super_block *es, int sync)
2765{ 2893{
@@ -2776,20 +2904,23 @@ static void ext4_commit_super(struct super_block *sb,
2776 * be remapped. Nothing we can do but to retry the 2904 * be remapped. Nothing we can do but to retry the
2777 * write and hope for the best. 2905 * write and hope for the best.
2778 */ 2906 */
2779 printk(KERN_ERR "ext4: previous I/O error to " 2907 printk(KERN_ERR "EXT4-fs: previous I/O error to "
2780 "superblock detected for %s.\n", sb->s_id); 2908 "superblock detected for %s.\n", sb->s_id);
2781 clear_buffer_write_io_error(sbh); 2909 clear_buffer_write_io_error(sbh);
2782 set_buffer_uptodate(sbh); 2910 set_buffer_uptodate(sbh);
2783 } 2911 }
2784 es->s_wtime = cpu_to_le32(get_seconds()); 2912 es->s_wtime = cpu_to_le32(get_seconds());
2785 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); 2913 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
2786 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 2914 &EXT4_SB(sb)->s_freeblocks_counter));
2915 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
2916 &EXT4_SB(sb)->s_freeinodes_counter));
2917
2787 BUFFER_TRACE(sbh, "marking dirty"); 2918 BUFFER_TRACE(sbh, "marking dirty");
2788 mark_buffer_dirty(sbh); 2919 mark_buffer_dirty(sbh);
2789 if (sync) { 2920 if (sync) {
2790 sync_dirty_buffer(sbh); 2921 sync_dirty_buffer(sbh);
2791 if (buffer_write_io_error(sbh)) { 2922 if (buffer_write_io_error(sbh)) {
2792 printk(KERN_ERR "ext4: I/O error while writing " 2923 printk(KERN_ERR "EXT4-fs: I/O error while writing "
2793 "superblock for %s.\n", sb->s_id); 2924 "superblock for %s.\n", sb->s_id);
2794 clear_buffer_write_io_error(sbh); 2925 clear_buffer_write_io_error(sbh);
2795 set_buffer_uptodate(sbh); 2926 set_buffer_uptodate(sbh);
@@ -2808,6 +2939,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2808{ 2939{
2809 journal_t *journal = EXT4_SB(sb)->s_journal; 2940 journal_t *journal = EXT4_SB(sb)->s_journal;
2810 2941
2942 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
2943 BUG_ON(journal != NULL);
2944 return;
2945 }
2811 jbd2_journal_lock_updates(journal); 2946 jbd2_journal_lock_updates(journal);
2812 if (jbd2_journal_flush(journal) < 0) 2947 if (jbd2_journal_flush(journal) < 0)
2813 goto out; 2948 goto out;
@@ -2837,6 +2972,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
2837 int j_errno; 2972 int j_errno;
2838 const char *errstr; 2973 const char *errstr;
2839 2974
2975 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2976
2840 journal = EXT4_SB(sb)->s_journal; 2977 journal = EXT4_SB(sb)->s_journal;
2841 2978
2842 /* 2979 /*
@@ -2869,14 +3006,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
2869int ext4_force_commit(struct super_block *sb) 3006int ext4_force_commit(struct super_block *sb)
2870{ 3007{
2871 journal_t *journal; 3008 journal_t *journal;
2872 int ret; 3009 int ret = 0;
2873 3010
2874 if (sb->s_flags & MS_RDONLY) 3011 if (sb->s_flags & MS_RDONLY)
2875 return 0; 3012 return 0;
2876 3013
2877 journal = EXT4_SB(sb)->s_journal; 3014 journal = EXT4_SB(sb)->s_journal;
2878 sb->s_dirt = 0; 3015 if (journal) {
2879 ret = ext4_journal_force_commit(journal); 3016 sb->s_dirt = 0;
3017 ret = ext4_journal_force_commit(journal);
3018 }
3019
2880 return ret; 3020 return ret;
2881} 3021}
2882 3022
@@ -2888,9 +3028,13 @@ int ext4_force_commit(struct super_block *sb)
2888 */ 3028 */
2889static void ext4_write_super(struct super_block *sb) 3029static void ext4_write_super(struct super_block *sb)
2890{ 3030{
2891 if (mutex_trylock(&sb->s_lock) != 0) 3031 if (EXT4_SB(sb)->s_journal) {
2892 BUG(); 3032 if (mutex_trylock(&sb->s_lock) != 0)
2893 sb->s_dirt = 0; 3033 BUG();
3034 sb->s_dirt = 0;
3035 } else {
3036 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3037 }
2894} 3038}
2895 3039
2896static int ext4_sync_fs(struct super_block *sb, int wait) 3040static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2899,10 +3043,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2899 3043
2900 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3044 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2901 sb->s_dirt = 0; 3045 sb->s_dirt = 0;
2902 if (wait) 3046 if (EXT4_SB(sb)->s_journal) {
2903 ret = ext4_force_commit(sb); 3047 if (wait)
2904 else 3048 ret = ext4_force_commit(sb);
2905 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); 3049 else
3050 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
3051 } else {
3052 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
3053 }
2906 return ret; 3054 return ret;
2907} 3055}
2908 3056
@@ -2917,15 +3065,17 @@ static void ext4_write_super_lockfs(struct super_block *sb)
2917 if (!(sb->s_flags & MS_RDONLY)) { 3065 if (!(sb->s_flags & MS_RDONLY)) {
2918 journal_t *journal = EXT4_SB(sb)->s_journal; 3066 journal_t *journal = EXT4_SB(sb)->s_journal;
2919 3067
2920 /* Now we set up the journal barrier. */ 3068 if (journal) {
2921 jbd2_journal_lock_updates(journal); 3069 /* Now we set up the journal barrier. */
3070 jbd2_journal_lock_updates(journal);
2922 3071
2923 /* 3072 /*
2924 * We don't want to clear needs_recovery flag when we failed 3073 * We don't want to clear needs_recovery flag when we
2925 * to flush the journal. 3074 * failed to flush the journal.
2926 */ 3075 */
2927 if (jbd2_journal_flush(journal) < 0) 3076 if (jbd2_journal_flush(journal) < 0)
2928 return; 3077 return;
3078 }
2929 3079
2930 /* Journal blocked and flushed, clear needs_recovery flag. */ 3080 /* Journal blocked and flushed, clear needs_recovery flag. */
2931 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3081 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2939,7 +3089,7 @@ static void ext4_write_super_lockfs(struct super_block *sb)
2939 */ 3089 */
2940static void ext4_unlockfs(struct super_block *sb) 3090static void ext4_unlockfs(struct super_block *sb)
2941{ 3091{
2942 if (!(sb->s_flags & MS_RDONLY)) { 3092 if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
2943 lock_super(sb); 3093 lock_super(sb);
2944 /* Reser the needs_recovery flag before the fs is unlocked. */ 3094 /* Reser the needs_recovery flag before the fs is unlocked. */
2945 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3095 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2957,6 +3107,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2957 unsigned long old_sb_flags; 3107 unsigned long old_sb_flags;
2958 struct ext4_mount_options old_opts; 3108 struct ext4_mount_options old_opts;
2959 ext4_group_t g; 3109 ext4_group_t g;
3110 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
2960 int err; 3111 int err;
2961#ifdef CONFIG_QUOTA 3112#ifdef CONFIG_QUOTA
2962 int i; 3113 int i;
@@ -2968,16 +3119,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2968 old_opts.s_resuid = sbi->s_resuid; 3119 old_opts.s_resuid = sbi->s_resuid;
2969 old_opts.s_resgid = sbi->s_resgid; 3120 old_opts.s_resgid = sbi->s_resgid;
2970 old_opts.s_commit_interval = sbi->s_commit_interval; 3121 old_opts.s_commit_interval = sbi->s_commit_interval;
3122 old_opts.s_min_batch_time = sbi->s_min_batch_time;
3123 old_opts.s_max_batch_time = sbi->s_max_batch_time;
2971#ifdef CONFIG_QUOTA 3124#ifdef CONFIG_QUOTA
2972 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 3125 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2973 for (i = 0; i < MAXQUOTAS; i++) 3126 for (i = 0; i < MAXQUOTAS; i++)
2974 old_opts.s_qf_names[i] = sbi->s_qf_names[i]; 3127 old_opts.s_qf_names[i] = sbi->s_qf_names[i];
2975#endif 3128#endif
3129 if (sbi->s_journal && sbi->s_journal->j_task->io_context)
3130 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
2976 3131
2977 /* 3132 /*
2978 * Allow the "check" option to be passed as a remount option. 3133 * Allow the "check" option to be passed as a remount option.
2979 */ 3134 */
2980 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { 3135 if (!parse_options(data, sb, NULL, &journal_ioprio,
3136 &n_blocks_count, 1)) {
2981 err = -EINVAL; 3137 err = -EINVAL;
2982 goto restore_opts; 3138 goto restore_opts;
2983 } 3139 }
@@ -2990,7 +3146,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2990 3146
2991 es = sbi->s_es; 3147 es = sbi->s_es;
2992 3148
2993 ext4_init_journal_params(sb, sbi->s_journal); 3149 if (sbi->s_journal) {
3150 ext4_init_journal_params(sb, sbi->s_journal);
3151 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3152 }
2994 3153
2995 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 3154 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2996 n_blocks_count > ext4_blocks_count(es)) { 3155 n_blocks_count > ext4_blocks_count(es)) {
@@ -3019,17 +3178,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3019 * We have to unlock super so that we can wait for 3178 * We have to unlock super so that we can wait for
3020 * transactions. 3179 * transactions.
3021 */ 3180 */
3022 unlock_super(sb); 3181 if (sbi->s_journal) {
3023 ext4_mark_recovery_complete(sb, es); 3182 unlock_super(sb);
3024 lock_super(sb); 3183 ext4_mark_recovery_complete(sb, es);
3184 lock_super(sb);
3185 }
3025 } else { 3186 } else {
3026 __le32 ret; 3187 int ret;
3027 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3188 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3028 ~EXT4_FEATURE_RO_COMPAT_SUPP))) { 3189 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3029 printk(KERN_WARNING "EXT4-fs: %s: couldn't " 3190 printk(KERN_WARNING "EXT4-fs: %s: couldn't "
3030 "remount RDWR because of unsupported " 3191 "remount RDWR because of unsupported "
3031 "optional features (%x).\n", 3192 "optional features (%x).\n", sb->s_id,
3032 sb->s_id, le32_to_cpu(ret)); 3193 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3194 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3033 err = -EROFS; 3195 err = -EROFS;
3034 goto restore_opts; 3196 goto restore_opts;
3035 } 3197 }
@@ -3046,7 +3208,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3046 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { 3208 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
3047 printk(KERN_ERR 3209 printk(KERN_ERR
3048 "EXT4-fs: ext4_remount: " 3210 "EXT4-fs: ext4_remount: "
3049 "Checksum for group %lu failed (%u!=%u)\n", 3211 "Checksum for group %u failed (%u!=%u)\n",
3050 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), 3212 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
3051 le16_to_cpu(gdp->bg_checksum)); 3213 le16_to_cpu(gdp->bg_checksum));
3052 err = -EINVAL; 3214 err = -EINVAL;
@@ -3075,7 +3237,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3075 * been changed by e2fsck since we originally mounted 3237 * been changed by e2fsck since we originally mounted
3076 * the partition.) 3238 * the partition.)
3077 */ 3239 */
3078 ext4_clear_journal_err(sb, es); 3240 if (sbi->s_journal)
3241 ext4_clear_journal_err(sb, es);
3079 sbi->s_mount_state = le16_to_cpu(es->s_state); 3242 sbi->s_mount_state = le16_to_cpu(es->s_state);
3080 if ((err = ext4_group_extend(sb, es, n_blocks_count))) 3243 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
3081 goto restore_opts; 3244 goto restore_opts;
@@ -3083,6 +3246,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3083 sb->s_flags &= ~MS_RDONLY; 3246 sb->s_flags &= ~MS_RDONLY;
3084 } 3247 }
3085 } 3248 }
3249 if (sbi->s_journal == NULL)
3250 ext4_commit_super(sb, es, 1);
3251
3086#ifdef CONFIG_QUOTA 3252#ifdef CONFIG_QUOTA
3087 /* Release old quota file names */ 3253 /* Release old quota file names */
3088 for (i = 0; i < MAXQUOTAS; i++) 3254 for (i = 0; i < MAXQUOTAS; i++)
@@ -3097,6 +3263,8 @@ restore_opts:
3097 sbi->s_resuid = old_opts.s_resuid; 3263 sbi->s_resuid = old_opts.s_resuid;
3098 sbi->s_resgid = old_opts.s_resgid; 3264 sbi->s_resgid = old_opts.s_resgid;
3099 sbi->s_commit_interval = old_opts.s_commit_interval; 3265 sbi->s_commit_interval = old_opts.s_commit_interval;
3266 sbi->s_min_batch_time = old_opts.s_min_batch_time;
3267 sbi->s_max_batch_time = old_opts.s_max_batch_time;
3100#ifdef CONFIG_QUOTA 3268#ifdef CONFIG_QUOTA
3101 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 3269 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
3102 for (i = 0; i < MAXQUOTAS; i++) { 3270 for (i = 0; i < MAXQUOTAS; i++) {
@@ -3359,7 +3527,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3359 * When we journal data on quota file, we have to flush journal to see 3527 * When we journal data on quota file, we have to flush journal to see
3360 * all updates to the file when we bypass pagecache... 3528 * all updates to the file when we bypass pagecache...
3361 */ 3529 */
3362 if (ext4_should_journal_data(path.dentry->d_inode)) { 3530 if (EXT4_SB(sb)->s_journal &&
3531 ext4_should_journal_data(path.dentry->d_inode)) {
3363 /* 3532 /*
3364 * We don't need to lock updates but journal_flush() could 3533 * We don't need to lock updates but journal_flush() could
3365 * otherwise be livelocked... 3534 * otherwise be livelocked...
@@ -3433,7 +3602,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3433 struct buffer_head *bh; 3602 struct buffer_head *bh;
3434 handle_t *handle = journal_current_handle(); 3603 handle_t *handle = journal_current_handle();
3435 3604
3436 if (!handle) { 3605 if (EXT4_SB(sb)->s_journal && !handle) {
3437 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" 3606 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3438 " cancelled because transaction is not started.\n", 3607 " cancelled because transaction is not started.\n",
3439 (unsigned long long)off, (unsigned long long)len); 3608 (unsigned long long)off, (unsigned long long)len);
@@ -3458,7 +3627,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3458 flush_dcache_page(bh->b_page); 3627 flush_dcache_page(bh->b_page);
3459 unlock_buffer(bh); 3628 unlock_buffer(bh);
3460 if (journal_quota) 3629 if (journal_quota)
3461 err = ext4_journal_dirty_metadata(handle, bh); 3630 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3462 else { 3631 else {
3463 /* Always do at least ordered writes for quotas */ 3632 /* Always do at least ordered writes for quotas */
3464 err = ext4_jbd2_file_inode(handle, inode); 3633 err = ext4_jbd2_file_inode(handle, inode);
@@ -3512,18 +3681,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3512static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, 3681static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3513 size_t cnt, loff_t *ppos) 3682 size_t cnt, loff_t *ppos)
3514{ 3683{
3515 unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; 3684 unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
3516 char str[32]; 3685 char str[32];
3517 unsigned long value;
3518 3686
3519 if (cnt >= sizeof(str)) 3687 if (cnt >= sizeof(str))
3520 return -EINVAL; 3688 return -EINVAL;
3521 if (copy_from_user(str, buf, cnt)) 3689 if (copy_from_user(str, buf, cnt))
3522 return -EFAULT; 3690 return -EFAULT;
3523 value = simple_strtol(str, NULL, 0); 3691
3524 if (value < 0) 3692 *p = simple_strtoul(str, NULL, 0);
3525 return -ERANGE;
3526 *p = value;
3527 return cnt; 3693 return cnt;
3528} 3694}
3529 3695
@@ -3614,7 +3780,7 @@ static void __exit exit_ext4_fs(void)
3614} 3780}
3615 3781
3616MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3782MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
3617MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); 3783MODULE_DESCRIPTION("Fourth Extended Filesystem");
3618MODULE_LICENSE("GPL"); 3784MODULE_LICENSE("GPL");
3619module_init(init_ext4_fs) 3785module_init(init_ext4_fs)
3620module_exit(exit_ext4_fs) 3786module_exit(exit_ext4_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fee..157ce6589c54 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
457 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { 457 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
458 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); 458 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
459 sb->s_dirt = 1; 459 sb->s_dirt = 1;
460 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 460 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
461 } 461 }
462} 462}
463 463
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
488 } else { 488 } else {
489 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 489 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
490 error = ext4_journal_dirty_metadata(handle, bh); 490 error = ext4_handle_dirty_metadata(handle, inode, bh);
491 if (IS_SYNC(inode)) 491 if (IS_SYNC(inode))
492 handle->h_sync = 1; 492 ext4_handle_sync(handle);
493 DQUOT_FREE_BLOCK(inode, 1); 493 DQUOT_FREE_BLOCK(inode, 1);
494 ea_bdebug(bh, "refcount now=%d; releasing", 494 ea_bdebug(bh, "refcount now=%d; releasing",
495 le32_to_cpu(BHDR(bh)->h_refcount)); 495 le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
724 if (error == -EIO) 724 if (error == -EIO)
725 goto bad_block; 725 goto bad_block;
726 if (!error) 726 if (!error)
727 error = ext4_journal_dirty_metadata(handle, 727 error = ext4_handle_dirty_metadata(handle,
728 bs->bh); 728 inode,
729 bs->bh);
729 if (error) 730 if (error)
730 goto cleanup; 731 goto cleanup;
731 goto inserted; 732 goto inserted;
@@ -794,8 +795,9 @@ inserted:
794 ea_bdebug(new_bh, "reusing; refcount now=%d", 795 ea_bdebug(new_bh, "reusing; refcount now=%d",
795 le32_to_cpu(BHDR(new_bh)->h_refcount)); 796 le32_to_cpu(BHDR(new_bh)->h_refcount));
796 unlock_buffer(new_bh); 797 unlock_buffer(new_bh);
797 error = ext4_journal_dirty_metadata(handle, 798 error = ext4_handle_dirty_metadata(handle,
798 new_bh); 799 inode,
800 new_bh);
799 if (error) 801 if (error)
800 goto cleanup_dquot; 802 goto cleanup_dquot;
801 } 803 }
@@ -810,8 +812,8 @@ inserted:
810 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
811 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
812 EXT4_I(inode)->i_block_group); 814 EXT4_I(inode)->i_block_group);
813 ext4_fsblk_t block = ext4_new_meta_block(handle, inode, 815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
814 goal, &error); 816 goal, NULL, &error);
815 if (error) 817 if (error)
816 goto cleanup; 818 goto cleanup;
817 ea_idebug(inode, "creating block %d", block); 819 ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
833 set_buffer_uptodate(new_bh); 835 set_buffer_uptodate(new_bh);
834 unlock_buffer(new_bh); 836 unlock_buffer(new_bh);
835 ext4_xattr_cache_insert(new_bh); 837 ext4_xattr_cache_insert(new_bh);
836 error = ext4_journal_dirty_metadata(handle, new_bh); 838 error = ext4_handle_dirty_metadata(handle,
839 inode, new_bh);
837 if (error) 840 if (error)
838 goto cleanup; 841 goto cleanup;
839 } 842 }
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1040 */ 1043 */
1041 is.iloc.bh = NULL; 1044 is.iloc.bh = NULL;
1042 if (IS_SYNC(inode)) 1045 if (IS_SYNC(inode))
1043 handle->h_sync = 1; 1046 ext4_handle_sync(handle);
1044 } 1047 }
1045 1048
1046cleanup: 1049cleanup:
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a2..1a39ac370942 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29 29
30static int set_task_ioprio(struct task_struct *task, int ioprio) 30int set_task_ioprio(struct task_struct *task, int ioprio)
31{ 31{
32 int err; 32 int err;
33 struct io_context *ioc; 33 struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
70 task_unlock(task); 70 task_unlock(task);
71 return err; 71 return err;
72} 72}
73EXPORT_SYMBOL_GPL(set_task_ioprio);
73 74
74asmlinkage long sys_ioprio_set(int which, int who, int ioprio) 75asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
75{ 76{
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
249 return ret; 249 return ret;
250} 250}
251 251
252#define NR_BATCH 64
253
254static void 252static void
255__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 253__flush_batch(journal_t *journal, int *batch_count)
256{ 254{
257 int i; 255 int i;
258 256
259 ll_rw_block(SWRITE, *batch_count, bhs); 257 ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
260 for (i = 0; i < *batch_count; i++) { 258 for (i = 0; i < *batch_count; i++) {
261 struct buffer_head *bh = bhs[i]; 259 struct buffer_head *bh = journal->j_chkpt_bhs[i];
262 clear_buffer_jwrite(bh); 260 clear_buffer_jwrite(bh);
263 BUFFER_TRACE(bh, "brelse"); 261 BUFFER_TRACE(bh, "brelse");
264 __brelse(bh); 262 __brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
277 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 275 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
278 */ 276 */
279static int __process_buffer(journal_t *journal, struct journal_head *jh, 277static int __process_buffer(journal_t *journal, struct journal_head *jh,
280 struct buffer_head **bhs, int *batch_count, 278 int *batch_count, transaction_t *transaction)
281 transaction_t *transaction)
282{ 279{
283 struct buffer_head *bh = jh2bh(jh); 280 struct buffer_head *bh = jh2bh(jh);
284 int ret = 0; 281 int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
325 get_bh(bh); 322 get_bh(bh);
326 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 323 J_ASSERT_BH(bh, !buffer_jwrite(bh));
327 set_buffer_jwrite(bh); 324 set_buffer_jwrite(bh);
328 bhs[*batch_count] = bh; 325 journal->j_chkpt_bhs[*batch_count] = bh;
329 __buffer_relink_io(jh); 326 __buffer_relink_io(jh);
330 jbd_unlock_bh_state(bh); 327 jbd_unlock_bh_state(bh);
331 transaction->t_chp_stats.cs_written++; 328 transaction->t_chp_stats.cs_written++;
332 (*batch_count)++; 329 (*batch_count)++;
333 if (*batch_count == NR_BATCH) { 330 if (*batch_count == JBD2_NR_BATCH) {
334 spin_unlock(&journal->j_list_lock); 331 spin_unlock(&journal->j_list_lock);
335 __flush_batch(journal, bhs, batch_count); 332 __flush_batch(journal, batch_count);
336 ret = 1; 333 ret = 1;
337 } 334 }
338 } 335 }
@@ -388,7 +385,6 @@ restart:
388 if (journal->j_checkpoint_transactions == transaction && 385 if (journal->j_checkpoint_transactions == transaction &&
389 transaction->t_tid == this_tid) { 386 transaction->t_tid == this_tid) {
390 int batch_count = 0; 387 int batch_count = 0;
391 struct buffer_head *bhs[NR_BATCH];
392 struct journal_head *jh; 388 struct journal_head *jh;
393 int retry = 0, err; 389 int retry = 0, err;
394 390
@@ -402,7 +398,7 @@ restart:
402 retry = 1; 398 retry = 1;
403 break; 399 break;
404 } 400 }
405 retry = __process_buffer(journal, jh, bhs, &batch_count, 401 retry = __process_buffer(journal, jh, &batch_count,
406 transaction); 402 transaction);
407 if (retry < 0 && !result) 403 if (retry < 0 && !result)
408 result = retry; 404 result = retry;
@@ -419,7 +415,7 @@ restart:
419 spin_unlock(&journal->j_list_lock); 415 spin_unlock(&journal->j_list_lock);
420 retry = 1; 416 retry = 1;
421 } 417 }
422 __flush_batch(journal, bhs, &batch_count); 418 __flush_batch(journal, &batch_count);
423 } 419 }
424 420
425 if (retry) { 421 if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
686 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
687 683
688 __jbd2_journal_drop_transaction(journal, transaction); 684 __jbd2_journal_drop_transaction(journal, transaction);
685 kfree(transaction);
689 686
690 /* Just in case anybody was waiting for more transactions to be 687 /* Just in case anybody was waiting for more transactions to be
691 checkpointed... */ 688 checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
760 J_ASSERT(journal->j_running_transaction != transaction); 757 J_ASSERT(journal->j_running_transaction != transaction);
761 758
762 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 759 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
763 kfree(transaction);
764} 760}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index c8a1bace685a..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/crc32.h> 25#include <linux/crc32.h>
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bio.h>
28 29
29/* 30/*
30 * Default IO end handler for temporary BJ_IO buffer_heads. 31 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
137 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
138 barrier_done = 1; 139 barrier_done = 1;
139 } 140 }
140 ret = submit_bh(WRITE, bh); 141 ret = submit_bh(WRITE_SYNC, bh);
141 if (barrier_done) 142 if (barrier_done)
142 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
143 144
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
158 lock_buffer(bh); 159 lock_buffer(bh);
159 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
160 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
161 ret = submit_bh(WRITE, bh); 162 ret = submit_bh(WRITE_SYNC, bh);
162 } 163 }
163 *cbh = bh; 164 *cbh = bh;
164 return ret; 165 return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
168 * This function along with journal_submit_commit_record 169 * This function along with journal_submit_commit_record
169 * allows to write the commit record asynchronously. 170 * allows to write the commit record asynchronously.
170 */ 171 */
171static int journal_wait_on_commit_record(struct buffer_head *bh) 172static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
172{ 174{
173 int ret = 0; 175 int ret = 0;
174 176
177retry:
175 clear_buffer_dirty(bh); 178 clear_buffer_dirty(bh);
176 wait_on_buffer(bh); 179 wait_on_buffer(bh);
180 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
181 printk(KERN_WARNING
182 "JBD2: wait_on_commit_record: sync failed on %s - "
183 "disabling barriers\n", journal->j_devname);
184 spin_lock(&journal->j_state_lock);
185 journal->j_flags &= ~JBD2_BARRIER;
186 spin_unlock(&journal->j_state_lock);
187
188 lock_buffer(bh);
189 clear_buffer_dirty(bh);
190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync;
192
193 ret = submit_bh(WRITE_SYNC, bh);
194 if (ret) {
195 unlock_buffer(bh);
196 return ret;
197 }
198 goto retry;
199 }
177 200
178 if (unlikely(!buffer_uptodate(bh))) 201 if (unlikely(!buffer_uptodate(bh)))
179 ret = -EIO; 202 ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
332 int flags; 355 int flags;
333 int err; 356 int err;
334 unsigned long long blocknr; 357 unsigned long long blocknr;
358 ktime_t start_time;
359 u64 commit_time;
335 char *tagp = NULL; 360 char *tagp = NULL;
336 journal_header_t *header; 361 journal_header_t *header;
337 journal_block_tag_t *tag = NULL; 362 journal_block_tag_t *tag = NULL;
338 int space_left = 0; 363 int space_left = 0;
339 int first_tag = 0; 364 int first_tag = 0;
340 int tag_flag; 365 int tag_flag;
341 int i; 366 int i, to_free = 0;
342 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
343 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
344 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
458 commit_transaction->t_state = T_FLUSH; 483 commit_transaction->t_state = T_FLUSH;
459 journal->j_committing_transaction = commit_transaction; 484 journal->j_committing_transaction = commit_transaction;
460 journal->j_running_transaction = NULL; 485 journal->j_running_transaction = NULL;
486 start_time = ktime_get();
461 commit_transaction->t_log_start = journal->j_head; 487 commit_transaction->t_log_start = journal->j_head;
462 wake_up(&journal->j_wait_transaction_locked); 488 wake_up(&journal->j_wait_transaction_locked);
463 spin_unlock(&journal->j_state_lock); 489 spin_unlock(&journal->j_state_lock);
@@ -803,7 +829,7 @@ wait_for_iobuf:
803 __jbd2_journal_abort_hard(journal); 829 __jbd2_journal_abort_hard(journal);
804 } 830 }
805 if (!err && !is_journal_aborted(journal)) 831 if (!err && !is_journal_aborted(journal))
806 err = journal_wait_on_commit_record(cbh); 832 err = journal_wait_on_commit_record(journal, cbh);
807 833
808 if (err) 834 if (err)
809 jbd2_journal_abort(journal, err); 835 jbd2_journal_abort(journal, err);
@@ -981,14 +1007,23 @@ restart_loop:
981 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1007 J_ASSERT(commit_transaction == journal->j_committing_transaction);
982 journal->j_commit_sequence = commit_transaction->t_tid; 1008 journal->j_commit_sequence = commit_transaction->t_tid;
983 journal->j_committing_transaction = NULL; 1009 journal->j_committing_transaction = NULL;
984 spin_unlock(&journal->j_state_lock); 1010 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
985 1011
986 if (journal->j_commit_callback) 1012 /*
987 journal->j_commit_callback(journal, commit_transaction); 1013 * weight the commit time higher than the average time so we don't
1014 * react too strongly to vast changes in the commit time
1015 */
1016 if (likely(journal->j_average_commit_time))
1017 journal->j_average_commit_time = (commit_time +
1018 journal->j_average_commit_time*3) / 4;
1019 else
1020 journal->j_average_commit_time = commit_time;
1021 spin_unlock(&journal->j_state_lock);
988 1022
989 if (commit_transaction->t_checkpoint_list == NULL && 1023 if (commit_transaction->t_checkpoint_list == NULL &&
990 commit_transaction->t_checkpoint_io_list == NULL) { 1024 commit_transaction->t_checkpoint_io_list == NULL) {
991 __jbd2_journal_drop_transaction(journal, commit_transaction); 1025 __jbd2_journal_drop_transaction(journal, commit_transaction);
1026 to_free = 1;
992 } else { 1027 } else {
993 if (journal->j_checkpoint_transactions == NULL) { 1028 if (journal->j_checkpoint_transactions == NULL) {
994 journal->j_checkpoint_transactions = commit_transaction; 1029 journal->j_checkpoint_transactions = commit_transaction;
@@ -1007,11 +1042,16 @@ restart_loop:
1007 } 1042 }
1008 spin_unlock(&journal->j_list_lock); 1043 spin_unlock(&journal->j_list_lock);
1009 1044
1045 if (journal->j_commit_callback)
1046 journal->j_commit_callback(journal, commit_transaction);
1047
1010 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 1048 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1011 journal->j_devname, journal->j_commit_sequence, 1049 journal->j_devname, commit_transaction->t_tid,
1012 journal->j_tail_sequence); 1050 journal->j_tail_sequence);
1013 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1051 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1014 journal->j_commit_sequence, journal->j_tail_sequence); 1052 journal->j_commit_sequence, journal->j_tail_sequence);
1053 if (to_free)
1054 kfree(commit_transaction);
1015 1055
1016 wake_up(&journal->j_wait_done_commit); 1056 wake_up(&journal->j_wait_done_commit);
1017} 1057}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f6bff9d6f8df..56675306ed81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -40,6 +40,7 @@
40 40
41#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42#include <asm/page.h> 42#include <asm/page.h>
43#include <asm/div64.h>
43 44
44EXPORT_SYMBOL(jbd2_journal_start); 45EXPORT_SYMBOL(jbd2_journal_start);
45EXPORT_SYMBOL(jbd2_journal_restart); 46EXPORT_SYMBOL(jbd2_journal_restart);
@@ -66,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
66EXPORT_SYMBOL(jbd2_journal_check_used_features); 67EXPORT_SYMBOL(jbd2_journal_check_used_features);
67EXPORT_SYMBOL(jbd2_journal_check_available_features); 68EXPORT_SYMBOL(jbd2_journal_check_available_features);
68EXPORT_SYMBOL(jbd2_journal_set_features); 69EXPORT_SYMBOL(jbd2_journal_set_features);
69EXPORT_SYMBOL(jbd2_journal_create);
70EXPORT_SYMBOL(jbd2_journal_load); 70EXPORT_SYMBOL(jbd2_journal_load);
71EXPORT_SYMBOL(jbd2_journal_destroy); 71EXPORT_SYMBOL(jbd2_journal_destroy);
72EXPORT_SYMBOL(jbd2_journal_abort); 72EXPORT_SYMBOL(jbd2_journal_abort);
@@ -132,8 +132,9 @@ static int kjournald2(void *arg)
132 journal->j_task = current; 132 journal->j_task = current;
133 wake_up(&journal->j_wait_done_commit); 133 wake_up(&journal->j_wait_done_commit);
134 134
135 printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", 135 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
136 journal->j_commit_interval / HZ); 136 "commit interval %ld seconds\n", current->pid,
137 journal->j_devname, journal->j_commit_interval / HZ);
137 138
138 /* 139 /*
139 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
@@ -650,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
650 return NULL; 651 return NULL;
651 652
652 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 653 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
654 if (!bh)
655 return NULL;
653 lock_buffer(bh); 656 lock_buffer(bh);
654 memset(bh->b_data, 0, journal->j_blocksize); 657 memset(bh->b_data, 0, journal->j_blocksize);
655 set_buffer_uptodate(bh); 658 set_buffer_uptodate(bh);
@@ -843,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
843 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 846 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
844 seq_printf(seq, " %ums logging transaction\n", 847 seq_printf(seq, " %ums logging transaction\n",
845 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 848 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
849 seq_printf(seq, " %luus average transaction commit time\n",
850 do_div(s->journal->j_average_commit_time, 1000));
846 seq_printf(seq, " %lu handles per transaction\n", 851 seq_printf(seq, " %lu handles per transaction\n",
847 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 852 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
848 seq_printf(seq, " %lu blocks per transaction\n", 853 seq_printf(seq, " %lu blocks per transaction\n",
@@ -980,6 +985,8 @@ static journal_t * journal_init_common (void)
980 spin_lock_init(&journal->j_state_lock); 985 spin_lock_init(&journal->j_state_lock);
981 986
982 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 987 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
988 journal->j_min_batch_time = 0;
989 journal->j_max_batch_time = 15000; /* 15ms */
983 990
984 /* The journal is marked for error until we succeed with recovery! */ 991 /* The journal is marked for error until we succeed with recovery! */
985 journal->j_flags = JBD2_ABORT; 992 journal->j_flags = JBD2_ABORT;
@@ -1035,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1035 1042
1036 /* journal descriptor can store up to n blocks -bzzz */ 1043 /* journal descriptor can store up to n blocks -bzzz */
1037 journal->j_blocksize = blocksize; 1044 journal->j_blocksize = blocksize;
1045 jbd2_stats_proc_init(journal);
1038 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1046 n = journal->j_blocksize / sizeof(journal_block_tag_t);
1039 journal->j_wbufsize = n; 1047 journal->j_wbufsize = n;
1040 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 1048 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
1041 if (!journal->j_wbuf) { 1049 if (!journal->j_wbuf) {
1042 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1050 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1043 __func__); 1051 __func__);
1044 kfree(journal); 1052 goto out_err;
1045 journal = NULL;
1046 goto out;
1047 } 1053 }
1048 journal->j_dev = bdev; 1054 journal->j_dev = bdev;
1049 journal->j_fs_dev = fs_dev; 1055 journal->j_fs_dev = fs_dev;
@@ -1053,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1053 p = journal->j_devname; 1059 p = journal->j_devname;
1054 while ((p = strchr(p, '/'))) 1060 while ((p = strchr(p, '/')))
1055 *p = '!'; 1061 *p = '!';
1056 jbd2_stats_proc_init(journal);
1057 1062
1058 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1063 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
1059 J_ASSERT(bh != NULL); 1064 if (!bh) {
1065 printk(KERN_ERR
1066 "%s: Cannot get buffer for journal superblock\n",
1067 __func__);
1068 goto out_err;
1069 }
1060 journal->j_sb_buffer = bh; 1070 journal->j_sb_buffer = bh;
1061 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1071 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1062out: 1072
1063 return journal; 1073 return journal;
1074out_err:
1075 jbd2_stats_proc_exit(journal);
1076 kfree(journal);
1077 return NULL;
1064} 1078}
1065 1079
1066/** 1080/**
@@ -1108,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1108 if (!journal->j_wbuf) { 1122 if (!journal->j_wbuf) {
1109 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1123 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1110 __func__); 1124 __func__);
1111 jbd2_stats_proc_exit(journal); 1125 goto out_err;
1112 kfree(journal);
1113 return NULL;
1114 } 1126 }
1115 1127
1116 err = jbd2_journal_bmap(journal, 0, &blocknr); 1128 err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1118,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1118 if (err) { 1130 if (err) {
1119 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 1131 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
1120 __func__); 1132 __func__);
1121 jbd2_stats_proc_exit(journal); 1133 goto out_err;
1122 kfree(journal);
1123 return NULL;
1124 } 1134 }
1125 1135
1126 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 1136 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1127 J_ASSERT(bh != NULL); 1137 if (!bh) {
1138 printk(KERN_ERR
1139 "%s: Cannot get buffer for journal superblock\n",
1140 __func__);
1141 goto out_err;
1142 }
1128 journal->j_sb_buffer = bh; 1143 journal->j_sb_buffer = bh;
1129 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1144 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1130 1145
1131 return journal; 1146 return journal;
1147out_err:
1148 jbd2_stats_proc_exit(journal);
1149 kfree(journal);
1150 return NULL;
1132} 1151}
1133 1152
1134/* 1153/*
@@ -1177,77 +1196,6 @@ static int journal_reset(journal_t *journal)
1177} 1196}
1178 1197
1179/** 1198/**
1180 * int jbd2_journal_create() - Initialise the new journal file
1181 * @journal: Journal to create. This structure must have been initialised
1182 *
1183 * Given a journal_t structure which tells us which disk blocks we can
1184 * use, create a new journal superblock and initialise all of the
1185 * journal fields from scratch.
1186 **/
1187int jbd2_journal_create(journal_t *journal)
1188{
1189 unsigned long long blocknr;
1190 struct buffer_head *bh;
1191 journal_superblock_t *sb;
1192 int i, err;
1193
1194 if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
1195 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
1196 journal->j_maxlen);
1197 journal_fail_superblock(journal);
1198 return -EINVAL;
1199 }
1200
1201 if (journal->j_inode == NULL) {
1202 /*
1203 * We don't know what block to start at!
1204 */
1205 printk(KERN_EMERG
1206 "%s: creation of journal on external device!\n",
1207 __func__);
1208 BUG();
1209 }
1210
1211 /* Zero out the entire journal on disk. We cannot afford to
1212 have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
1213 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
1214 for (i = 0; i < journal->j_maxlen; i++) {
1215 err = jbd2_journal_bmap(journal, i, &blocknr);
1216 if (err)
1217 return err;
1218 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1219 lock_buffer(bh);
1220 memset (bh->b_data, 0, journal->j_blocksize);
1221 BUFFER_TRACE(bh, "marking dirty");
1222 mark_buffer_dirty(bh);
1223 BUFFER_TRACE(bh, "marking uptodate");
1224 set_buffer_uptodate(bh);
1225 unlock_buffer(bh);
1226 __brelse(bh);
1227 }
1228
1229 sync_blockdev(journal->j_dev);
1230 jbd_debug(1, "JBD: journal cleared.\n");
1231
1232 /* OK, fill in the initial static fields in the new superblock */
1233 sb = journal->j_superblock;
1234
1235 sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
1236 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1237
1238 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
1239 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
1240 sb->s_first = cpu_to_be32(1);
1241
1242 journal->j_transaction_sequence = 1;
1243
1244 journal->j_flags &= ~JBD2_ABORT;
1245 journal->j_format_version = 2;
1246
1247 return journal_reset(journal);
1248}
1249
1250/**
1251 * void jbd2_journal_update_superblock() - Update journal sb on disk. 1199 * void jbd2_journal_update_superblock() - Update journal sb on disk.
1252 * @journal: The journal to update. 1200 * @journal: The journal to update.
1253 * @wait: Set to '0' if you don't want to wait for IO completion. 1201 * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1491,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
1491 spin_lock(&journal->j_list_lock); 1439 spin_lock(&journal->j_list_lock);
1492 while (journal->j_checkpoint_transactions != NULL) { 1440 while (journal->j_checkpoint_transactions != NULL) {
1493 spin_unlock(&journal->j_list_lock); 1441 spin_unlock(&journal->j_list_lock);
1442 mutex_lock(&journal->j_checkpoint_mutex);
1494 jbd2_log_do_checkpoint(journal); 1443 jbd2_log_do_checkpoint(journal);
1444 mutex_unlock(&journal->j_checkpoint_mutex);
1495 spin_lock(&journal->j_list_lock); 1445 spin_lock(&journal->j_list_lock);
1496 } 1446 }
1497 1447
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4f925a4f3d05..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{ 49{
49 transaction->t_journal = journal; 50 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING; 51 transaction->t_state = T_RUNNING;
52 transaction->t_start_time = ktime_get();
51 transaction->t_tid = journal->j_transaction_sequence++; 53 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval; 54 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 55 spin_lock_init(&transaction->t_handle_lock);
@@ -1240,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
1240{ 1242{
1241 transaction_t *transaction = handle->h_transaction; 1243 transaction_t *transaction = handle->h_transaction;
1242 journal_t *journal = transaction->t_journal; 1244 journal_t *journal = transaction->t_journal;
1243 int old_handle_count, err; 1245 int err;
1244 pid_t pid; 1246 pid_t pid;
1245 1247
1246 J_ASSERT(journal_current_handle() == handle); 1248 J_ASSERT(journal_current_handle() == handle);
@@ -1263,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
1263 /* 1265 /*
1264 * Implement synchronous transaction batching. If the handle 1266 * Implement synchronous transaction batching. If the handle
1265 * was synchronous, don't force a commit immediately. Let's 1267 * was synchronous, don't force a commit immediately. Let's
1266 * yield and let another thread piggyback onto this transaction. 1268 * yield and let another thread piggyback onto this
1267 * Keep doing that while new threads continue to arrive. 1269 * transaction. Keep doing that while new threads continue to
1268 * It doesn't cost much - we're about to run a commit and sleep 1270 * arrive. It doesn't cost much - we're about to run a commit
1269 * on IO anyway. Speeds up many-threaded, many-dir operations 1271 * and sleep on IO anyway. Speeds up many-threaded, many-dir
1270 * by 30x or more... 1272 * operations by 30x or more...
1273 *
1274 * We try and optimize the sleep time against what the
1275 * underlying disk can do, instead of having a static sleep
1276 * time. This is useful for the case where our storage is so
1277 * fast that it is more optimal to go ahead and force a flush
1278 * and wait for the transaction to be committed than it is to
1279 * wait for an arbitrary amount of time for new writers to
1280 * join the transaction. We achieve this by measuring how
1281 * long it takes to commit a transaction, and compare it with
1282 * how long this transaction has been running, and if run time
1283 * < commit time then we sleep for the delta and commit. This
1284 * greatly helps super fast disks that would see slowdowns as
1285 * more threads started doing fsyncs.
1271 * 1286 *
1272 * But don't do this if this process was the most recent one to 1287 * But don't do this if this process was the most recent one
1273 * perform a synchronous write. We do this to detect the case where a 1288 * to perform a synchronous write. We do this to detect the
1274 * single process is doing a stream of sync writes. No point in waiting 1289 * case where a single process is doing a stream of sync
1275 * for joiners in that case. 1290 * writes. No point in waiting for joiners in that case.
1276 */ 1291 */
1277 pid = current->pid; 1292 pid = current->pid;
1278 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1293 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1294 u64 commit_time, trans_time;
1295
1279 journal->j_last_sync_writer = pid; 1296 journal->j_last_sync_writer = pid;
1280 do { 1297
1281 old_handle_count = transaction->t_handle_count; 1298 spin_lock(&journal->j_state_lock);
1282 schedule_timeout_uninterruptible(1); 1299 commit_time = journal->j_average_commit_time;
1283 } while (old_handle_count != transaction->t_handle_count); 1300 spin_unlock(&journal->j_state_lock);
1301
1302 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1303 transaction->t_start_time));
1304
1305 commit_time = max_t(u64, commit_time,
1306 1000*journal->j_min_batch_time);
1307 commit_time = min_t(u64, commit_time,
1308 1000*journal->j_max_batch_time);
1309
1310 if (trans_time < commit_time) {
1311 ktime_t expires = ktime_add_ns(ktime_get(),
1312 commit_time);
1313 set_current_state(TASK_UNINTERRUPTIBLE);
1314 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1315 }
1284 } 1316 }
1285 1317
1286 current->journal_info = NULL; 1318 current->journal_info = NULL;
diff --git a/fs/super.c b/fs/super.c
index 7d67387496cb..ed080c417167 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -810,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
810 } 810 }
811 811
812 s->s_flags |= MS_ACTIVE; 812 s->s_flags |= MS_ACTIVE;
813 bdev->bd_super = s;
813 } 814 }
814 815
815 return simple_set_mnt(mnt, s); 816 return simple_set_mnt(mnt, s);
@@ -829,6 +830,7 @@ void kill_block_super(struct super_block *sb)
829 struct block_device *bdev = sb->s_bdev; 830 struct block_device *bdev = sb->s_bdev;
830 fmode_t mode = sb->s_mode; 831 fmode_t mode = sb->s_mode;
831 832
833 bdev->bd_super = 0;
832 generic_shutdown_super(sb); 834 generic_shutdown_super(sb);
833 sync_blockdev(bdev); 835 sync_blockdev(bdev);
834 close_bdev_exclusive(bdev, mode); 836 close_bdev_exclusive(bdev, mode);
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index d76800f6ecf0..dd495b8c3091 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -378,6 +378,13 @@ struct ext3_inode {
378#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ 378#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */
379 379
380/* 380/*
381 * Misc. filesystem flags
382 */
383#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
384#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
385#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
386
387/*
381 * Mount flags 388 * Mount flags
382 */ 389 */
383#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ 390#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
@@ -513,7 +520,23 @@ struct ext3_super_block {
513 __u16 s_reserved_word_pad; 520 __u16 s_reserved_word_pad;
514 __le32 s_default_mount_opts; 521 __le32 s_default_mount_opts;
515 __le32 s_first_meta_bg; /* First metablock block group */ 522 __le32 s_first_meta_bg; /* First metablock block group */
516 __u32 s_reserved[190]; /* Padding to the end of the block */ 523 __le32 s_mkfs_time; /* When the filesystem was created */
524 __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
525 /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
526/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
527 __le32 s_r_blocks_count_hi; /* Reserved blocks count */
528 __le32 s_free_blocks_count_hi; /* Free blocks count */
529 __le16 s_min_extra_isize; /* All inodes have at least # bytes */
530 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
531 __le32 s_flags; /* Miscellaneous flags */
532 __le16 s_raid_stride; /* RAID stride */
533 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
534 __le64 s_mmp_block; /* Block for multi-mount protection */
535 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
536 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
537 __u8 s_reserved_char_pad2;
538 __le16 s_reserved_pad;
539 __u32 s_reserved[162]; /* Padding to the end of the block */
517}; 540};
518 541
519#ifdef __KERNEL__ 542#ifdef __KERNEL__
@@ -718,6 +741,9 @@ static inline __le16 ext3_rec_len_to_disk(unsigned len)
718#define DX_HASH_LEGACY 0 741#define DX_HASH_LEGACY 0
719#define DX_HASH_HALF_MD4 1 742#define DX_HASH_HALF_MD4 1
720#define DX_HASH_TEA 2 743#define DX_HASH_TEA 2
744#define DX_HASH_LEGACY_UNSIGNED 3
745#define DX_HASH_HALF_MD4_UNSIGNED 4
746#define DX_HASH_TEA_UNSIGNED 5
721 747
722#ifdef __KERNEL__ 748#ifdef __KERNEL__
723 749
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index 76fdc0f4b028..f07f34de2f0e 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -57,6 +57,7 @@ struct ext3_sb_info {
57 u32 s_next_generation; 57 u32 s_next_generation;
58 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
59 int s_def_hash_version; 59 int s_def_hash_version;
60 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
60 struct percpu_counter s_freeblocks_counter; 61 struct percpu_counter s_freeblocks_counter;
61 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
62 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e38a64d71eff..0b87b29f4797 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -565,6 +565,7 @@ struct address_space {
565struct block_device { 565struct block_device {
566 dev_t bd_dev; /* not a kdev_t - it's a search key */ 566 dev_t bd_dev; /* not a kdev_t - it's a search key */
567 struct inode * bd_inode; /* will die */ 567 struct inode * bd_inode; /* will die */
568 struct super_block * bd_super;
568 int bd_openers; 569 int bd_openers;
569 struct mutex bd_mutex; /* open/close mutex */ 570 struct mutex bd_mutex; /* open/close mutex */
570 struct semaphore bd_mount_sem; 571 struct semaphore bd_mount_sem;
@@ -1389,6 +1390,7 @@ struct super_operations {
1389 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 1390 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
1390 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 1391 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
1391#endif 1392#endif
1393 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
1392}; 1394};
1393 1395
1394/* 1396/*
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index f98a656b17e5..76dad4808847 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -86,4 +86,6 @@ static inline int task_nice_ioclass(struct task_struct *task)
86 */ 86 */
87extern int ioprio_best(unsigned short aprio, unsigned short bprio); 87extern int ioprio_best(unsigned short aprio, unsigned short bprio);
88 88
89extern int set_task_ioprio(struct task_struct *task, int ioprio);
90
89#endif 91#endif
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 34456476e761..b45109c61fba 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -638,6 +638,11 @@ struct transaction_s
638 unsigned long t_expires; 638 unsigned long t_expires;
639 639
640 /* 640 /*
641 * When this transaction started, in nanoseconds [no locking]
642 */
643 ktime_t t_start_time;
644
645 /*
641 * How many handles used this transaction? [t_handle_lock] 646 * How many handles used this transaction? [t_handle_lock]
642 */ 647 */
643 int t_handle_count; 648 int t_handle_count;
@@ -682,6 +687,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
682 return end + (MAX_JIFFY_OFFSET - start); 687 return end + (MAX_JIFFY_OFFSET - start);
683} 688}
684 689
690#define JBD2_NR_BATCH 64
691
685/** 692/**
686 * struct journal_s - The journal_s type is the concrete type associated with 693 * struct journal_s - The journal_s type is the concrete type associated with
687 * journal_t. 694 * journal_t.
@@ -826,6 +833,14 @@ struct journal_s
826 struct mutex j_checkpoint_mutex; 833 struct mutex j_checkpoint_mutex;
827 834
828 /* 835 /*
836 * List of buffer heads used by the checkpoint routine. This
837 * was moved from jbd2_log_do_checkpoint() to reduce stack
838 * usage. Access to this array is controlled by the
839 * j_checkpoint_mutex. [j_checkpoint_mutex]
840 */
841 struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH];
842
843 /*
829 * Journal head: identifies the first unused block in the journal. 844 * Journal head: identifies the first unused block in the journal.
830 * [j_state_lock] 845 * [j_state_lock]
831 */ 846 */
@@ -939,8 +954,26 @@ struct journal_s
939 struct buffer_head **j_wbuf; 954 struct buffer_head **j_wbuf;
940 int j_wbufsize; 955 int j_wbufsize;
941 956
957 /*
958 * this is the pid of hte last person to run a synchronous operation
959 * through the journal
960 */
942 pid_t j_last_sync_writer; 961 pid_t j_last_sync_writer;
943 962
963 /*
964 * the average amount of time in nanoseconds it takes to commit a
965 * transaction to disk. [j_state_lock]
966 */
967 u64 j_average_commit_time;
968
969 /*
970 * minimum and maximum times that we should wait for
971 * additional filesystem operations to get batched into a
972 * synchronous handle in microseconds
973 */
974 u32 j_min_batch_time;
975 u32 j_max_batch_time;
976
944 /* This function is called when a transaction is closed */ 977 /* This function is called when a transaction is closed */
945 void (*j_commit_callback)(journal_t *, 978 void (*j_commit_callback)(journal_t *,
946 transaction_t *); 979 transaction_t *);
@@ -1102,7 +1135,6 @@ extern int jbd2_journal_set_features
1102 (journal_t *, unsigned long, unsigned long, unsigned long); 1135 (journal_t *, unsigned long, unsigned long, unsigned long);
1103extern void jbd2_journal_clear_features 1136extern void jbd2_journal_clear_features
1104 (journal_t *, unsigned long, unsigned long, unsigned long); 1137 (journal_t *, unsigned long, unsigned long, unsigned long);
1105extern int jbd2_journal_create (journal_t *);
1106extern int jbd2_journal_load (journal_t *journal); 1138extern int jbd2_journal_load (journal_t *journal);
1107extern int jbd2_journal_destroy (journal_t *); 1139extern int jbd2_journal_destroy (journal_t *);
1108extern int jbd2_journal_recover (journal_t *journal); 1140extern int jbd2_journal_recover (journal_t *journal);
@@ -1177,8 +1209,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
1177int jbd2_log_do_checkpoint(journal_t *journal); 1209int jbd2_log_do_checkpoint(journal_t *journal);
1178 1210
1179void __jbd2_log_wait_for_space(journal_t *journal); 1211void __jbd2_log_wait_for_space(journal_t *journal);
1180extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); 1212extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
1181extern int jbd2_cleanup_journal_tail(journal_t *); 1213extern int jbd2_cleanup_journal_tail(journal_t *);
1182 1214
1183/* Debugging code only: */ 1215/* Debugging code only: */
1184 1216