aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 13:02:55 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 13:02:55 -0400
commit69e1aaddd63104f37021d0b0f6abfd9623c9134c (patch)
tree14ad49741b428d270b681694bb2df349465455b9 /fs
parent56b59b429b4c26e5e730bc8c3d837de9f7d0a966 (diff)
parent9d547c35799a4ddd235f1565cec2fff6c9263504 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates for 3.4 from Ted Ts'o: "Ext4 commits for 3.3 merge window; mostly cleanups and bug fixes The changes to export dirty_writeback_interval are from Artem's s_dirt cleanup patch series. The same is true of the change to remove the s_dirt helper functions which never got used by anyone in-tree. I've run these changes by Al Viro, and am carrying them so that Artem can more easily fix up the rest of the file systems during the next merge window. (Originally we had hopped to remove the use of s_dirt from ext4 during this merge window, but his patches had some bugs, so I ultimately ended dropping them from the ext4 tree.)" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (66 commits) vfs: remove unused superblock helpers mm: export dirty_writeback_interval ext4: remove useless s_dirt assignment ext4: write superblock only once on unmount ext4: do not mark superblock as dirty unnecessarily ext4: correct ext4_punch_hole return codes ext4: remove restrictive checks for EOFBLOCKS_FL ext4: always set then trimmed blocks count into len ext4: fix trimmed block count accunting ext4: fix start and len arguments handling in ext4_trim_fs() ext4: update s_free_{inodes,blocks}_count during online resize ext4: change some printk() calls to use ext4_msg() instead ext4: avoid output message interleaving in ext4_error_<foo>() ext4: remove trailing newlines from ext4_msg() and ext4_error() messages ext4: add no_printk argument validation, fix fallout ext4: remove redundant "EXT4-fs: " from uses of ext4_msg ext4: give more helpful error message in ext4_ext_rm_leaf() ext4: remove unused code from ext4_ext_map_blocks() ext4: rewrite punch hole to use ext4_ext_remove_space() jbd2: cleanup journal tail after transaction commit ...
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/balloc.c63
-rw-r--r--fs/ext4/dir.c13
-rw-r--r--fs/ext4/ext4.h34
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.h128
-rw-r--r--fs/ext4/extents.c330
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c260
-rw-r--r--fs/ext4/inode.c95
-rw-r--r--fs/ext4/mballoc.c342
-rw-r--r--fs/ext4/mballoc.h20
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c18
-rw-r--r--fs/ext4/resize.c37
-rw-r--r--fs/ext4/super.c1075
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/jbd2/checkpoint.c140
-rw-r--r--fs/jbd2/commit.c47
-rw-r--r--fs/jbd2/journal.c361
-rw-r--r--fs/jbd2/recovery.c5
-rw-r--r--fs/jbd2/revoke.c12
-rw-r--r--fs/jbd2/transaction.c48
24 files changed, 1479 insertions, 1588 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f9e2cd8cf711..4bbd07a6fa18 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -336,10 +336,10 @@ err_out:
336 * Return buffer_head on success or NULL in case of failure. 336 * Return buffer_head on success or NULL in case of failure.
337 */ 337 */
338struct buffer_head * 338struct buffer_head *
339ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 339ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
340{ 340{
341 struct ext4_group_desc *desc; 341 struct ext4_group_desc *desc;
342 struct buffer_head *bh = NULL; 342 struct buffer_head *bh;
343 ext4_fsblk_t bitmap_blk; 343 ext4_fsblk_t bitmap_blk;
344 344
345 desc = ext4_get_group_desc(sb, block_group, NULL); 345 desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
348 bitmap_blk = ext4_block_bitmap(sb, desc); 348 bitmap_blk = ext4_block_bitmap(sb, desc);
349 bh = sb_getblk(sb, bitmap_blk); 349 bh = sb_getblk(sb, bitmap_blk);
350 if (unlikely(!bh)) { 350 if (unlikely(!bh)) {
351 ext4_error(sb, "Cannot read block bitmap - " 351 ext4_error(sb, "Cannot get buffer for block bitmap - "
352 "block_group = %u, block_bitmap = %llu", 352 "block_group = %u, block_bitmap = %llu",
353 block_group, bitmap_blk); 353 block_group, bitmap_blk);
354 return NULL; 354 return NULL;
355 } 355 }
356 356
@@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
382 return bh; 382 return bh;
383 } 383 }
384 /* 384 /*
385 * submit the buffer_head for read. We can 385 * submit the buffer_head for reading
386 * safely mark the bitmap as uptodate now.
387 * We do it here so the bitmap uptodate bit
388 * get set with buffer lock held.
389 */ 386 */
387 set_buffer_new(bh);
390 trace_ext4_read_block_bitmap_load(sb, block_group); 388 trace_ext4_read_block_bitmap_load(sb, block_group);
391 set_bitmap_uptodate(bh); 389 bh->b_end_io = ext4_end_bitmap_read;
392 if (bh_submit_read(bh) < 0) { 390 get_bh(bh);
393 put_bh(bh); 391 submit_bh(READ, bh);
392 return bh;
393}
394
395/* Returns 0 on success, 1 on error */
396int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
397 struct buffer_head *bh)
398{
399 struct ext4_group_desc *desc;
400
401 if (!buffer_new(bh))
402 return 0;
403 desc = ext4_get_group_desc(sb, block_group, NULL);
404 if (!desc)
405 return 1;
406 wait_on_buffer(bh);
407 if (!buffer_uptodate(bh)) {
394 ext4_error(sb, "Cannot read block bitmap - " 408 ext4_error(sb, "Cannot read block bitmap - "
395 "block_group = %u, block_bitmap = %llu", 409 "block_group = %u, block_bitmap = %llu",
396 block_group, bitmap_blk); 410 block_group, (unsigned long long) bh->b_blocknr);
397 return NULL; 411 return 1;
398 } 412 }
413 clear_buffer_new(bh);
414 /* Panic or remount fs read-only if block bitmap is invalid */
399 ext4_valid_block_bitmap(sb, desc, block_group, bh); 415 ext4_valid_block_bitmap(sb, desc, block_group, bh);
400 /* 416 return 0;
401 * file system mounted not to panic on error, 417}
402 * continue with corrupt bitmap 418
403 */ 419struct buffer_head *
420ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
421{
422 struct buffer_head *bh;
423
424 bh = ext4_read_block_bitmap_nowait(sb, block_group);
425 if (ext4_wait_block_bitmap(sb, block_group, bh)) {
426 put_bh(bh);
427 return NULL;
428 }
404 return bh; 429 return bh;
405} 430}
406 431
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 164c56092e58..ad56866d729a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -91,17 +91,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
91 return 0; 91 return 0;
92 92
93 if (filp) 93 if (filp)
94 ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, 94 ext4_error_file(filp, function, line, bh->b_blocknr,
95 "bad entry in directory: %s - offset=%u(%u), " 95 "bad entry in directory: %s - offset=%u(%u), "
96 "inode=%u, rec_len=%d, name_len=%d", 96 "inode=%u, rec_len=%d, name_len=%d",
97 error_msg, (unsigned) (offset%bh->b_size), 97 error_msg, (unsigned) (offset % bh->b_size),
98 offset, le32_to_cpu(de->inode), 98 offset, le32_to_cpu(de->inode),
99 rlen, de->name_len); 99 rlen, de->name_len);
100 else 100 else
101 ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, 101 ext4_error_inode(dir, function, line, bh->b_blocknr,
102 "bad entry in directory: %s - offset=%u(%u), " 102 "bad entry in directory: %s - offset=%u(%u), "
103 "inode=%u, rec_len=%d, name_len=%d", 103 "inode=%u, rec_len=%d, name_len=%d",
104 error_msg, (unsigned) (offset%bh->b_size), 104 error_msg, (unsigned) (offset % bh->b_size),
105 offset, le32_to_cpu(de->inode), 105 offset, le32_to_cpu(de->inode),
106 rlen, de->name_len); 106 rlen, de->name_len);
107 107
@@ -425,8 +425,9 @@ static int call_filldir(struct file *filp, void *dirent,
425 sb = inode->i_sb; 425 sb = inode->i_sb;
426 426
427 if (!fname) { 427 if (!fname) {
428 printk(KERN_ERR "EXT4-fs: call_filldir: called with " 428 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
429 "null fname?!?\n"); 429 "called with null fname?!?", __func__, __LINE__,
430 inode->i_ino, current->comm);
430 return 0; 431 return 0;
431 } 432 }
432 curr_pos = hash2pos(fname->hash, fname->minor_hash); 433 curr_pos = hash2pos(fname->hash, fname->minor_hash);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004fc3d84..ded731ac8a32 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,7 +53,7 @@
53 printk(KERN_DEBUG f, ## a); \ 53 printk(KERN_DEBUG f, ## a); \
54 } while (0) 54 } while (0)
55#else 55#else
56#define ext4_debug(f, a...) do {} while (0) 56#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
57#endif 57#endif
58 58
59#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
@@ -184,6 +184,8 @@ struct mpage_da_data {
184#define EXT4_IO_END_UNWRITTEN 0x0001 184#define EXT4_IO_END_UNWRITTEN 0x0001
185#define EXT4_IO_END_ERROR 0x0002 185#define EXT4_IO_END_ERROR 0x0002
186#define EXT4_IO_END_QUEUED 0x0004 186#define EXT4_IO_END_QUEUED 0x0004
187#define EXT4_IO_END_DIRECT 0x0008
188#define EXT4_IO_END_IN_FSYNC 0x0010
187 189
188struct ext4_io_page { 190struct ext4_io_page {
189 struct page *p_page; 191 struct page *p_page;
@@ -192,18 +194,25 @@ struct ext4_io_page {
192 194
193#define MAX_IO_PAGES 128 195#define MAX_IO_PAGES 128
194 196
197/*
198 * For converting uninitialized extents on a work queue.
199 *
200 * 'page' is only used from the writepage() path; 'pages' is only used for
201 * buffered writes; they are used to keep page references until conversion
202 * takes place. For AIO/DIO, neither field is filled in.
203 */
195typedef struct ext4_io_end { 204typedef struct ext4_io_end {
196 struct list_head list; /* per-file finished IO list */ 205 struct list_head list; /* per-file finished IO list */
197 struct inode *inode; /* file being written to */ 206 struct inode *inode; /* file being written to */
198 unsigned int flag; /* unwritten or not */ 207 unsigned int flag; /* unwritten or not */
199 struct page *page; /* page struct for buffer write */ 208 struct page *page; /* for writepage() path */
200 loff_t offset; /* offset in the file */ 209 loff_t offset; /* offset in the file */
201 ssize_t size; /* size of the extent */ 210 ssize_t size; /* size of the extent */
202 struct work_struct work; /* data work queue */ 211 struct work_struct work; /* data work queue */
203 struct kiocb *iocb; /* iocb struct for AIO */ 212 struct kiocb *iocb; /* iocb struct for AIO */
204 int result; /* error value for AIO */ 213 int result; /* error value for AIO */
205 int num_io_pages; 214 int num_io_pages; /* for writepages() */
206 struct ext4_io_page *pages[MAX_IO_PAGES]; 215 struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
207} ext4_io_end_t; 216} ext4_io_end_t;
208 217
209struct ext4_io_submit { 218struct ext4_io_submit {
@@ -923,6 +932,7 @@ struct ext4_inode_info {
923#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ 932#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
924#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ 933#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
925#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ 934#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
935#define EXT4_MOUNT_ERRORS_MASK 0x00070
926#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ 936#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
927#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ 937#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
928#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ 938#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
@@ -941,7 +951,6 @@ struct ext4_inode_info {
941#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ 951#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
942#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 952#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
943#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 953#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
944#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
945#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ 954#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
946#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 955#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
947#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 956#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
@@ -1142,6 +1151,7 @@ struct ext4_sb_info {
1142 unsigned int s_mount_opt; 1151 unsigned int s_mount_opt;
1143 unsigned int s_mount_opt2; 1152 unsigned int s_mount_opt2;
1144 unsigned int s_mount_flags; 1153 unsigned int s_mount_flags;
1154 unsigned int s_def_mount_opt;
1145 ext4_fsblk_t s_sb_block; 1155 ext4_fsblk_t s_sb_block;
1146 uid_t s_resuid; 1156 uid_t s_resuid;
1147 gid_t s_resgid; 1157 gid_t s_resgid;
@@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1420#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1430#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1421#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1431#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1422#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1432#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1423#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ 1433#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
1424#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ 1434#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
1435#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */
1425 1436
1426#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR 1437#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1427#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1438#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1794,8 +1805,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1794 ext4_group_t block_group, 1805 ext4_group_t block_group,
1795 struct buffer_head ** bh); 1806 struct buffer_head ** bh);
1796extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1807extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1797struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, 1808
1798 ext4_group_t block_group); 1809extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
1810 ext4_group_t block_group);
1811extern int ext4_wait_block_bitmap(struct super_block *sb,
1812 ext4_group_t block_group,
1813 struct buffer_head *bh);
1814extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1815 ext4_group_t block_group);
1799extern void ext4_init_block_bitmap(struct super_block *sb, 1816extern void ext4_init_block_bitmap(struct super_block *sb,
1800 struct buffer_head *bh, 1817 struct buffer_head *bh,
1801 ext4_group_t group, 1818 ext4_group_t group,
@@ -1841,6 +1858,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *);
1841extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); 1858extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1842extern int ext4_init_inode_table(struct super_block *sb, 1859extern int ext4_init_inode_table(struct super_block *sb,
1843 ext4_group_t group, int barrier); 1860 ext4_group_t group, int barrier);
1861extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
1844 1862
1845/* mballoc.c */ 1863/* mballoc.c */
1846extern long ext4_mb_stats; 1864extern long ext4_mb_stats;
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a52db3a69a30..0f58b86e3a02 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -47,9 +47,9 @@
47 */ 47 */
48#define EXT_DEBUG__ 48#define EXT_DEBUG__
49#ifdef EXT_DEBUG 49#ifdef EXT_DEBUG
50#define ext_debug(a...) printk(a) 50#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
51#else 51#else
52#define ext_debug(a...) 52#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
53#endif 53#endif
54 54
55/* 55/*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1dab18..83b20fcf9400 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,6 +104,78 @@
104#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 104#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
105#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 105#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
106 106
107/**
108 * struct ext4_journal_cb_entry - Base structure for callback information.
109 *
110 * This struct is a 'seed' structure for a using with your own callback
111 * structs. If you are using callbacks you must allocate one of these
112 * or another struct of your own definition which has this struct
113 * as it's first element and pass it to ext4_journal_callback_add().
114 */
115struct ext4_journal_cb_entry {
116 /* list information for other callbacks attached to the same handle */
117 struct list_head jce_list;
118
119 /* Function to call with this callback structure */
120 void (*jce_func)(struct super_block *sb,
121 struct ext4_journal_cb_entry *jce, int error);
122
123 /* user data goes here */
124};
125
126/**
127 * ext4_journal_callback_add: add a function to call after transaction commit
128 * @handle: active journal transaction handle to register callback on
129 * @func: callback function to call after the transaction has committed:
130 * @sb: superblock of current filesystem for transaction
131 * @jce: returned journal callback data
132 * @rc: journal state at commit (0 = transaction committed properly)
133 * @jce: journal callback data (internal and function private data struct)
134 *
135 * The registered function will be called in the context of the journal thread
136 * after the transaction for which the handle was created has completed.
137 *
138 * No locks are held when the callback function is called, so it is safe to
139 * call blocking functions from within the callback, but the callback should
140 * not block or run for too long, or the filesystem will be blocked waiting for
141 * the next transaction to commit. No journaling functions can be used, or
142 * there is a risk of deadlock.
143 *
144 * There is no guaranteed calling order of multiple registered callbacks on
145 * the same transaction.
146 */
147static inline void ext4_journal_callback_add(handle_t *handle,
148 void (*func)(struct super_block *sb,
149 struct ext4_journal_cb_entry *jce,
150 int rc),
151 struct ext4_journal_cb_entry *jce)
152{
153 struct ext4_sb_info *sbi =
154 EXT4_SB(handle->h_transaction->t_journal->j_private);
155
156 /* Add the jce to transaction's private list */
157 jce->jce_func = func;
158 spin_lock(&sbi->s_md_lock);
159 list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
160 spin_unlock(&sbi->s_md_lock);
161}
162
163/**
164 * ext4_journal_callback_del: delete a registered callback
165 * @handle: active journal transaction handle on which callback was registered
166 * @jce: registered journal callback entry to unregister
167 */
168static inline void ext4_journal_callback_del(handle_t *handle,
169 struct ext4_journal_cb_entry *jce)
170{
171 struct ext4_sb_info *sbi =
172 EXT4_SB(handle->h_transaction->t_journal->j_private);
173
174 spin_lock(&sbi->s_md_lock);
175 list_del_init(&jce->jce_list);
176 spin_unlock(&sbi->s_md_lock);
177}
178
107int 179int
108ext4_mark_iloc_dirty(handle_t *handle, 180ext4_mark_iloc_dirty(handle_t *handle,
109 struct inode *inode, 181 struct inode *inode,
@@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
261/* super.c */ 333/* super.c */
262int ext4_force_commit(struct super_block *sb); 334int ext4_force_commit(struct super_block *sb);
263 335
264static inline int ext4_should_journal_data(struct inode *inode) 336/*
337 * Ext4 inode journal modes
338 */
339#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */
340#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
341#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
342
343static inline int ext4_inode_journal_mode(struct inode *inode)
265{ 344{
266 if (EXT4_JOURNAL(inode) == NULL) 345 if (EXT4_JOURNAL(inode) == NULL)
267 return 0; 346 return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
268 if (!S_ISREG(inode->i_mode)) 347 /* We do not support data journalling with delayed allocation */
269 return 1; 348 if (!S_ISREG(inode->i_mode) ||
270 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 349 test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
271 return 1; 350 return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
272 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 351 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
273 return 1; 352 !test_opt(inode->i_sb, DELALLOC))
274 return 0; 353 return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
354 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
355 return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
356 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
357 return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
358 else
359 BUG();
360}
361
362static inline int ext4_should_journal_data(struct inode *inode)
363{
364 return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
275} 365}
276 366
277static inline int ext4_should_order_data(struct inode *inode) 367static inline int ext4_should_order_data(struct inode *inode)
278{ 368{
279 if (EXT4_JOURNAL(inode) == NULL) 369 return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
280 return 0;
281 if (!S_ISREG(inode->i_mode))
282 return 0;
283 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
284 return 0;
285 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
286 return 1;
287 return 0;
288} 370}
289 371
290static inline int ext4_should_writeback_data(struct inode *inode) 372static inline int ext4_should_writeback_data(struct inode *inode)
291{ 373{
292 if (EXT4_JOURNAL(inode) == NULL) 374 return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
293 return 1;
294 if (!S_ISREG(inode->i_mode))
295 return 0;
296 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
297 return 0;
298 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
299 return 1;
300 return 0;
301} 375}
302 376
303/* 377/*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c292e1b..1421938e6792 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,14 @@
44 44
45#include <trace/events/ext4.h> 45#include <trace/events/ext4.h>
46 46
47/*
48 * used by extent splitting.
49 */
50#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
51 due to ENOSPC */
52#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
53#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
54
47static int ext4_split_extent(handle_t *handle, 55static int ext4_split_extent(handle_t *handle,
48 struct inode *inode, 56 struct inode *inode,
49 struct ext4_ext_path *path, 57 struct ext4_ext_path *path,
@@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle,
51 int split_flag, 59 int split_flag,
52 int flags); 60 int flags);
53 61
62static int ext4_split_extent_at(handle_t *handle,
63 struct inode *inode,
64 struct ext4_ext_path *path,
65 ext4_lblk_t split,
66 int split_flag,
67 int flags);
68
54static int ext4_ext_truncate_extend_restart(handle_t *handle, 69static int ext4_ext_truncate_extend_restart(handle_t *handle,
55 struct inode *inode, 70 struct inode *inode,
56 int needed) 71 int needed)
@@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
300 ext4_fsblk_t block = ext4_ext_pblock(ext); 315 ext4_fsblk_t block = ext4_ext_pblock(ext);
301 int len = ext4_ext_get_actual_len(ext); 316 int len = ext4_ext_get_actual_len(ext);
302 317
318 if (len == 0)
319 return 0;
303 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 320 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
304} 321}
305 322
@@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2308 struct ext4_extent *ex; 2325 struct ext4_extent *ex;
2309 2326
2310 /* the header must be checked already in ext4_ext_remove_space() */ 2327 /* the header must be checked already in ext4_ext_remove_space() */
2311 ext_debug("truncate since %u in leaf\n", start); 2328 ext_debug("truncate since %u in leaf to %u\n", start, end);
2312 if (!path[depth].p_hdr) 2329 if (!path[depth].p_hdr)
2313 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2330 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2314 eh = path[depth].p_hdr; 2331 eh = path[depth].p_hdr;
@@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2343 ext_debug(" border %u:%u\n", a, b); 2360 ext_debug(" border %u:%u\n", a, b);
2344 2361
2345 /* If this extent is beyond the end of the hole, skip it */ 2362 /* If this extent is beyond the end of the hole, skip it */
2346 if (end <= ex_ee_block) { 2363 if (end < ex_ee_block) {
2347 ex--; 2364 ex--;
2348 ex_ee_block = le32_to_cpu(ex->ee_block); 2365 ex_ee_block = le32_to_cpu(ex->ee_block);
2349 ex_ee_len = ext4_ext_get_actual_len(ex); 2366 ex_ee_len = ext4_ext_get_actual_len(ex);
2350 continue; 2367 continue;
2351 } else if (b != ex_ee_block + ex_ee_len - 1) { 2368 } else if (b != ex_ee_block + ex_ee_len - 1) {
2352 EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", 2369 EXT4_ERROR_INODE(inode,
2353 start, end); 2370 "can not handle truncate %u:%u "
2371 "on extent %u:%u",
2372 start, end, ex_ee_block,
2373 ex_ee_block + ex_ee_len - 1);
2354 err = -EIO; 2374 err = -EIO;
2355 goto out; 2375 goto out;
2356 } else if (a != ex_ee_block) { 2376 } else if (a != ex_ee_block) {
@@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2482 return 1; 2502 return 1;
2483} 2503}
2484 2504
2485static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2505static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2506 ext4_lblk_t end)
2486{ 2507{
2487 struct super_block *sb = inode->i_sb; 2508 struct super_block *sb = inode->i_sb;
2488 int depth = ext_depth(inode); 2509 int depth = ext_depth(inode);
@@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2491 handle_t *handle; 2512 handle_t *handle;
2492 int i, err; 2513 int i, err;
2493 2514
2494 ext_debug("truncate since %u\n", start); 2515 ext_debug("truncate since %u to %u\n", start, end);
2495 2516
2496 /* probably first extent we're gonna free will be last in block */ 2517 /* probably first extent we're gonna free will be last in block */
2497 handle = ext4_journal_start(inode, depth + 1); 2518 handle = ext4_journal_start(inode, depth + 1);
@@ -2504,6 +2525,61 @@ again:
2504 trace_ext4_ext_remove_space(inode, start, depth); 2525 trace_ext4_ext_remove_space(inode, start, depth);
2505 2526
2506 /* 2527 /*
2528 * Check if we are removing extents inside the extent tree. If that
2529 * is the case, we are going to punch a hole inside the extent tree
2530 * so we have to check whether we need to split the extent covering
2531 * the last block to remove so we can easily remove the part of it
2532 * in ext4_ext_rm_leaf().
2533 */
2534 if (end < EXT_MAX_BLOCKS - 1) {
2535 struct ext4_extent *ex;
2536 ext4_lblk_t ee_block;
2537
2538 /* find extent for this block */
2539 path = ext4_ext_find_extent(inode, end, NULL);
2540 if (IS_ERR(path)) {
2541 ext4_journal_stop(handle);
2542 return PTR_ERR(path);
2543 }
2544 depth = ext_depth(inode);
2545 ex = path[depth].p_ext;
2546 if (!ex)
2547 goto cont;
2548
2549 ee_block = le32_to_cpu(ex->ee_block);
2550
2551 /*
2552 * See if the last block is inside the extent, if so split
2553 * the extent at 'end' block so we can easily remove the
2554 * tail of the first part of the split extent in
2555 * ext4_ext_rm_leaf().
2556 */
2557 if (end >= ee_block &&
2558 end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2559 int split_flag = 0;
2560
2561 if (ext4_ext_is_uninitialized(ex))
2562 split_flag = EXT4_EXT_MARK_UNINIT1 |
2563 EXT4_EXT_MARK_UNINIT2;
2564
2565 /*
2566 * Split the extent in two so that 'end' is the last
2567 * block in the first new extent
2568 */
2569 err = ext4_split_extent_at(handle, inode, path,
2570 end + 1, split_flag,
2571 EXT4_GET_BLOCKS_PRE_IO |
2572 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
2573
2574 if (err < 0)
2575 goto out;
2576 }
2577 ext4_ext_drop_refs(path);
2578 kfree(path);
2579 }
2580cont:
2581
2582 /*
2507 * We start scanning from right side, freeing all the blocks 2583 * We start scanning from right side, freeing all the blocks
2508 * after i_size and walking into the tree depth-wise. 2584 * after i_size and walking into the tree depth-wise.
2509 */ 2585 */
@@ -2515,6 +2591,7 @@ again:
2515 } 2591 }
2516 path[0].p_depth = depth; 2592 path[0].p_depth = depth;
2517 path[0].p_hdr = ext_inode_hdr(inode); 2593 path[0].p_hdr = ext_inode_hdr(inode);
2594
2518 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2595 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2519 err = -EIO; 2596 err = -EIO;
2520 goto out; 2597 goto out;
@@ -2526,7 +2603,7 @@ again:
2526 /* this is leaf block */ 2603 /* this is leaf block */
2527 err = ext4_ext_rm_leaf(handle, inode, path, 2604 err = ext4_ext_rm_leaf(handle, inode, path,
2528 &partial_cluster, start, 2605 &partial_cluster, start,
2529 EXT_MAX_BLOCKS - 1); 2606 end);
2530 /* root level has p_bh == NULL, brelse() eats this */ 2607 /* root level has p_bh == NULL, brelse() eats this */
2531 brelse(path[i].p_bh); 2608 brelse(path[i].p_bh);
2532 path[i].p_bh = NULL; 2609 path[i].p_bh = NULL;
@@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb)
2651 2728
2652 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2729 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2653#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2730#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2654 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2731 printk(KERN_INFO "EXT4-fs: file extents enabled"
2655#ifdef AGGRESSIVE_TEST 2732#ifdef AGGRESSIVE_TEST
2656 printk(", aggressive tests"); 2733 ", aggressive tests"
2657#endif 2734#endif
2658#ifdef CHECK_BINSEARCH 2735#ifdef CHECK_BINSEARCH
2659 printk(", check binsearch"); 2736 ", check binsearch"
2660#endif 2737#endif
2661#ifdef EXTENTS_STATS 2738#ifdef EXTENTS_STATS
2662 printk(", stats"); 2739 ", stats"
2663#endif 2740#endif
2664 printk("\n"); 2741 "\n");
2665#endif 2742#endif
2666#ifdef EXTENTS_STATS 2743#ifdef EXTENTS_STATS
2667 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2744 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
@@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2709} 2786}
2710 2787
2711/* 2788/*
2712 * used by extent splitting.
2713 */
2714#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
2715 due to ENOSPC */
2716#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
2717#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
2718
2719/*
2720 * ext4_split_extent_at() splits an extent at given block. 2789 * ext4_split_extent_at() splits an extent at given block.
2721 * 2790 *
2722 * @handle: the journal handle 2791 * @handle: the journal handle
@@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3224 depth = ext_depth(inode); 3293 depth = ext_depth(inode);
3225 eh = path[depth].p_hdr; 3294 eh = path[depth].p_hdr;
3226 3295
3227 if (unlikely(!eh->eh_entries)) { 3296 /*
3228 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3297 * We're going to remove EOFBLOCKS_FL entirely in future so we
3229 "EOFBLOCKS_FL set"); 3298 * do not care for this case anymore. Simply remove the flag
3230 return -EIO; 3299 * if there are no extents.
3231 } 3300 */
3301 if (unlikely(!eh->eh_entries))
3302 goto out;
3232 last_ex = EXT_LAST_EXTENT(eh); 3303 last_ex = EXT_LAST_EXTENT(eh);
3233 /* 3304 /*
3234 * We should clear the EOFBLOCKS_FL flag if we are writing the 3305 * We should clear the EOFBLOCKS_FL flag if we are writing the
@@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3252 for (i = depth-1; i >= 0; i--) 3323 for (i = depth-1; i >= 0; i--)
3253 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3324 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3254 return 0; 3325 return 0;
3326out:
3255 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3327 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3256 return ext4_mark_inode_dirty(handle, inode); 3328 return ext4_mark_inode_dirty(handle, inode);
3257} 3329}
@@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3710 int free_on_err = 0, err = 0, depth, ret; 3782 int free_on_err = 0, err = 0, depth, ret;
3711 unsigned int allocated = 0, offset = 0; 3783 unsigned int allocated = 0, offset = 0;
3712 unsigned int allocated_clusters = 0; 3784 unsigned int allocated_clusters = 0;
3713 unsigned int punched_out = 0;
3714 unsigned int result = 0;
3715 struct ext4_allocation_request ar; 3785 struct ext4_allocation_request ar;
3716 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3786 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3717 ext4_lblk_t cluster_offset; 3787 ext4_lblk_t cluster_offset;
@@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3721 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3791 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3722 3792
3723 /* check in cache */ 3793 /* check in cache */
3724 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && 3794 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3725 ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3726 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3795 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3727 if ((sbi->s_cluster_ratio > 1) && 3796 if ((sbi->s_cluster_ratio > 1) &&
3728 ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 3797 ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
@@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3790 3859
3791 /* if found extent covers block, simply return it */ 3860 /* if found extent covers block, simply return it */
3792 if (in_range(map->m_lblk, ee_block, ee_len)) { 3861 if (in_range(map->m_lblk, ee_block, ee_len)) {
3793 struct ext4_map_blocks punch_map;
3794 ext4_fsblk_t partial_cluster = 0;
3795
3796 newblock = map->m_lblk - ee_block + ee_start; 3862 newblock = map->m_lblk - ee_block + ee_start;
3797 /* number of remaining blocks in the extent */ 3863 /* number of remaining blocks in the extent */
3798 allocated = ee_len - (map->m_lblk - ee_block); 3864 allocated = ee_len - (map->m_lblk - ee_block);
3799 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3865 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3800 ee_block, ee_len, newblock); 3866 ee_block, ee_len, newblock);
3801 3867
3802 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3803 /*
3804 * Do not put uninitialized extent
3805 * in the cache
3806 */
3807 if (!ext4_ext_is_uninitialized(ex)) {
3808 ext4_ext_put_in_cache(inode, ee_block,
3809 ee_len, ee_start);
3810 goto out;
3811 }
3812 ret = ext4_ext_handle_uninitialized_extents(
3813 handle, inode, map, path, flags,
3814 allocated, newblock);
3815 return ret;
3816 }
3817
3818 /*
3819 * Punch out the map length, but only to the
3820 * end of the extent
3821 */
3822 punched_out = allocated < map->m_len ?
3823 allocated : map->m_len;
3824
3825 /* 3868 /*
3826 * Sense extents need to be converted to 3869 * Do not put uninitialized extent
3827 * uninitialized, they must fit in an 3870 * in the cache
3828 * uninitialized extent
3829 */ 3871 */
3830 if (punched_out > EXT_UNINIT_MAX_LEN) 3872 if (!ext4_ext_is_uninitialized(ex)) {
3831 punched_out = EXT_UNINIT_MAX_LEN; 3873 ext4_ext_put_in_cache(inode, ee_block,
3832 3874 ee_len, ee_start);
3833 punch_map.m_lblk = map->m_lblk; 3875 goto out;
3834 punch_map.m_pblk = newblock;
3835 punch_map.m_len = punched_out;
3836 punch_map.m_flags = 0;
3837
3838 /* Check to see if the extent needs to be split */
3839 if (punch_map.m_len != ee_len ||
3840 punch_map.m_lblk != ee_block) {
3841
3842 ret = ext4_split_extent(handle, inode,
3843 path, &punch_map, 0,
3844 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3845 EXT4_GET_BLOCKS_PRE_IO);
3846
3847 if (ret < 0) {
3848 err = ret;
3849 goto out2;
3850 }
3851 /*
3852 * find extent for the block at
3853 * the start of the hole
3854 */
3855 ext4_ext_drop_refs(path);
3856 kfree(path);
3857
3858 path = ext4_ext_find_extent(inode,
3859 map->m_lblk, NULL);
3860 if (IS_ERR(path)) {
3861 err = PTR_ERR(path);
3862 path = NULL;
3863 goto out2;
3864 }
3865
3866 depth = ext_depth(inode);
3867 ex = path[depth].p_ext;
3868 ee_len = ext4_ext_get_actual_len(ex);
3869 ee_block = le32_to_cpu(ex->ee_block);
3870 ee_start = ext4_ext_pblock(ex);
3871
3872 }
3873
3874 ext4_ext_mark_uninitialized(ex);
3875
3876 ext4_ext_invalidate_cache(inode);
3877
3878 err = ext4_ext_rm_leaf(handle, inode, path,
3879 &partial_cluster, map->m_lblk,
3880 map->m_lblk + punched_out);
3881
3882 if (!err && path->p_hdr->eh_entries == 0) {
3883 /*
3884 * Punch hole freed all of this sub tree,
3885 * so we need to correct eh_depth
3886 */
3887 err = ext4_ext_get_access(handle, inode, path);
3888 if (err == 0) {
3889 ext_inode_hdr(inode)->eh_depth = 0;
3890 ext_inode_hdr(inode)->eh_max =
3891 cpu_to_le16(ext4_ext_space_root(
3892 inode, 0));
3893
3894 err = ext4_ext_dirty(
3895 handle, inode, path);
3896 }
3897 } 3876 }
3898 3877 ret = ext4_ext_handle_uninitialized_extents(
3899 goto out2; 3878 handle, inode, map, path, flags,
3879 allocated, newblock);
3880 return ret;
3900 } 3881 }
3901 } 3882 }
3902 3883
@@ -4165,13 +4146,11 @@ out2:
4165 ext4_ext_drop_refs(path); 4146 ext4_ext_drop_refs(path);
4166 kfree(path); 4147 kfree(path);
4167 } 4148 }
4168 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
4169 punched_out : allocated;
4170 4149
4171 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 4150 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4172 newblock, map->m_len, err ? err : result); 4151 newblock, map->m_len, err ? err : allocated);
4173 4152
4174 return err ? err : result; 4153 return err ? err : allocated;
4175} 4154}
4176 4155
4177void ext4_ext_truncate(struct inode *inode) 4156void ext4_ext_truncate(struct inode *inode)
@@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode)
4228 4207
4229 last_block = (inode->i_size + sb->s_blocksize - 1) 4208 last_block = (inode->i_size + sb->s_blocksize - 1)
4230 >> EXT4_BLOCK_SIZE_BITS(sb); 4209 >> EXT4_BLOCK_SIZE_BITS(sb);
4231 err = ext4_ext_remove_space(inode, last_block); 4210 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4232 4211
4233 /* In a multi-transaction truncate, we only make the final 4212 /* In a multi-transaction truncate, we only make the final
4234 * transaction synchronous. 4213 * transaction synchronous.
@@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4436 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4415 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4437 if (ret <= 0) { 4416 if (ret <= 0) {
4438 WARN_ON(ret <= 0); 4417 WARN_ON(ret <= 0);
4439 printk(KERN_ERR "%s: ext4_ext_map_blocks " 4418 ext4_msg(inode->i_sb, KERN_ERR,
4440 "returned error inode#%lu, block=%u, " 4419 "%s:%d: inode #%lu: block %u: len %u: "
4441 "max_blocks=%u", __func__, 4420 "ext4_ext_map_blocks returned %d",
4442 inode->i_ino, map.m_lblk, map.m_len); 4421 __func__, __LINE__, inode->i_ino, map.m_lblk,
4422 map.m_len, ret);
4443 } 4423 }
4444 ext4_mark_inode_dirty(handle, inode); 4424 ext4_mark_inode_dirty(handle, inode);
4445 ret2 = ext4_journal_stop(handle); 4425 ret2 = ext4_journal_stop(handle);
@@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4705{ 4685{
4706 struct inode *inode = file->f_path.dentry->d_inode; 4686 struct inode *inode = file->f_path.dentry->d_inode;
4707 struct super_block *sb = inode->i_sb; 4687 struct super_block *sb = inode->i_sb;
4708 struct ext4_ext_cache cache_ex; 4688 ext4_lblk_t first_block, stop_block;
4709 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4710 struct address_space *mapping = inode->i_mapping; 4689 struct address_space *mapping = inode->i_mapping;
4711 struct ext4_map_blocks map;
4712 handle_t *handle; 4690 handle_t *handle;
4713 loff_t first_page, last_page, page_len; 4691 loff_t first_page, last_page, page_len;
4714 loff_t first_page_offset, last_page_offset; 4692 loff_t first_page_offset, last_page_offset;
4715 int ret, credits, blocks_released, err = 0; 4693 int credits, err = 0;
4716 4694
4717 /* No need to punch hole beyond i_size */ 4695 /* No need to punch hole beyond i_size */
4718 if (offset >= inode->i_size) 4696 if (offset >= inode->i_size)
@@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4728 offset; 4706 offset;
4729 } 4707 }
4730 4708
4731 first_block = (offset + sb->s_blocksize - 1) >>
4732 EXT4_BLOCK_SIZE_BITS(sb);
4733 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4734
4735 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 4709 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4736 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 4710 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4737 4711
@@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4810 } 4784 }
4811 } 4785 }
4812 4786
4813
4814 /* 4787 /*
4815 * If i_size is contained in the last page, we need to 4788 * If i_size is contained in the last page, we need to
4816 * unmap and zero the partial page after i_size 4789 * unmap and zero the partial page after i_size
@@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4830 } 4803 }
4831 } 4804 }
4832 4805
4806 first_block = (offset + sb->s_blocksize - 1) >>
4807 EXT4_BLOCK_SIZE_BITS(sb);
4808 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4809
4833 /* If there are no blocks to remove, return now */ 4810 /* If there are no blocks to remove, return now */
4834 if (first_block >= last_block) 4811 if (first_block >= stop_block)
4835 goto out; 4812 goto out;
4836 4813
4837 down_write(&EXT4_I(inode)->i_data_sem); 4814 down_write(&EXT4_I(inode)->i_data_sem);
4838 ext4_ext_invalidate_cache(inode); 4815 ext4_ext_invalidate_cache(inode);
4839 ext4_discard_preallocations(inode); 4816 ext4_discard_preallocations(inode);
4840 4817
4841 /* 4818 err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4842 * Loop over all the blocks and identify blocks
4843 * that need to be punched out
4844 */
4845 iblock = first_block;
4846 blocks_released = 0;
4847 while (iblock < last_block) {
4848 max_blocks = last_block - iblock;
4849 num_blocks = 1;
4850 memset(&map, 0, sizeof(map));
4851 map.m_lblk = iblock;
4852 map.m_len = max_blocks;
4853 ret = ext4_ext_map_blocks(handle, inode, &map,
4854 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4855
4856 if (ret > 0) {
4857 blocks_released += ret;
4858 num_blocks = ret;
4859 } else if (ret == 0) {
4860 /*
4861 * If map blocks could not find the block,
4862 * then it is in a hole. If the hole was
4863 * not already cached, then map blocks should
4864 * put it in the cache. So we can get the hole
4865 * out of the cache
4866 */
4867 memset(&cache_ex, 0, sizeof(cache_ex));
4868 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4869 !cache_ex.ec_start) {
4870
4871 /* The hole is cached */
4872 num_blocks = cache_ex.ec_block +
4873 cache_ex.ec_len - iblock;
4874
4875 } else {
4876 /* The block could not be identified */
4877 err = -EIO;
4878 break;
4879 }
4880 } else {
4881 /* Map blocks error */
4882 err = ret;
4883 break;
4884 }
4885
4886 if (num_blocks == 0) {
4887 /* This condition should never happen */
4888 ext_debug("Block lookup failed");
4889 err = -EIO;
4890 break;
4891 }
4892
4893 iblock += num_blocks;
4894 }
4895 4819
4896 if (blocks_released > 0) { 4820 ext4_ext_invalidate_cache(inode);
4897 ext4_ext_invalidate_cache(inode); 4821 ext4_discard_preallocations(inode);
4898 ext4_discard_preallocations(inode);
4899 }
4900 4822
4901 if (IS_SYNC(inode)) 4823 if (IS_SYNC(inode))
4902 ext4_handle_sync(handle); 4824 ext4_handle_sync(handle);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 00a2cb753efd..bb6c7d811313 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode)
89 io = list_entry(ei->i_completed_io_list.next, 89 io = list_entry(ei->i_completed_io_list.next,
90 ext4_io_end_t, list); 90 ext4_io_end_t, list);
91 list_del_init(&io->list); 91 list_del_init(&io->list);
92 io->flag |= EXT4_IO_END_IN_FSYNC;
92 /* 93 /*
93 * Calling ext4_end_io_nolock() to convert completed 94 * Calling ext4_end_io_nolock() to convert completed
94 * IO to written. 95 * IO to written.
@@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode)
108 if (ret < 0) 109 if (ret < 0)
109 ret2 = ret; 110 ret2 = ret;
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 111 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
112 io->flag &= ~EXT4_IO_END_IN_FSYNC;
111 } 113 }
112 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 114 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
113 return (ret2 < 0) ? ret2 : 0; 115 return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25d8c9781ad9..409c2ee7750a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
92 return EXT4_INODES_PER_GROUP(sb); 92 return EXT4_INODES_PER_GROUP(sb);
93} 93}
94 94
95void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
96{
97 if (uptodate) {
98 set_buffer_uptodate(bh);
99 set_bitmap_uptodate(bh);
100 }
101 unlock_buffer(bh);
102 put_bh(bh);
103}
104
95/* 105/*
96 * Read the inode allocation bitmap for a given block_group, reading 106 * Read the inode allocation bitmap for a given block_group, reading
97 * into the specified slot in the superblock's bitmap cache. 107 * into the specified slot in the superblock's bitmap cache.
@@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
147 return bh; 157 return bh;
148 } 158 }
149 /* 159 /*
150 * submit the buffer_head for read. We can 160 * submit the buffer_head for reading
151 * safely mark the bitmap as uptodate now.
152 * We do it here so the bitmap uptodate bit
153 * get set with buffer lock held.
154 */ 161 */
155 trace_ext4_load_inode_bitmap(sb, block_group); 162 trace_ext4_load_inode_bitmap(sb, block_group);
156 set_bitmap_uptodate(bh); 163 bh->b_end_io = ext4_end_bitmap_read;
157 if (bh_submit_read(bh) < 0) { 164 get_bh(bh);
165 submit_bh(READ, bh);
166 wait_on_buffer(bh);
167 if (!buffer_uptodate(bh)) {
158 put_bh(bh); 168 put_bh(bh);
159 ext4_error(sb, "Cannot read inode bitmap - " 169 ext4_error(sb, "Cannot read inode bitmap - "
160 "block_group = %u, inode_bitmap = %llu", 170 "block_group = %u, inode_bitmap = %llu",
161 block_group, bitmap_blk); 171 block_group, bitmap_blk);
162 return NULL; 172 return NULL;
163 } 173 }
164 return bh; 174 return bh;
@@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
194 struct ext4_sb_info *sbi; 204 struct ext4_sb_info *sbi;
195 int fatal = 0, err, count, cleared; 205 int fatal = 0, err, count, cleared;
196 206
197 if (atomic_read(&inode->i_count) > 1) { 207 if (!sb) {
198 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", 208 printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
199 atomic_read(&inode->i_count)); 209 "nonexistent device\n", __func__, __LINE__);
200 return; 210 return;
201 } 211 }
202 if (inode->i_nlink) { 212 if (atomic_read(&inode->i_count) > 1) {
203 printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", 213 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
204 inode->i_nlink); 214 __func__, __LINE__, inode->i_ino,
215 atomic_read(&inode->i_count));
205 return; 216 return;
206 } 217 }
207 if (!sb) { 218 if (inode->i_nlink) {
208 printk(KERN_ERR "ext4_free_inode: inode on " 219 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
209 "nonexistent device\n"); 220 __func__, __LINE__, inode->i_ino, inode->i_nlink);
210 return; 221 return;
211 } 222 }
212 sbi = EXT4_SB(sb); 223 sbi = EXT4_SB(sb);
@@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
593} 604}
594 605
595/* 606/*
596 * claim the inode from the inode bitmap. If the group
597 * is uninit we need to take the groups's ext4_group_lock
598 * and clear the uninit flag. The inode bitmap update
599 * and group desc uninit flag clear should be done
600 * after holding ext4_group_lock so that ext4_read_inode_bitmap
601 * doesn't race with the ext4_claim_inode
602 */
603static int ext4_claim_inode(struct super_block *sb,
604 struct buffer_head *inode_bitmap_bh,
605 unsigned long ino, ext4_group_t group, umode_t mode)
606{
607 int free = 0, retval = 0, count;
608 struct ext4_sb_info *sbi = EXT4_SB(sb);
609 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
610 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
611
612 /*
613 * We have to be sure that new inode allocation does not race with
614 * inode table initialization, because otherwise we may end up
615 * allocating and writing new inode right before sb_issue_zeroout
616 * takes place and overwriting our new inode with zeroes. So we
617 * take alloc_sem to prevent it.
618 */
619 down_read(&grp->alloc_sem);
620 ext4_lock_group(sb, group);
621 if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
622 /* not a free inode */
623 retval = 1;
624 goto err_ret;
625 }
626 ino++;
627 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
628 ino > EXT4_INODES_PER_GROUP(sb)) {
629 ext4_unlock_group(sb, group);
630 up_read(&grp->alloc_sem);
631 ext4_error(sb, "reserved inode or inode > inodes count - "
632 "block_group = %u, inode=%lu", group,
633 ino + group * EXT4_INODES_PER_GROUP(sb));
634 return 1;
635 }
636 /* If we didn't allocate from within the initialized part of the inode
637 * table then we need to initialize up to this inode. */
638 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
639
640 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
641 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
642 /* When marking the block group with
643 * ~EXT4_BG_INODE_UNINIT we don't want to depend
644 * on the value of bg_itable_unused even though
645 * mke2fs could have initialized the same for us.
646 * Instead we calculated the value below
647 */
648
649 free = 0;
650 } else {
651 free = EXT4_INODES_PER_GROUP(sb) -
652 ext4_itable_unused_count(sb, gdp);
653 }
654
655 /*
656 * Check the relative inode number against the last used
657 * relative inode number in this group. if it is greater
658 * we need to update the bg_itable_unused count
659 *
660 */
661 if (ino > free)
662 ext4_itable_unused_set(sb, gdp,
663 (EXT4_INODES_PER_GROUP(sb) - ino));
664 }
665 count = ext4_free_inodes_count(sb, gdp) - 1;
666 ext4_free_inodes_set(sb, gdp, count);
667 if (S_ISDIR(mode)) {
668 count = ext4_used_dirs_count(sb, gdp) + 1;
669 ext4_used_dirs_set(sb, gdp, count);
670 if (sbi->s_log_groups_per_flex) {
671 ext4_group_t f = ext4_flex_group(sbi, group);
672
673 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
674 }
675 }
676 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
677err_ret:
678 ext4_unlock_group(sb, group);
679 up_read(&grp->alloc_sem);
680 return retval;
681}
682
683/*
684 * There are two policies for allocating an inode. If the new inode is 607 * There are two policies for allocating an inode. If the new inode is
685 * a directory, then a forward search is made for a block group with both 608 * a directory, then a forward search is made for a block group with both
686 * free space and a low directory-to-inode ratio; if that fails, then of 609 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -741,6 +664,11 @@ got_group:
741 if (ret2 == -1) 664 if (ret2 == -1)
742 goto out; 665 goto out;
743 666
667 /*
668 * Normally we will only go through one pass of this loop,
669 * unless we get unlucky and it turns out the group we selected
670 * had its last inode grabbed by someone else.
671 */
744 for (i = 0; i < ngroups; i++, ino = 0) { 672 for (i = 0; i < ngroups; i++, ino = 0) {
745 err = -EIO; 673 err = -EIO;
746 674
@@ -757,51 +685,24 @@ repeat_in_this_group:
757 ino = ext4_find_next_zero_bit((unsigned long *) 685 ino = ext4_find_next_zero_bit((unsigned long *)
758 inode_bitmap_bh->b_data, 686 inode_bitmap_bh->b_data,
759 EXT4_INODES_PER_GROUP(sb), ino); 687 EXT4_INODES_PER_GROUP(sb), ino);
760 688 if (ino >= EXT4_INODES_PER_GROUP(sb)) {
761 if (ino < EXT4_INODES_PER_GROUP(sb)) { 689 if (++group == ngroups)
762 690 group = 0;
763 BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); 691 continue;
764 err = ext4_journal_get_write_access(handle,
765 inode_bitmap_bh);
766 if (err)
767 goto fail;
768
769 BUFFER_TRACE(group_desc_bh, "get_write_access");
770 err = ext4_journal_get_write_access(handle,
771 group_desc_bh);
772 if (err)
773 goto fail;
774 if (!ext4_claim_inode(sb, inode_bitmap_bh,
775 ino, group, mode)) {
776 /* we won it */
777 BUFFER_TRACE(inode_bitmap_bh,
778 "call ext4_handle_dirty_metadata");
779 err = ext4_handle_dirty_metadata(handle,
780 NULL,
781 inode_bitmap_bh);
782 if (err)
783 goto fail;
784 /* zero bit is inode number 1*/
785 ino++;
786 goto got;
787 }
788 /* we lost it */
789 ext4_handle_release_buffer(handle, inode_bitmap_bh);
790 ext4_handle_release_buffer(handle, group_desc_bh);
791
792 if (++ino < EXT4_INODES_PER_GROUP(sb))
793 goto repeat_in_this_group;
794 } 692 }
795 693 if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
796 /* 694 ext4_error(sb, "reserved inode found cleared - "
797 * This case is possible in concurrent environment. It is very 695 "inode=%lu", ino + 1);
798 * rare. We cannot repeat the find_group_xxx() call because 696 continue;
799 * that will simply return the same blockgroup, because the 697 }
800 * group descriptor metadata has not yet been updated. 698 ext4_lock_group(sb, group);
801 * So we just go onto the next blockgroup. 699 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
802 */ 700 ext4_unlock_group(sb, group);
803 if (++group == ngroups) 701 ino++; /* the inode bitmap is zero-based */
804 group = 0; 702 if (!ret2)
703 goto got; /* we grabbed the inode! */
704 if (ino < EXT4_INODES_PER_GROUP(sb))
705 goto repeat_in_this_group;
805 } 706 }
806 err = -ENOSPC; 707 err = -ENOSPC;
807 goto out; 708 goto out;
@@ -838,6 +739,59 @@ got:
838 if (err) 739 if (err)
839 goto fail; 740 goto fail;
840 } 741 }
742
743 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
744 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
745 if (err)
746 goto fail;
747
748 BUFFER_TRACE(group_desc_bh, "get_write_access");
749 err = ext4_journal_get_write_access(handle, group_desc_bh);
750 if (err)
751 goto fail;
752
753 /* Update the relevant bg descriptor fields */
754 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
755 int free;
756 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
757
758 down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
759 ext4_lock_group(sb, group); /* while we modify the bg desc */
760 free = EXT4_INODES_PER_GROUP(sb) -
761 ext4_itable_unused_count(sb, gdp);
762 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
763 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
764 free = 0;
765 }
766 /*
767 * Check the relative inode number against the last used
768 * relative inode number in this group. if it is greater
769 * we need to update the bg_itable_unused count
770 */
771 if (ino > free)
772 ext4_itable_unused_set(sb, gdp,
773 (EXT4_INODES_PER_GROUP(sb) - ino));
774 up_read(&grp->alloc_sem);
775 }
776 ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
777 if (S_ISDIR(mode)) {
778 ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
779 if (sbi->s_log_groups_per_flex) {
780 ext4_group_t f = ext4_flex_group(sbi, group);
781
782 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
783 }
784 }
785 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
786 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
787 ext4_unlock_group(sb, group);
788 }
789
790 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
791 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
792 if (err)
793 goto fail;
794
841 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); 795 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
842 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); 796 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
843 if (err) 797 if (err)
@@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1101 * where it is called from on active part of filesystem is ext4lazyinit 1055 * where it is called from on active part of filesystem is ext4lazyinit
1102 * thread, so we do not need any special locks, however we have to prevent 1056 * thread, so we do not need any special locks, however we have to prevent
1103 * inode allocation from the current group, so we take alloc_sem lock, to 1057 * inode allocation from the current group, so we take alloc_sem lock, to
1104 * block ext4_claim_inode until we are finished. 1058 * block ext4_new_inode() until we are finished.
1105 */ 1059 */
1106int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, 1060int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1107 int barrier) 1061 int barrier)
@@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1149 sbi->s_inodes_per_block); 1103 sbi->s_inodes_per_block);
1150 1104
1151 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { 1105 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1152 ext4_error(sb, "Something is wrong with group %u\n" 1106 ext4_error(sb, "Something is wrong with group %u: "
1153 "Used itable blocks: %d" 1107 "used itable blocks: %d; "
1154 "itable unused count: %u\n", 1108 "itable unused count: %u",
1155 group, used_blks, 1109 group, used_blks,
1156 ext4_itable_unused_count(sb, gdp)); 1110 ext4_itable_unused_count(sb, gdp));
1157 ret = 1; 1111 ret = 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82fe629d..c77b0bd2c711 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
272 trace_ext4_da_update_reserve_space(inode, used, quota_claim); 272 trace_ext4_da_update_reserve_space(inode, used, quota_claim);
273 if (unlikely(used > ei->i_reserved_data_blocks)) { 273 if (unlikely(used > ei->i_reserved_data_blocks)) {
274 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 274 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
275 "with only %d reserved data blocks\n", 275 "with only %d reserved data blocks",
276 __func__, inode->i_ino, used, 276 __func__, inode->i_ino, used,
277 ei->i_reserved_data_blocks); 277 ei->i_reserved_data_blocks);
278 WARN_ON(1); 278 WARN_ON(1);
@@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1165 */ 1165 */
1166 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " 1166 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1167 "ino %lu, to_free %d with only %d reserved " 1167 "ino %lu, to_free %d with only %d reserved "
1168 "data blocks\n", inode->i_ino, to_free, 1168 "data blocks", inode->i_ino, to_free,
1169 ei->i_reserved_data_blocks); 1169 ei->i_reserved_data_blocks);
1170 WARN_ON(1); 1170 WARN_ON(1);
1171 to_free = ei->i_reserved_data_blocks; 1171 to_free = ei->i_reserved_data_blocks;
@@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1428static void ext4_print_free_blocks(struct inode *inode) 1428static void ext4_print_free_blocks(struct inode *inode)
1429{ 1429{
1430 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1430 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1431 printk(KERN_CRIT "Total free blocks count %lld\n", 1431 struct super_block *sb = inode->i_sb;
1432
1433 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1432 EXT4_C2B(EXT4_SB(inode->i_sb), 1434 EXT4_C2B(EXT4_SB(inode->i_sb),
1433 ext4_count_free_clusters(inode->i_sb))); 1435 ext4_count_free_clusters(inode->i_sb)));
1434 printk(KERN_CRIT "Free/Dirty block details\n"); 1436 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1435 printk(KERN_CRIT "free_blocks=%lld\n", 1437 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1436 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1438 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1437 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1439 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1438 printk(KERN_CRIT "dirty_blocks=%lld\n", 1440 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1439 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1441 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1440 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1442 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1441 printk(KERN_CRIT "Block reservation details\n"); 1443 ext4_msg(sb, KERN_CRIT, "Block reservation details");
1442 printk(KERN_CRIT "i_reserved_data_blocks=%u\n", 1444 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1443 EXT4_I(inode)->i_reserved_data_blocks); 1445 EXT4_I(inode)->i_reserved_data_blocks);
1444 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", 1446 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
1445 EXT4_I(inode)->i_reserved_meta_blocks); 1447 EXT4_I(inode)->i_reserved_meta_blocks);
1446 return; 1448 return;
1447} 1449}
@@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file,
2482 int write_mode = (int)(unsigned long)fsdata; 2484 int write_mode = (int)(unsigned long)fsdata;
2483 2485
2484 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2486 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2485 if (ext4_should_order_data(inode)) { 2487 switch (ext4_inode_journal_mode(inode)) {
2488 case EXT4_INODE_ORDERED_DATA_MODE:
2486 return ext4_ordered_write_end(file, mapping, pos, 2489 return ext4_ordered_write_end(file, mapping, pos,
2487 len, copied, page, fsdata); 2490 len, copied, page, fsdata);
2488 } else if (ext4_should_writeback_data(inode)) { 2491 case EXT4_INODE_WRITEBACK_DATA_MODE:
2489 return ext4_writeback_write_end(file, mapping, pos, 2492 return ext4_writeback_write_end(file, mapping, pos,
2490 len, copied, page, fsdata); 2493 len, copied, page, fsdata);
2491 } else { 2494 default:
2492 BUG(); 2495 BUG();
2493 } 2496 }
2494 } 2497 }
@@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2763 goto out; 2766 goto out;
2764 2767
2765 ext_debug("ext4_end_io_dio(): io_end 0x%p " 2768 ext_debug("ext4_end_io_dio(): io_end 0x%p "
2766 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", 2769 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
2767 iocb->private, io_end->inode->i_ino, iocb, offset, 2770 iocb->private, io_end->inode->i_ino, iocb, offset,
2768 size); 2771 size);
2769 2772
@@ -2795,9 +2798,6 @@ out:
2795 2798
2796 /* queue the work to convert unwritten extents to written */ 2799 /* queue the work to convert unwritten extents to written */
2797 queue_work(wq, &io_end->work); 2800 queue_work(wq, &io_end->work);
2798
2799 /* XXX: probably should move into the real I/O completion handler */
2800 inode_dio_done(inode);
2801} 2801}
2802 2802
2803static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 2803static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2811 goto out; 2811 goto out;
2812 2812
2813 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { 2813 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
2814 printk("sb umounted, discard end_io request for inode %lu\n", 2814 ext4_msg(io_end->inode->i_sb, KERN_INFO,
2815 io_end->inode->i_ino); 2815 "sb umounted, discard end_io request for inode %lu",
2816 io_end->inode->i_ino);
2816 ext4_free_io_end(io_end); 2817 ext4_free_io_end(io_end);
2817 goto out; 2818 goto out;
2818 } 2819 }
@@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2921 iocb->private = NULL; 2922 iocb->private = NULL;
2922 EXT4_I(inode)->cur_aio_dio = NULL; 2923 EXT4_I(inode)->cur_aio_dio = NULL;
2923 if (!is_sync_kiocb(iocb)) { 2924 if (!is_sync_kiocb(iocb)) {
2924 iocb->private = ext4_init_io_end(inode, GFP_NOFS); 2925 ext4_io_end_t *io_end =
2925 if (!iocb->private) 2926 ext4_init_io_end(inode, GFP_NOFS);
2927 if (!io_end)
2926 return -ENOMEM; 2928 return -ENOMEM;
2929 io_end->flag |= EXT4_IO_END_DIRECT;
2930 iocb->private = io_end;
2927 /* 2931 /*
2928 * we save the io structure for current async 2932 * we save the io structure for current async
2929 * direct IO, so that later ext4_map_blocks() 2933 * direct IO, so that later ext4_map_blocks()
@@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2940 ext4_get_block_write, 2944 ext4_get_block_write,
2941 ext4_end_io_dio, 2945 ext4_end_io_dio,
2942 NULL, 2946 NULL,
2943 DIO_LOCKING | DIO_SKIP_HOLES); 2947 DIO_LOCKING);
2944 if (iocb->private) 2948 if (iocb->private)
2945 EXT4_I(inode)->cur_aio_dio = NULL; 2949 EXT4_I(inode)->cur_aio_dio = NULL;
2946 /* 2950 /*
@@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = {
3086 3090
3087void ext4_set_aops(struct inode *inode) 3091void ext4_set_aops(struct inode *inode)
3088{ 3092{
3089 if (ext4_should_order_data(inode) && 3093 switch (ext4_inode_journal_mode(inode)) {
3090 test_opt(inode->i_sb, DELALLOC)) 3094 case EXT4_INODE_ORDERED_DATA_MODE:
3091 inode->i_mapping->a_ops = &ext4_da_aops; 3095 if (test_opt(inode->i_sb, DELALLOC))
3092 else if (ext4_should_order_data(inode)) 3096 inode->i_mapping->a_ops = &ext4_da_aops;
3093 inode->i_mapping->a_ops = &ext4_ordered_aops; 3097 else
3094 else if (ext4_should_writeback_data(inode) && 3098 inode->i_mapping->a_ops = &ext4_ordered_aops;
3095 test_opt(inode->i_sb, DELALLOC)) 3099 break;
3096 inode->i_mapping->a_ops = &ext4_da_aops; 3100 case EXT4_INODE_WRITEBACK_DATA_MODE:
3097 else if (ext4_should_writeback_data(inode)) 3101 if (test_opt(inode->i_sb, DELALLOC))
3098 inode->i_mapping->a_ops = &ext4_writeback_aops; 3102 inode->i_mapping->a_ops = &ext4_da_aops;
3099 else 3103 else
3104 inode->i_mapping->a_ops = &ext4_writeback_aops;
3105 break;
3106 case EXT4_INODE_JOURNAL_DATA_MODE:
3100 inode->i_mapping->a_ops = &ext4_journalled_aops; 3107 inode->i_mapping->a_ops = &ext4_journalled_aops;
3108 break;
3109 default:
3110 BUG();
3111 }
3101} 3112}
3102 3113
3103 3114
@@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3329{ 3340{
3330 struct inode *inode = file->f_path.dentry->d_inode; 3341 struct inode *inode = file->f_path.dentry->d_inode;
3331 if (!S_ISREG(inode->i_mode)) 3342 if (!S_ISREG(inode->i_mode))
3332 return -ENOTSUPP; 3343 return -EOPNOTSUPP;
3333 3344
3334 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 3345 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3335 /* TODO: Add support for non extent hole punching */ 3346 /* TODO: Add support for non extent hole punching */
3336 return -ENOTSUPP; 3347 return -EOPNOTSUPP;
3337 } 3348 }
3338 3349
3339 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { 3350 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3340 /* TODO: Add support for bigalloc file systems */ 3351 /* TODO: Add support for bigalloc file systems */
3341 return -ENOTSUPP; 3352 return -EOPNOTSUPP;
3342 } 3353 }
3343 3354
3344 return ext4_ext_punch_hole(file, offset, length); 3355 return ext4_ext_punch_hole(file, offset, length);
@@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle,
3924 ext4_update_dynamic_rev(sb); 3935 ext4_update_dynamic_rev(sb);
3925 EXT4_SET_RO_COMPAT_FEATURE(sb, 3936 EXT4_SET_RO_COMPAT_FEATURE(sb,
3926 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 3937 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
3927 sb->s_dirt = 1;
3928 ext4_handle_sync(handle); 3938 ext4_handle_sync(handle);
3929 err = ext4_handle_dirty_metadata(handle, NULL, 3939 err = ext4_handle_dirty_super(handle, sb);
3930 EXT4_SB(sb)->s_sbh);
3931 } 3940 }
3932 } 3941 }
3933 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 3942 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4152 } 4161 }
4153 4162
4154 if (attr->ia_valid & ATTR_SIZE) { 4163 if (attr->ia_valid & ATTR_SIZE) {
4155 if (attr->ia_size != i_size_read(inode)) { 4164 if (attr->ia_size != i_size_read(inode))
4156 truncate_setsize(inode, attr->ia_size); 4165 truncate_setsize(inode, attr->ia_size);
4157 ext4_truncate(inode); 4166 ext4_truncate(inode);
4158 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
4159 ext4_truncate(inode);
4160 } 4167 }
4161 4168
4162 if (!rc) { 4169 if (!rc) {
@@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
4314{ 4321{
4315 int err = 0; 4322 int err = 0;
4316 4323
4317 if (test_opt(inode->i_sb, I_VERSION)) 4324 if (IS_I_VERSION(inode))
4318 inode_inc_iversion(inode); 4325 inode_inc_iversion(inode);
4319 4326
4320 /* the do_update_inode consumes one bh->b_count */ 4327 /* the do_update_inode consumes one bh->b_count */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cb990b21c698..99ab428bcfa0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,6 +21,7 @@
21 * mballoc.c contains the multiblocks allocation routines 21 * mballoc.c contains the multiblocks allocation routines
22 */ 22 */
23 23
24#include "ext4_jbd2.h"
24#include "mballoc.h" 25#include "mballoc.h"
25#include <linux/debugfs.h> 26#include <linux/debugfs.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
@@ -339,7 +340,7 @@
339 */ 340 */
340static struct kmem_cache *ext4_pspace_cachep; 341static struct kmem_cache *ext4_pspace_cachep;
341static struct kmem_cache *ext4_ac_cachep; 342static struct kmem_cache *ext4_ac_cachep;
342static struct kmem_cache *ext4_free_ext_cachep; 343static struct kmem_cache *ext4_free_data_cachep;
343 344
344/* We create slab caches for groupinfo data structures based on the 345/* We create slab caches for groupinfo data structures based on the
345 * superblock block size. There will be one per mounted filesystem for 346 * superblock block size. There will be one per mounted filesystem for
@@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
357 ext4_group_t group); 358 ext4_group_t group);
358static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 359static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
359 ext4_group_t group); 360 ext4_group_t group);
360static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 361static void ext4_free_data_callback(struct super_block *sb,
362 struct ext4_journal_cb_entry *jce, int rc);
361 363
362static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 364static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
363{ 365{
@@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
425{ 427{
426 char *bb; 428 char *bb;
427 429
428 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 430 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
429 BUG_ON(max == NULL); 431 BUG_ON(max == NULL);
430 432
431 if (order > e4b->bd_blkbits + 1) { 433 if (order > e4b->bd_blkbits + 1) {
@@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
436 /* at order 0 we see each particular block */ 438 /* at order 0 we see each particular block */
437 if (order == 0) { 439 if (order == 0) {
438 *max = 1 << (e4b->bd_blkbits + 3); 440 *max = 1 << (e4b->bd_blkbits + 3);
439 return EXT4_MB_BITMAP(e4b); 441 return e4b->bd_bitmap;
440 } 442 }
441 443
442 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 444 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
443 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 445 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
444 446
445 return bb; 447 return bb;
@@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
588 for (j = 0; j < (1 << order); j++) { 590 for (j = 0; j < (1 << order); j++) {
589 k = (i * (1 << order)) + j; 591 k = (i * (1 << order)) + j;
590 MB_CHECK_ASSERT( 592 MB_CHECK_ASSERT(
591 !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); 593 !mb_test_bit(k, e4b->bd_bitmap));
592 } 594 }
593 count++; 595 count++;
594 } 596 }
@@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
782 int groups_per_page; 784 int groups_per_page;
783 int err = 0; 785 int err = 0;
784 int i; 786 int i;
785 ext4_group_t first_group; 787 ext4_group_t first_group, group;
786 int first_block; 788 int first_block;
787 struct super_block *sb; 789 struct super_block *sb;
788 struct buffer_head *bhs; 790 struct buffer_head *bhs;
@@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
806 808
807 /* allocate buffer_heads to read bitmaps */ 809 /* allocate buffer_heads to read bitmaps */
808 if (groups_per_page > 1) { 810 if (groups_per_page > 1) {
809 err = -ENOMEM;
810 i = sizeof(struct buffer_head *) * groups_per_page; 811 i = sizeof(struct buffer_head *) * groups_per_page;
811 bh = kzalloc(i, GFP_NOFS); 812 bh = kzalloc(i, GFP_NOFS);
812 if (bh == NULL) 813 if (bh == NULL) {
814 err = -ENOMEM;
813 goto out; 815 goto out;
816 }
814 } else 817 } else
815 bh = &bhs; 818 bh = &bhs;
816 819
817 first_group = page->index * blocks_per_page / 2; 820 first_group = page->index * blocks_per_page / 2;
818 821
819 /* read all groups the page covers into the cache */ 822 /* read all groups the page covers into the cache */
820 for (i = 0; i < groups_per_page; i++) { 823 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
821 struct ext4_group_desc *desc; 824 if (group >= ngroups)
822
823 if (first_group + i >= ngroups)
824 break; 825 break;
825 826
826 grinfo = ext4_get_group_info(sb, first_group + i); 827 grinfo = ext4_get_group_info(sb, group);
827 /* 828 /*
828 * If page is uptodate then we came here after online resize 829 * If page is uptodate then we came here after online resize
829 * which added some new uninitialized group info structs, so 830 * which added some new uninitialized group info structs, so
@@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
834 bh[i] = NULL; 835 bh[i] = NULL;
835 continue; 836 continue;
836 } 837 }
837 838 if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
838 err = -EIO; 839 err = -ENOMEM;
839 desc = ext4_get_group_desc(sb, first_group + i, NULL);
840 if (desc == NULL)
841 goto out;
842
843 err = -ENOMEM;
844 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
845 if (bh[i] == NULL)
846 goto out; 840 goto out;
847
848 if (bitmap_uptodate(bh[i]))
849 continue;
850
851 lock_buffer(bh[i]);
852 if (bitmap_uptodate(bh[i])) {
853 unlock_buffer(bh[i]);
854 continue;
855 }
856 ext4_lock_group(sb, first_group + i);
857 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
858 ext4_init_block_bitmap(sb, bh[i],
859 first_group + i, desc);
860 set_bitmap_uptodate(bh[i]);
861 set_buffer_uptodate(bh[i]);
862 ext4_unlock_group(sb, first_group + i);
863 unlock_buffer(bh[i]);
864 continue;
865 } 841 }
866 ext4_unlock_group(sb, first_group + i); 842 mb_debug(1, "read bitmap for group %u\n", group);
867 if (buffer_uptodate(bh[i])) {
868 /*
869 * if not uninit if bh is uptodate,
870 * bitmap is also uptodate
871 */
872 set_bitmap_uptodate(bh[i]);
873 unlock_buffer(bh[i]);
874 continue;
875 }
876 get_bh(bh[i]);
877 /*
878 * submit the buffer_head for read. We can
879 * safely mark the bitmap as uptodate now.
880 * We do it here so the bitmap uptodate bit
881 * get set with buffer lock held.
882 */
883 set_bitmap_uptodate(bh[i]);
884 bh[i]->b_end_io = end_buffer_read_sync;
885 submit_bh(READ, bh[i]);
886 mb_debug(1, "read bitmap for group %u\n", first_group + i);
887 } 843 }
888 844
889 /* wait for I/O completion */ 845 /* wait for I/O completion */
890 for (i = 0; i < groups_per_page; i++) 846 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
891 if (bh[i]) 847 if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
892 wait_on_buffer(bh[i]); 848 err = -EIO;
893
894 err = -EIO;
895 for (i = 0; i < groups_per_page; i++)
896 if (bh[i] && !buffer_uptodate(bh[i]))
897 goto out; 849 goto out;
850 }
851 }
898 852
899 err = 0;
900 first_block = page->index * blocks_per_page; 853 first_block = page->index * blocks_per_page;
901 for (i = 0; i < blocks_per_page; i++) { 854 for (i = 0; i < blocks_per_page; i++) {
902 int group; 855 int group;
@@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1250 int order = 1; 1203 int order = 1;
1251 void *bb; 1204 void *bb;
1252 1205
1253 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 1206 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1254 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1207 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1255 1208
1256 bb = EXT4_MB_BUDDY(e4b); 1209 bb = e4b->bd_buddy;
1257 while (order <= e4b->bd_blkbits + 1) { 1210 while (order <= e4b->bd_blkbits + 1) {
1258 block = block >> 1; 1211 block = block >> 1;
1259 if (!mb_test_bit(block, bb)) { 1212 if (!mb_test_bit(block, bb)) {
@@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1323 1276
1324 /* let's maintain fragments counter */ 1277 /* let's maintain fragments counter */
1325 if (first != 0) 1278 if (first != 0)
1326 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); 1279 block = !mb_test_bit(first - 1, e4b->bd_bitmap);
1327 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1280 if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1328 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); 1281 max = !mb_test_bit(first + count, e4b->bd_bitmap);
1329 if (block && max) 1282 if (block && max)
1330 e4b->bd_info->bb_fragments--; 1283 e4b->bd_info->bb_fragments--;
1331 else if (!block && !max) 1284 else if (!block && !max)
@@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1336 block = first++; 1289 block = first++;
1337 order = 0; 1290 order = 0;
1338 1291
1339 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1292 if (!mb_test_bit(block, e4b->bd_bitmap)) {
1340 ext4_fsblk_t blocknr; 1293 ext4_fsblk_t blocknr;
1341 1294
1342 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1295 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1347 "freeing already freed block " 1300 "freeing already freed block "
1348 "(bit %u)", block); 1301 "(bit %u)", block);
1349 } 1302 }
1350 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1303 mb_clear_bit(block, e4b->bd_bitmap);
1351 e4b->bd_info->bb_counters[order]++; 1304 e4b->bd_info->bb_counters[order]++;
1352 1305
1353 /* start of the buddy */ 1306 /* start of the buddy */
@@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1429 break; 1382 break;
1430 1383
1431 next = (block + 1) * (1 << order); 1384 next = (block + 1) * (1 << order);
1432 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1385 if (mb_test_bit(next, e4b->bd_bitmap))
1433 break; 1386 break;
1434 1387
1435 order = mb_find_order_for_block(e4b, next); 1388 order = mb_find_order_for_block(e4b, next);
@@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1466 1419
1467 /* let's maintain fragments counter */ 1420 /* let's maintain fragments counter */
1468 if (start != 0) 1421 if (start != 0)
1469 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); 1422 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1470 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1423 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1471 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); 1424 max = !mb_test_bit(start + len, e4b->bd_bitmap);
1472 if (mlen && max) 1425 if (mlen && max)
1473 e4b->bd_info->bb_fragments++; 1426 e4b->bd_info->bb_fragments++;
1474 else if (!mlen && !max) 1427 else if (!mlen && !max)
@@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1511 } 1464 }
1512 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1465 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1513 1466
1514 ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1467 ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1515 mb_check_buddy(e4b); 1468 mb_check_buddy(e4b);
1516 1469
1517 return ret; 1470 return ret;
@@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1810 struct ext4_buddy *e4b) 1763 struct ext4_buddy *e4b)
1811{ 1764{
1812 struct super_block *sb = ac->ac_sb; 1765 struct super_block *sb = ac->ac_sb;
1813 void *bitmap = EXT4_MB_BITMAP(e4b); 1766 void *bitmap = e4b->bd_bitmap;
1814 struct ext4_free_extent ex; 1767 struct ext4_free_extent ex;
1815 int i; 1768 int i;
1816 int free; 1769 int free;
@@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1870{ 1823{
1871 struct super_block *sb = ac->ac_sb; 1824 struct super_block *sb = ac->ac_sb;
1872 struct ext4_sb_info *sbi = EXT4_SB(sb); 1825 struct ext4_sb_info *sbi = EXT4_SB(sb);
1873 void *bitmap = EXT4_MB_BITMAP(e4b); 1826 void *bitmap = e4b->bd_bitmap;
1874 struct ext4_free_extent ex; 1827 struct ext4_free_extent ex;
1875 ext4_fsblk_t first_group_block; 1828 ext4_fsblk_t first_group_block;
1876 ext4_fsblk_t a; 1829 ext4_fsblk_t a;
@@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2224 EXT4_DESC_PER_BLOCK_BITS(sb); 2177 EXT4_DESC_PER_BLOCK_BITS(sb);
2225 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2178 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2226 if (meta_group_info == NULL) { 2179 if (meta_group_info == NULL) {
2227 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " 2180 ext4_msg(sb, KERN_ERR, "can't allocate mem "
2228 "for a buddy group"); 2181 "for a buddy group");
2229 goto exit_meta_group_info; 2182 goto exit_meta_group_info;
2230 } 2183 }
@@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2238 2191
2239 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2192 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2240 if (meta_group_info[i] == NULL) { 2193 if (meta_group_info[i] == NULL) {
2241 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); 2194 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2242 goto exit_group_info; 2195 goto exit_group_info;
2243 } 2196 }
2244 memset(meta_group_info[i], 0, kmem_cache_size(cachep)); 2197 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2522 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2475 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2523 &ext4_mb_seq_groups_fops, sb); 2476 &ext4_mb_seq_groups_fops, sb);
2524 2477
2525 if (sbi->s_journal)
2526 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2527
2528 return 0; 2478 return 0;
2529 2479
2530out_free_locality_groups: 2480out_free_locality_groups:
@@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb,
2637 * This function is called by the jbd2 layer once the commit has finished, 2587 * This function is called by the jbd2 layer once the commit has finished,
2638 * so we know we can free the blocks that were released with that commit. 2588 * so we know we can free the blocks that were released with that commit.
2639 */ 2589 */
2640static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2590static void ext4_free_data_callback(struct super_block *sb,
2591 struct ext4_journal_cb_entry *jce,
2592 int rc)
2641{ 2593{
2642 struct super_block *sb = journal->j_private; 2594 struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2643 struct ext4_buddy e4b; 2595 struct ext4_buddy e4b;
2644 struct ext4_group_info *db; 2596 struct ext4_group_info *db;
2645 int err, count = 0, count2 = 0; 2597 int err, count = 0, count2 = 0;
2646 struct ext4_free_data *entry;
2647 struct list_head *l, *ltmp;
2648 2598
2649 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2599 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2650 entry = list_entry(l, struct ext4_free_data, list); 2600 entry->efd_count, entry->efd_group, entry);
2651 2601
2652 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2602 if (test_opt(sb, DISCARD))
2653 entry->count, entry->group, entry); 2603 ext4_issue_discard(sb, entry->efd_group,
2604 entry->efd_start_cluster, entry->efd_count);
2654 2605
2655 if (test_opt(sb, DISCARD)) 2606 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2656 ext4_issue_discard(sb, entry->group, 2607 /* we expect to find existing buddy because it's pinned */
2657 entry->start_cluster, entry->count); 2608 BUG_ON(err != 0);
2658 2609
2659 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2660 /* we expect to find existing buddy because it's pinned */
2661 BUG_ON(err != 0);
2662 2610
2663 db = e4b.bd_info; 2611 db = e4b.bd_info;
2664 /* there are blocks to put in buddy to make them really free */ 2612 /* there are blocks to put in buddy to make them really free */
2665 count += entry->count; 2613 count += entry->efd_count;
2666 count2++; 2614 count2++;
2667 ext4_lock_group(sb, entry->group); 2615 ext4_lock_group(sb, entry->efd_group);
2668 /* Take it out of per group rb tree */ 2616 /* Take it out of per group rb tree */
2669 rb_erase(&entry->node, &(db->bb_free_root)); 2617 rb_erase(&entry->efd_node, &(db->bb_free_root));
2670 mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); 2618 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2671 2619
2672 /* 2620 /*
2673 * Clear the trimmed flag for the group so that the next 2621 * Clear the trimmed flag for the group so that the next
2674 * ext4_trim_fs can trim it. 2622 * ext4_trim_fs can trim it.
2675 * If the volume is mounted with -o discard, online discard 2623 * If the volume is mounted with -o discard, online discard
2676 * is supported and the free blocks will be trimmed online. 2624 * is supported and the free blocks will be trimmed online.
2677 */ 2625 */
2678 if (!test_opt(sb, DISCARD)) 2626 if (!test_opt(sb, DISCARD))
2679 EXT4_MB_GRP_CLEAR_TRIMMED(db); 2627 EXT4_MB_GRP_CLEAR_TRIMMED(db);
2680 2628
2681 if (!db->bb_free_root.rb_node) { 2629 if (!db->bb_free_root.rb_node) {
2682 /* No more items in the per group rb tree 2630 /* No more items in the per group rb tree
2683 * balance refcounts from ext4_mb_free_metadata() 2631 * balance refcounts from ext4_mb_free_metadata()
2684 */ 2632 */
2685 page_cache_release(e4b.bd_buddy_page); 2633 page_cache_release(e4b.bd_buddy_page);
2686 page_cache_release(e4b.bd_bitmap_page); 2634 page_cache_release(e4b.bd_bitmap_page);
2687 }
2688 ext4_unlock_group(sb, entry->group);
2689 kmem_cache_free(ext4_free_ext_cachep, entry);
2690 ext4_mb_unload_buddy(&e4b);
2691 } 2635 }
2636 ext4_unlock_group(sb, entry->efd_group);
2637 kmem_cache_free(ext4_free_data_cachep, entry);
2638 ext4_mb_unload_buddy(&e4b);
2692 2639
2693 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2640 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2694} 2641}
@@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void)
2741 return -ENOMEM; 2688 return -ENOMEM;
2742 } 2689 }
2743 2690
2744 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, 2691 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2745 SLAB_RECLAIM_ACCOUNT); 2692 SLAB_RECLAIM_ACCOUNT);
2746 if (ext4_free_ext_cachep == NULL) { 2693 if (ext4_free_data_cachep == NULL) {
2747 kmem_cache_destroy(ext4_pspace_cachep); 2694 kmem_cache_destroy(ext4_pspace_cachep);
2748 kmem_cache_destroy(ext4_ac_cachep); 2695 kmem_cache_destroy(ext4_ac_cachep);
2749 return -ENOMEM; 2696 return -ENOMEM;
@@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void)
2761 rcu_barrier(); 2708 rcu_barrier();
2762 kmem_cache_destroy(ext4_pspace_cachep); 2709 kmem_cache_destroy(ext4_pspace_cachep);
2763 kmem_cache_destroy(ext4_ac_cachep); 2710 kmem_cache_destroy(ext4_ac_cachep);
2764 kmem_cache_destroy(ext4_free_ext_cachep); 2711 kmem_cache_destroy(ext4_free_data_cachep);
2765 ext4_groupinfo_destroy_slabs(); 2712 ext4_groupinfo_destroy_slabs();
2766 ext4_remove_debugfs_entry(); 2713 ext4_remove_debugfs_entry();
2767} 2714}
@@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2815 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 2762 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2816 if (!ext4_data_block_valid(sbi, block, len)) { 2763 if (!ext4_data_block_valid(sbi, block, len)) {
2817 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 2764 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2818 "fs metadata\n", block, block+len); 2765 "fs metadata", block, block+len);
2819 /* File system mounted not to panic on error 2766 /* File system mounted not to panic on error
2820 * Fix the bitmap and repeat the block allocation 2767 * Fix the bitmap and repeat the block allocation
2821 * We leak some of the blocks here. 2768 * We leak some of the blocks here.
@@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2911 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2858 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2912 int bsbits, max; 2859 int bsbits, max;
2913 ext4_lblk_t end; 2860 ext4_lblk_t end;
2914 loff_t size, orig_size, start_off; 2861 loff_t size, start_off;
2862 loff_t orig_size __maybe_unused;
2915 ext4_lblk_t start; 2863 ext4_lblk_t start;
2916 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2864 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2917 struct ext4_prealloc_space *pa; 2865 struct ext4_prealloc_space *pa;
@@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3321 n = rb_first(&(grp->bb_free_root)); 3269 n = rb_first(&(grp->bb_free_root));
3322 3270
3323 while (n) { 3271 while (n) {
3324 entry = rb_entry(n, struct ext4_free_data, node); 3272 entry = rb_entry(n, struct ext4_free_data, efd_node);
3325 ext4_set_bits(bitmap, entry->start_cluster, entry->count); 3273 ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3326 n = rb_next(n); 3274 n = rb_next(n);
3327 } 3275 }
3328 return; 3276 return;
@@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3916 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) 3864 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3917 return; 3865 return;
3918 3866
3919 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" 3867 ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
3920 " Allocation context details:"); 3868 " Allocation context details:");
3921 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", 3869 ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
3922 ac->ac_status, ac->ac_flags); 3870 ac->ac_status, ac->ac_flags);
3923 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " 3871 ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
3924 "goal %lu/%lu/%lu@%lu, " 3872 "goal %lu/%lu/%lu@%lu, "
3925 "best %lu/%lu/%lu@%lu cr %d", 3873 "best %lu/%lu/%lu@%lu cr %d",
3926 (unsigned long)ac->ac_o_ex.fe_group, 3874 (unsigned long)ac->ac_o_ex.fe_group,
@@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3936 (unsigned long)ac->ac_b_ex.fe_len, 3884 (unsigned long)ac->ac_b_ex.fe_len,
3937 (unsigned long)ac->ac_b_ex.fe_logical, 3885 (unsigned long)ac->ac_b_ex.fe_logical,
3938 (int)ac->ac_criteria); 3886 (int)ac->ac_criteria);
3939 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", 3887 ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
3940 ac->ac_ex_scanned, ac->ac_found); 3888 ac->ac_ex_scanned, ac->ac_found);
3941 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); 3889 ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
3942 ngroups = ext4_get_groups_count(sb); 3890 ngroups = ext4_get_groups_count(sb);
3943 for (i = 0; i < ngroups; i++) { 3891 for (i = 0; i < ngroups; i++) {
3944 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3892 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4428,9 +4376,9 @@ out:
4428static int can_merge(struct ext4_free_data *entry1, 4376static int can_merge(struct ext4_free_data *entry1,
4429 struct ext4_free_data *entry2) 4377 struct ext4_free_data *entry2)
4430{ 4378{
4431 if ((entry1->t_tid == entry2->t_tid) && 4379 if ((entry1->efd_tid == entry2->efd_tid) &&
4432 (entry1->group == entry2->group) && 4380 (entry1->efd_group == entry2->efd_group) &&
4433 ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) 4381 ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4434 return 1; 4382 return 1;
4435 return 0; 4383 return 0;
4436} 4384}
@@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4452 BUG_ON(e4b->bd_bitmap_page == NULL); 4400 BUG_ON(e4b->bd_bitmap_page == NULL);
4453 BUG_ON(e4b->bd_buddy_page == NULL); 4401 BUG_ON(e4b->bd_buddy_page == NULL);
4454 4402
4455 new_node = &new_entry->node; 4403 new_node = &new_entry->efd_node;
4456 cluster = new_entry->start_cluster; 4404 cluster = new_entry->efd_start_cluster;
4457 4405
4458 if (!*n) { 4406 if (!*n) {
4459 /* first free block exent. We need to 4407 /* first free block exent. We need to
@@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4466 } 4414 }
4467 while (*n) { 4415 while (*n) {
4468 parent = *n; 4416 parent = *n;
4469 entry = rb_entry(parent, struct ext4_free_data, node); 4417 entry = rb_entry(parent, struct ext4_free_data, efd_node);
4470 if (cluster < entry->start_cluster) 4418 if (cluster < entry->efd_start_cluster)
4471 n = &(*n)->rb_left; 4419 n = &(*n)->rb_left;
4472 else if (cluster >= (entry->start_cluster + entry->count)) 4420 else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4473 n = &(*n)->rb_right; 4421 n = &(*n)->rb_right;
4474 else { 4422 else {
4475 ext4_grp_locked_error(sb, group, 0, 4423 ext4_grp_locked_error(sb, group, 0,
@@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4486 /* Now try to see the extent can be merged to left and right */ 4434 /* Now try to see the extent can be merged to left and right */
4487 node = rb_prev(new_node); 4435 node = rb_prev(new_node);
4488 if (node) { 4436 if (node) {
4489 entry = rb_entry(node, struct ext4_free_data, node); 4437 entry = rb_entry(node, struct ext4_free_data, efd_node);
4490 if (can_merge(entry, new_entry)) { 4438 if (can_merge(entry, new_entry)) {
4491 new_entry->start_cluster = entry->start_cluster; 4439 new_entry->efd_start_cluster = entry->efd_start_cluster;
4492 new_entry->count += entry->count; 4440 new_entry->efd_count += entry->efd_count;
4493 rb_erase(node, &(db->bb_free_root)); 4441 rb_erase(node, &(db->bb_free_root));
4494 spin_lock(&sbi->s_md_lock); 4442 ext4_journal_callback_del(handle, &entry->efd_jce);
4495 list_del(&entry->list); 4443 kmem_cache_free(ext4_free_data_cachep, entry);
4496 spin_unlock(&sbi->s_md_lock);
4497 kmem_cache_free(ext4_free_ext_cachep, entry);
4498 } 4444 }
4499 } 4445 }
4500 4446
4501 node = rb_next(new_node); 4447 node = rb_next(new_node);
4502 if (node) { 4448 if (node) {
4503 entry = rb_entry(node, struct ext4_free_data, node); 4449 entry = rb_entry(node, struct ext4_free_data, efd_node);
4504 if (can_merge(new_entry, entry)) { 4450 if (can_merge(new_entry, entry)) {
4505 new_entry->count += entry->count; 4451 new_entry->efd_count += entry->efd_count;
4506 rb_erase(node, &(db->bb_free_root)); 4452 rb_erase(node, &(db->bb_free_root));
4507 spin_lock(&sbi->s_md_lock); 4453 ext4_journal_callback_del(handle, &entry->efd_jce);
4508 list_del(&entry->list); 4454 kmem_cache_free(ext4_free_data_cachep, entry);
4509 spin_unlock(&sbi->s_md_lock);
4510 kmem_cache_free(ext4_free_ext_cachep, entry);
4511 } 4455 }
4512 } 4456 }
4513 /* Add the extent to transaction's private list */ 4457 /* Add the extent to transaction's private list */
4514 spin_lock(&sbi->s_md_lock); 4458 ext4_journal_callback_add(handle, ext4_free_data_callback,
4515 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4459 &new_entry->efd_jce);
4516 spin_unlock(&sbi->s_md_lock);
4517 return 0; 4460 return 0;
4518} 4461}
4519 4462
@@ -4691,15 +4634,15 @@ do_more:
4691 * blocks being freed are metadata. these blocks shouldn't 4634 * blocks being freed are metadata. these blocks shouldn't
4692 * be used until this transaction is committed 4635 * be used until this transaction is committed
4693 */ 4636 */
4694 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4637 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4695 if (!new_entry) { 4638 if (!new_entry) {
4696 err = -ENOMEM; 4639 err = -ENOMEM;
4697 goto error_return; 4640 goto error_return;
4698 } 4641 }
4699 new_entry->start_cluster = bit; 4642 new_entry->efd_start_cluster = bit;
4700 new_entry->group = block_group; 4643 new_entry->efd_group = block_group;
4701 new_entry->count = count_clusters; 4644 new_entry->efd_count = count_clusters;
4702 new_entry->t_tid = handle->h_transaction->t_tid; 4645 new_entry->efd_tid = handle->h_transaction->t_tid;
4703 4646
4704 ext4_lock_group(sb, block_group); 4647 ext4_lock_group(sb, block_group);
4705 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4648 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4971 start = (e4b.bd_info->bb_first_free > start) ? 4914 start = (e4b.bd_info->bb_first_free > start) ?
4972 e4b.bd_info->bb_first_free : start; 4915 e4b.bd_info->bb_first_free : start;
4973 4916
4974 while (start < max) { 4917 while (start <= max) {
4975 start = mb_find_next_zero_bit(bitmap, max, start); 4918 start = mb_find_next_zero_bit(bitmap, max + 1, start);
4976 if (start >= max) 4919 if (start > max)
4977 break; 4920 break;
4978 next = mb_find_next_bit(bitmap, max, start); 4921 next = mb_find_next_bit(bitmap, max + 1, start);
4979 4922
4980 if ((next - start) >= minblocks) { 4923 if ((next - start) >= minblocks) {
4981 ext4_trim_extent(sb, start, 4924 ext4_trim_extent(sb, start,
@@ -5027,37 +4970,36 @@ out:
5027int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 4970int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5028{ 4971{
5029 struct ext4_group_info *grp; 4972 struct ext4_group_info *grp;
5030 ext4_group_t first_group, last_group; 4973 ext4_group_t group, first_group, last_group;
5031 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
5032 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 4974 ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5033 uint64_t start, len, minlen, trimmed = 0; 4975 uint64_t start, end, minlen, trimmed = 0;
5034 ext4_fsblk_t first_data_blk = 4976 ext4_fsblk_t first_data_blk =
5035 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 4977 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4978 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5036 int ret = 0; 4979 int ret = 0;
5037 4980
5038 start = range->start >> sb->s_blocksize_bits; 4981 start = range->start >> sb->s_blocksize_bits;
5039 len = range->len >> sb->s_blocksize_bits; 4982 end = start + (range->len >> sb->s_blocksize_bits) - 1;
5040 minlen = range->minlen >> sb->s_blocksize_bits; 4983 minlen = range->minlen >> sb->s_blocksize_bits;
5041 4984
5042 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) 4985 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
4986 unlikely(start >= max_blks))
5043 return -EINVAL; 4987 return -EINVAL;
5044 if (start + len <= first_data_blk) 4988 if (end >= max_blks)
4989 end = max_blks - 1;
4990 if (end <= first_data_blk)
5045 goto out; 4991 goto out;
5046 if (start < first_data_blk) { 4992 if (start < first_data_blk)
5047 len -= first_data_blk - start;
5048 start = first_data_blk; 4993 start = first_data_blk;
5049 }
5050 4994
5051 /* Determine first and last group to examine based on start and len */ 4995 /* Determine first and last group to examine based on start and end */
5052 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 4996 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5053 &first_group, &first_cluster); 4997 &first_group, &first_cluster);
5054 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), 4998 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5055 &last_group, &last_cluster); 4999 &last_group, &last_cluster);
5056 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
5057 last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
5058 5000
5059 if (first_group > last_group) 5001 /* end now represents the last cluster to discard in this group */
5060 return -EINVAL; 5002 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5061 5003
5062 for (group = first_group; group <= last_group; group++) { 5004 for (group = first_group; group <= last_group; group++) {
5063 grp = ext4_get_group_info(sb, group); 5005 grp = ext4_get_group_info(sb, group);
@@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5069 } 5011 }
5070 5012
5071 /* 5013 /*
5072 * For all the groups except the last one, last block will 5014 * For all the groups except the last one, last cluster will
5073 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to 5015 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5074 * change it for the last group in which case start + 5016 * change it for the last group, note that last_cluster is
5075 * len < EXT4_BLOCKS_PER_GROUP(sb). 5017 * already computed earlier by ext4_get_group_no_and_offset()
5076 */ 5018 */
5077 if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) 5019 if (group == last_group)
5078 last_cluster = first_cluster + len; 5020 end = last_cluster;
5079 len -= last_cluster - first_cluster;
5080 5021
5081 if (grp->bb_free >= minlen) { 5022 if (grp->bb_free >= minlen) {
5082 cnt = ext4_trim_all_free(sb, group, first_cluster, 5023 cnt = ext4_trim_all_free(sb, group, first_cluster,
5083 last_cluster, minlen); 5024 end, minlen);
5084 if (cnt < 0) { 5025 if (cnt < 0) {
5085 ret = cnt; 5026 ret = cnt;
5086 break; 5027 break;
5087 } 5028 }
5029 trimmed += cnt;
5088 } 5030 }
5089 trimmed += cnt; 5031
5032 /*
5033 * For every group except the first one, we are sure
5034 * that the first cluster to discard will be cluster #0.
5035 */
5090 first_cluster = 0; 5036 first_cluster = 0;
5091 } 5037 }
5092 range->len = trimmed * sb->s_blocksize;
5093 5038
5094 if (!ret) 5039 if (!ret)
5095 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); 5040 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5096 5041
5097out: 5042out:
5043 range->len = trimmed * sb->s_blocksize;
5098 return ret; 5044 return ret;
5099} 5045}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 47705f3285e3..c070618c21ce 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
96 96
97 97
98struct ext4_free_data { 98struct ext4_free_data {
99 /* this links the free block information from group_info */ 99 /* MUST be the first member */
100 struct rb_node node; 100 struct ext4_journal_cb_entry efd_jce;
101
102 /* ext4_free_data private data starts from here */
101 103
102 /* this links the free block information from ext4_sb_info */ 104 /* this links the free block information from group_info */
103 struct list_head list; 105 struct rb_node efd_node;
104 106
105 /* group which free block extent belongs */ 107 /* group which free block extent belongs */
106 ext4_group_t group; 108 ext4_group_t efd_group;
107 109
108 /* free block extent */ 110 /* free block extent */
109 ext4_grpblk_t start_cluster; 111 ext4_grpblk_t efd_start_cluster;
110 ext4_grpblk_t count; 112 ext4_grpblk_t efd_count;
111 113
112 /* transaction which freed this extent */ 114 /* transaction which freed this extent */
113 tid_t t_tid; 115 tid_t efd_tid;
114}; 116};
115 117
116struct ext4_prealloc_space { 118struct ext4_prealloc_space {
@@ -210,8 +212,6 @@ struct ext4_buddy {
210 __u16 bd_blkbits; 212 __u16 bd_blkbits;
211 ext4_group_t bd_group; 213 ext4_group_t bd_group;
212}; 214};
213#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
214#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
215 215
216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
217 struct ext4_free_extent *fex) 217 struct ext4_free_extent *fex)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index e7d6bb0acfa6..f39f80f8f2c5 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode)
471 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 471 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
472 S_IFREG, NULL, goal, owner); 472 S_IFREG, NULL, goal, owner);
473 if (IS_ERR(tmp_inode)) { 473 if (IS_ERR(tmp_inode)) {
474 retval = PTR_ERR(inode); 474 retval = PTR_ERR(tmp_inode);
475 ext4_journal_stop(handle); 475 ext4_journal_stop(handle);
476 return retval; 476 return retval;
477 } 477 }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 7ea4ba4eff2a..ed6548d89165 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb,
257 * If check_interval in MMP block is larger, use that instead of 257 * If check_interval in MMP block is larger, use that instead of
258 * update_interval from the superblock. 258 * update_interval from the superblock.
259 */ 259 */
260 if (mmp->mmp_check_interval > mmp_check_interval) 260 if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
261 mmp_check_interval = mmp->mmp_check_interval; 261 mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
262 262
263 seq = le32_to_cpu(mmp->mmp_seq); 263 seq = le32_to_cpu(mmp->mmp_seq);
264 if (seq == EXT4_MMP_SEQ_CLEAN) 264 if (seq == EXT4_MMP_SEQ_CLEAN)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f482375d..349d7b3671c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -468,7 +468,7 @@ fail2:
468fail: 468fail:
469 if (*err == ERR_BAD_DX_DIR) 469 if (*err == ERR_BAD_DX_DIR)
470 ext4_warning(dir->i_sb, 470 ext4_warning(dir->i_sb,
471 "Corrupt dir inode %ld, running e2fsck is " 471 "Corrupt dir inode %lu, running e2fsck is "
472 "recommended.", dir->i_ino); 472 "recommended.", dir->i_ino);
473 return NULL; 473 return NULL;
474} 474}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 475851896518..74cd1f7f1f88 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -60,7 +60,6 @@ void ext4_ioend_wait(struct inode *inode)
60static void put_io_page(struct ext4_io_page *io_page) 60static void put_io_page(struct ext4_io_page *io_page)
61{ 61{
62 if (atomic_dec_and_test(&io_page->p_count)) { 62 if (atomic_dec_and_test(&io_page->p_count)) {
63 end_page_writeback(io_page->p_page);
64 put_page(io_page->p_page); 63 put_page(io_page->p_page);
65 kmem_cache_free(io_page_cachep, io_page); 64 kmem_cache_free(io_page_cachep, io_page);
66 } 65 }
@@ -110,6 +109,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
110 if (io->iocb) 109 if (io->iocb)
111 aio_complete(io->iocb, io->result, 0); 110 aio_complete(io->iocb, io->result, 0);
112 111
112 if (io->flag & EXT4_IO_END_DIRECT)
113 inode_dio_done(inode);
113 /* Wake up anyone waiting on unwritten extent conversion */ 114 /* Wake up anyone waiting on unwritten extent conversion */
114 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) 115 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
115 wake_up_all(ext4_ioend_wq(io->inode)); 116 wake_up_all(ext4_ioend_wq(io->inode));
@@ -127,12 +128,18 @@ static void ext4_end_io_work(struct work_struct *work)
127 unsigned long flags; 128 unsigned long flags;
128 129
129 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 130 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
131 if (io->flag & EXT4_IO_END_IN_FSYNC)
132 goto requeue;
130 if (list_empty(&io->list)) { 133 if (list_empty(&io->list)) {
131 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 134 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
132 goto free; 135 goto free;
133 } 136 }
134 137
135 if (!mutex_trylock(&inode->i_mutex)) { 138 if (!mutex_trylock(&inode->i_mutex)) {
139 bool was_queued;
140requeue:
141 was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
142 io->flag |= EXT4_IO_END_QUEUED;
136 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 143 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
137 /* 144 /*
138 * Requeue the work instead of waiting so that the work 145 * Requeue the work instead of waiting so that the work
@@ -145,9 +152,8 @@ static void ext4_end_io_work(struct work_struct *work)
145 * yield the cpu if it sees an end_io request that has already 152 * yield the cpu if it sees an end_io request that has already
146 * been requeued. 153 * been requeued.
147 */ 154 */
148 if (io->flag & EXT4_IO_END_QUEUED) 155 if (was_queued)
149 yield(); 156 yield();
150 io->flag |= EXT4_IO_END_QUEUED;
151 return; 157 return;
152 } 158 }
153 list_del_init(&io->list); 159 list_del_init(&io->list);
@@ -227,9 +233,9 @@ static void ext4_end_bio(struct bio *bio, int error)
227 } while (bh != head); 233 } while (bh != head);
228 } 234 }
229 235
230 put_io_page(io_end->pages[i]); 236 if (atomic_read(&io_end->pages[i]->p_count) == 1)
237 end_page_writeback(io_end->pages[i]->p_page);
231 } 238 }
232 io_end->num_io_pages = 0;
233 inode = io_end->inode; 239 inode = io_end->inode;
234 240
235 if (error) { 241 if (error) {
@@ -421,6 +427,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
421 * PageWriteback bit from the page to prevent the system from 427 * PageWriteback bit from the page to prevent the system from
422 * wedging later on. 428 * wedging later on.
423 */ 429 */
430 if (atomic_read(&io_page->p_count) == 1)
431 end_page_writeback(page);
424 put_io_page(io_page); 432 put_io_page(io_page);
425 return ret; 433 return ret;
426} 434}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f9d948f0eb86..59fa0be27251 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb,
1163 do_div(reserved_blocks, 100); 1163 do_div(reserved_blocks, 100);
1164 1164
1165 ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); 1165 ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
1166 ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
1166 le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * 1167 le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1167 flex_gd->count); 1168 flex_gd->count);
1169 le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1170 flex_gd->count);
1168 1171
1169 /* 1172 /*
1170 * We need to protect s_groups_count against other CPUs seeing 1173 * We need to protect s_groups_count against other CPUs seeing
@@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
1465 } 1468 }
1466 1469
1467 ext4_blocks_count_set(es, o_blocks_count + add); 1470 ext4_blocks_count_set(es, o_blocks_count + add);
1471 ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
1468 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1472 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1469 o_blocks_count + add); 1473 o_blocks_count + add);
1470 /* We add the blocks to the bitmap and set the group need init bit */ 1474 /* We add the blocks to the bitmap and set the group need init bit */
@@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1512 o_blocks_count = ext4_blocks_count(es); 1516 o_blocks_count = ext4_blocks_count(es);
1513 1517
1514 if (test_opt(sb, DEBUG)) 1518 if (test_opt(sb, DEBUG))
1515 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", 1519 ext4_msg(sb, KERN_DEBUG,
1516 o_blocks_count, n_blocks_count); 1520 "extending last group from %llu to %llu blocks",
1521 o_blocks_count, n_blocks_count);
1517 1522
1518 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 1523 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
1519 return 0; 1524 return 0;
1520 1525
1521 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1526 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1522 printk(KERN_ERR "EXT4-fs: filesystem on %s:" 1527 ext4_msg(sb, KERN_ERR,
1523 " too large to resize to %llu blocks safely\n", 1528 "filesystem too large to resize to %llu blocks safely",
1524 sb->s_id, n_blocks_count); 1529 n_blocks_count);
1525 if (sizeof(sector_t) < 8) 1530 if (sizeof(sector_t) < 8)
1526 ext4_warning(sb, "CONFIG_LBDAF not enabled"); 1531 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1527 return -EINVAL; 1532 return -EINVAL;
@@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1582 ext4_fsblk_t o_blocks_count; 1587 ext4_fsblk_t o_blocks_count;
1583 ext4_group_t o_group; 1588 ext4_group_t o_group;
1584 ext4_group_t n_group; 1589 ext4_group_t n_group;
1585 ext4_grpblk_t offset; 1590 ext4_grpblk_t offset, add;
1586 unsigned long n_desc_blocks; 1591 unsigned long n_desc_blocks;
1587 unsigned long o_desc_blocks; 1592 unsigned long o_desc_blocks;
1588 unsigned long desc_blocks; 1593 unsigned long desc_blocks;
@@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1591 o_blocks_count = ext4_blocks_count(es); 1596 o_blocks_count = ext4_blocks_count(es);
1592 1597
1593 if (test_opt(sb, DEBUG)) 1598 if (test_opt(sb, DEBUG))
1594 printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " 1599 ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
1595 "upto %llu blocks\n", o_blocks_count, n_blocks_count); 1600 "to %llu blocks", o_blocks_count, n_blocks_count);
1596 1601
1597 if (n_blocks_count < o_blocks_count) { 1602 if (n_blocks_count < o_blocks_count) {
1598 /* On-line shrinking not supported */ 1603 /* On-line shrinking not supported */
@@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1605 return 0; 1610 return 0;
1606 1611
1607 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1612 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
1608 ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); 1613 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
1609 1614
1610 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / 1615 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
1611 EXT4_DESC_PER_BLOCK(sb); 1616 EXT4_DESC_PER_BLOCK(sb);
@@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1634 } 1639 }
1635 brelse(bh); 1640 brelse(bh);
1636 1641
1637 if (offset != 0) { 1642 /* extend the last group */
1638 /* extend the last group */ 1643 if (n_group == o_group)
1639 ext4_grpblk_t add; 1644 add = n_blocks_count - o_blocks_count;
1640 add = EXT4_BLOCKS_PER_GROUP(sb) - offset; 1645 else
1646 add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
1647 if (add > 0) {
1641 err = ext4_group_extend_no_check(sb, o_blocks_count, add); 1648 err = ext4_group_extend_no_check(sb, o_blocks_count, add);
1642 if (err) 1649 if (err)
1643 goto out; 1650 goto out;
@@ -1674,7 +1681,7 @@ out:
1674 1681
1675 iput(resize_inode); 1682 iput(resize_inode);
1676 if (test_opt(sb, DEBUG)) 1683 if (test_opt(sb, DEBUG))
1677 printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " 1684 ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
1678 "upto %llu blocks\n", o_blocks_count, n_blocks_count); 1685 "upto %llu blocks", o_blocks_count, n_blocks_count);
1679 return err; 1686 return err;
1680} 1687}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 933900909ed0..ceebaf853beb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat;
62 62
63static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 63static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
64 unsigned long journal_devnum); 64 unsigned long journal_devnum);
65static int ext4_show_options(struct seq_file *seq, struct dentry *root);
65static int ext4_commit_super(struct super_block *sb, int sync); 66static int ext4_commit_super(struct super_block *sb, int sync);
66static void ext4_mark_recovery_complete(struct super_block *sb, 67static void ext4_mark_recovery_complete(struct super_block *sb,
67 struct ext4_super_block *es); 68 struct ext4_super_block *es);
@@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
375 if (is_handle_aborted(handle)) 376 if (is_handle_aborted(handle))
376 return; 377 return;
377 378
378 printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", 379 printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
379 caller, line, errstr, err_fn); 380 caller, line, errstr, err_fn);
380 381
381 jbd2_journal_abort_handle(handle); 382 jbd2_journal_abort_handle(handle);
@@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb)
431 return bdi->dev == NULL; 432 return bdi->dev == NULL;
432} 433}
433 434
435static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
436{
437 struct super_block *sb = journal->j_private;
438 struct ext4_sb_info *sbi = EXT4_SB(sb);
439 int error = is_journal_aborted(journal);
440 struct ext4_journal_cb_entry *jce, *tmp;
441
442 spin_lock(&sbi->s_md_lock);
443 list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
444 list_del_init(&jce->jce_list);
445 spin_unlock(&sbi->s_md_lock);
446 jce->jce_func(sb, jce, error);
447 spin_lock(&sbi->s_md_lock);
448 }
449 spin_unlock(&sbi->s_md_lock);
450}
434 451
435/* Deal with the reporting of failure conditions on a filesystem such as 452/* Deal with the reporting of failure conditions on a filesystem such as
436 * inconsistencies detected or read IO failures. 453 * inconsistencies detected or read IO failures.
@@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function,
498 va_start(args, fmt); 515 va_start(args, fmt);
499 vaf.fmt = fmt; 516 vaf.fmt = fmt;
500 vaf.va = &args; 517 vaf.va = &args;
501 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
502 inode->i_sb->s_id, function, line, inode->i_ino);
503 if (block) 518 if (block)
504 printk(KERN_CONT "block %llu: ", block); 519 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
505 printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); 520 "inode #%lu: block %llu: comm %s: %pV\n",
521 inode->i_sb->s_id, function, line, inode->i_ino,
522 block, current->comm, &vaf);
523 else
524 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
525 "inode #%lu: comm %s: %pV\n",
526 inode->i_sb->s_id, function, line, inode->i_ino,
527 current->comm, &vaf);
506 va_end(args); 528 va_end(args);
507 529
508 ext4_handle_error(inode->i_sb); 530 ext4_handle_error(inode->i_sb);
@@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function,
524 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 546 path = d_path(&(file->f_path), pathname, sizeof(pathname));
525 if (IS_ERR(path)) 547 if (IS_ERR(path))
526 path = "(unknown)"; 548 path = "(unknown)";
527 printk(KERN_CRIT
528 "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
529 inode->i_sb->s_id, function, line, inode->i_ino);
530 if (block)
531 printk(KERN_CONT "block %llu: ", block);
532 va_start(args, fmt); 549 va_start(args, fmt);
533 vaf.fmt = fmt; 550 vaf.fmt = fmt;
534 vaf.va = &args; 551 vaf.va = &args;
535 printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); 552 if (block)
553 printk(KERN_CRIT
554 "EXT4-fs error (device %s): %s:%d: inode #%lu: "
555 "block %llu: comm %s: path %s: %pV\n",
556 inode->i_sb->s_id, function, line, inode->i_ino,
557 block, current->comm, path, &vaf);
558 else
559 printk(KERN_CRIT
560 "EXT4-fs error (device %s): %s:%d: inode #%lu: "
561 "comm %s: path %s: %pV\n",
562 inode->i_sb->s_id, function, line, inode->i_ino,
563 current->comm, path, &vaf);
536 va_end(args); 564 va_end(args);
537 565
538 ext4_handle_error(inode->i_sb); 566 ext4_handle_error(inode->i_sb);
@@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb)
808 destroy_workqueue(sbi->dio_unwritten_wq); 836 destroy_workqueue(sbi->dio_unwritten_wq);
809 837
810 lock_super(sb); 838 lock_super(sb);
811 if (sb->s_dirt)
812 ext4_commit_super(sb, 1);
813
814 if (sbi->s_journal) { 839 if (sbi->s_journal) {
815 err = jbd2_journal_destroy(sbi->s_journal); 840 err = jbd2_journal_destroy(sbi->s_journal);
816 sbi->s_journal = NULL; 841 sbi->s_journal = NULL;
@@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb)
827 if (!(sb->s_flags & MS_RDONLY)) { 852 if (!(sb->s_flags & MS_RDONLY)) {
828 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 853 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
829 es->s_state = cpu_to_le16(sbi->s_mount_state); 854 es->s_state = cpu_to_le16(sbi->s_mount_state);
830 ext4_commit_super(sb, 1);
831 } 855 }
856 if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
857 ext4_commit_super(sb, 1);
858
832 if (sbi->s_proc) { 859 if (sbi->s_proc) {
860 remove_proc_entry("options", sbi->s_proc);
833 remove_proc_entry(sb->s_id, ext4_proc_root); 861 remove_proc_entry(sb->s_id, ext4_proc_root);
834 } 862 }
835 kobject_del(&sbi->s_kobj); 863 kobject_del(&sbi->s_kobj);
@@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode)
990 } 1018 }
991} 1019}
992 1020
993static inline void ext4_show_quota_options(struct seq_file *seq,
994 struct super_block *sb)
995{
996#if defined(CONFIG_QUOTA)
997 struct ext4_sb_info *sbi = EXT4_SB(sb);
998
999 if (sbi->s_jquota_fmt) {
1000 char *fmtname = "";
1001
1002 switch (sbi->s_jquota_fmt) {
1003 case QFMT_VFS_OLD:
1004 fmtname = "vfsold";
1005 break;
1006 case QFMT_VFS_V0:
1007 fmtname = "vfsv0";
1008 break;
1009 case QFMT_VFS_V1:
1010 fmtname = "vfsv1";
1011 break;
1012 }
1013 seq_printf(seq, ",jqfmt=%s", fmtname);
1014 }
1015
1016 if (sbi->s_qf_names[USRQUOTA])
1017 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
1018
1019 if (sbi->s_qf_names[GRPQUOTA])
1020 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1021
1022 if (test_opt(sb, USRQUOTA))
1023 seq_puts(seq, ",usrquota");
1024
1025 if (test_opt(sb, GRPQUOTA))
1026 seq_puts(seq, ",grpquota");
1027#endif
1028}
1029
1030/*
1031 * Show an option if
1032 * - it's set to a non-default value OR
1033 * - if the per-sb default is different from the global default
1034 */
1035static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1036{
1037 int def_errors;
1038 unsigned long def_mount_opts;
1039 struct super_block *sb = root->d_sb;
1040 struct ext4_sb_info *sbi = EXT4_SB(sb);
1041 struct ext4_super_block *es = sbi->s_es;
1042
1043 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1044 def_errors = le16_to_cpu(es->s_errors);
1045
1046 if (sbi->s_sb_block != 1)
1047 seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
1048 if (test_opt(sb, MINIX_DF))
1049 seq_puts(seq, ",minixdf");
1050 if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
1051 seq_puts(seq, ",grpid");
1052 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
1053 seq_puts(seq, ",nogrpid");
1054 if (sbi->s_resuid != EXT4_DEF_RESUID ||
1055 le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
1056 seq_printf(seq, ",resuid=%u", sbi->s_resuid);
1057 }
1058 if (sbi->s_resgid != EXT4_DEF_RESGID ||
1059 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
1060 seq_printf(seq, ",resgid=%u", sbi->s_resgid);
1061 }
1062 if (test_opt(sb, ERRORS_RO)) {
1063 if (def_errors == EXT4_ERRORS_PANIC ||
1064 def_errors == EXT4_ERRORS_CONTINUE) {
1065 seq_puts(seq, ",errors=remount-ro");
1066 }
1067 }
1068 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1069 seq_puts(seq, ",errors=continue");
1070 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1071 seq_puts(seq, ",errors=panic");
1072 if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
1073 seq_puts(seq, ",nouid32");
1074 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
1075 seq_puts(seq, ",debug");
1076#ifdef CONFIG_EXT4_FS_XATTR
1077 if (test_opt(sb, XATTR_USER))
1078 seq_puts(seq, ",user_xattr");
1079 if (!test_opt(sb, XATTR_USER))
1080 seq_puts(seq, ",nouser_xattr");
1081#endif
1082#ifdef CONFIG_EXT4_FS_POSIX_ACL
1083 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
1084 seq_puts(seq, ",acl");
1085 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
1086 seq_puts(seq, ",noacl");
1087#endif
1088 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
1089 seq_printf(seq, ",commit=%u",
1090 (unsigned) (sbi->s_commit_interval / HZ));
1091 }
1092 if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
1093 seq_printf(seq, ",min_batch_time=%u",
1094 (unsigned) sbi->s_min_batch_time);
1095 }
1096 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
1097 seq_printf(seq, ",max_batch_time=%u",
1098 (unsigned) sbi->s_max_batch_time);
1099 }
1100
1101 /*
1102 * We're changing the default of barrier mount option, so
1103 * let's always display its mount state so it's clear what its
1104 * status is.
1105 */
1106 seq_puts(seq, ",barrier=");
1107 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
1108 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
1109 seq_puts(seq, ",journal_async_commit");
1110 else if (test_opt(sb, JOURNAL_CHECKSUM))
1111 seq_puts(seq, ",journal_checksum");
1112 if (test_opt(sb, I_VERSION))
1113 seq_puts(seq, ",i_version");
1114 if (!test_opt(sb, DELALLOC) &&
1115 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1116 seq_puts(seq, ",nodelalloc");
1117
1118 if (!test_opt(sb, MBLK_IO_SUBMIT))
1119 seq_puts(seq, ",nomblk_io_submit");
1120 if (sbi->s_stripe)
1121 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1122 /*
1123 * journal mode get enabled in different ways
1124 * So just print the value even if we didn't specify it
1125 */
1126 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1127 seq_puts(seq, ",data=journal");
1128 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1129 seq_puts(seq, ",data=ordered");
1130 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1131 seq_puts(seq, ",data=writeback");
1132
1133 if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1134 seq_printf(seq, ",inode_readahead_blks=%u",
1135 sbi->s_inode_readahead_blks);
1136
1137 if (test_opt(sb, DATA_ERR_ABORT))
1138 seq_puts(seq, ",data_err=abort");
1139
1140 if (test_opt(sb, NO_AUTO_DA_ALLOC))
1141 seq_puts(seq, ",noauto_da_alloc");
1142
1143 if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
1144 seq_puts(seq, ",discard");
1145
1146 if (test_opt(sb, NOLOAD))
1147 seq_puts(seq, ",norecovery");
1148
1149 if (test_opt(sb, DIOREAD_NOLOCK))
1150 seq_puts(seq, ",dioread_nolock");
1151
1152 if (test_opt(sb, BLOCK_VALIDITY) &&
1153 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1154 seq_puts(seq, ",block_validity");
1155
1156 if (!test_opt(sb, INIT_INODE_TABLE))
1157 seq_puts(seq, ",noinit_itable");
1158 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1159 seq_printf(seq, ",init_itable=%u",
1160 (unsigned) sbi->s_li_wait_mult);
1161
1162 ext4_show_quota_options(seq, sb);
1163
1164 return 0;
1165}
1166
1167static struct inode *ext4_nfs_get_inode(struct super_block *sb, 1021static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1168 u64 ino, u32 generation) 1022 u64 ino, u32 generation)
1169{ 1023{
@@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = {
1316enum { 1170enum {
1317 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 1171 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1318 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1172 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1319 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1173 Opt_nouid32, Opt_debug, Opt_removed,
1320 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1174 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1321 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, 1175 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1322 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 1176 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1323 Opt_journal_update, Opt_journal_dev, 1177 Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
1324 Opt_journal_checksum, Opt_journal_async_commit,
1325 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1178 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1326 Opt_data_err_abort, Opt_data_err_ignore, 1179 Opt_data_err_abort, Opt_data_err_ignore,
1327 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1180 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1328 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1181 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1329 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, 1182 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1330 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, 1183 Opt_usrquota, Opt_grpquota, Opt_i_version,
1331 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, 1184 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1332 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1185 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1333 Opt_inode_readahead_blks, Opt_journal_ioprio, 1186 Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1350,20 +1203,19 @@ static const match_table_t tokens = {
1350 {Opt_err_ro, "errors=remount-ro"}, 1203 {Opt_err_ro, "errors=remount-ro"},
1351 {Opt_nouid32, "nouid32"}, 1204 {Opt_nouid32, "nouid32"},
1352 {Opt_debug, "debug"}, 1205 {Opt_debug, "debug"},
1353 {Opt_oldalloc, "oldalloc"}, 1206 {Opt_removed, "oldalloc"},
1354 {Opt_orlov, "orlov"}, 1207 {Opt_removed, "orlov"},
1355 {Opt_user_xattr, "user_xattr"}, 1208 {Opt_user_xattr, "user_xattr"},
1356 {Opt_nouser_xattr, "nouser_xattr"}, 1209 {Opt_nouser_xattr, "nouser_xattr"},
1357 {Opt_acl, "acl"}, 1210 {Opt_acl, "acl"},
1358 {Opt_noacl, "noacl"}, 1211 {Opt_noacl, "noacl"},
1359 {Opt_noload, "noload"},
1360 {Opt_noload, "norecovery"}, 1212 {Opt_noload, "norecovery"},
1361 {Opt_nobh, "nobh"}, 1213 {Opt_noload, "noload"},
1362 {Opt_bh, "bh"}, 1214 {Opt_removed, "nobh"},
1215 {Opt_removed, "bh"},
1363 {Opt_commit, "commit=%u"}, 1216 {Opt_commit, "commit=%u"},
1364 {Opt_min_batch_time, "min_batch_time=%u"}, 1217 {Opt_min_batch_time, "min_batch_time=%u"},
1365 {Opt_max_batch_time, "max_batch_time=%u"}, 1218 {Opt_max_batch_time, "max_batch_time=%u"},
1366 {Opt_journal_update, "journal=update"},
1367 {Opt_journal_dev, "journal_dev=%u"}, 1219 {Opt_journal_dev, "journal_dev=%u"},
1368 {Opt_journal_checksum, "journal_checksum"}, 1220 {Opt_journal_checksum, "journal_checksum"},
1369 {Opt_journal_async_commit, "journal_async_commit"}, 1221 {Opt_journal_async_commit, "journal_async_commit"},
@@ -1389,7 +1241,6 @@ static const match_table_t tokens = {
1389 {Opt_nobarrier, "nobarrier"}, 1241 {Opt_nobarrier, "nobarrier"},
1390 {Opt_i_version, "i_version"}, 1242 {Opt_i_version, "i_version"},
1391 {Opt_stripe, "stripe=%u"}, 1243 {Opt_stripe, "stripe=%u"},
1392 {Opt_resize, "resize"},
1393 {Opt_delalloc, "delalloc"}, 1244 {Opt_delalloc, "delalloc"},
1394 {Opt_nodelalloc, "nodelalloc"}, 1245 {Opt_nodelalloc, "nodelalloc"},
1395 {Opt_mblk_io_submit, "mblk_io_submit"}, 1246 {Opt_mblk_io_submit, "mblk_io_submit"},
@@ -1408,6 +1259,11 @@ static const match_table_t tokens = {
1408 {Opt_init_itable, "init_itable=%u"}, 1259 {Opt_init_itable, "init_itable=%u"},
1409 {Opt_init_itable, "init_itable"}, 1260 {Opt_init_itable, "init_itable"},
1410 {Opt_noinit_itable, "noinit_itable"}, 1261 {Opt_noinit_itable, "noinit_itable"},
1262 {Opt_removed, "check=none"}, /* mount option from ext2/3 */
1263 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
1264 {Opt_removed, "reservation"}, /* mount option from ext2/3 */
1265 {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1266 {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
1411 {Opt_err, NULL}, 1267 {Opt_err, NULL},
1412}; 1268};
1413 1269
@@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype)
1496} 1352}
1497#endif 1353#endif
1498 1354
1499static int parse_options(char *options, struct super_block *sb, 1355#define MOPT_SET 0x0001
1500 unsigned long *journal_devnum, 1356#define MOPT_CLEAR 0x0002
1501 unsigned int *journal_ioprio, 1357#define MOPT_NOSUPPORT 0x0004
1502 ext4_fsblk_t *n_blocks_count, int is_remount) 1358#define MOPT_EXPLICIT 0x0008
1503{ 1359#define MOPT_CLEAR_ERR 0x0010
1504 struct ext4_sb_info *sbi = EXT4_SB(sb); 1360#define MOPT_GTE0 0x0020
1505 char *p;
1506 substring_t args[MAX_OPT_ARGS];
1507 int data_opt = 0;
1508 int option;
1509#ifdef CONFIG_QUOTA 1361#ifdef CONFIG_QUOTA
1510 int qfmt; 1362#define MOPT_Q 0
1363#define MOPT_QFMT 0x0040
1364#else
1365#define MOPT_Q MOPT_NOSUPPORT
1366#define MOPT_QFMT MOPT_NOSUPPORT
1511#endif 1367#endif
1512 1368#define MOPT_DATAJ 0x0080
1513 if (!options) 1369
1514 return 1; 1370static const struct mount_opts {
1515 1371 int token;
1516 while ((p = strsep(&options, ",")) != NULL) { 1372 int mount_opt;
1517 int token; 1373 int flags;
1518 if (!*p) 1374} ext4_mount_opts[] = {
1519 continue; 1375 {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1520 1376 {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1521 /* 1377 {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1522 * Initialize args struct so we know whether arg was 1378 {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1523 * found; some options take optional arguments. 1379 {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
1524 */ 1380 {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
1525 args[0].to = args[0].from = NULL; 1381 {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1526 token = match_token(p, tokens, args); 1382 {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1527 switch (token) { 1383 {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
1528 case Opt_bsd_df: 1384 {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
1529 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1385 {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1530 clear_opt(sb, MINIX_DF); 1386 {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1531 break; 1387 {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
1532 case Opt_minix_df: 1388 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
1533 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1389 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
1534 set_opt(sb, MINIX_DF); 1390 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1535 1391 EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
1536 break; 1392 {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
1537 case Opt_grpid: 1393 {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1538 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1394 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1539 set_opt(sb, GRPID); 1395 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1540 1396 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
1541 break; 1397 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
1542 case Opt_nogrpid: 1398 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1543 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1399 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1544 clear_opt(sb, GRPID); 1400 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1545 1401 {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1546 break; 1402 {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1547 case Opt_resuid: 1403 {Opt_commit, 0, MOPT_GTE0},
1548 if (match_int(&args[0], &option)) 1404 {Opt_max_batch_time, 0, MOPT_GTE0},
1549 return 0; 1405 {Opt_min_batch_time, 0, MOPT_GTE0},
1550 sbi->s_resuid = option; 1406 {Opt_inode_readahead_blks, 0, MOPT_GTE0},
1551 break; 1407 {Opt_init_itable, 0, MOPT_GTE0},
1552 case Opt_resgid: 1408 {Opt_stripe, 0, MOPT_GTE0},
1553 if (match_int(&args[0], &option)) 1409 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
1554 return 0; 1410 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
1555 sbi->s_resgid = option; 1411 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
1556 break;
1557 case Opt_sb:
1558 /* handled by get_sb_block() instead of here */
1559 /* *sb_block = match_int(&args[0]); */
1560 break;
1561 case Opt_err_panic:
1562 clear_opt(sb, ERRORS_CONT);
1563 clear_opt(sb, ERRORS_RO);
1564 set_opt(sb, ERRORS_PANIC);
1565 break;
1566 case Opt_err_ro:
1567 clear_opt(sb, ERRORS_CONT);
1568 clear_opt(sb, ERRORS_PANIC);
1569 set_opt(sb, ERRORS_RO);
1570 break;
1571 case Opt_err_cont:
1572 clear_opt(sb, ERRORS_RO);
1573 clear_opt(sb, ERRORS_PANIC);
1574 set_opt(sb, ERRORS_CONT);
1575 break;
1576 case Opt_nouid32:
1577 set_opt(sb, NO_UID32);
1578 break;
1579 case Opt_debug:
1580 set_opt(sb, DEBUG);
1581 break;
1582 case Opt_oldalloc:
1583 ext4_msg(sb, KERN_WARNING,
1584 "Ignoring deprecated oldalloc option");
1585 break;
1586 case Opt_orlov:
1587 ext4_msg(sb, KERN_WARNING,
1588 "Ignoring deprecated orlov option");
1589 break;
1590#ifdef CONFIG_EXT4_FS_XATTR 1412#ifdef CONFIG_EXT4_FS_XATTR
1591 case Opt_user_xattr: 1413 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1592 set_opt(sb, XATTR_USER); 1414 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1593 break;
1594 case Opt_nouser_xattr:
1595 clear_opt(sb, XATTR_USER);
1596 break;
1597#else 1415#else
1598 case Opt_user_xattr: 1416 {Opt_user_xattr, 0, MOPT_NOSUPPORT},
1599 case Opt_nouser_xattr: 1417 {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
1600 ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
1601 break;
1602#endif 1418#endif
1603#ifdef CONFIG_EXT4_FS_POSIX_ACL 1419#ifdef CONFIG_EXT4_FS_POSIX_ACL
1604 case Opt_acl: 1420 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1605 set_opt(sb, POSIX_ACL); 1421 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1606 break;
1607 case Opt_noacl:
1608 clear_opt(sb, POSIX_ACL);
1609 break;
1610#else 1422#else
1611 case Opt_acl: 1423 {Opt_acl, 0, MOPT_NOSUPPORT},
1612 case Opt_noacl: 1424 {Opt_noacl, 0, MOPT_NOSUPPORT},
1613 ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
1614 break;
1615#endif 1425#endif
1616 case Opt_journal_update: 1426 {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1617 /* @@@ FIXME */ 1427 {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1618 /* Eventually we will want to be able to create 1428 {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1619 a journal file here. For now, only allow the 1429 {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1620 user to specify an existing inode to be the 1430 MOPT_SET | MOPT_Q},
1621 journal file. */ 1431 {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1622 if (is_remount) { 1432 MOPT_SET | MOPT_Q},
1623 ext4_msg(sb, KERN_ERR, 1433 {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1624 "Cannot specify journal on remount"); 1434 EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
1625 return 0; 1435 {Opt_usrjquota, 0, MOPT_Q},
1626 } 1436 {Opt_grpjquota, 0, MOPT_Q},
1627 set_opt(sb, UPDATE_JOURNAL); 1437 {Opt_offusrjquota, 0, MOPT_Q},
1628 break; 1438 {Opt_offgrpjquota, 0, MOPT_Q},
1629 case Opt_journal_dev: 1439 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1630 if (is_remount) { 1440 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1441 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1442 {Opt_err, 0, 0}
1443};
1444
1445static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1446 substring_t *args, unsigned long *journal_devnum,
1447 unsigned int *journal_ioprio, int is_remount)
1448{
1449 struct ext4_sb_info *sbi = EXT4_SB(sb);
1450 const struct mount_opts *m;
1451 int arg = 0;
1452
1453 if (args->from && match_int(args, &arg))
1454 return -1;
1455 switch (token) {
1456 case Opt_noacl:
1457 case Opt_nouser_xattr:
1458 ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
1459 break;
1460 case Opt_sb:
1461 return 1; /* handled by get_sb_block() */
1462 case Opt_removed:
1463 ext4_msg(sb, KERN_WARNING,
1464 "Ignoring removed %s option", opt);
1465 return 1;
1466 case Opt_resuid:
1467 sbi->s_resuid = arg;
1468 return 1;
1469 case Opt_resgid:
1470 sbi->s_resgid = arg;
1471 return 1;
1472 case Opt_abort:
1473 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1474 return 1;
1475 case Opt_i_version:
1476 sb->s_flags |= MS_I_VERSION;
1477 return 1;
1478 case Opt_journal_dev:
1479 if (is_remount) {
1480 ext4_msg(sb, KERN_ERR,
1481 "Cannot specify journal on remount");
1482 return -1;
1483 }
1484 *journal_devnum = arg;
1485 return 1;
1486 case Opt_journal_ioprio:
1487 if (arg < 0 || arg > 7)
1488 return -1;
1489 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
1490 return 1;
1491 }
1492
1493 for (m = ext4_mount_opts; m->token != Opt_err; m++) {
1494 if (token != m->token)
1495 continue;
1496 if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
1497 return -1;
1498 if (m->flags & MOPT_EXPLICIT)
1499 set_opt2(sb, EXPLICIT_DELALLOC);
1500 if (m->flags & MOPT_CLEAR_ERR)
1501 clear_opt(sb, ERRORS_MASK);
1502 if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
1503 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1504 "options when quota turned on");
1505 return -1;
1506 }
1507
1508 if (m->flags & MOPT_NOSUPPORT) {
1509 ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
1510 } else if (token == Opt_commit) {
1511 if (arg == 0)
1512 arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
1513 sbi->s_commit_interval = HZ * arg;
1514 } else if (token == Opt_max_batch_time) {
1515 if (arg == 0)
1516 arg = EXT4_DEF_MAX_BATCH_TIME;
1517 sbi->s_max_batch_time = arg;
1518 } else if (token == Opt_min_batch_time) {
1519 sbi->s_min_batch_time = arg;
1520 } else if (token == Opt_inode_readahead_blks) {
1521 if (arg > (1 << 30))
1522 return -1;
1523 if (arg && !is_power_of_2(arg)) {
1631 ext4_msg(sb, KERN_ERR, 1524 ext4_msg(sb, KERN_ERR,
1632 "Cannot specify journal on remount"); 1525 "EXT4-fs: inode_readahead_blks"
1633 return 0; 1526 " must be a power of 2");
1527 return -1;
1634 } 1528 }
1635 if (match_int(&args[0], &option)) 1529 sbi->s_inode_readahead_blks = arg;
1636 return 0; 1530 } else if (token == Opt_init_itable) {
1637 *journal_devnum = option; 1531 set_opt(sb, INIT_INODE_TABLE);
1638 break; 1532 if (!args->from)
1639 case Opt_journal_checksum: 1533 arg = EXT4_DEF_LI_WAIT_MULT;
1640 set_opt(sb, JOURNAL_CHECKSUM); 1534 sbi->s_li_wait_mult = arg;
1641 break; 1535 } else if (token == Opt_stripe) {
1642 case Opt_journal_async_commit: 1536 sbi->s_stripe = arg;
1643 set_opt(sb, JOURNAL_ASYNC_COMMIT); 1537 } else if (m->flags & MOPT_DATAJ) {
1644 set_opt(sb, JOURNAL_CHECKSUM);
1645 break;
1646 case Opt_noload:
1647 set_opt(sb, NOLOAD);
1648 break;
1649 case Opt_commit:
1650 if (match_int(&args[0], &option))
1651 return 0;
1652 if (option < 0)
1653 return 0;
1654 if (option == 0)
1655 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1656 sbi->s_commit_interval = HZ * option;
1657 break;
1658 case Opt_max_batch_time:
1659 if (match_int(&args[0], &option))
1660 return 0;
1661 if (option < 0)
1662 return 0;
1663 if (option == 0)
1664 option = EXT4_DEF_MAX_BATCH_TIME;
1665 sbi->s_max_batch_time = option;
1666 break;
1667 case Opt_min_batch_time:
1668 if (match_int(&args[0], &option))
1669 return 0;
1670 if (option < 0)
1671 return 0;
1672 sbi->s_min_batch_time = option;
1673 break;
1674 case Opt_data_journal:
1675 data_opt = EXT4_MOUNT_JOURNAL_DATA;
1676 goto datacheck;
1677 case Opt_data_ordered:
1678 data_opt = EXT4_MOUNT_ORDERED_DATA;
1679 goto datacheck;
1680 case Opt_data_writeback:
1681 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1682 datacheck:
1683 if (is_remount) { 1538 if (is_remount) {
1684 if (!sbi->s_journal) 1539 if (!sbi->s_journal)
1685 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); 1540 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1686 else if (test_opt(sb, DATA_FLAGS) != data_opt) { 1541 else if (test_opt(sb, DATA_FLAGS) !=
1542 m->mount_opt) {
1687 ext4_msg(sb, KERN_ERR, 1543 ext4_msg(sb, KERN_ERR,
1688 "Cannot change data mode on remount"); 1544 "Cannot change data mode on remount");
1689 return 0; 1545 return -1;
1690 } 1546 }
1691 } else { 1547 } else {
1692 clear_opt(sb, DATA_FLAGS); 1548 clear_opt(sb, DATA_FLAGS);
1693 sbi->s_mount_opt |= data_opt; 1549 sbi->s_mount_opt |= m->mount_opt;
1694 } 1550 }
1695 break;
1696 case Opt_data_err_abort:
1697 set_opt(sb, DATA_ERR_ABORT);
1698 break;
1699 case Opt_data_err_ignore:
1700 clear_opt(sb, DATA_ERR_ABORT);
1701 break;
1702#ifdef CONFIG_QUOTA 1551#ifdef CONFIG_QUOTA
1703 case Opt_usrjquota: 1552 } else if (token == Opt_usrjquota) {
1704 if (!set_qf_name(sb, USRQUOTA, &args[0])) 1553 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1705 return 0; 1554 return -1;
1706 break; 1555 } else if (token == Opt_grpjquota) {
1707 case Opt_grpjquota:
1708 if (!set_qf_name(sb, GRPQUOTA, &args[0])) 1556 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1709 return 0; 1557 return -1;
1710 break; 1558 } else if (token == Opt_offusrjquota) {
1711 case Opt_offusrjquota:
1712 if (!clear_qf_name(sb, USRQUOTA)) 1559 if (!clear_qf_name(sb, USRQUOTA))
1713 return 0; 1560 return -1;
1714 break; 1561 } else if (token == Opt_offgrpjquota) {
1715 case Opt_offgrpjquota:
1716 if (!clear_qf_name(sb, GRPQUOTA)) 1562 if (!clear_qf_name(sb, GRPQUOTA))
1717 return 0; 1563 return -1;
1718 break; 1564 } else if (m->flags & MOPT_QFMT) {
1719
1720 case Opt_jqfmt_vfsold:
1721 qfmt = QFMT_VFS_OLD;
1722 goto set_qf_format;
1723 case Opt_jqfmt_vfsv0:
1724 qfmt = QFMT_VFS_V0;
1725 goto set_qf_format;
1726 case Opt_jqfmt_vfsv1:
1727 qfmt = QFMT_VFS_V1;
1728set_qf_format:
1729 if (sb_any_quota_loaded(sb) && 1565 if (sb_any_quota_loaded(sb) &&
1730 sbi->s_jquota_fmt != qfmt) { 1566 sbi->s_jquota_fmt != m->mount_opt) {
1731 ext4_msg(sb, KERN_ERR, "Cannot change " 1567 ext4_msg(sb, KERN_ERR, "Cannot "
1732 "journaled quota options when " 1568 "change journaled quota options "
1733 "quota turned on"); 1569 "when quota turned on");
1734 return 0; 1570 return -1;
1735 }
1736 sbi->s_jquota_fmt = qfmt;
1737 break;
1738 case Opt_quota:
1739 case Opt_usrquota:
1740 set_opt(sb, QUOTA);
1741 set_opt(sb, USRQUOTA);
1742 break;
1743 case Opt_grpquota:
1744 set_opt(sb, QUOTA);
1745 set_opt(sb, GRPQUOTA);
1746 break;
1747 case Opt_noquota:
1748 if (sb_any_quota_loaded(sb)) {
1749 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1750 "options when quota turned on");
1751 return 0;
1752 } 1571 }
1753 clear_opt(sb, QUOTA); 1572 sbi->s_jquota_fmt = m->mount_opt;
1754 clear_opt(sb, USRQUOTA);
1755 clear_opt(sb, GRPQUOTA);
1756 break;
1757#else
1758 case Opt_quota:
1759 case Opt_usrquota:
1760 case Opt_grpquota:
1761 ext4_msg(sb, KERN_ERR,
1762 "quota options not supported");
1763 break;
1764 case Opt_usrjquota:
1765 case Opt_grpjquota:
1766 case Opt_offusrjquota:
1767 case Opt_offgrpjquota:
1768 case Opt_jqfmt_vfsold:
1769 case Opt_jqfmt_vfsv0:
1770 case Opt_jqfmt_vfsv1:
1771 ext4_msg(sb, KERN_ERR,
1772 "journaled quota options not supported");
1773 break;
1774 case Opt_noquota:
1775 break;
1776#endif 1573#endif
1777 case Opt_abort: 1574 } else {
1778 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; 1575 if (!args->from)
1779 break; 1576 arg = 1;
1780 case Opt_nobarrier: 1577 if (m->flags & MOPT_CLEAR)
1781 clear_opt(sb, BARRIER); 1578 arg = !arg;
1782 break; 1579 else if (unlikely(!(m->flags & MOPT_SET))) {
1783 case Opt_barrier: 1580 ext4_msg(sb, KERN_WARNING,
1784 if (args[0].from) { 1581 "buggy handling of option %s", opt);
1785 if (match_int(&args[0], &option)) 1582 WARN_ON(1);
1786 return 0; 1583 return -1;
1787 } else
1788 option = 1; /* No argument, default to 1 */
1789 if (option)
1790 set_opt(sb, BARRIER);
1791 else
1792 clear_opt(sb, BARRIER);
1793 break;
1794 case Opt_ignore:
1795 break;
1796 case Opt_resize:
1797 if (!is_remount) {
1798 ext4_msg(sb, KERN_ERR,
1799 "resize option only available "
1800 "for remount");
1801 return 0;
1802 }
1803 if (match_int(&args[0], &option) != 0)
1804 return 0;
1805 *n_blocks_count = option;
1806 break;
1807 case Opt_nobh:
1808 ext4_msg(sb, KERN_WARNING,
1809 "Ignoring deprecated nobh option");
1810 break;
1811 case Opt_bh:
1812 ext4_msg(sb, KERN_WARNING,
1813 "Ignoring deprecated bh option");
1814 break;
1815 case Opt_i_version:
1816 set_opt(sb, I_VERSION);
1817 sb->s_flags |= MS_I_VERSION;
1818 break;
1819 case Opt_nodelalloc:
1820 clear_opt(sb, DELALLOC);
1821 clear_opt2(sb, EXPLICIT_DELALLOC);
1822 break;
1823 case Opt_mblk_io_submit:
1824 set_opt(sb, MBLK_IO_SUBMIT);
1825 break;
1826 case Opt_nomblk_io_submit:
1827 clear_opt(sb, MBLK_IO_SUBMIT);
1828 break;
1829 case Opt_stripe:
1830 if (match_int(&args[0], &option))
1831 return 0;
1832 if (option < 0)
1833 return 0;
1834 sbi->s_stripe = option;
1835 break;
1836 case Opt_delalloc:
1837 set_opt(sb, DELALLOC);
1838 set_opt2(sb, EXPLICIT_DELALLOC);
1839 break;
1840 case Opt_block_validity:
1841 set_opt(sb, BLOCK_VALIDITY);
1842 break;
1843 case Opt_noblock_validity:
1844 clear_opt(sb, BLOCK_VALIDITY);
1845 break;
1846 case Opt_inode_readahead_blks:
1847 if (match_int(&args[0], &option))
1848 return 0;
1849 if (option < 0 || option > (1 << 30))
1850 return 0;
1851 if (option && !is_power_of_2(option)) {
1852 ext4_msg(sb, KERN_ERR,
1853 "EXT4-fs: inode_readahead_blks"
1854 " must be a power of 2");
1855 return 0;
1856 } 1584 }
1857 sbi->s_inode_readahead_blks = option; 1585 if (arg != 0)
1858 break; 1586 sbi->s_mount_opt |= m->mount_opt;
1859 case Opt_journal_ioprio:
1860 if (match_int(&args[0], &option))
1861 return 0;
1862 if (option < 0 || option > 7)
1863 break;
1864 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1865 option);
1866 break;
1867 case Opt_noauto_da_alloc:
1868 set_opt(sb, NO_AUTO_DA_ALLOC);
1869 break;
1870 case Opt_auto_da_alloc:
1871 if (args[0].from) {
1872 if (match_int(&args[0], &option))
1873 return 0;
1874 } else
1875 option = 1; /* No argument, default to 1 */
1876 if (option)
1877 clear_opt(sb, NO_AUTO_DA_ALLOC);
1878 else 1587 else
1879 set_opt(sb,NO_AUTO_DA_ALLOC); 1588 sbi->s_mount_opt &= ~m->mount_opt;
1880 break;
1881 case Opt_discard:
1882 set_opt(sb, DISCARD);
1883 break;
1884 case Opt_nodiscard:
1885 clear_opt(sb, DISCARD);
1886 break;
1887 case Opt_dioread_nolock:
1888 set_opt(sb, DIOREAD_NOLOCK);
1889 break;
1890 case Opt_dioread_lock:
1891 clear_opt(sb, DIOREAD_NOLOCK);
1892 break;
1893 case Opt_init_itable:
1894 set_opt(sb, INIT_INODE_TABLE);
1895 if (args[0].from) {
1896 if (match_int(&args[0], &option))
1897 return 0;
1898 } else
1899 option = EXT4_DEF_LI_WAIT_MULT;
1900 if (option < 0)
1901 return 0;
1902 sbi->s_li_wait_mult = option;
1903 break;
1904 case Opt_noinit_itable:
1905 clear_opt(sb, INIT_INODE_TABLE);
1906 break;
1907 default:
1908 ext4_msg(sb, KERN_ERR,
1909 "Unrecognized mount option \"%s\" "
1910 "or missing value", p);
1911 return 0;
1912 } 1589 }
1590 return 1;
1591 }
1592 ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
1593 "or missing value", opt);
1594 return -1;
1595}
1596
1597static int parse_options(char *options, struct super_block *sb,
1598 unsigned long *journal_devnum,
1599 unsigned int *journal_ioprio,
1600 int is_remount)
1601{
1602 struct ext4_sb_info *sbi = EXT4_SB(sb);
1603 char *p;
1604 substring_t args[MAX_OPT_ARGS];
1605 int token;
1606
1607 if (!options)
1608 return 1;
1609
1610 while ((p = strsep(&options, ",")) != NULL) {
1611 if (!*p)
1612 continue;
1613 /*
1614 * Initialize args struct so we know whether arg was
1615 * found; some options take optional arguments.
1616 */
1617 args[0].to = args[0].from = 0;
1618 token = match_token(p, tokens, args);
1619 if (handle_mount_opt(sb, p, token, args, journal_devnum,
1620 journal_ioprio, is_remount) < 0)
1621 return 0;
1913 } 1622 }
1914#ifdef CONFIG_QUOTA 1623#ifdef CONFIG_QUOTA
1915 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1624 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
@@ -1942,6 +1651,160 @@ set_qf_format:
1942 return 1; 1651 return 1;
1943} 1652}
1944 1653
1654static inline void ext4_show_quota_options(struct seq_file *seq,
1655 struct super_block *sb)
1656{
1657#if defined(CONFIG_QUOTA)
1658 struct ext4_sb_info *sbi = EXT4_SB(sb);
1659
1660 if (sbi->s_jquota_fmt) {
1661 char *fmtname = "";
1662
1663 switch (sbi->s_jquota_fmt) {
1664 case QFMT_VFS_OLD:
1665 fmtname = "vfsold";
1666 break;
1667 case QFMT_VFS_V0:
1668 fmtname = "vfsv0";
1669 break;
1670 case QFMT_VFS_V1:
1671 fmtname = "vfsv1";
1672 break;
1673 }
1674 seq_printf(seq, ",jqfmt=%s", fmtname);
1675 }
1676
1677 if (sbi->s_qf_names[USRQUOTA])
1678 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
1679
1680 if (sbi->s_qf_names[GRPQUOTA])
1681 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1682
1683 if (test_opt(sb, USRQUOTA))
1684 seq_puts(seq, ",usrquota");
1685
1686 if (test_opt(sb, GRPQUOTA))
1687 seq_puts(seq, ",grpquota");
1688#endif
1689}
1690
1691static const char *token2str(int token)
1692{
1693 static const struct match_token *t;
1694
1695 for (t = tokens; t->token != Opt_err; t++)
1696 if (t->token == token && !strchr(t->pattern, '='))
1697 break;
1698 return t->pattern;
1699}
1700
1701/*
1702 * Show an option if
1703 * - it's set to a non-default value OR
1704 * - if the per-sb default is different from the global default
1705 */
1706static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1707 int nodefs)
1708{
1709 struct ext4_sb_info *sbi = EXT4_SB(sb);
1710 struct ext4_super_block *es = sbi->s_es;
1711 int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
1712 const struct mount_opts *m;
1713 char sep = nodefs ? '\n' : ',';
1714
1715#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
1716#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
1717
1718 if (sbi->s_sb_block != 1)
1719 SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
1720
1721 for (m = ext4_mount_opts; m->token != Opt_err; m++) {
1722 int want_set = m->flags & MOPT_SET;
1723 if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
1724 (m->flags & MOPT_CLEAR_ERR))
1725 continue;
1726 if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
1727 continue; /* skip if same as the default */
1728 if ((want_set &&
1729 (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
1730 (!want_set && (sbi->s_mount_opt & m->mount_opt)))
1731 continue; /* select Opt_noFoo vs Opt_Foo */
1732 SEQ_OPTS_PRINT("%s", token2str(m->token));
1733 }
1734
1735 if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
1736 le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
1737 SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
1738 if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
1739 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
1740 SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
1741 def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
1742 if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
1743 SEQ_OPTS_PUTS("errors=remount-ro");
1744 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1745 SEQ_OPTS_PUTS("errors=continue");
1746 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1747 SEQ_OPTS_PUTS("errors=panic");
1748 if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
1749 SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
1750 if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
1751 SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
1752 if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
1753 SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
1754 if (sb->s_flags & MS_I_VERSION)
1755 SEQ_OPTS_PUTS("i_version");
1756 if (nodefs || sbi->s_stripe)
1757 SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
1758 if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
1759 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1760 SEQ_OPTS_PUTS("data=journal");
1761 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1762 SEQ_OPTS_PUTS("data=ordered");
1763 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1764 SEQ_OPTS_PUTS("data=writeback");
1765 }
1766 if (nodefs ||
1767 sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1768 SEQ_OPTS_PRINT("inode_readahead_blks=%u",
1769 sbi->s_inode_readahead_blks);
1770
1771 if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
1772 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
1773 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1774
1775 ext4_show_quota_options(seq, sb);
1776 return 0;
1777}
1778
1779static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1780{
1781 return _ext4_show_options(seq, root->d_sb, 0);
1782}
1783
1784static int options_seq_show(struct seq_file *seq, void *offset)
1785{
1786 struct super_block *sb = seq->private;
1787 int rc;
1788
1789 seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
1790 rc = _ext4_show_options(seq, sb, 1);
1791 seq_puts(seq, "\n");
1792 return rc;
1793}
1794
1795static int options_open_fs(struct inode *inode, struct file *file)
1796{
1797 return single_open(file, options_seq_show, PDE(inode)->data);
1798}
1799
1800static const struct file_operations ext4_seq_options_fops = {
1801 .owner = THIS_MODULE,
1802 .open = options_open_fs,
1803 .read = seq_read,
1804 .llseek = seq_lseek,
1805 .release = single_release,
1806};
1807
1945static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 1808static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1946 int read_only) 1809 int read_only)
1947{ 1810{
@@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void)
2945 ext4_clear_request_list(); 2808 ext4_clear_request_list();
2946 kfree(ext4_li_info); 2809 kfree(ext4_li_info);
2947 ext4_li_info = NULL; 2810 ext4_li_info = NULL;
2948 printk(KERN_CRIT "EXT4: error %d creating inode table " 2811 printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
2949 "initialization thread\n", 2812 "initialization thread\n",
2950 err); 2813 err);
2951 return err; 2814 return err;
@@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3183 set_opt(sb, INIT_INODE_TABLE); 3046 set_opt(sb, INIT_INODE_TABLE);
3184 if (def_mount_opts & EXT4_DEFM_DEBUG) 3047 if (def_mount_opts & EXT4_DEFM_DEBUG)
3185 set_opt(sb, DEBUG); 3048 set_opt(sb, DEBUG);
3186 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3049 if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
3187 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
3188 "2.6.38");
3189 set_opt(sb, GRPID); 3050 set_opt(sb, GRPID);
3190 }
3191 if (def_mount_opts & EXT4_DEFM_UID16) 3051 if (def_mount_opts & EXT4_DEFM_UID16)
3192 set_opt(sb, NO_UID32); 3052 set_opt(sb, NO_UID32);
3193 /* xattr user namespace & acls are now defaulted on */ 3053 /* xattr user namespace & acls are now defaulted on */
@@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3240 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 3100 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3241 3101
3242 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3102 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3243 &journal_devnum, &journal_ioprio, NULL, 0)) { 3103 &journal_devnum, &journal_ioprio, 0)) {
3244 ext4_msg(sb, KERN_WARNING, 3104 ext4_msg(sb, KERN_WARNING,
3245 "failed to parse options in superblock: %s", 3105 "failed to parse options in superblock: %s",
3246 sbi->s_es->s_mount_opts); 3106 sbi->s_es->s_mount_opts);
3247 } 3107 }
3108 sbi->s_def_mount_opt = sbi->s_mount_opt;
3248 if (!parse_options((char *) data, sb, &journal_devnum, 3109 if (!parse_options((char *) data, sb, &journal_devnum,
3249 &journal_ioprio, NULL, 0)) 3110 &journal_ioprio, 0))
3250 goto failed_mount; 3111 goto failed_mount;
3251 3112
3252 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 3113 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3416#else 3277#else
3417 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 3278 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
3418#endif 3279#endif
3419 sb->s_dirt = 1;
3420 } 3280 }
3421 3281
3422 /* Handle clustersize */ 3282 /* Handle clustersize */
@@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3540 if (ext4_proc_root) 3400 if (ext4_proc_root)
3541 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 3401 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
3542 3402
3403 if (sbi->s_proc)
3404 proc_create_data("options", S_IRUGO, sbi->s_proc,
3405 &ext4_seq_options_fops, sb);
3406
3543 bgl_lock_init(sbi->s_blockgroup_lock); 3407 bgl_lock_init(sbi->s_blockgroup_lock);
3544 3408
3545 for (i = 0; i < db_count; i++) { 3409 for (i = 0; i < db_count; i++) {
@@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3694 } 3558 }
3695 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3559 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3696 3560
3561 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
3562
3697 /* 3563 /*
3698 * The journal may have updated the bg summary counts, so we 3564 * The journal may have updated the bg summary counts, so we
3699 * need to update the global counters. 3565 * need to update the global counters.
@@ -3861,6 +3727,7 @@ failed_mount2:
3861 ext4_kvfree(sbi->s_group_desc); 3727 ext4_kvfree(sbi->s_group_desc);
3862failed_mount: 3728failed_mount:
3863 if (sbi->s_proc) { 3729 if (sbi->s_proc) {
3730 remove_proc_entry("options", sbi->s_proc);
3864 remove_proc_entry(sb->s_id, ext4_proc_root); 3731 remove_proc_entry(sb->s_id, ext4_proc_root);
3865 } 3732 }
3866#ifdef CONFIG_QUOTA 3733#ifdef CONFIG_QUOTA
@@ -4090,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb,
4090 if (!(journal->j_flags & JBD2_BARRIER)) 3957 if (!(journal->j_flags & JBD2_BARRIER))
4091 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3958 ext4_msg(sb, KERN_INFO, "barriers disabled");
4092 3959
4093 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
4094 err = jbd2_journal_update_format(journal);
4095 if (err) {
4096 ext4_msg(sb, KERN_ERR, "error updating journal");
4097 jbd2_journal_destroy(journal);
4098 return err;
4099 }
4100 }
4101
4102 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) 3960 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
4103 err = jbd2_journal_wipe(journal, !really_read_only); 3961 err = jbd2_journal_wipe(journal, !really_read_only);
4104 if (!err) { 3962 if (!err) {
@@ -4385,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4385{ 4243{
4386 struct ext4_super_block *es; 4244 struct ext4_super_block *es;
4387 struct ext4_sb_info *sbi = EXT4_SB(sb); 4245 struct ext4_sb_info *sbi = EXT4_SB(sb);
4388 ext4_fsblk_t n_blocks_count = 0;
4389 unsigned long old_sb_flags; 4246 unsigned long old_sb_flags;
4390 struct ext4_mount_options old_opts; 4247 struct ext4_mount_options old_opts;
4391 int enable_quota = 0; 4248 int enable_quota = 0;
@@ -4418,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4418 /* 4275 /*
4419 * Allow the "check" option to be passed as a remount option. 4276 * Allow the "check" option to be passed as a remount option.
4420 */ 4277 */
4421 if (!parse_options(data, sb, NULL, &journal_ioprio, 4278 if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
4422 &n_blocks_count, 1)) {
4423 err = -EINVAL; 4279 err = -EINVAL;
4424 goto restore_opts; 4280 goto restore_opts;
4425 } 4281 }
@@ -4437,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4437 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 4293 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4438 } 4294 }
4439 4295
4440 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 4296 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
4441 n_blocks_count > ext4_blocks_count(es)) {
4442 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { 4297 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
4443 err = -EROFS; 4298 err = -EROFS;
4444 goto restore_opts; 4299 goto restore_opts;
@@ -4513,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4513 if (sbi->s_journal) 4368 if (sbi->s_journal)
4514 ext4_clear_journal_err(sb, es); 4369 ext4_clear_journal_err(sb, es);
4515 sbi->s_mount_state = le16_to_cpu(es->s_state); 4370 sbi->s_mount_state = le16_to_cpu(es->s_state);
4516 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
4517 goto restore_opts;
4518 if (!ext4_setup_super(sb, es, 0)) 4371 if (!ext4_setup_super(sb, es, 0))
4519 sb->s_flags &= ~MS_RDONLY; 4372 sb->s_flags &= ~MS_RDONLY;
4520 if (EXT4_HAS_INCOMPAT_FEATURE(sb, 4373 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d89a220..e88748e55c0f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -82,8 +82,8 @@
82 printk("\n"); \ 82 printk("\n"); \
83 } while (0) 83 } while (0)
84#else 84#else
85# define ea_idebug(f...) 85# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
86# define ea_bdebug(f...) 86# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
87#endif 87#endif
88 88
89static void ext4_xattr_cache_insert(struct buffer_head *); 89static void ext4_xattr_cache_insert(struct buffer_head *);
@@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
158static inline int 158static inline int
159ext4_xattr_check_block(struct buffer_head *bh) 159ext4_xattr_check_block(struct buffer_head *bh)
160{ 160{
161 int error;
162
163 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 161 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
164 BHDR(bh)->h_blocks != cpu_to_le32(1)) 162 BHDR(bh)->h_blocks != cpu_to_le32(1))
165 return -EIO; 163 return -EIO;
166 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); 164 return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
167 return error;
168} 165}
169 166
170static inline int 167static inline int
@@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
220 error = -ENODATA; 217 error = -ENODATA;
221 if (!EXT4_I(inode)->i_file_acl) 218 if (!EXT4_I(inode)->i_file_acl)
222 goto cleanup; 219 goto cleanup;
223 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); 220 ea_idebug(inode, "reading block %llu",
221 (unsigned long long)EXT4_I(inode)->i_file_acl);
224 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 222 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
225 if (!bh) 223 if (!bh)
226 goto cleanup; 224 goto cleanup;
@@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
363 error = 0; 361 error = 0;
364 if (!EXT4_I(inode)->i_file_acl) 362 if (!EXT4_I(inode)->i_file_acl)
365 goto cleanup; 363 goto cleanup;
366 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); 364 ea_idebug(inode, "reading block %llu",
365 (unsigned long long)EXT4_I(inode)->i_file_acl);
367 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 366 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
368 error = -EIO; 367 error = -EIO;
369 if (!bh) 368 if (!bh)
@@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
487 ext4_free_blocks(handle, inode, bh, 0, 1, 486 ext4_free_blocks(handle, inode, bh, 0, 1,
488 EXT4_FREE_BLOCKS_METADATA | 487 EXT4_FREE_BLOCKS_METADATA |
489 EXT4_FREE_BLOCKS_FORGET); 488 EXT4_FREE_BLOCKS_FORGET);
489 unlock_buffer(bh);
490 } else { 490 } else {
491 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 491 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
492 if (ce)
493 mb_cache_entry_release(ce);
494 unlock_buffer(bh);
492 error = ext4_handle_dirty_metadata(handle, inode, bh); 495 error = ext4_handle_dirty_metadata(handle, inode, bh);
493 if (IS_SYNC(inode)) 496 if (IS_SYNC(inode))
494 ext4_handle_sync(handle); 497 ext4_handle_sync(handle);
495 dquot_free_block(inode, 1); 498 dquot_free_block(inode, 1);
496 ea_bdebug(bh, "refcount now=%d; releasing", 499 ea_bdebug(bh, "refcount now=%d; releasing",
497 le32_to_cpu(BHDR(bh)->h_refcount)); 500 le32_to_cpu(BHDR(bh)->h_refcount));
498 if (ce)
499 mb_cache_entry_release(ce);
500 } 501 }
501 unlock_buffer(bh);
502out: 502out:
503 ext4_std_error(inode->i_sb, error); 503 ext4_std_error(inode->i_sb, error);
504 return; 504 return;
@@ -834,7 +834,8 @@ inserted:
834 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 834 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
835 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 835 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
836 836
837 ea_idebug(inode, "creating block %d", block); 837 ea_idebug(inode, "creating block %llu",
838 (unsigned long long)block);
838 839
839 new_bh = sb_getblk(sb, block); 840 new_bh = sb_getblk(sb, block);
840 if (!new_bh) { 841 if (!new_bh) {
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index d49d202903fb..c78841ee81cf 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh)
88 * whole transaction. 88 * whole transaction.
89 * 89 *
90 * Requires j_list_lock 90 * Requires j_list_lock
91 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
92 */ 91 */
93static int __try_to_free_cp_buf(struct journal_head *jh) 92static int __try_to_free_cp_buf(struct journal_head *jh)
94{ 93{
95 int ret = 0; 94 int ret = 0;
96 struct buffer_head *bh = jh2bh(jh); 95 struct buffer_head *bh = jh2bh(jh);
97 96
98 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && 97 if (jh->b_transaction == NULL && !buffer_locked(bh) &&
99 !buffer_dirty(bh) && !buffer_write_io_error(bh)) { 98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
100 /* 99 /*
101 * Get our reference so that bh cannot be freed before 100 * Get our reference so that bh cannot be freed before
@@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
104 get_bh(bh); 103 get_bh(bh);
105 JBUFFER_TRACE(jh, "remove from checkpoint list"); 104 JBUFFER_TRACE(jh, "remove from checkpoint list");
106 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 105 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
107 jbd_unlock_bh_state(bh);
108 BUFFER_TRACE(bh, "release"); 106 BUFFER_TRACE(bh, "release");
109 __brelse(bh); 107 __brelse(bh);
110 } else {
111 jbd_unlock_bh_state(bh);
112 } 108 }
113 return ret; 109 return ret;
114} 110}
@@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
180} 176}
181 177
182/* 178/*
183 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
184 * The caller must restart a list walk. Wait for someone else to run
185 * jbd_unlock_bh_state().
186 */
187static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
188 __releases(journal->j_list_lock)
189{
190 get_bh(bh);
191 spin_unlock(&journal->j_list_lock);
192 jbd_lock_bh_state(bh);
193 jbd_unlock_bh_state(bh);
194 put_bh(bh);
195}
196
197/*
198 * Clean up transaction's list of buffers submitted for io. 179 * Clean up transaction's list of buffers submitted for io.
199 * We wait for any pending IO to complete and remove any clean 180 * We wait for any pending IO to complete and remove any clean
200 * buffers. Note that we take the buffers in the opposite ordering 181 * buffers. Note that we take the buffers in the opposite ordering
@@ -222,15 +203,9 @@ restart:
222 while (!released && transaction->t_checkpoint_io_list) { 203 while (!released && transaction->t_checkpoint_io_list) {
223 jh = transaction->t_checkpoint_io_list; 204 jh = transaction->t_checkpoint_io_list;
224 bh = jh2bh(jh); 205 bh = jh2bh(jh);
225 if (!jbd_trylock_bh_state(bh)) {
226 jbd_sync_bh(journal, bh);
227 spin_lock(&journal->j_list_lock);
228 goto restart;
229 }
230 get_bh(bh); 206 get_bh(bh);
231 if (buffer_locked(bh)) { 207 if (buffer_locked(bh)) {
232 spin_unlock(&journal->j_list_lock); 208 spin_unlock(&journal->j_list_lock);
233 jbd_unlock_bh_state(bh);
234 wait_on_buffer(bh); 209 wait_on_buffer(bh);
235 /* the journal_head may have gone by now */ 210 /* the journal_head may have gone by now */
236 BUFFER_TRACE(bh, "brelse"); 211 BUFFER_TRACE(bh, "brelse");
@@ -246,7 +221,6 @@ restart:
246 * it has been written out and so we can drop it from the list 221 * it has been written out and so we can drop it from the list
247 */ 222 */
248 released = __jbd2_journal_remove_checkpoint(jh); 223 released = __jbd2_journal_remove_checkpoint(jh);
249 jbd_unlock_bh_state(bh);
250 __brelse(bh); 224 __brelse(bh);
251 } 225 }
252 226
@@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count)
266 240
267 for (i = 0; i < *batch_count; i++) { 241 for (i = 0; i < *batch_count; i++) {
268 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 242 struct buffer_head *bh = journal->j_chkpt_bhs[i];
269 clear_buffer_jwrite(bh);
270 BUFFER_TRACE(bh, "brelse"); 243 BUFFER_TRACE(bh, "brelse");
271 __brelse(bh); 244 __brelse(bh);
272 } 245 }
@@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count)
281 * be written out. 254 * be written out.
282 * 255 *
283 * Called with j_list_lock held and drops it if 1 is returned 256 * Called with j_list_lock held and drops it if 1 is returned
284 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
285 */ 257 */
286static int __process_buffer(journal_t *journal, struct journal_head *jh, 258static int __process_buffer(journal_t *journal, struct journal_head *jh,
287 int *batch_count, transaction_t *transaction) 259 int *batch_count, transaction_t *transaction)
@@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
292 if (buffer_locked(bh)) { 264 if (buffer_locked(bh)) {
293 get_bh(bh); 265 get_bh(bh);
294 spin_unlock(&journal->j_list_lock); 266 spin_unlock(&journal->j_list_lock);
295 jbd_unlock_bh_state(bh);
296 wait_on_buffer(bh); 267 wait_on_buffer(bh);
297 /* the journal_head may have gone by now */ 268 /* the journal_head may have gone by now */
298 BUFFER_TRACE(bh, "brelse"); 269 BUFFER_TRACE(bh, "brelse");
@@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
304 275
305 transaction->t_chp_stats.cs_forced_to_close++; 276 transaction->t_chp_stats.cs_forced_to_close++;
306 spin_unlock(&journal->j_list_lock); 277 spin_unlock(&journal->j_list_lock);
307 jbd_unlock_bh_state(bh);
308 if (unlikely(journal->j_flags & JBD2_UNMOUNT)) 278 if (unlikely(journal->j_flags & JBD2_UNMOUNT))
309 /* 279 /*
310 * The journal thread is dead; so starting and 280 * The journal thread is dead; so starting and
@@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
323 if (unlikely(buffer_write_io_error(bh))) 293 if (unlikely(buffer_write_io_error(bh)))
324 ret = -EIO; 294 ret = -EIO;
325 get_bh(bh); 295 get_bh(bh);
326 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
327 BUFFER_TRACE(bh, "remove from checkpoint"); 296 BUFFER_TRACE(bh, "remove from checkpoint");
328 __jbd2_journal_remove_checkpoint(jh); 297 __jbd2_journal_remove_checkpoint(jh);
329 spin_unlock(&journal->j_list_lock); 298 spin_unlock(&journal->j_list_lock);
330 jbd_unlock_bh_state(bh);
331 __brelse(bh); 299 __brelse(bh);
332 } else { 300 } else {
333 /* 301 /*
@@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
340 BUFFER_TRACE(bh, "queue"); 308 BUFFER_TRACE(bh, "queue");
341 get_bh(bh); 309 get_bh(bh);
342 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 310 J_ASSERT_BH(bh, !buffer_jwrite(bh));
343 set_buffer_jwrite(bh);
344 journal->j_chkpt_bhs[*batch_count] = bh; 311 journal->j_chkpt_bhs[*batch_count] = bh;
345 __buffer_relink_io(jh); 312 __buffer_relink_io(jh);
346 jbd_unlock_bh_state(bh);
347 transaction->t_chp_stats.cs_written++; 313 transaction->t_chp_stats.cs_written++;
348 (*batch_count)++; 314 (*batch_count)++;
349 if (*batch_count == JBD2_NR_BATCH) { 315 if (*batch_count == JBD2_NR_BATCH) {
@@ -407,15 +373,7 @@ restart:
407 int retry = 0, err; 373 int retry = 0, err;
408 374
409 while (!retry && transaction->t_checkpoint_list) { 375 while (!retry && transaction->t_checkpoint_list) {
410 struct buffer_head *bh;
411
412 jh = transaction->t_checkpoint_list; 376 jh = transaction->t_checkpoint_list;
413 bh = jh2bh(jh);
414 if (!jbd_trylock_bh_state(bh)) {
415 jbd_sync_bh(journal, bh);
416 retry = 1;
417 break;
418 }
419 retry = __process_buffer(journal, jh, &batch_count, 377 retry = __process_buffer(journal, jh, &batch_count,
420 transaction); 378 transaction);
421 if (retry < 0 && !result) 379 if (retry < 0 && !result)
@@ -478,79 +436,28 @@ out:
478 436
479int jbd2_cleanup_journal_tail(journal_t *journal) 437int jbd2_cleanup_journal_tail(journal_t *journal)
480{ 438{
481 transaction_t * transaction;
482 tid_t first_tid; 439 tid_t first_tid;
483 unsigned long blocknr, freed; 440 unsigned long blocknr;
484 441
485 if (is_journal_aborted(journal)) 442 if (is_journal_aborted(journal))
486 return 1; 443 return 1;
487 444
488 /* OK, work out the oldest transaction remaining in the log, and 445 if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
489 * the log block it starts at.
490 *
491 * If the log is now empty, we need to work out which is the
492 * next transaction ID we will write, and where it will
493 * start. */
494
495 write_lock(&journal->j_state_lock);
496 spin_lock(&journal->j_list_lock);
497 transaction = journal->j_checkpoint_transactions;
498 if (transaction) {
499 first_tid = transaction->t_tid;
500 blocknr = transaction->t_log_start;
501 } else if ((transaction = journal->j_committing_transaction) != NULL) {
502 first_tid = transaction->t_tid;
503 blocknr = transaction->t_log_start;
504 } else if ((transaction = journal->j_running_transaction) != NULL) {
505 first_tid = transaction->t_tid;
506 blocknr = journal->j_head;
507 } else {
508 first_tid = journal->j_transaction_sequence;
509 blocknr = journal->j_head;
510 }
511 spin_unlock(&journal->j_list_lock);
512 J_ASSERT(blocknr != 0);
513
514 /* If the oldest pinned transaction is at the tail of the log
515 already then there's not much we can do right now. */
516 if (journal->j_tail_sequence == first_tid) {
517 write_unlock(&journal->j_state_lock);
518 return 1; 446 return 1;
519 } 447 J_ASSERT(blocknr != 0);
520
521 /* OK, update the superblock to recover the freed space.
522 * Physical blocks come first: have we wrapped beyond the end of
523 * the log? */
524 freed = blocknr - journal->j_tail;
525 if (blocknr < journal->j_tail)
526 freed = freed + journal->j_last - journal->j_first;
527
528 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
529 jbd_debug(1,
530 "Cleaning journal tail from %d to %d (offset %lu), "
531 "freeing %lu\n",
532 journal->j_tail_sequence, first_tid, blocknr, freed);
533
534 journal->j_free += freed;
535 journal->j_tail_sequence = first_tid;
536 journal->j_tail = blocknr;
537 write_unlock(&journal->j_state_lock);
538 448
539 /* 449 /*
540 * If there is an external journal, we need to make sure that 450 * We need to make sure that any blocks that were recently written out
541 * any data blocks that were recently written out --- perhaps 451 * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
542 * by jbd2_log_do_checkpoint() --- are flushed out before we 452 * we drop the transactions from the journal. It's unlikely this will
543 * drop the transactions from the external journal. It's 453 * be necessary, especially with an appropriately sized journal, but we
544 * unlikely this will be necessary, especially with a 454 * need this to guarantee correctness. Fortunately
545 * appropriately sized journal, but we need this to guarantee 455 * jbd2_cleanup_journal_tail() doesn't get called all that often.
546 * correctness. Fortunately jbd2_cleanup_journal_tail()
547 * doesn't get called all that often.
548 */ 456 */
549 if ((journal->j_fs_dev != journal->j_dev) && 457 if (journal->j_flags & JBD2_BARRIER)
550 (journal->j_flags & JBD2_BARRIER))
551 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 458 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
552 if (!(journal->j_flags & JBD2_ABORT)) 459
553 jbd2_journal_update_superblock(journal, 1); 460 __jbd2_update_log_tail(journal, first_tid, blocknr);
554 return 0; 461 return 0;
555} 462}
556 463
@@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
582 do { 489 do {
583 jh = next_jh; 490 jh = next_jh;
584 next_jh = jh->b_cpnext; 491 next_jh = jh->b_cpnext;
585 /* Use trylock because of the ranking */ 492 ret = __try_to_free_cp_buf(jh);
586 if (jbd_trylock_bh_state(jh2bh(jh))) { 493 if (ret) {
587 ret = __try_to_free_cp_buf(jh); 494 freed++;
588 if (ret) { 495 if (ret == 2) {
589 freed++; 496 *released = 1;
590 if (ret == 2) { 497 return freed;
591 *released = 1;
592 return freed;
593 }
594 } 498 }
595 } 499 }
596 /* 500 /*
@@ -673,9 +577,7 @@ out:
673 * The function can free jh and bh. 577 * The function can free jh and bh.
674 * 578 *
675 * This function is called with j_list_lock held. 579 * This function is called with j_list_lock held.
676 * This function is called with jbd_lock_bh_state(jh2bh(jh))
677 */ 580 */
678
679int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 581int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
680{ 582{
681 struct transaction_chp_stats_s *stats; 583 struct transaction_chp_stats_s *stats;
@@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
722 transaction->t_tid, stats); 624 transaction->t_tid, stats);
723 625
724 __jbd2_journal_drop_transaction(journal, transaction); 626 __jbd2_journal_drop_transaction(journal, transaction);
725 kfree(transaction); 627 jbd2_journal_free_transaction(transaction);
726 628
727 /* Just in case anybody was waiting for more transactions to be 629 /* Just in case anybody was waiting for more transactions to be
728 checkpointed... */ 630 checkpointed... */
@@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
797 J_ASSERT(journal->j_committing_transaction != transaction); 699 J_ASSERT(journal->j_committing_transaction != transaction);
798 J_ASSERT(journal->j_running_transaction != transaction); 700 J_ASSERT(journal->j_running_transaction != transaction);
799 701
702 trace_jbd2_drop_transaction(journal, transaction);
703
800 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 704 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
801} 705}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index c067a8cae63b..17f557f01cf0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -331,6 +331,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
331 struct buffer_head *cbh = NULL; /* For transactional checksums */ 331 struct buffer_head *cbh = NULL; /* For transactional checksums */
332 __u32 crc32_sum = ~0; 332 __u32 crc32_sum = ~0;
333 struct blk_plug plug; 333 struct blk_plug plug;
334 /* Tail of the journal */
335 unsigned long first_block;
336 tid_t first_tid;
337 int update_tail;
334 338
335 /* 339 /*
336 * First job: lock down the current transaction and wait for 340 * First job: lock down the current transaction and wait for
@@ -340,7 +344,18 @@ void jbd2_journal_commit_transaction(journal_t *journal)
340 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 344 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
341 if (journal->j_flags & JBD2_FLUSHED) { 345 if (journal->j_flags & JBD2_FLUSHED) {
342 jbd_debug(3, "super block updated\n"); 346 jbd_debug(3, "super block updated\n");
343 jbd2_journal_update_superblock(journal, 1); 347 mutex_lock(&journal->j_checkpoint_mutex);
348 /*
349 * We hold j_checkpoint_mutex so tail cannot change under us.
350 * We don't need any special data guarantees for writing sb
351 * since journal is empty and it is ok for write to be
352 * flushed only with transaction commit.
353 */
354 jbd2_journal_update_sb_log_tail(journal,
355 journal->j_tail_sequence,
356 journal->j_tail,
357 WRITE_SYNC);
358 mutex_unlock(&journal->j_checkpoint_mutex);
344 } else { 359 } else {
345 jbd_debug(3, "superblock not updated\n"); 360 jbd_debug(3, "superblock not updated\n");
346 } 361 }
@@ -677,10 +692,30 @@ start_journal_io:
677 err = 0; 692 err = 0;
678 } 693 }
679 694
695 /*
696 * Get current oldest transaction in the log before we issue flush
697 * to the filesystem device. After the flush we can be sure that
698 * blocks of all older transactions are checkpointed to persistent
699 * storage and we will be safe to update journal start in the
700 * superblock with the numbers we get here.
701 */
702 update_tail =
703 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
704
680 write_lock(&journal->j_state_lock); 705 write_lock(&journal->j_state_lock);
706 if (update_tail) {
707 long freed = first_block - journal->j_tail;
708
709 if (first_block < journal->j_tail)
710 freed += journal->j_last - journal->j_first;
711 /* Update tail only if we free significant amount of space */
712 if (freed < journal->j_maxlen / 4)
713 update_tail = 0;
714 }
681 J_ASSERT(commit_transaction->t_state == T_COMMIT); 715 J_ASSERT(commit_transaction->t_state == T_COMMIT);
682 commit_transaction->t_state = T_COMMIT_DFLUSH; 716 commit_transaction->t_state = T_COMMIT_DFLUSH;
683 write_unlock(&journal->j_state_lock); 717 write_unlock(&journal->j_state_lock);
718
684 /* 719 /*
685 * If the journal is not located on the file system device, 720 * If the journal is not located on the file system device,
686 * then we must flush the file system device before we issue 721 * then we must flush the file system device before we issue
@@ -831,6 +866,14 @@ wait_for_iobuf:
831 if (err) 866 if (err)
832 jbd2_journal_abort(journal, err); 867 jbd2_journal_abort(journal, err);
833 868
869 /*
870 * Now disk caches for filesystem device are flushed so we are safe to
871 * erase checkpointed transactions from the log by updating journal
872 * superblock.
873 */
874 if (update_tail)
875 jbd2_update_log_tail(journal, first_tid, first_block);
876
834 /* End of a transaction! Finally, we can do checkpoint 877 /* End of a transaction! Finally, we can do checkpoint
835 processing: any buffers committed as a result of this 878 processing: any buffers committed as a result of this
836 transaction can be removed from any checkpoint list it was on 879 transaction can be removed from any checkpoint list it was on
@@ -1048,7 +1091,7 @@ restart_loop:
1048 jbd_debug(1, "JBD2: commit %d complete, head %d\n", 1091 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1049 journal->j_commit_sequence, journal->j_tail_sequence); 1092 journal->j_commit_sequence, journal->j_tail_sequence);
1050 if (to_free) 1093 if (to_free)
1051 kfree(commit_transaction); 1094 jbd2_journal_free_transaction(commit_transaction);
1052 1095
1053 wake_up(&journal->j_wait_done_commit); 1096 wake_up(&journal->j_wait_done_commit);
1054} 1097}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 839377e3d624..98ed6dbfe381 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -71,7 +71,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke);
71 71
72EXPORT_SYMBOL(jbd2_journal_init_dev); 72EXPORT_SYMBOL(jbd2_journal_init_dev);
73EXPORT_SYMBOL(jbd2_journal_init_inode); 73EXPORT_SYMBOL(jbd2_journal_init_inode);
74EXPORT_SYMBOL(jbd2_journal_update_format);
75EXPORT_SYMBOL(jbd2_journal_check_used_features); 74EXPORT_SYMBOL(jbd2_journal_check_used_features);
76EXPORT_SYMBOL(jbd2_journal_check_available_features); 75EXPORT_SYMBOL(jbd2_journal_check_available_features);
77EXPORT_SYMBOL(jbd2_journal_set_features); 76EXPORT_SYMBOL(jbd2_journal_set_features);
@@ -96,7 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
96EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 95EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
97EXPORT_SYMBOL(jbd2_inode_cache); 96EXPORT_SYMBOL(jbd2_inode_cache);
98 97
99static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
100static void __journal_abort_soft (journal_t *journal, int errno); 98static void __journal_abort_soft (journal_t *journal, int errno);
101static int jbd2_journal_create_slab(size_t slab_size); 99static int jbd2_journal_create_slab(size_t slab_size);
102 100
@@ -746,6 +744,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
746 return jbd2_journal_add_journal_head(bh); 744 return jbd2_journal_add_journal_head(bh);
747} 745}
748 746
747/*
748 * Return tid of the oldest transaction in the journal and block in the journal
749 * where the transaction starts.
750 *
751 * If the journal is now empty, return which will be the next transaction ID
752 * we will write and where will that transaction start.
753 *
754 * The return value is 0 if journal tail cannot be pushed any further, 1 if
755 * it can.
756 */
757int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
758 unsigned long *block)
759{
760 transaction_t *transaction;
761 int ret;
762
763 read_lock(&journal->j_state_lock);
764 spin_lock(&journal->j_list_lock);
765 transaction = journal->j_checkpoint_transactions;
766 if (transaction) {
767 *tid = transaction->t_tid;
768 *block = transaction->t_log_start;
769 } else if ((transaction = journal->j_committing_transaction) != NULL) {
770 *tid = transaction->t_tid;
771 *block = transaction->t_log_start;
772 } else if ((transaction = journal->j_running_transaction) != NULL) {
773 *tid = transaction->t_tid;
774 *block = journal->j_head;
775 } else {
776 *tid = journal->j_transaction_sequence;
777 *block = journal->j_head;
778 }
779 ret = tid_gt(*tid, journal->j_tail_sequence);
780 spin_unlock(&journal->j_list_lock);
781 read_unlock(&journal->j_state_lock);
782
783 return ret;
784}
785
786/*
787 * Update information in journal structure and in on disk journal superblock
788 * about log tail. This function does not check whether information passed in
789 * really pushes log tail further. It's responsibility of the caller to make
790 * sure provided log tail information is valid (e.g. by holding
791 * j_checkpoint_mutex all the time between computing log tail and calling this
792 * function as is the case with jbd2_cleanup_journal_tail()).
793 *
794 * Requires j_checkpoint_mutex
795 */
796void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
797{
798 unsigned long freed;
799
800 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
801
802 /*
803 * We cannot afford for write to remain in drive's caches since as
804 * soon as we update j_tail, next transaction can start reusing journal
805 * space and if we lose sb update during power failure we'd replay
806 * old transaction with possibly newly overwritten data.
807 */
808 jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
809 write_lock(&journal->j_state_lock);
810 freed = block - journal->j_tail;
811 if (block < journal->j_tail)
812 freed += journal->j_last - journal->j_first;
813
814 trace_jbd2_update_log_tail(journal, tid, block, freed);
815 jbd_debug(1,
816 "Cleaning journal tail from %d to %d (offset %lu), "
817 "freeing %lu\n",
818 journal->j_tail_sequence, tid, block, freed);
819
820 journal->j_free += freed;
821 journal->j_tail_sequence = tid;
822 journal->j_tail = block;
823 write_unlock(&journal->j_state_lock);
824}
825
826/*
827 * This is a variaon of __jbd2_update_log_tail which checks for validity of
828 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
829 * with other threads updating log tail.
830 */
831void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
832{
833 mutex_lock(&journal->j_checkpoint_mutex);
834 if (tid_gt(tid, journal->j_tail_sequence))
835 __jbd2_update_log_tail(journal, tid, block);
836 mutex_unlock(&journal->j_checkpoint_mutex);
837}
838
749struct jbd2_stats_proc_session { 839struct jbd2_stats_proc_session {
750 journal_t *journal; 840 journal_t *journal;
751 struct transaction_stats_s *stats; 841 struct transaction_stats_s *stats;
@@ -1114,40 +1204,45 @@ static int journal_reset(journal_t *journal)
1114 1204
1115 journal->j_max_transaction_buffers = journal->j_maxlen / 4; 1205 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
1116 1206
1117 /* Add the dynamic fields and write it to disk. */
1118 jbd2_journal_update_superblock(journal, 1);
1119 return jbd2_journal_start_thread(journal);
1120}
1121
1122/**
1123 * void jbd2_journal_update_superblock() - Update journal sb on disk.
1124 * @journal: The journal to update.
1125 * @wait: Set to '0' if you don't want to wait for IO completion.
1126 *
1127 * Update a journal's dynamic superblock fields and write it to disk,
1128 * optionally waiting for the IO to complete.
1129 */
1130void jbd2_journal_update_superblock(journal_t *journal, int wait)
1131{
1132 journal_superblock_t *sb = journal->j_superblock;
1133 struct buffer_head *bh = journal->j_sb_buffer;
1134
1135 /* 1207 /*
1136 * As a special case, if the on-disk copy is already marked as needing 1208 * As a special case, if the on-disk copy is already marked as needing
1137 * no recovery (s_start == 0) and there are no outstanding transactions 1209 * no recovery (s_start == 0), then we can safely defer the superblock
1138 * in the filesystem, then we can safely defer the superblock update 1210 * update until the next commit by setting JBD2_FLUSHED. This avoids
1139 * until the next commit by setting JBD2_FLUSHED. This avoids
1140 * attempting a write to a potential-readonly device. 1211 * attempting a write to a potential-readonly device.
1141 */ 1212 */
1142 if (sb->s_start == 0 && journal->j_tail_sequence == 1213 if (sb->s_start == 0) {
1143 journal->j_transaction_sequence) {
1144 jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " 1214 jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
1145 "(start %ld, seq %d, errno %d)\n", 1215 "(start %ld, seq %d, errno %d)\n",
1146 journal->j_tail, journal->j_tail_sequence, 1216 journal->j_tail, journal->j_tail_sequence,
1147 journal->j_errno); 1217 journal->j_errno);
1148 goto out; 1218 journal->j_flags |= JBD2_FLUSHED;
1219 } else {
1220 /* Lock here to make assertions happy... */
1221 mutex_lock(&journal->j_checkpoint_mutex);
1222 /*
1223 * Update log tail information. We use WRITE_FUA since new
1224 * transaction will start reusing journal space and so we
1225 * must make sure information about current log tail is on
1226 * disk before that.
1227 */
1228 jbd2_journal_update_sb_log_tail(journal,
1229 journal->j_tail_sequence,
1230 journal->j_tail,
1231 WRITE_FUA);
1232 mutex_unlock(&journal->j_checkpoint_mutex);
1149 } 1233 }
1234 return jbd2_journal_start_thread(journal);
1235}
1150 1236
1237static void jbd2_write_superblock(journal_t *journal, int write_op)
1238{
1239 struct buffer_head *bh = journal->j_sb_buffer;
1240 int ret;
1241
1242 trace_jbd2_write_superblock(journal, write_op);
1243 if (!(journal->j_flags & JBD2_BARRIER))
1244 write_op &= ~(REQ_FUA | REQ_FLUSH);
1245 lock_buffer(bh);
1151 if (buffer_write_io_error(bh)) { 1246 if (buffer_write_io_error(bh)) {
1152 /* 1247 /*
1153 * Oh, dear. A previous attempt to write the journal 1248 * Oh, dear. A previous attempt to write the journal
@@ -1163,48 +1258,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1163 clear_buffer_write_io_error(bh); 1258 clear_buffer_write_io_error(bh);
1164 set_buffer_uptodate(bh); 1259 set_buffer_uptodate(bh);
1165 } 1260 }
1261 get_bh(bh);
1262 bh->b_end_io = end_buffer_write_sync;
1263 ret = submit_bh(write_op, bh);
1264 wait_on_buffer(bh);
1265 if (buffer_write_io_error(bh)) {
1266 clear_buffer_write_io_error(bh);
1267 set_buffer_uptodate(bh);
1268 ret = -EIO;
1269 }
1270 if (ret) {
1271 printk(KERN_ERR "JBD2: Error %d detected when updating "
1272 "journal superblock for %s.\n", ret,
1273 journal->j_devname);
1274 }
1275}
1276
1277/**
1278 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1279 * @journal: The journal to update.
1280 * @tail_tid: TID of the new transaction at the tail of the log
1281 * @tail_block: The first block of the transaction at the tail of the log
1282 * @write_op: With which operation should we write the journal sb
1283 *
1284 * Update a journal's superblock information about log tail and write it to
1285 * disk, waiting for the IO to complete.
1286 */
1287void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1288 unsigned long tail_block, int write_op)
1289{
1290 journal_superblock_t *sb = journal->j_superblock;
1291
1292 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1293 jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
1294 tail_block, tail_tid);
1295
1296 sb->s_sequence = cpu_to_be32(tail_tid);
1297 sb->s_start = cpu_to_be32(tail_block);
1298
1299 jbd2_write_superblock(journal, write_op);
1300
1301 /* Log is no longer empty */
1302 write_lock(&journal->j_state_lock);
1303 WARN_ON(!sb->s_sequence);
1304 journal->j_flags &= ~JBD2_FLUSHED;
1305 write_unlock(&journal->j_state_lock);
1306}
1307
1308/**
1309 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
1310 * @journal: The journal to update.
1311 *
1312 * Update a journal's dynamic superblock fields to show that journal is empty.
1313 * Write updated superblock to disk waiting for IO to complete.
1314 */
1315static void jbd2_mark_journal_empty(journal_t *journal)
1316{
1317 journal_superblock_t *sb = journal->j_superblock;
1166 1318
1319 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1167 read_lock(&journal->j_state_lock); 1320 read_lock(&journal->j_state_lock);
1168 jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", 1321 jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
1169 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1322 journal->j_tail_sequence);
1170 1323
1171 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1324 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1172 sb->s_start = cpu_to_be32(journal->j_tail); 1325 sb->s_start = cpu_to_be32(0);
1173 sb->s_errno = cpu_to_be32(journal->j_errno);
1174 read_unlock(&journal->j_state_lock); 1326 read_unlock(&journal->j_state_lock);
1175 1327
1176 BUFFER_TRACE(bh, "marking dirty"); 1328 jbd2_write_superblock(journal, WRITE_FUA);
1177 mark_buffer_dirty(bh);
1178 if (wait) {
1179 sync_dirty_buffer(bh);
1180 if (buffer_write_io_error(bh)) {
1181 printk(KERN_ERR "JBD2: I/O error detected "
1182 "when updating journal superblock for %s.\n",
1183 journal->j_devname);
1184 clear_buffer_write_io_error(bh);
1185 set_buffer_uptodate(bh);
1186 }
1187 } else
1188 write_dirty_buffer(bh, WRITE);
1189
1190out:
1191 /* If we have just flushed the log (by marking s_start==0), then
1192 * any future commit will have to be careful to update the
1193 * superblock again to re-record the true start of the log. */
1194 1329
1330 /* Log is no longer empty */
1195 write_lock(&journal->j_state_lock); 1331 write_lock(&journal->j_state_lock);
1196 if (sb->s_start) 1332 journal->j_flags |= JBD2_FLUSHED;
1197 journal->j_flags &= ~JBD2_FLUSHED;
1198 else
1199 journal->j_flags |= JBD2_FLUSHED;
1200 write_unlock(&journal->j_state_lock); 1333 write_unlock(&journal->j_state_lock);
1201} 1334}
1202 1335
1336
1337/**
1338 * jbd2_journal_update_sb_errno() - Update error in the journal.
1339 * @journal: The journal to update.
1340 *
1341 * Update a journal's errno. Write updated superblock to disk waiting for IO
1342 * to complete.
1343 */
1344static void jbd2_journal_update_sb_errno(journal_t *journal)
1345{
1346 journal_superblock_t *sb = journal->j_superblock;
1347
1348 read_lock(&journal->j_state_lock);
1349 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
1350 journal->j_errno);
1351 sb->s_errno = cpu_to_be32(journal->j_errno);
1352 read_unlock(&journal->j_state_lock);
1353
1354 jbd2_write_superblock(journal, WRITE_SYNC);
1355}
1356
1203/* 1357/*
1204 * Read the superblock for a given journal, performing initial 1358 * Read the superblock for a given journal, performing initial
1205 * validation of the format. 1359 * validation of the format.
1206 */ 1360 */
1207
1208static int journal_get_superblock(journal_t *journal) 1361static int journal_get_superblock(journal_t *journal)
1209{ 1362{
1210 struct buffer_head *bh; 1363 struct buffer_head *bh;
@@ -1398,14 +1551,11 @@ int jbd2_journal_destroy(journal_t *journal)
1398 1551
1399 if (journal->j_sb_buffer) { 1552 if (journal->j_sb_buffer) {
1400 if (!is_journal_aborted(journal)) { 1553 if (!is_journal_aborted(journal)) {
1401 /* We can now mark the journal as empty. */ 1554 mutex_lock(&journal->j_checkpoint_mutex);
1402 journal->j_tail = 0; 1555 jbd2_mark_journal_empty(journal);
1403 journal->j_tail_sequence = 1556 mutex_unlock(&journal->j_checkpoint_mutex);
1404 ++journal->j_transaction_sequence; 1557 } else
1405 jbd2_journal_update_superblock(journal, 1);
1406 } else {
1407 err = -EIO; 1558 err = -EIO;
1408 }
1409 brelse(journal->j_sb_buffer); 1559 brelse(journal->j_sb_buffer);
1410 } 1560 }
1411 1561
@@ -1552,61 +1702,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1552EXPORT_SYMBOL(jbd2_journal_clear_features); 1702EXPORT_SYMBOL(jbd2_journal_clear_features);
1553 1703
1554/** 1704/**
1555 * int jbd2_journal_update_format () - Update on-disk journal structure.
1556 * @journal: Journal to act on.
1557 *
1558 * Given an initialised but unloaded journal struct, poke about in the
1559 * on-disk structure to update it to the most recent supported version.
1560 */
1561int jbd2_journal_update_format (journal_t *journal)
1562{
1563 journal_superblock_t *sb;
1564 int err;
1565
1566 err = journal_get_superblock(journal);
1567 if (err)
1568 return err;
1569
1570 sb = journal->j_superblock;
1571
1572 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1573 case JBD2_SUPERBLOCK_V2:
1574 return 0;
1575 case JBD2_SUPERBLOCK_V1:
1576 return journal_convert_superblock_v1(journal, sb);
1577 default:
1578 break;
1579 }
1580 return -EINVAL;
1581}
1582
1583static int journal_convert_superblock_v1(journal_t *journal,
1584 journal_superblock_t *sb)
1585{
1586 int offset, blocksize;
1587 struct buffer_head *bh;
1588
1589 printk(KERN_WARNING
1590 "JBD2: Converting superblock from version 1 to 2.\n");
1591
1592 /* Pre-initialise new fields to zero */
1593 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1594 blocksize = be32_to_cpu(sb->s_blocksize);
1595 memset(&sb->s_feature_compat, 0, blocksize-offset);
1596
1597 sb->s_nr_users = cpu_to_be32(1);
1598 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1599 journal->j_format_version = 2;
1600
1601 bh = journal->j_sb_buffer;
1602 BUFFER_TRACE(bh, "marking dirty");
1603 mark_buffer_dirty(bh);
1604 sync_dirty_buffer(bh);
1605 return 0;
1606}
1607
1608
1609/**
1610 * int jbd2_journal_flush () - Flush journal 1705 * int jbd2_journal_flush () - Flush journal
1611 * @journal: Journal to act on. 1706 * @journal: Journal to act on.
1612 * 1707 *
@@ -1619,7 +1714,6 @@ int jbd2_journal_flush(journal_t *journal)
1619{ 1714{
1620 int err = 0; 1715 int err = 0;
1621 transaction_t *transaction = NULL; 1716 transaction_t *transaction = NULL;
1622 unsigned long old_tail;
1623 1717
1624 write_lock(&journal->j_state_lock); 1718 write_lock(&journal->j_state_lock);
1625 1719
@@ -1654,6 +1748,7 @@ int jbd2_journal_flush(journal_t *journal)
1654 if (is_journal_aborted(journal)) 1748 if (is_journal_aborted(journal))
1655 return -EIO; 1749 return -EIO;
1656 1750
1751 mutex_lock(&journal->j_checkpoint_mutex);
1657 jbd2_cleanup_journal_tail(journal); 1752 jbd2_cleanup_journal_tail(journal);
1658 1753
1659 /* Finally, mark the journal as really needing no recovery. 1754 /* Finally, mark the journal as really needing no recovery.
@@ -1661,14 +1756,9 @@ int jbd2_journal_flush(journal_t *journal)
1661 * the magic code for a fully-recovered superblock. Any future 1756 * the magic code for a fully-recovered superblock. Any future
1662 * commits of data to the journal will restore the current 1757 * commits of data to the journal will restore the current
1663 * s_start value. */ 1758 * s_start value. */
1759 jbd2_mark_journal_empty(journal);
1760 mutex_unlock(&journal->j_checkpoint_mutex);
1664 write_lock(&journal->j_state_lock); 1761 write_lock(&journal->j_state_lock);
1665 old_tail = journal->j_tail;
1666 journal->j_tail = 0;
1667 write_unlock(&journal->j_state_lock);
1668 jbd2_journal_update_superblock(journal, 1);
1669 write_lock(&journal->j_state_lock);
1670 journal->j_tail = old_tail;
1671
1672 J_ASSERT(!journal->j_running_transaction); 1762 J_ASSERT(!journal->j_running_transaction);
1673 J_ASSERT(!journal->j_committing_transaction); 1763 J_ASSERT(!journal->j_committing_transaction);
1674 J_ASSERT(!journal->j_checkpoint_transactions); 1764 J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1708,8 +1798,12 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1708 write ? "Clearing" : "Ignoring"); 1798 write ? "Clearing" : "Ignoring");
1709 1799
1710 err = jbd2_journal_skip_recovery(journal); 1800 err = jbd2_journal_skip_recovery(journal);
1711 if (write) 1801 if (write) {
1712 jbd2_journal_update_superblock(journal, 1); 1802 /* Lock to make assertions happy... */
1803 mutex_lock(&journal->j_checkpoint_mutex);
1804 jbd2_mark_journal_empty(journal);
1805 mutex_unlock(&journal->j_checkpoint_mutex);
1806 }
1713 1807
1714 no_recovery: 1808 no_recovery:
1715 return err; 1809 return err;
@@ -1759,7 +1853,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
1759 __jbd2_journal_abort_hard(journal); 1853 __jbd2_journal_abort_hard(journal);
1760 1854
1761 if (errno) 1855 if (errno)
1762 jbd2_journal_update_superblock(journal, 1); 1856 jbd2_journal_update_sb_errno(journal);
1763} 1857}
1764 1858
1765/** 1859/**
@@ -2017,7 +2111,7 @@ static struct kmem_cache *jbd2_journal_head_cache;
2017static atomic_t nr_journal_heads = ATOMIC_INIT(0); 2111static atomic_t nr_journal_heads = ATOMIC_INIT(0);
2018#endif 2112#endif
2019 2113
2020static int journal_init_jbd2_journal_head_cache(void) 2114static int jbd2_journal_init_journal_head_cache(void)
2021{ 2115{
2022 int retval; 2116 int retval;
2023 2117
@@ -2035,7 +2129,7 @@ static int journal_init_jbd2_journal_head_cache(void)
2035 return retval; 2129 return retval;
2036} 2130}
2037 2131
2038static void jbd2_journal_destroy_jbd2_journal_head_cache(void) 2132static void jbd2_journal_destroy_journal_head_cache(void)
2039{ 2133{
2040 if (jbd2_journal_head_cache) { 2134 if (jbd2_journal_head_cache) {
2041 kmem_cache_destroy(jbd2_journal_head_cache); 2135 kmem_cache_destroy(jbd2_journal_head_cache);
@@ -2323,7 +2417,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2323 2417
2324struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; 2418struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2325 2419
2326static int __init journal_init_handle_cache(void) 2420static int __init jbd2_journal_init_handle_cache(void)
2327{ 2421{
2328 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); 2422 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2329 if (jbd2_handle_cache == NULL) { 2423 if (jbd2_handle_cache == NULL) {
@@ -2358,17 +2452,20 @@ static int __init journal_init_caches(void)
2358 2452
2359 ret = jbd2_journal_init_revoke_caches(); 2453 ret = jbd2_journal_init_revoke_caches();
2360 if (ret == 0) 2454 if (ret == 0)
2361 ret = journal_init_jbd2_journal_head_cache(); 2455 ret = jbd2_journal_init_journal_head_cache();
2456 if (ret == 0)
2457 ret = jbd2_journal_init_handle_cache();
2362 if (ret == 0) 2458 if (ret == 0)
2363 ret = journal_init_handle_cache(); 2459 ret = jbd2_journal_init_transaction_cache();
2364 return ret; 2460 return ret;
2365} 2461}
2366 2462
2367static void jbd2_journal_destroy_caches(void) 2463static void jbd2_journal_destroy_caches(void)
2368{ 2464{
2369 jbd2_journal_destroy_revoke_caches(); 2465 jbd2_journal_destroy_revoke_caches();
2370 jbd2_journal_destroy_jbd2_journal_head_cache(); 2466 jbd2_journal_destroy_journal_head_cache();
2371 jbd2_journal_destroy_handle_cache(); 2467 jbd2_journal_destroy_handle_cache();
2468 jbd2_journal_destroy_transaction_cache();
2372 jbd2_journal_destroy_slabs(); 2469 jbd2_journal_destroy_slabs();
2373} 2470}
2374 2471
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7baf1390..c1a03354a22f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/crc32.h> 23#include <linux/crc32.h>
24#include <linux/blkdev.h>
24#endif 25#endif
25 26
26/* 27/*
@@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal)
265 err2 = sync_blockdev(journal->j_fs_dev); 266 err2 = sync_blockdev(journal->j_fs_dev);
266 if (!err) 267 if (!err)
267 err = err2; 268 err = err2;
268 269 /* Make sure all replayed data is on permanent storage */
270 if (journal->j_flags & JBD2_BARRIER)
271 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
269 return err; 272 return err;
270} 273}
271 274
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 30b2867d6cc9..6973705d6a3d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void)
208 J_ASSERT(!jbd2_revoke_record_cache); 208 J_ASSERT(!jbd2_revoke_record_cache);
209 J_ASSERT(!jbd2_revoke_table_cache); 209 J_ASSERT(!jbd2_revoke_table_cache);
210 210
211 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", 211 jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
212 sizeof(struct jbd2_revoke_record_s), 212 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
213 0,
214 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
215 NULL);
216 if (!jbd2_revoke_record_cache) 213 if (!jbd2_revoke_record_cache)
217 goto record_cache_failure; 214 goto record_cache_failure;
218 215
219 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", 216 jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
220 sizeof(struct jbd2_revoke_table_s), 217 SLAB_TEMPORARY);
221 0, SLAB_TEMPORARY, NULL);
222 if (!jbd2_revoke_table_cache) 218 if (!jbd2_revoke_table_cache)
223 goto table_cache_failure; 219 goto table_cache_failure;
224 return 0; 220 return 0;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5aba56e1fd5..ddcd3549c6c2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -33,6 +33,35 @@
33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
34static void __jbd2_journal_unfile_buffer(struct journal_head *jh); 34static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
35 35
36static struct kmem_cache *transaction_cache;
37int __init jbd2_journal_init_transaction_cache(void)
38{
39 J_ASSERT(!transaction_cache);
40 transaction_cache = kmem_cache_create("jbd2_transaction_s",
41 sizeof(transaction_t),
42 0,
43 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
44 NULL);
45 if (transaction_cache)
46 return 0;
47 return -ENOMEM;
48}
49
50void jbd2_journal_destroy_transaction_cache(void)
51{
52 if (transaction_cache) {
53 kmem_cache_destroy(transaction_cache);
54 transaction_cache = NULL;
55 }
56}
57
58void jbd2_journal_free_transaction(transaction_t *transaction)
59{
60 if (unlikely(ZERO_OR_NULL_PTR(transaction)))
61 return;
62 kmem_cache_free(transaction_cache, transaction);
63}
64
36/* 65/*
37 * jbd2_get_transaction: obtain a new transaction_t object. 66 * jbd2_get_transaction: obtain a new transaction_t object.
38 * 67 *
@@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
133 162
134alloc_transaction: 163alloc_transaction:
135 if (!journal->j_running_transaction) { 164 if (!journal->j_running_transaction) {
136 new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); 165 new_transaction = kmem_cache_alloc(transaction_cache,
166 gfp_mask | __GFP_ZERO);
137 if (!new_transaction) { 167 if (!new_transaction) {
138 /* 168 /*
139 * If __GFP_FS is not present, then we may be 169 * If __GFP_FS is not present, then we may be
@@ -162,7 +192,7 @@ repeat:
162 if (is_journal_aborted(journal) || 192 if (is_journal_aborted(journal) ||
163 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 193 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
164 read_unlock(&journal->j_state_lock); 194 read_unlock(&journal->j_state_lock);
165 kfree(new_transaction); 195 jbd2_journal_free_transaction(new_transaction);
166 return -EROFS; 196 return -EROFS;
167 } 197 }
168 198
@@ -284,7 +314,7 @@ repeat:
284 read_unlock(&journal->j_state_lock); 314 read_unlock(&journal->j_state_lock);
285 315
286 lock_map_acquire(&handle->h_lockdep_map); 316 lock_map_acquire(&handle->h_lockdep_map);
287 kfree(new_transaction); 317 jbd2_journal_free_transaction(new_transaction);
288 return 0; 318 return 0;
289} 319}
290 320
@@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1549 * of these pointers, it could go bad. Generally the caller needs to re-read 1579 * of these pointers, it could go bad. Generally the caller needs to re-read
1550 * the pointer from the transaction_t. 1580 * the pointer from the transaction_t.
1551 * 1581 *
1552 * Called under j_list_lock. The journal may not be locked. 1582 * Called under j_list_lock.
1553 */ 1583 */
1554void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) 1584static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1555{ 1585{
1556 struct journal_head **list = NULL; 1586 struct journal_head **list = NULL;
1557 transaction_t *transaction; 1587 transaction_t *transaction;
@@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1646 spin_lock(&journal->j_list_lock); 1676 spin_lock(&journal->j_list_lock);
1647 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1677 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1648 /* written-back checkpointed metadata buffer */ 1678 /* written-back checkpointed metadata buffer */
1649 if (jh->b_jlist == BJ_None) { 1679 JBUFFER_TRACE(jh, "remove from checkpoint list");
1650 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1680 __jbd2_journal_remove_checkpoint(jh);
1651 __jbd2_journal_remove_checkpoint(jh);
1652 }
1653 } 1681 }
1654 spin_unlock(&journal->j_list_lock); 1682 spin_unlock(&journal->j_list_lock);
1655out: 1683out:
@@ -1949,6 +1977,8 @@ zap_buffer_unlocked:
1949 clear_buffer_mapped(bh); 1977 clear_buffer_mapped(bh);
1950 clear_buffer_req(bh); 1978 clear_buffer_req(bh);
1951 clear_buffer_new(bh); 1979 clear_buffer_new(bh);
1980 clear_buffer_delay(bh);
1981 clear_buffer_unwritten(bh);
1952 bh->b_bdev = NULL; 1982 bh->b_bdev = NULL;
1953 return may_free; 1983 return may_free;
1954} 1984}