aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/balloc.c63
-rw-r--r--fs/ext4/dir.c13
-rw-r--r--fs/ext4/ext4.h34
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.h128
-rw-r--r--fs/ext4/extents.c330
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c260
-rw-r--r--fs/ext4/inode.c95
-rw-r--r--fs/ext4/mballoc.c342
-rw-r--r--fs/ext4/mballoc.h20
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c18
-rw-r--r--fs/ext4/resize.c37
-rw-r--r--fs/ext4/super.c1075
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/jbd2/checkpoint.c140
-rw-r--r--fs/jbd2/commit.c47
-rw-r--r--fs/jbd2/journal.c361
-rw-r--r--fs/jbd2/recovery.c5
-rw-r--r--fs/jbd2/revoke.c12
-rw-r--r--fs/jbd2/transaction.c48
24 files changed, 1479 insertions, 1588 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f9e2cd8cf711..4bbd07a6fa18 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -336,10 +336,10 @@ err_out:
336 * Return buffer_head on success or NULL in case of failure. 336 * Return buffer_head on success or NULL in case of failure.
337 */ 337 */
338struct buffer_head * 338struct buffer_head *
339ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 339ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
340{ 340{
341 struct ext4_group_desc *desc; 341 struct ext4_group_desc *desc;
342 struct buffer_head *bh = NULL; 342 struct buffer_head *bh;
343 ext4_fsblk_t bitmap_blk; 343 ext4_fsblk_t bitmap_blk;
344 344
345 desc = ext4_get_group_desc(sb, block_group, NULL); 345 desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
348 bitmap_blk = ext4_block_bitmap(sb, desc); 348 bitmap_blk = ext4_block_bitmap(sb, desc);
349 bh = sb_getblk(sb, bitmap_blk); 349 bh = sb_getblk(sb, bitmap_blk);
350 if (unlikely(!bh)) { 350 if (unlikely(!bh)) {
351 ext4_error(sb, "Cannot read block bitmap - " 351 ext4_error(sb, "Cannot get buffer for block bitmap - "
352 "block_group = %u, block_bitmap = %llu", 352 "block_group = %u, block_bitmap = %llu",
353 block_group, bitmap_blk); 353 block_group, bitmap_blk);
354 return NULL; 354 return NULL;
355 } 355 }
356 356
@@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
382 return bh; 382 return bh;
383 } 383 }
384 /* 384 /*
385 * submit the buffer_head for read. We can 385 * submit the buffer_head for reading
386 * safely mark the bitmap as uptodate now.
387 * We do it here so the bitmap uptodate bit
388 * get set with buffer lock held.
389 */ 386 */
387 set_buffer_new(bh);
390 trace_ext4_read_block_bitmap_load(sb, block_group); 388 trace_ext4_read_block_bitmap_load(sb, block_group);
391 set_bitmap_uptodate(bh); 389 bh->b_end_io = ext4_end_bitmap_read;
392 if (bh_submit_read(bh) < 0) { 390 get_bh(bh);
393 put_bh(bh); 391 submit_bh(READ, bh);
392 return bh;
393}
394
395/* Returns 0 on success, 1 on error */
396int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
397 struct buffer_head *bh)
398{
399 struct ext4_group_desc *desc;
400
401 if (!buffer_new(bh))
402 return 0;
403 desc = ext4_get_group_desc(sb, block_group, NULL);
404 if (!desc)
405 return 1;
406 wait_on_buffer(bh);
407 if (!buffer_uptodate(bh)) {
394 ext4_error(sb, "Cannot read block bitmap - " 408 ext4_error(sb, "Cannot read block bitmap - "
395 "block_group = %u, block_bitmap = %llu", 409 "block_group = %u, block_bitmap = %llu",
396 block_group, bitmap_blk); 410 block_group, (unsigned long long) bh->b_blocknr);
397 return NULL; 411 return 1;
398 } 412 }
413 clear_buffer_new(bh);
414 /* Panic or remount fs read-only if block bitmap is invalid */
399 ext4_valid_block_bitmap(sb, desc, block_group, bh); 415 ext4_valid_block_bitmap(sb, desc, block_group, bh);
400 /* 416 return 0;
401 * file system mounted not to panic on error, 417}
402 * continue with corrupt bitmap 418
403 */ 419struct buffer_head *
420ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
421{
422 struct buffer_head *bh;
423
424 bh = ext4_read_block_bitmap_nowait(sb, block_group);
425 if (ext4_wait_block_bitmap(sb, block_group, bh)) {
426 put_bh(bh);
427 return NULL;
428 }
404 return bh; 429 return bh;
405} 430}
406 431
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 164c56092e58..ad56866d729a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -91,17 +91,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
91 return 0; 91 return 0;
92 92
93 if (filp) 93 if (filp)
94 ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, 94 ext4_error_file(filp, function, line, bh->b_blocknr,
95 "bad entry in directory: %s - offset=%u(%u), " 95 "bad entry in directory: %s - offset=%u(%u), "
96 "inode=%u, rec_len=%d, name_len=%d", 96 "inode=%u, rec_len=%d, name_len=%d",
97 error_msg, (unsigned) (offset%bh->b_size), 97 error_msg, (unsigned) (offset % bh->b_size),
98 offset, le32_to_cpu(de->inode), 98 offset, le32_to_cpu(de->inode),
99 rlen, de->name_len); 99 rlen, de->name_len);
100 else 100 else
101 ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, 101 ext4_error_inode(dir, function, line, bh->b_blocknr,
102 "bad entry in directory: %s - offset=%u(%u), " 102 "bad entry in directory: %s - offset=%u(%u), "
103 "inode=%u, rec_len=%d, name_len=%d", 103 "inode=%u, rec_len=%d, name_len=%d",
104 error_msg, (unsigned) (offset%bh->b_size), 104 error_msg, (unsigned) (offset % bh->b_size),
105 offset, le32_to_cpu(de->inode), 105 offset, le32_to_cpu(de->inode),
106 rlen, de->name_len); 106 rlen, de->name_len);
107 107
@@ -425,8 +425,9 @@ static int call_filldir(struct file *filp, void *dirent,
425 sb = inode->i_sb; 425 sb = inode->i_sb;
426 426
427 if (!fname) { 427 if (!fname) {
428 printk(KERN_ERR "EXT4-fs: call_filldir: called with " 428 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
429 "null fname?!?\n"); 429 "called with null fname?!?", __func__, __LINE__,
430 inode->i_ino, current->comm);
430 return 0; 431 return 0;
431 } 432 }
432 curr_pos = hash2pos(fname->hash, fname->minor_hash); 433 curr_pos = hash2pos(fname->hash, fname->minor_hash);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004fc3d84..ded731ac8a32 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,7 +53,7 @@
53 printk(KERN_DEBUG f, ## a); \ 53 printk(KERN_DEBUG f, ## a); \
54 } while (0) 54 } while (0)
55#else 55#else
56#define ext4_debug(f, a...) do {} while (0) 56#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
57#endif 57#endif
58 58
59#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
@@ -184,6 +184,8 @@ struct mpage_da_data {
184#define EXT4_IO_END_UNWRITTEN 0x0001 184#define EXT4_IO_END_UNWRITTEN 0x0001
185#define EXT4_IO_END_ERROR 0x0002 185#define EXT4_IO_END_ERROR 0x0002
186#define EXT4_IO_END_QUEUED 0x0004 186#define EXT4_IO_END_QUEUED 0x0004
187#define EXT4_IO_END_DIRECT 0x0008
188#define EXT4_IO_END_IN_FSYNC 0x0010
187 189
188struct ext4_io_page { 190struct ext4_io_page {
189 struct page *p_page; 191 struct page *p_page;
@@ -192,18 +194,25 @@ struct ext4_io_page {
192 194
193#define MAX_IO_PAGES 128 195#define MAX_IO_PAGES 128
194 196
197/*
198 * For converting uninitialized extents on a work queue.
199 *
200 * 'page' is only used from the writepage() path; 'pages' is only used for
201 * buffered writes; they are used to keep page references until conversion
202 * takes place. For AIO/DIO, neither field is filled in.
203 */
195typedef struct ext4_io_end { 204typedef struct ext4_io_end {
196 struct list_head list; /* per-file finished IO list */ 205 struct list_head list; /* per-file finished IO list */
197 struct inode *inode; /* file being written to */ 206 struct inode *inode; /* file being written to */
198 unsigned int flag; /* unwritten or not */ 207 unsigned int flag; /* unwritten or not */
199 struct page *page; /* page struct for buffer write */ 208 struct page *page; /* for writepage() path */
200 loff_t offset; /* offset in the file */ 209 loff_t offset; /* offset in the file */
201 ssize_t size; /* size of the extent */ 210 ssize_t size; /* size of the extent */
202 struct work_struct work; /* data work queue */ 211 struct work_struct work; /* data work queue */
203 struct kiocb *iocb; /* iocb struct for AIO */ 212 struct kiocb *iocb; /* iocb struct for AIO */
204 int result; /* error value for AIO */ 213 int result; /* error value for AIO */
205 int num_io_pages; 214 int num_io_pages; /* for writepages() */
206 struct ext4_io_page *pages[MAX_IO_PAGES]; 215 struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
207} ext4_io_end_t; 216} ext4_io_end_t;
208 217
209struct ext4_io_submit { 218struct ext4_io_submit {
@@ -923,6 +932,7 @@ struct ext4_inode_info {
923#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ 932#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
924#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ 933#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
925#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ 934#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
935#define EXT4_MOUNT_ERRORS_MASK 0x00070
926#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ 936#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
927#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ 937#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
928#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ 938#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
@@ -941,7 +951,6 @@ struct ext4_inode_info {
941#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ 951#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
942#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 952#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
943#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 953#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
944#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
945#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ 954#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
946#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 955#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
947#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 956#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
@@ -1142,6 +1151,7 @@ struct ext4_sb_info {
1142 unsigned int s_mount_opt; 1151 unsigned int s_mount_opt;
1143 unsigned int s_mount_opt2; 1152 unsigned int s_mount_opt2;
1144 unsigned int s_mount_flags; 1153 unsigned int s_mount_flags;
1154 unsigned int s_def_mount_opt;
1145 ext4_fsblk_t s_sb_block; 1155 ext4_fsblk_t s_sb_block;
1146 uid_t s_resuid; 1156 uid_t s_resuid;
1147 gid_t s_resgid; 1157 gid_t s_resgid;
@@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1420#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1430#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1421#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1431#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1422#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1432#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1423#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ 1433#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
1424#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ 1434#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
1435#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */
1425 1436
1426#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR 1437#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1427#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1438#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1794,8 +1805,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1794 ext4_group_t block_group, 1805 ext4_group_t block_group,
1795 struct buffer_head ** bh); 1806 struct buffer_head ** bh);
1796extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1807extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1797struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, 1808
1798 ext4_group_t block_group); 1809extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
1810 ext4_group_t block_group);
1811extern int ext4_wait_block_bitmap(struct super_block *sb,
1812 ext4_group_t block_group,
1813 struct buffer_head *bh);
1814extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1815 ext4_group_t block_group);
1799extern void ext4_init_block_bitmap(struct super_block *sb, 1816extern void ext4_init_block_bitmap(struct super_block *sb,
1800 struct buffer_head *bh, 1817 struct buffer_head *bh,
1801 ext4_group_t group, 1818 ext4_group_t group,
@@ -1841,6 +1858,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *);
1841extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); 1858extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1842extern int ext4_init_inode_table(struct super_block *sb, 1859extern int ext4_init_inode_table(struct super_block *sb,
1843 ext4_group_t group, int barrier); 1860 ext4_group_t group, int barrier);
1861extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
1844 1862
1845/* mballoc.c */ 1863/* mballoc.c */
1846extern long ext4_mb_stats; 1864extern long ext4_mb_stats;
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a52db3a69a30..0f58b86e3a02 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -47,9 +47,9 @@
47 */ 47 */
48#define EXT_DEBUG__ 48#define EXT_DEBUG__
49#ifdef EXT_DEBUG 49#ifdef EXT_DEBUG
50#define ext_debug(a...) printk(a) 50#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
51#else 51#else
52#define ext_debug(a...) 52#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
53#endif 53#endif
54 54
55/* 55/*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1dab18..83b20fcf9400 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,6 +104,78 @@
104#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 104#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
105#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 105#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
106 106
107/**
108 * struct ext4_journal_cb_entry - Base structure for callback information.
109 *
110 * This struct is a 'seed' structure for a using with your own callback
111 * structs. If you are using callbacks you must allocate one of these
112 * or another struct of your own definition which has this struct
113 * as it's first element and pass it to ext4_journal_callback_add().
114 */
115struct ext4_journal_cb_entry {
116 /* list information for other callbacks attached to the same handle */
117 struct list_head jce_list;
118
119 /* Function to call with this callback structure */
120 void (*jce_func)(struct super_block *sb,
121 struct ext4_journal_cb_entry *jce, int error);
122
123 /* user data goes here */
124};
125
126/**
127 * ext4_journal_callback_add: add a function to call after transaction commit
128 * @handle: active journal transaction handle to register callback on
129 * @func: callback function to call after the transaction has committed:
130 * @sb: superblock of current filesystem for transaction
131 * @jce: returned journal callback data
132 * @rc: journal state at commit (0 = transaction committed properly)
133 * @jce: journal callback data (internal and function private data struct)
134 *
135 * The registered function will be called in the context of the journal thread
136 * after the transaction for which the handle was created has completed.
137 *
138 * No locks are held when the callback function is called, so it is safe to
139 * call blocking functions from within the callback, but the callback should
140 * not block or run for too long, or the filesystem will be blocked waiting for
141 * the next transaction to commit. No journaling functions can be used, or
142 * there is a risk of deadlock.
143 *
144 * There is no guaranteed calling order of multiple registered callbacks on
145 * the same transaction.
146 */
147static inline void ext4_journal_callback_add(handle_t *handle,
148 void (*func)(struct super_block *sb,
149 struct ext4_journal_cb_entry *jce,
150 int rc),
151 struct ext4_journal_cb_entry *jce)
152{
153 struct ext4_sb_info *sbi =
154 EXT4_SB(handle->h_transaction->t_journal->j_private);
155
156 /* Add the jce to transaction's private list */
157 jce->jce_func = func;
158 spin_lock(&sbi->s_md_lock);
159 list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
160 spin_unlock(&sbi->s_md_lock);
161}
162
163/**
164 * ext4_journal_callback_del: delete a registered callback
165 * @handle: active journal transaction handle on which callback was registered
166 * @jce: registered journal callback entry to unregister
167 */
168static inline void ext4_journal_callback_del(handle_t *handle,
169 struct ext4_journal_cb_entry *jce)
170{
171 struct ext4_sb_info *sbi =
172 EXT4_SB(handle->h_transaction->t_journal->j_private);
173
174 spin_lock(&sbi->s_md_lock);
175 list_del_init(&jce->jce_list);
176 spin_unlock(&sbi->s_md_lock);
177}
178
107int 179int
108ext4_mark_iloc_dirty(handle_t *handle, 180ext4_mark_iloc_dirty(handle_t *handle,
109 struct inode *inode, 181 struct inode *inode,
@@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
261/* super.c */ 333/* super.c */
262int ext4_force_commit(struct super_block *sb); 334int ext4_force_commit(struct super_block *sb);
263 335
264static inline int ext4_should_journal_data(struct inode *inode) 336/*
337 * Ext4 inode journal modes
338 */
339#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */
340#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
341#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
342
343static inline int ext4_inode_journal_mode(struct inode *inode)
265{ 344{
266 if (EXT4_JOURNAL(inode) == NULL) 345 if (EXT4_JOURNAL(inode) == NULL)
267 return 0; 346 return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
268 if (!S_ISREG(inode->i_mode)) 347 /* We do not support data journalling with delayed allocation */
269 return 1; 348 if (!S_ISREG(inode->i_mode) ||
270 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 349 test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
271 return 1; 350 return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
272 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 351 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
273 return 1; 352 !test_opt(inode->i_sb, DELALLOC))
274 return 0; 353 return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
354 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
355 return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
356 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
357 return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
358 else
359 BUG();
360}
361
362static inline int ext4_should_journal_data(struct inode *inode)
363{
364 return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
275} 365}
276 366
277static inline int ext4_should_order_data(struct inode *inode) 367static inline int ext4_should_order_data(struct inode *inode)
278{ 368{
279 if (EXT4_JOURNAL(inode) == NULL) 369 return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
280 return 0;
281 if (!S_ISREG(inode->i_mode))
282 return 0;
283 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
284 return 0;
285 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
286 return 1;
287 return 0;
288} 370}
289 371
290static inline int ext4_should_writeback_data(struct inode *inode) 372static inline int ext4_should_writeback_data(struct inode *inode)
291{ 373{
292 if (EXT4_JOURNAL(inode) == NULL) 374 return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
293 return 1;
294 if (!S_ISREG(inode->i_mode))
295 return 0;
296 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
297 return 0;
298 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
299 return 1;
300 return 0;
301} 375}
302 376
303/* 377/*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c292e1b..1421938e6792 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,14 @@
44 44
45#include <trace/events/ext4.h> 45#include <trace/events/ext4.h>
46 46
47/*
48 * used by extent splitting.
49 */
50#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
51 due to ENOSPC */
52#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
53#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
54
47static int ext4_split_extent(handle_t *handle, 55static int ext4_split_extent(handle_t *handle,
48 struct inode *inode, 56 struct inode *inode,
49 struct ext4_ext_path *path, 57 struct ext4_ext_path *path,
@@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle,
51 int split_flag, 59 int split_flag,
52 int flags); 60 int flags);
53 61
62static int ext4_split_extent_at(handle_t *handle,
63 struct inode *inode,
64 struct ext4_ext_path *path,
65 ext4_lblk_t split,
66 int split_flag,
67 int flags);
68
54static int ext4_ext_truncate_extend_restart(handle_t *handle, 69static int ext4_ext_truncate_extend_restart(handle_t *handle,
55 struct inode *inode, 70 struct inode *inode,
56 int needed) 71 int needed)
@@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
300 ext4_fsblk_t block = ext4_ext_pblock(ext); 315 ext4_fsblk_t block = ext4_ext_pblock(ext);
301 int len = ext4_ext_get_actual_len(ext); 316 int len = ext4_ext_get_actual_len(ext);
302 317
318 if (len == 0)
319 return 0;
303 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 320 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
304} 321}
305 322
@@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2308 struct ext4_extent *ex; 2325 struct ext4_extent *ex;
2309 2326
2310 /* the header must be checked already in ext4_ext_remove_space() */ 2327 /* the header must be checked already in ext4_ext_remove_space() */
2311 ext_debug("truncate since %u in leaf\n", start); 2328 ext_debug("truncate since %u in leaf to %u\n", start, end);
2312 if (!path[depth].p_hdr) 2329 if (!path[depth].p_hdr)
2313 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2330 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2314 eh = path[depth].p_hdr; 2331 eh = path[depth].p_hdr;
@@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2343 ext_debug(" border %u:%u\n", a, b); 2360 ext_debug(" border %u:%u\n", a, b);
2344 2361
2345 /* If this extent is beyond the end of the hole, skip it */ 2362 /* If this extent is beyond the end of the hole, skip it */
2346 if (end <= ex_ee_block) { 2363 if (end < ex_ee_block) {
2347 ex--; 2364 ex--;
2348 ex_ee_block = le32_to_cpu(ex->ee_block); 2365 ex_ee_block = le32_to_cpu(ex->ee_block);
2349 ex_ee_len = ext4_ext_get_actual_len(ex); 2366 ex_ee_len = ext4_ext_get_actual_len(ex);
2350 continue; 2367 continue;
2351 } else if (b != ex_ee_block + ex_ee_len - 1) { 2368 } else if (b != ex_ee_block + ex_ee_len - 1) {
2352 EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", 2369 EXT4_ERROR_INODE(inode,
2353 start, end); 2370 "can not handle truncate %u:%u "
2371 "on extent %u:%u",
2372 start, end, ex_ee_block,
2373 ex_ee_block + ex_ee_len - 1);
2354 err = -EIO; 2374 err = -EIO;
2355 goto out; 2375 goto out;
2356 } else if (a != ex_ee_block) { 2376 } else if (a != ex_ee_block) {
@@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2482 return 1; 2502 return 1;
2483} 2503}
2484 2504
2485static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2505static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2506 ext4_lblk_t end)
2486{ 2507{
2487 struct super_block *sb = inode->i_sb; 2508 struct super_block *sb = inode->i_sb;
2488 int depth = ext_depth(inode); 2509 int depth = ext_depth(inode);
@@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2491 handle_t *handle; 2512 handle_t *handle;
2492 int i, err; 2513 int i, err;
2493 2514
2494 ext_debug("truncate since %u\n", start); 2515 ext_debug("truncate since %u to %u\n", start, end);
2495 2516
2496 /* probably first extent we're gonna free will be last in block */ 2517 /* probably first extent we're gonna free will be last in block */
2497 handle = ext4_journal_start(inode, depth + 1); 2518 handle = ext4_journal_start(inode, depth + 1);
@@ -2504,6 +2525,61 @@ again:
2504 trace_ext4_ext_remove_space(inode, start, depth); 2525 trace_ext4_ext_remove_space(inode, start, depth);
2505 2526
2506 /* 2527 /*
2528 * Check if we are removing extents inside the extent tree. If that
2529 * is the case, we are going to punch a hole inside the extent tree
2530 * so we have to check whether we need to split the extent covering
2531 * the last block to remove so we can easily remove the part of it
2532 * in ext4_ext_rm_leaf().
2533 */
2534 if (end < EXT_MAX_BLOCKS - 1) {
2535 struct ext4_extent *ex;
2536 ext4_lblk_t ee_block;
2537
2538 /* find extent for this block */
2539 path = ext4_ext_find_extent(inode, end, NULL);
2540 if (IS_ERR(path)) {
2541 ext4_journal_stop(handle);
2542 return PTR_ERR(path);
2543 }
2544 depth = ext_depth(inode);
2545 ex = path[depth].p_ext;
2546 if (!ex)
2547 goto cont;
2548
2549 ee_block = le32_to_cpu(ex->ee_block);
2550
2551 /*
2552 * See if the last block is inside the extent, if so split
2553 * the extent at 'end' block so we can easily remove the
2554 * tail of the first part of the split extent in
2555 * ext4_ext_rm_leaf().
2556 */
2557 if (end >= ee_block &&
2558 end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2559 int split_flag = 0;
2560
2561 if (ext4_ext_is_uninitialized(ex))
2562 split_flag = EXT4_EXT_MARK_UNINIT1 |
2563 EXT4_EXT_MARK_UNINIT2;
2564
2565 /*
2566 * Split the extent in two so that 'end' is the last
2567 * block in the first new extent
2568 */
2569 err = ext4_split_extent_at(handle, inode, path,
2570 end + 1, split_flag,
2571 EXT4_GET_BLOCKS_PRE_IO |
2572 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
2573
2574 if (err < 0)
2575 goto out;
2576 }
2577 ext4_ext_drop_refs(path);
2578 kfree(path);
2579 }
2580cont:
2581
2582 /*
2507 * We start scanning from right side, freeing all the blocks 2583 * We start scanning from right side, freeing all the blocks
2508 * after i_size and walking into the tree depth-wise. 2584 * after i_size and walking into the tree depth-wise.
2509 */ 2585 */
@@ -2515,6 +2591,7 @@ again:
2515 } 2591 }
2516 path[0].p_depth = depth; 2592 path[0].p_depth = depth;
2517 path[0].p_hdr = ext_inode_hdr(inode); 2593 path[0].p_hdr = ext_inode_hdr(inode);
2594
2518 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2595 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2519 err = -EIO; 2596 err = -EIO;
2520 goto out; 2597 goto out;
@@ -2526,7 +2603,7 @@ again:
2526 /* this is leaf block */ 2603 /* this is leaf block */
2527 err = ext4_ext_rm_leaf(handle, inode, path, 2604 err = ext4_ext_rm_leaf(handle, inode, path,
2528 &partial_cluster, start, 2605 &partial_cluster, start,
2529 EXT_MAX_BLOCKS - 1); 2606 end);
2530 /* root level has p_bh == NULL, brelse() eats this */ 2607 /* root level has p_bh == NULL, brelse() eats this */
2531 brelse(path[i].p_bh); 2608 brelse(path[i].p_bh);
2532 path[i].p_bh = NULL; 2609 path[i].p_bh = NULL;
@@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb)
2651 2728
2652 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2729 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2653#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2730#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2654 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2731 printk(KERN_INFO "EXT4-fs: file extents enabled"
2655#ifdef AGGRESSIVE_TEST 2732#ifdef AGGRESSIVE_TEST
2656 printk(", aggressive tests"); 2733 ", aggressive tests"
2657#endif 2734#endif
2658#ifdef CHECK_BINSEARCH 2735#ifdef CHECK_BINSEARCH
2659 printk(", check binsearch"); 2736 ", check binsearch"
2660#endif 2737#endif
2661#ifdef EXTENTS_STATS 2738#ifdef EXTENTS_STATS
2662 printk(", stats"); 2739 ", stats"
2663#endif 2740#endif
2664 printk("\n"); 2741 "\n");
2665#endif 2742#endif
2666#ifdef EXTENTS_STATS 2743#ifdef EXTENTS_STATS
2667 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2744 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
@@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2709} 2786}
2710 2787
2711/* 2788/*
2712 * used by extent splitting.
2713 */
2714#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
2715 due to ENOSPC */
2716#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
2717#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
2718
2719/*
2720 * ext4_split_extent_at() splits an extent at given block. 2789 * ext4_split_extent_at() splits an extent at given block.
2721 * 2790 *
2722 * @handle: the journal handle 2791 * @handle: the journal handle
@@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3224 depth = ext_depth(inode); 3293 depth = ext_depth(inode);
3225 eh = path[depth].p_hdr; 3294 eh = path[depth].p_hdr;
3226 3295
3227 if (unlikely(!eh->eh_entries)) { 3296 /*
3228 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3297 * We're going to remove EOFBLOCKS_FL entirely in future so we
3229 "EOFBLOCKS_FL set"); 3298 * do not care for this case anymore. Simply remove the flag
3230 return -EIO; 3299 * if there are no extents.
3231 } 3300 */
3301 if (unlikely(!eh->eh_entries))
3302 goto out;
3232 last_ex = EXT_LAST_EXTENT(eh); 3303 last_ex = EXT_LAST_EXTENT(eh);
3233 /* 3304 /*
3234 * We should clear the EOFBLOCKS_FL flag if we are writing the 3305 * We should clear the EOFBLOCKS_FL flag if we are writing the
@@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3252 for (i = depth-1; i >= 0; i--) 3323 for (i = depth-1; i >= 0; i--)
3253 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3324 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3254 return 0; 3325 return 0;
3326out:
3255 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3327 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3256 return ext4_mark_inode_dirty(handle, inode); 3328 return ext4_mark_inode_dirty(handle, inode);
3257} 3329}
@@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3710 int free_on_err = 0, err = 0, depth, ret; 3782 int free_on_err = 0, err = 0, depth, ret;
3711 unsigned int allocated = 0, offset = 0; 3783 unsigned int allocated = 0, offset = 0;
3712 unsigned int allocated_clusters = 0; 3784 unsigned int allocated_clusters = 0;
3713 unsigned int punched_out = 0;
3714 unsigned int result = 0;
3715 struct ext4_allocation_request ar; 3785 struct ext4_allocation_request ar;
3716 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3786 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3717 ext4_lblk_t cluster_offset; 3787 ext4_lblk_t cluster_offset;
@@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3721 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3791 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3722 3792
3723 /* check in cache */ 3793 /* check in cache */
3724 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && 3794 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3725 ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3726 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3795 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3727 if ((sbi->s_cluster_ratio > 1) && 3796 if ((sbi->s_cluster_ratio > 1) &&
3728 ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 3797 ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
@@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3790 3859
3791 /* if found extent covers block, simply return it */ 3860 /* if found extent covers block, simply return it */
3792 if (in_range(map->m_lblk, ee_block, ee_len)) { 3861 if (in_range(map->m_lblk, ee_block, ee_len)) {
3793 struct ext4_map_blocks punch_map;
3794 ext4_fsblk_t partial_cluster = 0;
3795
3796 newblock = map->m_lblk - ee_block + ee_start; 3862 newblock = map->m_lblk - ee_block + ee_start;
3797 /* number of remaining blocks in the extent */ 3863 /* number of remaining blocks in the extent */
3798 allocated = ee_len - (map->m_lblk - ee_block); 3864 allocated = ee_len - (map->m_lblk - ee_block);
3799 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3865 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3800 ee_block, ee_len, newblock); 3866 ee_block, ee_len, newblock);
3801 3867
3802 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3803 /*
3804 * Do not put uninitialized extent
3805 * in the cache
3806 */
3807 if (!ext4_ext_is_uninitialized(ex)) {
3808 ext4_ext_put_in_cache(inode, ee_block,
3809 ee_len, ee_start);
3810 goto out;
3811 }
3812 ret = ext4_ext_handle_uninitialized_extents(
3813 handle, inode, map, path, flags,
3814 allocated, newblock);
3815 return ret;
3816 }
3817
3818 /*
3819 * Punch out the map length, but only to the
3820 * end of the extent
3821 */
3822 punched_out = allocated < map->m_len ?
3823 allocated : map->m_len;
3824
3825 /* 3868 /*
3826 * Sense extents need to be converted to 3869 * Do not put uninitialized extent
3827 * uninitialized, they must fit in an 3870 * in the cache
3828 * uninitialized extent
3829 */ 3871 */
3830 if (punched_out > EXT_UNINIT_MAX_LEN) 3872 if (!ext4_ext_is_uninitialized(ex)) {
3831 punched_out = EXT_UNINIT_MAX_LEN; 3873 ext4_ext_put_in_cache(inode, ee_block,
3832 3874 ee_len, ee_start);
3833 punch_map.m_lblk = map->m_lblk; 3875 goto out;
3834 punch_map.m_pblk = newblock;
3835 punch_map.m_len = punched_out;
3836 punch_map.m_flags = 0;
3837
3838 /* Check to see if the extent needs to be split */
3839 if (punch_map.m_len != ee_len ||
3840 punch_map.m_lblk != ee_block) {
3841
3842 ret = ext4_split_extent(handle, inode,
3843 path, &punch_map, 0,
3844 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3845 EXT4_GET_BLOCKS_PRE_IO);
3846
3847 if (ret < 0) {
3848 err = ret;
3849 goto out2;
3850 }
3851 /*
3852 * find extent for the block at
3853 * the start of the hole
3854 */
3855 ext4_ext_drop_refs(path);
3856 kfree(path);
3857
3858 path = ext4_ext_find_extent(inode,
3859 map->m_lblk, NULL);
3860 if (IS_ERR(path)) {
3861 err = PTR_ERR(path);
3862 path = NULL;
3863 goto out2;
3864 }
3865
3866 depth = ext_depth(inode);
3867 ex = path[depth].p_ext;
3868 ee_len = ext4_ext_get_actual_len(ex);
3869 ee_block = le32_to_cpu(ex->ee_block);
3870 ee_start = ext4_ext_pblock(ex);
3871
3872 }
3873
3874 ext4_ext_mark_uninitialized(ex);
3875
3876 ext4_ext_invalidate_cache(inode);
3877
3878 err = ext4_ext_rm_leaf(handle, inode, path,
3879 &partial_cluster, map->m_lblk,
3880 map->m_lblk + punched_out);
3881
3882 if (!err && path->p_hdr->eh_entries == 0) {
3883 /*
3884 * Punch hole freed all of this sub tree,
3885 * so we need to correct eh_depth
3886 */
3887 err = ext4_ext_get_access(handle, inode, path);
3888 if (err == 0) {
3889 ext_inode_hdr(inode)->eh_depth = 0;
3890 ext_inode_hdr(inode)->eh_max =
3891 cpu_to_le16(ext4_ext_space_root(
3892 inode, 0));
3893
3894 err = ext4_ext_dirty(
3895 handle, inode, path);
3896 }
3897 } 3876 }
3898 3877 ret = ext4_ext_handle_uninitialized_extents(
3899 goto out2; 3878 handle, inode, map, path, flags,
3879 allocated, newblock);
3880 return ret;
3900 } 3881 }
3901 } 3882 }
3902 3883
@@ -4165,13 +4146,11 @@ out2:
4165 ext4_ext_drop_refs(path); 4146 ext4_ext_drop_refs(path);
4166 kfree(path); 4147 kfree(path);
4167 } 4148 }
4168 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
4169 punched_out : allocated;
4170 4149
4171 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 4150 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4172 newblock, map->m_len, err ? err : result); 4151 newblock, map->m_len, err ? err : allocated);
4173 4152
4174 return err ? err : result; 4153 return err ? err : allocated;
4175} 4154}
4176 4155
4177void ext4_ext_truncate(struct inode *inode) 4156void ext4_ext_truncate(struct inode *inode)
@@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode)
4228 4207
4229 last_block = (inode->i_size + sb->s_blocksize - 1) 4208 last_block = (inode->i_size + sb->s_blocksize - 1)
4230 >> EXT4_BLOCK_SIZE_BITS(sb); 4209 >> EXT4_BLOCK_SIZE_BITS(sb);
4231 err = ext4_ext_remove_space(inode, last_block); 4210 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4232 4211
4233 /* In a multi-transaction truncate, we only make the final 4212 /* In a multi-transaction truncate, we only make the final
4234 * transaction synchronous. 4213 * transaction synchronous.
@@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4436 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4415 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4437 if (ret <= 0) { 4416 if (ret <= 0) {
4438 WARN_ON(ret <= 0); 4417 WARN_ON(ret <= 0);
4439 printk(KERN_ERR "%s: ext4_ext_map_blocks " 4418 ext4_msg(inode->i_sb, KERN_ERR,
4440 "returned error inode#%lu, block=%u, " 4419 "%s:%d: inode #%lu: block %u: len %u: "
4441 "max_blocks=%u", __func__, 4420 "ext4_ext_map_blocks returned %d",
4442 inode->i_ino, map.m_lblk, map.m_len); 4421 __func__, __LINE__, inode->i_ino, map.m_lblk,
4422 map.m_len, ret);
4443 } 4423 }
4444 ext4_mark_inode_dirty(handle, inode); 4424 ext4_mark_inode_dirty(handle, inode);
4445 ret2 = ext4_journal_stop(handle); 4425 ret2 = ext4_journal_stop(handle);
@@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4705{ 4685{
4706 struct inode *inode = file->f_path.dentry->d_inode; 4686 struct inode *inode = file->f_path.dentry->d_inode;
4707 struct super_block *sb = inode->i_sb; 4687 struct super_block *sb = inode->i_sb;
4708 struct ext4_ext_cache cache_ex; 4688 ext4_lblk_t first_block, stop_block;
4709 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4710 struct address_space *mapping = inode->i_mapping; 4689 struct address_space *mapping = inode->i_mapping;
4711 struct ext4_map_blocks map;
4712 handle_t *handle; 4690 handle_t *handle;
4713 loff_t first_page, last_page, page_len; 4691 loff_t first_page, last_page, page_len;
4714 loff_t first_page_offset, last_page_offset; 4692 loff_t first_page_offset, last_page_offset;
4715 int ret, credits, blocks_released, err = 0; 4693 int credits, err = 0;
4716 4694
4717 /* No need to punch hole beyond i_size */ 4695 /* No need to punch hole beyond i_size */
4718 if (offset >= inode->i_size) 4696 if (offset >= inode->i_size)
@@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4728 offset; 4706 offset;
4729 } 4707 }
4730 4708
4731 first_block = (offset + sb->s_blocksize - 1) >>
4732 EXT4_BLOCK_SIZE_BITS(sb);
4733 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4734
4735 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 4709 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4736 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 4710 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4737 4711
@@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4810 } 4784 }
4811 } 4785 }
4812 4786
4813
4814 /* 4787 /*
4815 * If i_size is contained in the last page, we need to 4788 * If i_size is contained in the last page, we need to
4816 * unmap and zero the partial page after i_size 4789 * unmap and zero the partial page after i_size
@@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4830 } 4803 }
4831 } 4804 }
4832 4805
4806 first_block = (offset + sb->s_blocksize - 1) >>
4807 EXT4_BLOCK_SIZE_BITS(sb);
4808 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4809
4833 /* If there are no blocks to remove, return now */ 4810 /* If there are no blocks to remove, return now */
4834 if (first_block >= last_block) 4811 if (first_block >= stop_block)
4835 goto out; 4812 goto out;
4836 4813
4837 down_write(&EXT4_I(inode)->i_data_sem); 4814 down_write(&EXT4_I(inode)->i_data_sem);
4838 ext4_ext_invalidate_cache(inode); 4815 ext4_ext_invalidate_cache(inode);
4839 ext4_discard_preallocations(inode); 4816 ext4_discard_preallocations(inode);
4840 4817
4841 /* 4818 err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4842 * Loop over all the blocks and identify blocks
4843 * that need to be punched out
4844 */
4845 iblock = first_block;
4846 blocks_released = 0;
4847 while (iblock < last_block) {
4848 max_blocks = last_block - iblock;
4849 num_blocks = 1;
4850 memset(&map, 0, sizeof(map));
4851 map.m_lblk = iblock;
4852 map.m_len = max_blocks;
4853 ret = ext4_ext_map_blocks(handle, inode, &map,
4854 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4855
4856 if (ret > 0) {
4857 blocks_released += ret;
4858 num_blocks = ret;
4859 } else if (ret == 0) {
4860 /*
4861 * If map blocks could not find the block,
4862 * then it is in a hole. If the hole was
4863 * not already cached, then map blocks should
4864 * put it in the cache. So we can get the hole
4865 * out of the cache
4866 */
4867 memset(&cache_ex, 0, sizeof(cache_ex));
4868 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4869 !cache_ex.ec_start) {
4870
4871 /* The hole is cached */
4872 num_blocks = cache_ex.ec_block +
4873 cache_ex.ec_len - iblock;
4874
4875 } else {
4876 /* The block could not be identified */
4877 err = -EIO;
4878 break;
4879 }
4880 } else {
4881 /* Map blocks error */
4882 err = ret;
4883 break;
4884 }
4885
4886 if (num_blocks == 0) {
4887 /* This condition should never happen */
4888 ext_debug("Block lookup failed");
4889 err = -EIO;
4890 break;
4891 }
4892
4893 iblock += num_blocks;
4894 }
4895 4819
4896 if (blocks_released > 0) { 4820 ext4_ext_invalidate_cache(inode);
4897 ext4_ext_invalidate_cache(inode); 4821 ext4_discard_preallocations(inode);
4898 ext4_discard_preallocations(inode);
4899 }
4900 4822
4901 if (IS_SYNC(inode)) 4823 if (IS_SYNC(inode))
4902 ext4_handle_sync(handle); 4824 ext4_handle_sync(handle);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 00a2cb753efd..bb6c7d811313 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode)
89 io = list_entry(ei->i_completed_io_list.next, 89 io = list_entry(ei->i_completed_io_list.next,
90 ext4_io_end_t, list); 90 ext4_io_end_t, list);
91 list_del_init(&io->list); 91 list_del_init(&io->list);
92 io->flag |= EXT4_IO_END_IN_FSYNC;
92 /* 93 /*
93 * Calling ext4_end_io_nolock() to convert completed 94 * Calling ext4_end_io_nolock() to convert completed
94 * IO to written. 95 * IO to written.
@@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode)
108 if (ret < 0) 109 if (ret < 0)
109 ret2 = ret; 110 ret2 = ret;
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 111 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
112 io->flag &= ~EXT4_IO_END_IN_FSYNC;
111 } 113 }
112 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 114 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
113 return (ret2 < 0) ? ret2 : 0; 115 return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25d8c9781ad9..409c2ee7750a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
92 return EXT4_INODES_PER_GROUP(sb); 92 return EXT4_INODES_PER_GROUP(sb);
93} 93}
94 94
95void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
96{
97 if (uptodate) {
98 set_buffer_uptodate(bh);
99 set_bitmap_uptodate(bh);
100 }
101 unlock_buffer(bh);
102 put_bh(bh);
103}
104
95/* 105/*
96 * Read the inode allocation bitmap for a given block_group, reading 106 * Read the inode allocation bitmap for a given block_group, reading
97 * into the specified slot in the superblock's bitmap cache. 107 * into the specified slot in the superblock's bitmap cache.
@@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
147 return bh; 157 return bh;
148 } 158 }
149 /* 159 /*
150 * submit the buffer_head for read. We can 160 * submit the buffer_head for reading
151 * safely mark the bitmap as uptodate now.
152 * We do it here so the bitmap uptodate bit
153 * get set with buffer lock held.
154 */ 161 */
155 trace_ext4_load_inode_bitmap(sb, block_group); 162 trace_ext4_load_inode_bitmap(sb, block_group);
156 set_bitmap_uptodate(bh); 163 bh->b_end_io = ext4_end_bitmap_read;
157 if (bh_submit_read(bh) < 0) { 164 get_bh(bh);
165 submit_bh(READ, bh);
166 wait_on_buffer(bh);
167 if (!buffer_uptodate(bh)) {
158 put_bh(bh); 168 put_bh(bh);
159 ext4_error(sb, "Cannot read inode bitmap - " 169 ext4_error(sb, "Cannot read inode bitmap - "
160 "block_group = %u, inode_bitmap = %llu", 170 "block_group = %u, inode_bitmap = %llu",
161 block_group, bitmap_blk); 171 block_group, bitmap_blk);
162 return NULL; 172 return NULL;
163 } 173 }
164 return bh; 174 return bh;
@@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
194 struct ext4_sb_info *sbi; 204 struct ext4_sb_info *sbi;
195 int fatal = 0, err, count, cleared; 205 int fatal = 0, err, count, cleared;
196 206
197 if (atomic_read(&inode->i_count) > 1) { 207 if (!sb) {
198 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", 208 printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
199 atomic_read(&inode->i_count)); 209 "nonexistent device\n", __func__, __LINE__);
200 return; 210 return;
201 } 211 }
202 if (inode->i_nlink) { 212 if (atomic_read(&inode->i_count) > 1) {
203 printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", 213 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
204 inode->i_nlink); 214 __func__, __LINE__, inode->i_ino,
215 atomic_read(&inode->i_count));
205 return; 216 return;
206 } 217 }
207 if (!sb) { 218 if (inode->i_nlink) {
208 printk(KERN_ERR "ext4_free_inode: inode on " 219 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
209 "nonexistent device\n"); 220 __func__, __LINE__, inode->i_ino, inode->i_nlink);
210 return; 221 return;
211 } 222 }
212 sbi = EXT4_SB(sb); 223 sbi = EXT4_SB(sb);
@@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
593} 604}
594 605
595/* 606/*
596 * claim the inode from the inode bitmap. If the group
597 * is uninit we need to take the groups's ext4_group_lock
598 * and clear the uninit flag. The inode bitmap update
599 * and group desc uninit flag clear should be done
600 * after holding ext4_group_lock so that ext4_read_inode_bitmap
601 * doesn't race with the ext4_claim_inode
602 */
603static int ext4_claim_inode(struct super_block *sb,
604 struct buffer_head *inode_bitmap_bh,
605 unsigned long ino, ext4_group_t group, umode_t mode)
606{
607 int free = 0, retval = 0, count;
608 struct ext4_sb_info *sbi = EXT4_SB(sb);
609 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
610 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
611
612 /*
613 * We have to be sure that new inode allocation does not race with
614 * inode table initialization, because otherwise we may end up
615 * allocating and writing new inode right before sb_issue_zeroout
616 * takes place and overwriting our new inode with zeroes. So we
617 * take alloc_sem to prevent it.
618 */
619 down_read(&grp->alloc_sem);
620 ext4_lock_group(sb, group);
621 if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
622 /* not a free inode */
623 retval = 1;
624 goto err_ret;
625 }
626 ino++;
627 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
628 ino > EXT4_INODES_PER_GROUP(sb)) {
629 ext4_unlock_group(sb, group);
630 up_read(&grp->alloc_sem);
631 ext4_error(sb, "reserved inode or inode > inodes count - "
632 "block_group = %u, inode=%lu", group,
633 ino + group * EXT4_INODES_PER_GROUP(sb));
634 return 1;
635 }
636 /* If we didn't allocate from within the initialized part of the inode
637 * table then we need to initialize up to this inode. */
638 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
639
640 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
641 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
642 /* When marking the block group with
643 * ~EXT4_BG_INODE_UNINIT we don't want to depend
644 * on the value of bg_itable_unused even though
645 * mke2fs could have initialized the same for us.
646 * Instead we calculated the value below
647 */
648
649 free = 0;
650 } else {
651 free = EXT4_INODES_PER_GROUP(sb) -
652 ext4_itable_unused_count(sb, gdp);
653 }
654
655 /*
656 * Check the relative inode number against the last used
657 * relative inode number in this group. if it is greater
658 * we need to update the bg_itable_unused count
659 *
660 */
661 if (ino > free)
662 ext4_itable_unused_set(sb, gdp,
663 (EXT4_INODES_PER_GROUP(sb) - ino));
664 }
665 count = ext4_free_inodes_count(sb, gdp) - 1;
666 ext4_free_inodes_set(sb, gdp, count);
667 if (S_ISDIR(mode)) {
668 count = ext4_used_dirs_count(sb, gdp) + 1;
669 ext4_used_dirs_set(sb, gdp, count);
670 if (sbi->s_log_groups_per_flex) {
671 ext4_group_t f = ext4_flex_group(sbi, group);
672
673 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
674 }
675 }
676 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
677err_ret:
678 ext4_unlock_group(sb, group);
679 up_read(&grp->alloc_sem);
680 return retval;
681}
682
683/*
684 * There are two policies for allocating an inode. If the new inode is 607 * There are two policies for allocating an inode. If the new inode is
685 * a directory, then a forward search is made for a block group with both 608 * a directory, then a forward search is made for a block group with both
686 * free space and a low directory-to-inode ratio; if that fails, then of 609 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -741,6 +664,11 @@ got_group:
741 if (ret2 == -1) 664 if (ret2 == -1)
742 goto out; 665 goto out;
743 666
667 /*
668 * Normally we will only go through one pass of this loop,
669 * unless we get unlucky and it turns out the group we selected
670 * had its last inode grabbed by someone else.
671 */
744 for (i = 0; i < ngroups; i++, ino = 0) { 672 for (i = 0; i < ngroups; i++, ino = 0) {
745 err = -EIO; 673 err = -EIO;
746 674
@@ -757,51 +685,24 @@ repeat_in_this_group:
757 ino = ext4_find_next_zero_bit((unsigned long *) 685 ino = ext4_find_next_zero_bit((unsigned long *)
758 inode_bitmap_bh->b_data, 686 inode_bitmap_bh->b_data,
759 EXT4_INODES_PER_GROUP(sb), ino); 687 EXT4_INODES_PER_GROUP(sb), ino);
760 688 if (ino >= EXT4_INODES_PER_GROUP(sb)) {
761 if (ino < EXT4_INODES_PER_GROUP(sb)) { 689 if (++group == ngroups)
762 690 group = 0;
763 BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); 691 continue;
764 err = ext4_journal_get_write_access(handle,
765 inode_bitmap_bh);
766 if (err)
767 goto fail;
768
769 BUFFER_TRACE(group_desc_bh, "get_write_access");
770 err = ext4_journal_get_write_access(handle,
771 group_desc_bh);
772 if (err)
773 goto fail;
774 if (!ext4_claim_inode(sb, inode_bitmap_bh,
775 ino, group, mode)) {
776 /* we won it */
777 BUFFER_TRACE(inode_bitmap_bh,
778 "call ext4_handle_dirty_metadata");
779 err = ext4_handle_dirty_metadata(handle,
780 NULL,
781 inode_bitmap_bh);
782 if (err)
783 goto fail;
784 /* zero bit is inode number 1*/
785 ino++;
786 goto got;
787 }
788 /* we lost it */
789 ext4_handle_release_buffer(handle, inode_bitmap_bh);
790 ext4_handle_release_buffer(handle, group_desc_bh);
791
792 if (++ino < EXT4_INODES_PER_GROUP(sb))
793 goto repeat_in_this_group;
794 } 692 }
795 693 if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
796 /* 694 ext4_error(sb, "reserved inode found cleared - "
797 * This case is possible in concurrent environment. It is very 695 "inode=%lu", ino + 1);
798 * rare. We cannot repeat the find_group_xxx() call because 696 continue;
799 * that will simply return the same blockgroup, because the 697 }
800 * group descriptor metadata has not yet been updated. 698 ext4_lock_group(sb, group);
801 * So we just go onto the next blockgroup. 699 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
802 */ 700 ext4_unlock_group(sb, group);
803 if (++group == ngroups) 701 ino++; /* the inode bitmap is zero-based */
804 group = 0; 702 if (!ret2)
703 goto got; /* we grabbed the inode! */
704 if (ino < EXT4_INODES_PER_GROUP(sb))
705 goto repeat_in_this_group;
805 } 706 }
806 err = -ENOSPC; 707 err = -ENOSPC;
807 goto out; 708 goto out;
@@ -838,6 +739,59 @@ got:
838 if (err) 739 if (err)
839 goto fail; 740 goto fail;
840 } 741 }
742
743 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
744 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
745 if (err)
746 goto fail;
747
748 BUFFER_TRACE(group_desc_bh, "get_write_access");
749 err = ext4_journal_get_write_access(handle, group_desc_bh);
750 if (err)
751 goto fail;
752
753 /* Update the relevant bg descriptor fields */
754 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
755 int free;
756 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
757
758 down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
759 ext4_lock_group(sb, group); /* while we modify the bg desc */
760 free = EXT4_INODES_PER_GROUP(sb) -
761 ext4_itable_unused_count(sb, gdp);
762 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
763 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
764 free = 0;
765 }
766 /*
767 * Check the relative inode number against the last used
768 * relative inode number in this group. if it is greater
769 * we need to update the bg_itable_unused count
770 */
771 if (ino > free)
772 ext4_itable_unused_set(sb, gdp,
773 (EXT4_INODES_PER_GROUP(sb) - ino));
774 up_read(&grp->alloc_sem);
775 }
776 ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
777 if (S_ISDIR(mode)) {
778 ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
779 if (sbi->s_log_groups_per_flex) {
780 ext4_group_t f = ext4_flex_group(sbi, group);
781
782 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
783 }
784 }
785 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
786 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
787 ext4_unlock_group(sb, group);
788 }
789
790 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
791 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
792 if (err)
793 goto fail;
794
841 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); 795 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
842 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); 796 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
843 if (err) 797 if (err)
@@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1101 * where it is called from on active part of filesystem is ext4lazyinit 1055 * where it is called from on active part of filesystem is ext4lazyinit
1102 * thread, so we do not need any special locks, however we have to prevent 1056 * thread, so we do not need any special locks, however we have to prevent
1103 * inode allocation from the current group, so we take alloc_sem lock, to 1057 * inode allocation from the current group, so we take alloc_sem lock, to
1104 * block ext4_claim_inode until we are finished. 1058 * block ext4_new_inode() until we are finished.
1105 */ 1059 */
1106int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, 1060int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1107 int barrier) 1061 int barrier)
@@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1149 sbi->s_inodes_per_block); 1103 sbi->s_inodes_per_block);
1150 1104
1151 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { 1105 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1152 ext4_error(sb, "Something is wrong with group %u\n" 1106 ext4_error(sb, "Something is wrong with group %u: "
1153 "Used itable blocks: %d" 1107 "used itable blocks: %d; "
1154 "itable unused count: %u\n", 1108 "itable unused count: %u",
1155 group, used_blks, 1109 group, used_blks,
1156 ext4_itable_unused_count(sb, gdp)); 1110 ext4_itable_unused_count(sb, gdp));
1157 ret = 1; 1111 ret = 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82fe629d..c77b0bd2c711 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
272 trace_ext4_da_update_reserve_space(inode, used, quota_claim); 272 trace_ext4_da_update_reserve_space(inode, used, quota_claim);
273 if (unlikely(used > ei->i_reserved_data_blocks)) { 273 if (unlikely(used > ei->i_reserved_data_blocks)) {
274 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 274 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
275 "with only %d reserved data blocks\n", 275 "with only %d reserved data blocks",
276 __func__, inode->i_ino, used, 276 __func__, inode->i_ino, used,
277 ei->i_reserved_data_blocks); 277 ei->i_reserved_data_blocks);
278 WARN_ON(1); 278 WARN_ON(1);
@@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1165 */ 1165 */
1166 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " 1166 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1167 "ino %lu, to_free %d with only %d reserved " 1167 "ino %lu, to_free %d with only %d reserved "
1168 "data blocks\n", inode->i_ino, to_free, 1168 "data blocks", inode->i_ino, to_free,
1169 ei->i_reserved_data_blocks); 1169 ei->i_reserved_data_blocks);
1170 WARN_ON(1); 1170 WARN_ON(1);
1171 to_free = ei->i_reserved_data_blocks; 1171 to_free = ei->i_reserved_data_blocks;
@@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1428static void ext4_print_free_blocks(struct inode *inode) 1428static void ext4_print_free_blocks(struct inode *inode)
1429{ 1429{
1430 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1430 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1431 printk(KERN_CRIT "Total free blocks count %lld\n", 1431 struct super_block *sb = inode->i_sb;
1432
1433 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1432 EXT4_C2B(EXT4_SB(inode->i_sb), 1434 EXT4_C2B(EXT4_SB(inode->i_sb),
1433 ext4_count_free_clusters(inode->i_sb))); 1435 ext4_count_free_clusters(inode->i_sb)));
1434 printk(KERN_CRIT "Free/Dirty block details\n"); 1436 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1435 printk(KERN_CRIT "free_blocks=%lld\n", 1437 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1436 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1438 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1437 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1439 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1438 printk(KERN_CRIT "dirty_blocks=%lld\n", 1440 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1439 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1441 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1440 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1442 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1441 printk(KERN_CRIT "Block reservation details\n"); 1443 ext4_msg(sb, KERN_CRIT, "Block reservation details");
1442 printk(KERN_CRIT "i_reserved_data_blocks=%u\n", 1444 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1443 EXT4_I(inode)->i_reserved_data_blocks); 1445 EXT4_I(inode)->i_reserved_data_blocks);
1444 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", 1446 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
1445 EXT4_I(inode)->i_reserved_meta_blocks); 1447 EXT4_I(inode)->i_reserved_meta_blocks);
1446 return; 1448 return;
1447} 1449}
@@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file,
2482 int write_mode = (int)(unsigned long)fsdata; 2484 int write_mode = (int)(unsigned long)fsdata;
2483 2485
2484 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2486 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2485 if (ext4_should_order_data(inode)) { 2487 switch (ext4_inode_journal_mode(inode)) {
2488 case EXT4_INODE_ORDERED_DATA_MODE:
2486 return ext4_ordered_write_end(file, mapping, pos, 2489 return ext4_ordered_write_end(file, mapping, pos,
2487 len, copied, page, fsdata); 2490 len, copied, page, fsdata);
2488 } else if (ext4_should_writeback_data(inode)) { 2491 case EXT4_INODE_WRITEBACK_DATA_MODE:
2489 return ext4_writeback_write_end(file, mapping, pos, 2492 return ext4_writeback_write_end(file, mapping, pos,
2490 len, copied, page, fsdata); 2493 len, copied, page, fsdata);
2491 } else { 2494 default:
2492 BUG(); 2495 BUG();
2493 } 2496 }
2494 } 2497 }
@@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2763 goto out; 2766 goto out;
2764 2767
2765 ext_debug("ext4_end_io_dio(): io_end 0x%p " 2768 ext_debug("ext4_end_io_dio(): io_end 0x%p "
2766 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", 2769 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
2767 iocb->private, io_end->inode->i_ino, iocb, offset, 2770 iocb->private, io_end->inode->i_ino, iocb, offset,
2768 size); 2771 size);
2769 2772
@@ -2795,9 +2798,6 @@ out:
2795 2798
2796 /* queue the work to convert unwritten extents to written */ 2799 /* queue the work to convert unwritten extents to written */
2797 queue_work(wq, &io_end->work); 2800 queue_work(wq, &io_end->work);
2798
2799 /* XXX: probably should move into the real I/O completion handler */
2800 inode_dio_done(inode);
2801} 2801}
2802 2802
2803static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 2803static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2811 goto out; 2811 goto out;
2812 2812
2813 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { 2813 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
2814 printk("sb umounted, discard end_io request for inode %lu\n", 2814 ext4_msg(io_end->inode->i_sb, KERN_INFO,
2815 io_end->inode->i_ino); 2815 "sb umounted, discard end_io request for inode %lu",
2816 io_end->inode->i_ino);
2816 ext4_free_io_end(io_end); 2817 ext4_free_io_end(io_end);
2817 goto out; 2818 goto out;
2818 } 2819 }
@@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2921 iocb->private = NULL; 2922 iocb->private = NULL;
2922 EXT4_I(inode)->cur_aio_dio = NULL; 2923 EXT4_I(inode)->cur_aio_dio = NULL;
2923 if (!is_sync_kiocb(iocb)) { 2924 if (!is_sync_kiocb(iocb)) {
2924 iocb->private = ext4_init_io_end(inode, GFP_NOFS); 2925 ext4_io_end_t *io_end =
2925 if (!iocb->private) 2926 ext4_init_io_end(inode, GFP_NOFS);
2927 if (!io_end)
2926 return -ENOMEM; 2928 return -ENOMEM;
2929 io_end->flag |= EXT4_IO_END_DIRECT;
2930 iocb->private = io_end;
2927 /* 2931 /*
2928 * we save the io structure for current async 2932 * we save the io structure for current async
2929 * direct IO, so that later ext4_map_blocks() 2933 * direct IO, so that later ext4_map_blocks()
@@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2940 ext4_get_block_write, 2944 ext4_get_block_write,
2941 ext4_end_io_dio, 2945 ext4_end_io_dio,
2942 NULL, 2946 NULL,
2943 DIO_LOCKING | DIO_SKIP_HOLES); 2947 DIO_LOCKING);
2944 if (iocb->private) 2948 if (iocb->private)
2945 EXT4_I(inode)->cur_aio_dio = NULL; 2949 EXT4_I(inode)->cur_aio_dio = NULL;
2946 /* 2950 /*
@@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = {
3086 3090
3087void ext4_set_aops(struct inode *inode) 3091void ext4_set_aops(struct inode *inode)
3088{ 3092{
3089 if (ext4_should_order_data(inode) && 3093 switch (ext4_inode_journal_mode(inode)) {
3090 test_opt(inode->i_sb, DELALLOC)) 3094 case EXT4_INODE_ORDERED_DATA_MODE:
3091 inode->i_mapping->a_ops = &ext4_da_aops; 3095 if (test_opt(inode->i_sb, DELALLOC))
3092 else if (ext4_should_order_data(inode)) 3096 inode->i_mapping->a_ops = &ext4_da_aops;
3093 inode->i_mapping->a_ops = &ext4_ordered_aops; 3097 else
3094 else if (ext4_should_writeback_data(inode) && 3098 inode->i_mapping->a_ops = &ext4_ordered_aops;
3095 test_opt(inode->i_sb, DELALLOC)) 3099 break;
3096 inode->i_mapping->a_ops = &ext4_da_aops; 3100 case EXT4_INODE_WRITEBACK_DATA_MODE:
3097 else if (ext4_should_writeback_data(inode)) 3101 if (test_opt(inode->i_sb, DELALLOC))
3098 inode->i_mapping->a_ops = &ext4_writeback_aops; 3102 inode->i_mapping->a_ops = &ext4_da_aops;
3099 else 3103 else
3104 inode->i_mapping->a_ops = &ext4_writeback_aops;
3105 break;
3106 case EXT4_INODE_JOURNAL_DATA_MODE:
3100 inode->i_mapping->a_ops = &ext4_journalled_aops; 3107 inode->i_mapping->a_ops = &ext4_journalled_aops;
3108 break;
3109 default:
3110 BUG();
3111 }
3101} 3112}
3102 3113
3103 3114
@@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3329{ 3340{
3330 struct inode *inode = file->f_path.dentry->d_inode; 3341 struct inode *inode = file->f_path.dentry->d_inode;
3331 if (!S_ISREG(inode->i_mode)) 3342 if (!S_ISREG(inode->i_mode))
3332 return -ENOTSUPP; 3343 return -EOPNOTSUPP;
3333 3344
3334 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 3345 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3335 /* TODO: Add support for non extent hole punching */ 3346 /* TODO: Add support for non extent hole punching */
3336 return -ENOTSUPP; 3347 return -EOPNOTSUPP;
3337 } 3348 }
3338 3349
3339 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { 3350 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3340 /* TODO: Add support for bigalloc file systems */ 3351 /* TODO: Add support for bigalloc file systems */
3341 return -ENOTSUPP; 3352 return -EOPNOTSUPP;
3342 } 3353 }
3343 3354
3344 return ext4_ext_punch_hole(file, offset, length); 3355 return ext4_ext_punch_hole(file, offset, length);
@@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle,
3924 ext4_update_dynamic_rev(sb); 3935 ext4_update_dynamic_rev(sb);
3925 EXT4_SET_RO_COMPAT_FEATURE(sb, 3936 EXT4_SET_RO_COMPAT_FEATURE(sb,
3926 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 3937 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
3927 sb->s_dirt = 1;
3928 ext4_handle_sync(handle); 3938 ext4_handle_sync(handle);
3929 err = ext4_handle_dirty_metadata(handle, NULL, 3939 err = ext4_handle_dirty_super(handle, sb);
3930 EXT4_SB(sb)->s_sbh);
3931 } 3940 }
3932 } 3941 }
3933 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 3942 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4152 } 4161 }
4153 4162
4154 if (attr->ia_valid & ATTR_SIZE) { 4163 if (attr->ia_valid & ATTR_SIZE) {
4155 if (attr->ia_size != i_size_read(inode)) { 4164 if (attr->ia_size != i_size_read(inode))
4156 truncate_setsize(inode, attr->ia_size); 4165 truncate_setsize(inode, attr->ia_size);
4157 ext4_truncate(inode); 4166 ext4_truncate(inode);
4158 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
4159 ext4_truncate(inode);
4160 } 4167 }
4161 4168
4162 if (!rc) { 4169 if (!rc) {
@@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
4314{ 4321{
4315 int err = 0; 4322 int err = 0;
4316 4323
4317 if (test_opt(inode->i_sb, I_VERSION)) 4324 if (IS_I_VERSION(inode))
4318 inode_inc_iversion(inode); 4325 inode_inc_iversion(inode);
4319 4326
4320 /* the do_update_inode consumes one bh->b_count */ 4327 /* the do_update_inode consumes one bh->b_count */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cb990b21c698..99ab428bcfa0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,6 +21,7 @@
21 * mballoc.c contains the multiblocks allocation routines 21 * mballoc.c contains the multiblocks allocation routines
22 */ 22 */
23 23
24#include "ext4_jbd2.h"
24#include "mballoc.h" 25#include "mballoc.h"
25#include <linux/debugfs.h> 26#include <linux/debugfs.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
@@ -339,7 +340,7 @@
339 */ 340 */
340static struct kmem_cache *ext4_pspace_cachep; 341static struct kmem_cache *ext4_pspace_cachep;
341static struct kmem_cache *ext4_ac_cachep; 342static struct kmem_cache *ext4_ac_cachep;
342static struct kmem_cache *ext4_free_ext_cachep; 343static struct kmem_cache *ext4_free_data_cachep;
343 344
344/* We create slab caches for groupinfo data structures based on the 345/* We create slab caches for groupinfo data structures based on the
345 * superblock block size. There will be one per mounted filesystem for 346 * superblock block size. There will be one per mounted filesystem for
@@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
357 ext4_group_t group); 358 ext4_group_t group);
358static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 359static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
359 ext4_group_t group); 360 ext4_group_t group);
360static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 361static void ext4_free_data_callback(struct super_block *sb,
362 struct ext4_journal_cb_entry *jce, int rc);
361 363
362static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 364static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
363{ 365{
@@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
425{ 427{
426 char *bb; 428 char *bb;
427 429
428 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 430 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
429 BUG_ON(max == NULL); 431 BUG_ON(max == NULL);
430 432
431 if (order > e4b->bd_blkbits + 1) { 433 if (order > e4b->bd_blkbits + 1) {
@@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
436 /* at order 0 we see each particular block */ 438 /* at order 0 we see each particular block */
437 if (order == 0) { 439 if (order == 0) {
438 *max = 1 << (e4b->bd_blkbits + 3); 440 *max = 1 << (e4b->bd_blkbits + 3);
439 return EXT4_MB_BITMAP(e4b); 441 return e4b->bd_bitmap;
440 } 442 }
441 443
442 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 444 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
443 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 445 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
444 446
445 return bb; 447 return bb;
@@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
588 for (j = 0; j < (1 << order); j++) { 590 for (j = 0; j < (1 << order); j++) {
589 k = (i * (1 << order)) + j; 591 k = (i * (1 << order)) + j;
590 MB_CHECK_ASSERT( 592 MB_CHECK_ASSERT(
591 !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); 593 !mb_test_bit(k, e4b->bd_bitmap));
592 } 594 }
593 count++; 595 count++;
594 } 596 }
@@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
782 int groups_per_page; 784 int groups_per_page;
783 int err = 0; 785 int err = 0;
784 int i; 786 int i;
785 ext4_group_t first_group; 787 ext4_group_t first_group, group;
786 int first_block; 788 int first_block;
787 struct super_block *sb; 789 struct super_block *sb;
788 struct buffer_head *bhs; 790 struct buffer_head *bhs;
@@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
806 808
807 /* allocate buffer_heads to read bitmaps */ 809 /* allocate buffer_heads to read bitmaps */
808 if (groups_per_page > 1) { 810 if (groups_per_page > 1) {
809 err = -ENOMEM;
810 i = sizeof(struct buffer_head *) * groups_per_page; 811 i = sizeof(struct buffer_head *) * groups_per_page;
811 bh = kzalloc(i, GFP_NOFS); 812 bh = kzalloc(i, GFP_NOFS);
812 if (bh == NULL) 813 if (bh == NULL) {
814 err = -ENOMEM;
813 goto out; 815 goto out;
816 }
814 } else 817 } else
815 bh = &bhs; 818 bh = &bhs;
816 819
817 first_group = page->index * blocks_per_page / 2; 820 first_group = page->index * blocks_per_page / 2;
818 821
819 /* read all groups the page covers into the cache */ 822 /* read all groups the page covers into the cache */
820 for (i = 0; i < groups_per_page; i++) { 823 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
821 struct ext4_group_desc *desc; 824 if (group >= ngroups)
822
823 if (first_group + i >= ngroups)
824 break; 825 break;
825 826
826 grinfo = ext4_get_group_info(sb, first_group + i); 827 grinfo = ext4_get_group_info(sb, group);
827 /* 828 /*
828 * If page is uptodate then we came here after online resize 829 * If page is uptodate then we came here after online resize
829 * which added some new uninitialized group info structs, so 830 * which added some new uninitialized group info structs, so
@@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
834 bh[i] = NULL; 835 bh[i] = NULL;
835 continue; 836 continue;
836 } 837 }
837 838 if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
838 err = -EIO; 839 err = -ENOMEM;
839 desc = ext4_get_group_desc(sb, first_group + i, NULL);
840 if (desc == NULL)
841 goto out;
842
843 err = -ENOMEM;
844 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
845 if (bh[i] == NULL)
846 goto out; 840 goto out;
847
848 if (bitmap_uptodate(bh[i]))
849 continue;
850
851 lock_buffer(bh[i]);
852 if (bitmap_uptodate(bh[i])) {
853 unlock_buffer(bh[i]);
854 continue;
855 }
856 ext4_lock_group(sb, first_group + i);
857 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
858 ext4_init_block_bitmap(sb, bh[i],
859 first_group + i, desc);
860 set_bitmap_uptodate(bh[i]);
861 set_buffer_uptodate(bh[i]);
862 ext4_unlock_group(sb, first_group + i);
863 unlock_buffer(bh[i]);
864 continue;
865 } 841 }
866 ext4_unlock_group(sb, first_group + i); 842 mb_debug(1, "read bitmap for group %u\n", group);
867 if (buffer_uptodate(bh[i])) {
868 /*
869 * if not uninit if bh is uptodate,
870 * bitmap is also uptodate
871 */
872 set_bitmap_uptodate(bh[i]);
873 unlock_buffer(bh[i]);
874 continue;
875 }
876 get_bh(bh[i]);
877 /*
878 * submit the buffer_head for read. We can
879 * safely mark the bitmap as uptodate now.
880 * We do it here so the bitmap uptodate bit
881 * get set with buffer lock held.
882 */
883 set_bitmap_uptodate(bh[i]);
884 bh[i]->b_end_io = end_buffer_read_sync;
885 submit_bh(READ, bh[i]);
886 mb_debug(1, "read bitmap for group %u\n", first_group + i);
887 } 843 }
888 844
889 /* wait for I/O completion */ 845 /* wait for I/O completion */
890 for (i = 0; i < groups_per_page; i++) 846 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
891 if (bh[i]) 847 if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
892 wait_on_buffer(bh[i]); 848 err = -EIO;
893
894 err = -EIO;
895 for (i = 0; i < groups_per_page; i++)
896 if (bh[i] && !buffer_uptodate(bh[i]))
897 goto out; 849 goto out;
850 }
851 }
898 852
899 err = 0;
900 first_block = page->index * blocks_per_page; 853 first_block = page->index * blocks_per_page;
901 for (i = 0; i < blocks_per_page; i++) { 854 for (i = 0; i < blocks_per_page; i++) {
902 int group; 855 int group;
@@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1250 int order = 1; 1203 int order = 1;
1251 void *bb; 1204 void *bb;
1252 1205
1253 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 1206 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1254 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1207 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1255 1208
1256 bb = EXT4_MB_BUDDY(e4b); 1209 bb = e4b->bd_buddy;
1257 while (order <= e4b->bd_blkbits + 1) { 1210 while (order <= e4b->bd_blkbits + 1) {
1258 block = block >> 1; 1211 block = block >> 1;
1259 if (!mb_test_bit(block, bb)) { 1212 if (!mb_test_bit(block, bb)) {
@@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1323 1276
1324 /* let's maintain fragments counter */ 1277 /* let's maintain fragments counter */
1325 if (first != 0) 1278 if (first != 0)
1326 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); 1279 block = !mb_test_bit(first - 1, e4b->bd_bitmap);
1327 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1280 if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1328 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); 1281 max = !mb_test_bit(first + count, e4b->bd_bitmap);
1329 if (block && max) 1282 if (block && max)
1330 e4b->bd_info->bb_fragments--; 1283 e4b->bd_info->bb_fragments--;
1331 else if (!block && !max) 1284 else if (!block && !max)
@@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1336 block = first++; 1289 block = first++;
1337 order = 0; 1290 order = 0;
1338 1291
1339 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1292 if (!mb_test_bit(block, e4b->bd_bitmap)) {
1340 ext4_fsblk_t blocknr; 1293 ext4_fsblk_t blocknr;
1341 1294
1342 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1295 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1347 "freeing already freed block " 1300 "freeing already freed block "
1348 "(bit %u)", block); 1301 "(bit %u)", block);
1349 } 1302 }
1350 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1303 mb_clear_bit(block, e4b->bd_bitmap);
1351 e4b->bd_info->bb_counters[order]++; 1304 e4b->bd_info->bb_counters[order]++;
1352 1305
1353 /* start of the buddy */ 1306 /* start of the buddy */
@@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1429 break; 1382 break;
1430 1383
1431 next = (block + 1) * (1 << order); 1384 next = (block + 1) * (1 << order);
1432 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1385 if (mb_test_bit(next, e4b->bd_bitmap))
1433 break; 1386 break;
1434 1387
1435 order = mb_find_order_for_block(e4b, next); 1388 order = mb_find_order_for_block(e4b, next);
@@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1466 1419
1467 /* let's maintain fragments counter */ 1420 /* let's maintain fragments counter */
1468 if (start != 0) 1421 if (start != 0)
1469 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); 1422 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1470 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1423 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1471 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); 1424 max = !mb_test_bit(start + len, e4b->bd_bitmap);
1472 if (mlen && max) 1425 if (mlen && max)
1473 e4b->bd_info->bb_fragments++; 1426 e4b->bd_info->bb_fragments++;
1474 else if (!mlen && !max) 1427 else if (!mlen && !max)
@@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1511 } 1464 }
1512 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1465 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1513 1466
1514 ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1467 ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1515 mb_check_buddy(e4b); 1468 mb_check_buddy(e4b);
1516 1469
1517 return ret; 1470 return ret;
@@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1810 struct ext4_buddy *e4b) 1763 struct ext4_buddy *e4b)
1811{ 1764{
1812 struct super_block *sb = ac->ac_sb; 1765 struct super_block *sb = ac->ac_sb;
1813 void *bitmap = EXT4_MB_BITMAP(e4b); 1766 void *bitmap = e4b->bd_bitmap;
1814 struct ext4_free_extent ex; 1767 struct ext4_free_extent ex;
1815 int i; 1768 int i;
1816 int free; 1769 int free;
@@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1870{ 1823{
1871 struct super_block *sb = ac->ac_sb; 1824 struct super_block *sb = ac->ac_sb;
1872 struct ext4_sb_info *sbi = EXT4_SB(sb); 1825 struct ext4_sb_info *sbi = EXT4_SB(sb);
1873 void *bitmap = EXT4_MB_BITMAP(e4b); 1826 void *bitmap = e4b->bd_bitmap;
1874 struct ext4_free_extent ex; 1827 struct ext4_free_extent ex;
1875 ext4_fsblk_t first_group_block; 1828 ext4_fsblk_t first_group_block;
1876 ext4_fsblk_t a; 1829 ext4_fsblk_t a;
@@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2224 EXT4_DESC_PER_BLOCK_BITS(sb); 2177 EXT4_DESC_PER_BLOCK_BITS(sb);
2225 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2178 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2226 if (meta_group_info == NULL) { 2179 if (meta_group_info == NULL) {
2227 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " 2180 ext4_msg(sb, KERN_ERR, "can't allocate mem "
2228 "for a buddy group"); 2181 "for a buddy group");
2229 goto exit_meta_group_info; 2182 goto exit_meta_group_info;
2230 } 2183 }
@@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2238 2191
2239 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2192 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2240 if (meta_group_info[i] == NULL) { 2193 if (meta_group_info[i] == NULL) {
2241 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); 2194 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2242 goto exit_group_info; 2195 goto exit_group_info;
2243 } 2196 }
2244 memset(meta_group_info[i], 0, kmem_cache_size(cachep)); 2197 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2522 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2475 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2523 &ext4_mb_seq_groups_fops, sb); 2476 &ext4_mb_seq_groups_fops, sb);
2524 2477
2525 if (sbi->s_journal)
2526 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2527
2528 return 0; 2478 return 0;
2529 2479
2530out_free_locality_groups: 2480out_free_locality_groups:
@@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb,
2637 * This function is called by the jbd2 layer once the commit has finished, 2587 * This function is called by the jbd2 layer once the commit has finished,
2638 * so we know we can free the blocks that were released with that commit. 2588 * so we know we can free the blocks that were released with that commit.
2639 */ 2589 */
2640static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2590static void ext4_free_data_callback(struct super_block *sb,
2591 struct ext4_journal_cb_entry *jce,
2592 int rc)
2641{ 2593{
2642 struct super_block *sb = journal->j_private; 2594 struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2643 struct ext4_buddy e4b; 2595 struct ext4_buddy e4b;
2644 struct ext4_group_info *db; 2596 struct ext4_group_info *db;
2645 int err, count = 0, count2 = 0; 2597 int err, count = 0, count2 = 0;
2646 struct ext4_free_data *entry;
2647 struct list_head *l, *ltmp;
2648 2598
2649 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2599 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2650 entry = list_entry(l, struct ext4_free_data, list); 2600 entry->efd_count, entry->efd_group, entry);
2651 2601
2652 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2602 if (test_opt(sb, DISCARD))
2653 entry->count, entry->group, entry); 2603 ext4_issue_discard(sb, entry->efd_group,
2604 entry->efd_start_cluster, entry->efd_count);
2654 2605
2655 if (test_opt(sb, DISCARD)) 2606 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2656 ext4_issue_discard(sb, entry->group, 2607 /* we expect to find existing buddy because it's pinned */
2657 entry->start_cluster, entry->count); 2608 BUG_ON(err != 0);
2658 2609
2659 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2660 /* we expect to find existing buddy because it's pinned */
2661 BUG_ON(err != 0);
2662 2610
2663 db = e4b.bd_info; 2611 db = e4b.bd_info;
2664 /* there are blocks to put in buddy to make them really free */ 2612 /* there are blocks to put in buddy to make them really free */
2665 count += entry->count; 2613 count += entry->efd_count;
2666 count2++; 2614 count2++;
2667 ext4_lock_group(sb, entry->group); 2615 ext4_lock_group(sb, entry->efd_group);
2668 /* Take it out of per group rb tree */ 2616 /* Take it out of per group rb tree */
2669 rb_erase(&entry->node, &(db->bb_free_root)); 2617 rb_erase(&entry->efd_node, &(db->bb_free_root));
2670 mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); 2618 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2671 2619
2672 /* 2620 /*
2673 * Clear the trimmed flag for the group so that the next 2621 * Clear the trimmed flag for the group so that the next
2674 * ext4_trim_fs can trim it. 2622 * ext4_trim_fs can trim it.
2675 * If the volume is mounted with -o discard, online discard 2623 * If the volume is mounted with -o discard, online discard
2676 * is supported and the free blocks will be trimmed online. 2624 * is supported and the free blocks will be trimmed online.
2677 */ 2625 */
2678 if (!test_opt(sb, DISCARD)) 2626 if (!test_opt(sb, DISCARD))
2679 EXT4_MB_GRP_CLEAR_TRIMMED(db); 2627 EXT4_MB_GRP_CLEAR_TRIMMED(db);
2680 2628
2681 if (!db->bb_free_root.rb_node) { 2629 if (!db->bb_free_root.rb_node) {
2682 /* No more items in the per group rb tree 2630 /* No more items in the per group rb tree
2683 * balance refcounts from ext4_mb_free_metadata() 2631 * balance refcounts from ext4_mb_free_metadata()
2684 */ 2632 */
2685 page_cache_release(e4b.bd_buddy_page); 2633 page_cache_release(e4b.bd_buddy_page);
2686 page_cache_release(e4b.bd_bitmap_page); 2634 page_cache_release(e4b.bd_bitmap_page);
2687 }
2688 ext4_unlock_group(sb, entry->group);
2689 kmem_cache_free(ext4_free_ext_cachep, entry);
2690 ext4_mb_unload_buddy(&e4b);
2691 } 2635 }
2636 ext4_unlock_group(sb, entry->efd_group);
2637 kmem_cache_free(ext4_free_data_cachep, entry);
2638 ext4_mb_unload_buddy(&e4b);
2692 2639
2693 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2640 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2694} 2641}
@@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void)
2741 return -ENOMEM; 2688 return -ENOMEM;
2742 } 2689 }
2743 2690
2744 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, 2691 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2745 SLAB_RECLAIM_ACCOUNT); 2692 SLAB_RECLAIM_ACCOUNT);
2746 if (ext4_free_ext_cachep == NULL) { 2693 if (ext4_free_data_cachep == NULL) {
2747 kmem_cache_destroy(ext4_pspace_cachep); 2694 kmem_cache_destroy(ext4_pspace_cachep);
2748 kmem_cache_destroy(ext4_ac_cachep); 2695 kmem_cache_destroy(ext4_ac_cachep);
2749 return -ENOMEM; 2696 return -ENOMEM;
@@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void)
2761 rcu_barrier(); 2708 rcu_barrier();
2762 kmem_cache_destroy(ext4_pspace_cachep); 2709 kmem_cache_destroy(ext4_pspace_cachep);
2763 kmem_cache_destroy(ext4_ac_cachep); 2710 kmem_cache_destroy(ext4_ac_cachep);
2764 kmem_cache_destroy(ext4_free_ext_cachep); 2711 kmem_cache_destroy(ext4_free_data_cachep);
2765 ext4_groupinfo_destroy_slabs(); 2712 ext4_groupinfo_destroy_slabs();
2766 ext4_remove_debugfs_entry(); 2713 ext4_remove_debugfs_entry();
2767} 2714}
@@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2815 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 2762 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2816 if (!ext4_data_block_valid(sbi, block, len)) { 2763 if (!ext4_data_block_valid(sbi, block, len)) {
2817 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 2764 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2818 "fs metadata\n", block, block+len); 2765 "fs metadata", block, block+len);
2819 /* File system mounted not to panic on error 2766 /* File system mounted not to panic on error
2820 * Fix the bitmap and repeat the block allocation 2767 * Fix the bitmap and repeat the block allocation
2821 * We leak some of the blocks here. 2768 * We leak some of the blocks here.
@@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2911 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2858 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2912 int bsbits, max; 2859 int bsbits, max;
2913 ext4_lblk_t end; 2860 ext4_lblk_t end;
2914 loff_t size, orig_size, start_off; 2861 loff_t size, start_off;
2862 loff_t orig_size __maybe_unused;
2915 ext4_lblk_t start; 2863 ext4_lblk_t start;
2916 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2864 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2917 struct ext4_prealloc_space *pa; 2865 struct ext4_prealloc_space *pa;
@@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3321 n = rb_first(&(grp->bb_free_root)); 3269 n = rb_first(&(grp->bb_free_root));
3322 3270
3323 while (n) { 3271 while (n) {
3324 entry = rb_entry(n, struct ext4_free_data, node); 3272 entry = rb_entry(n, struct ext4_free_data, efd_node);
3325 ext4_set_bits(bitmap, entry->start_cluster, entry->count); 3273 ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3326 n = rb_next(n); 3274 n = rb_next(n);
3327 } 3275 }
3328 return; 3276 return;
@@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3916 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) 3864 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3917 return; 3865 return;
3918 3866
3919 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" 3867 ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
3920 " Allocation context details:"); 3868 " Allocation context details:");
3921 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", 3869 ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
3922 ac->ac_status, ac->ac_flags); 3870 ac->ac_status, ac->ac_flags);
3923 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " 3871 ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
3924 "goal %lu/%lu/%lu@%lu, " 3872 "goal %lu/%lu/%lu@%lu, "
3925 "best %lu/%lu/%lu@%lu cr %d", 3873 "best %lu/%lu/%lu@%lu cr %d",
3926 (unsigned long)ac->ac_o_ex.fe_group, 3874 (unsigned long)ac->ac_o_ex.fe_group,
@@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3936 (unsigned long)ac->ac_b_ex.fe_len, 3884 (unsigned long)ac->ac_b_ex.fe_len,
3937 (unsigned long)ac->ac_b_ex.fe_logical, 3885 (unsigned long)ac->ac_b_ex.fe_logical,
3938 (int)ac->ac_criteria); 3886 (int)ac->ac_criteria);
3939 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", 3887 ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
3940 ac->ac_ex_scanned, ac->ac_found); 3888 ac->ac_ex_scanned, ac->ac_found);
3941 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); 3889 ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
3942 ngroups = ext4_get_groups_count(sb); 3890 ngroups = ext4_get_groups_count(sb);
3943 for (i = 0; i < ngroups; i++) { 3891 for (i = 0; i < ngroups; i++) {
3944 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3892 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4428,9 +4376,9 @@ out:
4428static int can_merge(struct ext4_free_data *entry1, 4376static int can_merge(struct ext4_free_data *entry1,
4429 struct ext4_free_data *entry2) 4377 struct ext4_free_data *entry2)
4430{ 4378{
4431 if ((entry1->t_tid == entry2->t_tid) && 4379 if ((entry1->efd_tid == entry2->efd_tid) &&
4432 (entry1->group == entry2->group) && 4380 (entry1->efd_group == entry2->efd_group) &&
4433 ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) 4381 ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4434 return 1; 4382 return 1;
4435 return 0; 4383 return 0;
4436} 4384}
@@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4452 BUG_ON(e4b->bd_bitmap_page == NULL); 4400 BUG_ON(e4b->bd_bitmap_page == NULL);
4453 BUG_ON(e4b->bd_buddy_page == NULL); 4401 BUG_ON(e4b->bd_buddy_page == NULL);
4454 4402
4455 new_node = &new_entry->node; 4403 new_node = &new_entry->efd_node;
4456 cluster = new_entry->start_cluster; 4404 cluster = new_entry->efd_start_cluster;
4457 4405
4458 if (!*n) { 4406 if (!*n) {
4459 /* first free block exent. We need to 4407 /* first free block exent. We need to
@@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4466 } 4414 }
4467 while (*n) { 4415 while (*n) {
4468 parent = *n; 4416 parent = *n;
4469 entry = rb_entry(parent, struct ext4_free_data, node); 4417 entry = rb_entry(parent, struct ext4_free_data, efd_node);
4470 if (cluster < entry->start_cluster) 4418 if (cluster < entry->efd_start_cluster)
4471 n = &(*n)->rb_left; 4419 n = &(*n)->rb_left;
4472 else if (cluster >= (entry->start_cluster + entry->count)) 4420 else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4473 n = &(*n)->rb_right; 4421 n = &(*n)->rb_right;
4474 else { 4422 else {
4475 ext4_grp_locked_error(sb, group, 0, 4423 ext4_grp_locked_error(sb, group, 0,
@@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4486 /* Now try to see the extent can be merged to left and right */ 4434 /* Now try to see the extent can be merged to left and right */
4487 node = rb_prev(new_node); 4435 node = rb_prev(new_node);
4488 if (node) { 4436 if (node) {
4489 entry = rb_entry(node, struct ext4_free_data, node); 4437 entry = rb_entry(node, struct ext4_free_data, efd_node);
4490 if (can_merge(entry, new_entry)) { 4438 if (can_merge(entry, new_entry)) {
4491 new_entry->start_cluster = entry->start_cluster; 4439 new_entry->efd_start_cluster = entry->efd_start_cluster;
4492 new_entry->count += entry->count; 4440 new_entry->efd_count += entry->efd_count;
4493 rb_erase(node, &(db->bb_free_root)); 4441 rb_erase(node, &(db->bb_free_root));
4494 spin_lock(&sbi->s_md_lock); 4442 ext4_journal_callback_del(handle, &entry->efd_jce);
4495 list_del(&entry->list); 4443 kmem_cache_free(ext4_free_data_cachep, entry);
4496 spin_unlock(&sbi->s_md_lock);
4497 kmem_cache_free(ext4_free_ext_cachep, entry);
4498 } 4444 }
4499 } 4445 }
4500 4446
4501 node = rb_next(new_node); 4447 node = rb_next(new_node);
4502 if (node) { 4448 if (node) {
4503 entry = rb_entry(node, struct ext4_free_data, node); 4449 entry = rb_entry(node, struct ext4_free_data, efd_node);
4504 if (can_merge(new_entry, entry)) { 4450 if (can_merge(new_entry, entry)) {
4505 new_entry->count += entry->count; 4451 new_entry->efd_count += entry->efd_count;
4506 rb_erase(node, &(db->bb_free_root)); 4452 rb_erase(node, &(db->bb_free_root));
4507 spin_lock(&sbi->s_md_lock); 4453 ext4_journal_callback_del(handle, &entry->efd_jce);
4508 list_del(&entry->list); 4454 kmem_cache_free(ext4_free_data_cachep, entry);
4509 spin_unlock(&sbi->s_md_lock);
4510 kmem_cache_free(ext4_free_ext_cachep, entry);
4511 } 4455 }
4512 } 4456 }
4513 /* Add the extent to transaction's private list */ 4457 /* Add the extent to transaction's private list */
4514 spin_lock(&sbi->s_md_lock); 4458 ext4_journal_callback_add(handle, ext4_free_data_callback,
4515 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4459 &new_entry->efd_jce);
4516 spin_unlock(&sbi->s_md_lock);
4517 return 0; 4460 return 0;
4518} 4461}
4519 4462
@@ -4691,15 +4634,15 @@ do_more:
4691 * blocks being freed are metadata. these blocks shouldn't 4634 * blocks being freed are metadata. these blocks shouldn't
4692 * be used until this transaction is committed 4635 * be used until this transaction is committed
4693 */ 4636 */
4694 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4637 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4695 if (!new_entry) { 4638 if (!new_entry) {
4696 err = -ENOMEM; 4639 err = -ENOMEM;
4697 goto error_return; 4640 goto error_return;
4698 } 4641 }
4699 new_entry->start_cluster = bit; 4642 new_entry->efd_start_cluster = bit;
4700 new_entry->group = block_group; 4643 new_entry->efd_group = block_group;
4701 new_entry->count = count_clusters; 4644 new_entry->efd_count = count_clusters;
4702 new_entry->t_tid = handle->h_transaction->t_tid; 4645 new_entry->efd_tid = handle->h_transaction->t_tid;
4703 4646
4704 ext4_lock_group(sb, block_group); 4647 ext4_lock_group(sb, block_group);
4705 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4648 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4971 start = (e4b.bd_info->bb_first_free > start) ? 4914 start = (e4b.bd_info->bb_first_free > start) ?
4972 e4b.bd_info->bb_first_free : start; 4915 e4b.bd_info->bb_first_free : start;
4973 4916
4974 while (start < max) { 4917 while (start <= max) {
4975 start = mb_find_next_zero_bit(bitmap, max, start); 4918 start = mb_find_next_zero_bit(bitmap, max + 1, start);
4976 if (start >= max) 4919 if (start > max)
4977 break; 4920 break;
4978 next = mb_find_next_bit(bitmap, max, start); 4921 next = mb_find_next_bit(bitmap, max + 1, start);
4979 4922
4980 if ((next - start) >= minblocks) { 4923 if ((next - start) >= minblocks) {
4981 ext4_trim_extent(sb, start, 4924 ext4_trim_extent(sb, start,
@@ -5027,37 +4970,36 @@ out:
5027int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 4970int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5028{ 4971{
5029 struct ext4_group_info *grp; 4972 struct ext4_group_info *grp;
5030 ext4_group_t first_group, last_group; 4973 ext4_group_t group, first_group, last_group;
5031 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
5032 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 4974 ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5033 uint64_t start, len, minlen, trimmed = 0; 4975 uint64_t start, end, minlen, trimmed = 0;
5034 ext4_fsblk_t first_data_blk = 4976 ext4_fsblk_t first_data_blk =
5035 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 4977 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4978 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5036 int ret = 0; 4979 int ret = 0;
5037 4980
5038 start = range->start >> sb->s_blocksize_bits; 4981 start = range->start >> sb->s_blocksize_bits;
5039 len = range->len >> sb->s_blocksize_bits; 4982 end = start + (range->len >> sb->s_blocksize_bits) - 1;
5040 minlen = range->minlen >> sb->s_blocksize_bits; 4983 minlen = range->minlen >> sb->s_blocksize_bits;
5041 4984
5042 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) 4985 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
4986 unlikely(start >= max_blks))
5043 return -EINVAL; 4987 return -EINVAL;
5044 if (start + len <= first_data_blk) 4988 if (end >= max_blks)
4989 end = max_blks - 1;
4990 if (end <= first_data_blk)
5045 goto out; 4991 goto out;
5046 if (start < first_data_blk) { 4992 if (start < first_data_blk)
5047 len -= first_data_blk - start;
5048 start = first_data_blk; 4993 start = first_data_blk;
5049 }
5050 4994
5051 /* Determine first and last group to examine based on start and len */ 4995 /* Determine first and last group to examine based on start and end */
5052 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 4996 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5053 &first_group, &first_cluster); 4997 &first_group, &first_cluster);
5054 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), 4998 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5055 &last_group, &last_cluster); 4999 &last_group, &last_cluster);
5056 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
5057 last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
5058 5000
5059 if (first_group > last_group) 5001 /* end now represents the last cluster to discard in this group */
5060 return -EINVAL; 5002 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5061 5003
5062 for (group = first_group; group <= last_group; group++) { 5004 for (group = first_group; group <= last_group; group++) {
5063 grp = ext4_get_group_info(sb, group); 5005 grp = ext4_get_group_info(sb, group);
@@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5069 } 5011 }
5070 5012
5071 /* 5013 /*
5072 * For all the groups except the last one, last block will 5014 * For all the groups except the last one, last cluster will
5073 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to 5015 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5074 * change it for the last group in which case start + 5016 * change it for the last group, note that last_cluster is
5075 * len < EXT4_BLOCKS_PER_GROUP(sb). 5017 * already computed earlier by ext4_get_group_no_and_offset()
5076 */ 5018 */
5077 if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) 5019 if (group == last_group)
5078 last_cluster = first_cluster + len; 5020 end = last_cluster;
5079 len -= last_cluster - first_cluster;
5080 5021
5081 if (grp->bb_free >= minlen) { 5022 if (grp->bb_free >= minlen) {
5082 cnt = ext4_trim_all_free(sb, group, first_cluster, 5023 cnt = ext4_trim_all_free(sb, group, first_cluster,
5083 last_cluster, minlen); 5024 end, minlen);
5084 if (cnt < 0) { 5025 if (cnt < 0) {
5085 ret = cnt; 5026 ret = cnt;
5086 break; 5027 break;
5087 } 5028 }
5029 trimmed += cnt;
5088 } 5030 }
5089 trimmed += cnt; 5031
5032 /*
5033 * For every group except the first one, we are sure
5034 * that the first cluster to discard will be cluster #0.
5035 */
5090 first_cluster = 0; 5036 first_cluster = 0;
5091 } 5037 }
5092 range->len = trimmed * sb->s_blocksize;
5093 5038
5094 if (!ret) 5039 if (!ret)
5095 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); 5040 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5096 5041
5097out: 5042out:
5043 range->len = trimmed * sb->s_blocksize;
5098 return ret; 5044 return ret;
5099} 5045}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 47705f3285e3..c070618c21ce 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
96 96
97 97
98struct ext4_free_data { 98struct ext4_free_data {
99 /* this links the free block information from group_info */ 99 /* MUST be the first member */
100 struct rb_node node; 100 struct ext4_journal_cb_entry efd_jce;
101
102 /* ext4_free_data private data starts from here */
101 103
102 /* this links the free block information from ext4_sb_info */ 104 /* this links the free block information from group_info */
103 struct list_head list; 105 struct rb_node efd_node;
104 106
105 /* group which free block extent belongs */ 107 /* group which free block extent belongs */
106 ext4_group_t group; 108 ext4_group_t efd_group;
107 109
108 /* free block extent */ 110 /* free block extent */
109 ext4_grpblk_t start_cluster; 111 ext4_grpblk_t efd_start_cluster;
110 ext4_grpblk_t count; 112 ext4_grpblk_t efd_count;
111 113
112 /* transaction which freed this extent */ 114 /* transaction which freed this extent */
113 tid_t t_tid; 115 tid_t efd_tid;
114}; 116};
115 117
116struct ext4_prealloc_space { 118struct ext4_prealloc_space {
@@ -210,8 +212,6 @@ struct ext4_buddy {
210 __u16 bd_blkbits; 212 __u16 bd_blkbits;
211 ext4_group_t bd_group; 213 ext4_group_t bd_group;
212}; 214};
213#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
214#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
215 215
216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
217 struct ext4_free_extent *fex) 217 struct ext4_free_extent *fex)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index e7d6bb0acfa6..f39f80f8f2c5 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode)
471 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 471 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
472 S_IFREG, NULL, goal, owner); 472 S_IFREG, NULL, goal, owner);
473 if (IS_ERR(tmp_inode)) { 473 if (IS_ERR(tmp_inode)) {
474 retval = PTR_ERR(inode); 474 retval = PTR_ERR(tmp_inode);
475 ext4_journal_stop(handle); 475 ext4_journal_stop(handle);
476 return retval; 476 return retval;
477 } 477 }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 7ea4ba4eff2a..ed6548d89165 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb,
257 * If check_interval in MMP block is larger, use that instead of 257 * If check_interval in MMP block is larger, use that instead of
258 * update_interval from the superblock. 258 * update_interval from the superblock.
259 */ 259 */
260 if (mmp->mmp_check_interval > mmp_check_interval) 260 if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
261 mmp_check_interval = mmp->mmp_check_interval; 261 mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
262 262
263 seq = le32_to_cpu(mmp->mmp_seq); 263 seq = le32_to_cpu(mmp->mmp_seq);
264 if (seq == EXT4_MMP_SEQ_CLEAN) 264 if (seq == EXT4_MMP_SEQ_CLEAN)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f482375d..349d7b3671c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -468,7 +468,7 @@ fail2:
468fail: 468fail:
469 if (*err == ERR_BAD_DX_DIR) 469 if (*err == ERR_BAD_DX_DIR)
470 ext4_warning(dir->i_sb, 470 ext4_warning(dir->i_sb,
471 "Corrupt dir inode %ld, running e2fsck is " 471 "Corrupt dir inode %lu, running e2fsck is "
472 "recommended.", dir->i_ino); 472 "recommended.", dir->i_ino);
473 return NULL; 473 return NULL;
474} 474}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 475851896518..74cd1f7f1f88 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -60,7 +60,6 @@ void ext4_ioend_wait(struct inode *inode)
60static void put_io_page(struct ext4_io_page *io_page) 60static void put_io_page(struct ext4_io_page *io_page)
61{ 61{
62 if (atomic_dec_and_test(&io_page->p_count)) { 62 if (atomic_dec_and_test(&io_page->p_count)) {
63 end_page_writeback(io_page->p_page);
64 put_page(io_page->p_page); 63 put_page(io_page->p_page);
65 kmem_cache_free(io_page_cachep, io_page); 64 kmem_cache_free(io_page_cachep, io_page);
66 } 65 }
@@ -110,6 +109,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
110 if (io->iocb) 109 if (io->iocb)
111 aio_complete(io->iocb, io->result, 0); 110 aio_complete(io->iocb, io->result, 0);
112 111
112 if (io->flag & EXT4_IO_END_DIRECT)
113 inode_dio_done(inode);
113 /* Wake up anyone waiting on unwritten extent conversion */ 114 /* Wake up anyone waiting on unwritten extent conversion */
114 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) 115 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
115 wake_up_all(ext4_ioend_wq(io->inode)); 116 wake_up_all(ext4_ioend_wq(io->inode));
@@ -127,12 +128,18 @@ static void ext4_end_io_work(struct work_struct *work)
127 unsigned long flags; 128 unsigned long flags;
128 129
129 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 130 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
131 if (io->flag & EXT4_IO_END_IN_FSYNC)
132 goto requeue;
130 if (list_empty(&io->list)) { 133 if (list_empty(&io->list)) {
131 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 134 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
132 goto free; 135 goto free;
133 } 136 }
134 137
135 if (!mutex_trylock(&inode->i_mutex)) { 138 if (!mutex_trylock(&inode->i_mutex)) {
139 bool was_queued;
140requeue:
141 was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
142 io->flag |= EXT4_IO_END_QUEUED;
136 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 143 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
137 /* 144 /*
138 * Requeue the work instead of waiting so that the work 145 * Requeue the work instead of waiting so that the work
@@ -145,9 +152,8 @@ static void ext4_end_io_work(struct work_struct *work)
145 * yield the cpu if it sees an end_io request that has already 152 * yield the cpu if it sees an end_io request that has already
146 * been requeued. 153 * been requeued.
147 */ 154 */
148 if (io->flag & EXT4_IO_END_QUEUED) 155 if (was_queued)
149 yield(); 156 yield();
150 io->flag |= EXT4_IO_END_QUEUED;
151 return; 157 return;
152 } 158 }
153 list_del_init(&io->list); 159 list_del_init(&io->list);
@@ -227,9 +233,9 @@ static void ext4_end_bio(struct bio *bio, int error)
227 } while (bh != head); 233 } while (bh != head);
228 } 234 }
229 235
230 put_io_page(io_end->pages[i]); 236 if (atomic_read(&io_end->pages[i]->p_count) == 1)
237 end_page_writeback(io_end->pages[i]->p_page);
231 } 238 }
232 io_end->num_io_pages = 0;
233 inode = io_end->inode; 239 inode = io_end->inode;
234 240
235 if (error) { 241 if (error) {
@@ -421,6 +427,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
421 * PageWriteback bit from the page to prevent the system from 427 * PageWriteback bit from the page to prevent the system from
422 * wedging later on. 428 * wedging later on.
423 */ 429 */
430 if (atomic_read(&io_page->p_count) == 1)
431 end_page_writeback(page);
424 put_io_page(io_page); 432 put_io_page(io_page);
425 return ret; 433 return ret;
426} 434}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f9d948f0eb86..59fa0be27251 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb,
1163 do_div(reserved_blocks, 100); 1163 do_div(reserved_blocks, 100);
1164 1164
1165 ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); 1165 ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
1166 ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
1166 le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * 1167 le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1167 flex_gd->count); 1168 flex_gd->count);
1169 le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1170 flex_gd->count);
1168 1171
1169 /* 1172 /*
1170 * We need to protect s_groups_count against other CPUs seeing 1173 * We need to protect s_groups_count against other CPUs seeing
@@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
1465 } 1468 }
1466 1469
1467 ext4_blocks_count_set(es, o_blocks_count + add); 1470 ext4_blocks_count_set(es, o_blocks_count + add);
1471 ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
1468 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1472 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1469 o_blocks_count + add); 1473 o_blocks_count + add);
1470 /* We add the blocks to the bitmap and set the group need init bit */ 1474 /* We add the blocks to the bitmap and set the group need init bit */
@@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1512 o_blocks_count = ext4_blocks_count(es); 1516 o_blocks_count = ext4_blocks_count(es);
1513 1517
1514 if (test_opt(sb, DEBUG)) 1518 if (test_opt(sb, DEBUG))
1515 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", 1519 ext4_msg(sb, KERN_DEBUG,
1516 o_blocks_count, n_blocks_count); 1520 "extending last group from %llu to %llu blocks",
1521 o_blocks_count, n_blocks_count);
1517 1522
1518 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 1523 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
1519 return 0; 1524 return 0;
1520 1525
1521 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1526 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1522 printk(KERN_ERR "EXT4-fs: filesystem on %s:" 1527 ext4_msg(sb, KERN_ERR,
1523 " too large to resize to %llu blocks safely\n", 1528 "filesystem too large to resize to %llu blocks safely",
1524 sb->s_id, n_blocks_count); 1529 n_blocks_count);
1525 if (sizeof(sector_t) < 8) 1530 if (sizeof(sector_t) < 8)
1526 ext4_warning(sb, "CONFIG_LBDAF not enabled"); 1531 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1527 return -EINVAL; 1532 return -EINVAL;
@@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1582 ext4_fsblk_t o_blocks_count; 1587 ext4_fsblk_t o_blocks_count;
1583 ext4_group_t o_group; 1588 ext4_group_t o_group;
1584 ext4_group_t n_group; 1589 ext4_group_t n_group;
1585 ext4_grpblk_t offset; 1590 ext4_grpblk_t offset, add;
1586 unsigned long n_desc_blocks; 1591 unsigned long n_desc_blocks;
1587 unsigned long o_desc_blocks; 1592 unsigned long o_desc_blocks;
1588 unsigned long desc_blocks; 1593 unsigned long desc_blocks;
@@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1591 o_blocks_count = ext4_blocks_count(es); 1596 o_blocks_count = ext4_blocks_count(es);
1592 1597
1593 if (test_opt(sb, DEBUG)) 1598 if (test_opt(sb, DEBUG))
1594 printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " 1599 ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
1595 "upto %llu blocks\n", o_blocks_count, n_blocks_count); 1600 "to %llu blocks", o_blocks_count, n_blocks_count);
1596 1601
1597 if (n_blocks_count < o_blocks_count) { 1602 if (n_blocks_count < o_blocks_count) {
1598 /* On-line shrinking not supported */ 1603 /* On-line shrinking not supported */
@@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1605 return 0; 1610 return 0;
1606 1611
1607 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1612 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
1608 ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); 1613 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
1609 1614
1610 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / 1615 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
1611 EXT4_DESC_PER_BLOCK(sb); 1616 EXT4_DESC_PER_BLOCK(sb);
@@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1634 } 1639 }
1635 brelse(bh); 1640 brelse(bh);
1636 1641
1637 if (offset != 0) { 1642 /* extend the last group */
1638 /* extend the last group */ 1643 if (n_group == o_group)
1639 ext4_grpblk_t add; 1644 add = n_blocks_count - o_blocks_count;
1640 add = EXT4_BLOCKS_PER_GROUP(sb) - offset; 1645 else
1646 add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
1647 if (add > 0) {
1641 err = ext4_group_extend_no_check(sb, o_blocks_count, add); 1648 err = ext4_group_extend_no_check(sb, o_blocks_count, add);
1642 if (err) 1649 if (err)
1643 goto out; 1650 goto out;
@@ -1674,7 +1681,7 @@ out:
1674 1681
1675 iput(resize_inode); 1682 iput(resize_inode);
1676 if (test_opt(sb, DEBUG)) 1683 if (test_opt(sb, DEBUG))
1677 printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " 1684 ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
1678 "upto %llu blocks\n", o_blocks_count, n_blocks_count); 1685 "upto %llu blocks", o_blocks_count, n_blocks_count);
1679 return err; 1686 return err;
1680} 1687}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 933900909ed0..ceebaf853beb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat;
62 62
63static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 63static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
64 unsigned long journal_devnum); 64 unsigned long journal_devnum);
65static int ext4_show_options(struct seq_file *seq, struct dentry *root);
65static int ext4_commit_super(struct super_block *sb, int sync); 66static int ext4_commit_super(struct super_block *sb, int sync);
66static void ext4_mark_recovery_complete(struct super_block *sb, 67static void ext4_mark_recovery_complete(struct super_block *sb,
67 struct ext4_super_block *es); 68 struct ext4_super_block *es);
@@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
375 if (is_handle_aborted(handle)) 376 if (is_handle_aborted(handle))
376 return; 377 return;
377 378
378 printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", 379 printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
379 caller, line, errstr, err_fn); 380 caller, line, errstr, err_fn);
380 381
381 jbd2_journal_abort_handle(handle); 382 jbd2_journal_abort_handle(handle);
@@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb)
431 return bdi->dev == NULL; 432 return bdi->dev == NULL;
432} 433}
433 434
435static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
436{
437 struct super_block *sb = journal->j_private;
438 struct ext4_sb_info *sbi = EXT4_SB(sb);
439 int error = is_journal_aborted(journal);
440 struct ext4_journal_cb_entry *jce, *tmp;
441
442 spin_lock(&sbi->s_md_lock);
443 list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
444 list_del_init(&jce->jce_list);
445 spin_unlock(&sbi->s_md_lock);
446 jce->jce_func(sb, jce, error);
447 spin_lock(&sbi->s_md_lock);
448 }
449 spin_unlock(&sbi->s_md_lock);
450}
434 451
435/* Deal with the reporting of failure conditions on a filesystem such as 452/* Deal with the reporting of failure conditions on a filesystem such as
436 * inconsistencies detected or read IO failures. 453 * inconsistencies detected or read IO failures.
@@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function,
498 va_start(args, fmt); 515 va_start(args, fmt);
499 vaf.fmt = fmt; 516 vaf.fmt = fmt;
500 vaf.va = &args; 517 vaf.va = &args;
501 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
502 inode->i_sb->s_id, function, line, inode->i_ino);
503 if (block) 518 if (block)
504 printk(KERN_CONT "block %llu: ", block); 519 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
505 printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); 520 "inode #%lu: block %llu: comm %s: %pV\n",
521 inode->i_sb->s_id, function, line, inode->i_ino,
522 block, current->comm, &vaf);
523 else
524 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
525 "inode #%lu: comm %s: %pV\n",
526 inode->i_sb->s_id, function, line, inode->i_ino,
527 current->comm, &vaf);
506 va_end(args); 528 va_end(args);
507 529
508 ext4_handle_error(inode->i_sb); 530 ext4_handle_error(inode->i_sb);
@@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function,
524 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 546 path = d_path(&(file->f_path), pathname, sizeof(pathname));
525 if (IS_ERR(path)) 547 if (IS_ERR(path))
526 path = "(unknown)"; 548 path = "(unknown)";
527 printk(KERN_CRIT
528 "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
529 inode->i_sb->s_id, function, line, inode->i_ino);
530 if (block)
531 printk(KERN_CONT "block %llu: ", block);
532 va_start(args, fmt); 549 va_start(args, fmt);
533 vaf.fmt = fmt; 550 vaf.fmt = fmt;
534 vaf.va = &args; 551 vaf.va = &args;
535 printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); 552 if (block)
553 printk(KERN_CRIT
554 "EXT4-fs error (device %s): %s:%d: inode #%lu: "
555 "block %llu: comm %s: path %s: %pV\n",
556 inode->i_sb->s_id, function, line, inode->i_ino,
557 block, current->comm, path, &vaf);
558 else
559 printk(KERN_CRIT
560 "EXT4-fs error (device %s): %s:%d: inode #%lu: "
561 "comm %s: path %s: %pV\n",
562 inode->i_sb->s_id, function, line, inode->i_ino,
563 current->comm, path, &vaf);
536 va_end(args); 564 va_end(args);
537 565
538 ext4_handle_error(inode->i_sb); 566 ext4_handle_error(inode->i_sb);
@@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb)
808 destroy_workqueue(sbi->dio_unwritten_wq); 836 destroy_workqueue(sbi->dio_unwritten_wq);
809 837
810 lock_super(sb); 838 lock_super(sb);
811 if (sb->s_dirt)
812 ext4_commit_super(sb, 1);
813
814 if (sbi->s_journal) { 839 if (sbi->s_journal) {
815 err = jbd2_journal_destroy(sbi->s_journal); 840 err = jbd2_journal_destroy(sbi->s_journal);
816 sbi->s_journal = NULL; 841 sbi->s_journal = NULL;
@@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb)
827 if (!(sb->s_flags & MS_RDONLY)) { 852 if (!(sb->s_flags & MS_RDONLY)) {
828 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 853 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
829 es->s_state = cpu_to_le16(sbi->s_mount_state); 854 es->s_state = cpu_to_le16(sbi->s_mount_state);
830 ext4_commit_super(sb, 1);
831 } 855 }
856 if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
857 ext4_commit_super(sb, 1);
858
832 if (sbi->s_proc) { 859 if (sbi->s_proc) {
860 remove_proc_entry("options", sbi->s_proc);
833 remove_proc_entry(sb->s_id, ext4_proc_root); 861 remove_proc_entry(sb->s_id, ext4_proc_root);
834 } 862 }
835 kobject_del(&sbi->s_kobj); 863 kobject_del(&sbi->s_kobj);
@@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode)
990 } 1018 }
991} 1019}
992 1020
993static inline void ext4_show_quota_options(struct seq_file *seq,
994 struct super_block *sb)
995{
996#if defined(CONFIG_QUOTA)
997 struct ext4_sb_info *sbi = EXT4_SB(sb);
998
999 if (sbi->s_jquota_fmt) {
1000 char *fmtname = "";
1001
1002 switch (sbi->s_jquota_fmt) {
1003 case QFMT_VFS_OLD:
1004 fmtname = "vfsold";
1005 break;
1006 case QFMT_VFS_V0:
1007 fmtname = "vfsv0";
1008 break;
1009 case QFMT_VFS_V1:
1010 fmtname = "vfsv1";
1011 break;
1012 }
1013 seq_printf(seq, ",jqfmt=%s", fmtname);
1014 }
1015
1016 if (sbi->s_qf_names[USRQUOTA])
1017 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
1018
1019 if (sbi->s_qf_names[GRPQUOTA])
1020 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1021
1022 if (test_opt(sb, USRQUOTA))
1023 seq_puts(seq, ",usrquota");
1024
1025 if (test_opt(sb, GRPQUOTA))
1026 seq_puts(seq, ",grpquota");
1027#endif
1028}
1029
1030/*
1031 * Show an option if
1032 * - it's set to a non-default value OR
1033 * - if the per-sb default is different from the global default
1034 */
1035static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1036{
1037 int def_errors;
1038 unsigned long def_mount_opts;
1039 struct super_block *sb = root->d_sb;
1040 struct ext4_sb_info *sbi = EXT4_SB(sb);
1041 struct ext4_super_block *es = sbi->s_es;
1042
1043 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1044 def_errors = le16_to_cpu(es->s_errors);
1045
1046 if (sbi->s_sb_block != 1)
1047 seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
1048 if (test_opt(sb, MINIX_DF))
1049 seq_puts(seq, ",minixdf");
1050 if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
1051 seq_puts(seq, ",grpid");
1052 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
1053 seq_puts(seq, ",nogrpid");
1054 if (sbi->s_resuid != EXT4_DEF_RESUID ||
1055 le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
1056 seq_printf(seq, ",resuid=%u", sbi->s_resuid);
1057 }
1058 if (sbi->s_resgid != EXT4_DEF_RESGID ||
1059 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
1060 seq_printf(seq, ",resgid=%u", sbi->s_resgid);
1061 }
1062 if (test_opt(sb, ERRORS_RO)) {
1063 if (def_errors == EXT4_ERRORS_PANIC ||
1064 def_errors == EXT4_ERRORS_CONTINUE) {
1065 seq_puts(seq, ",errors=remount-ro");
1066 }
1067 }
1068 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1069 seq_puts(seq, ",errors=continue");
1070 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1071 seq_puts(seq, ",errors=panic");
1072 if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
1073 seq_puts(seq, ",nouid32");
1074 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
1075 seq_puts(seq, ",debug");
1076#ifdef CONFIG_EXT4_FS_XATTR
1077 if (test_opt(sb, XATTR_USER))
1078 seq_puts(seq, ",user_xattr");
1079 if (!test_opt(sb, XATTR_USER))
1080 seq_puts(seq, ",nouser_xattr");
1081#endif
1082#ifdef CONFIG_EXT4_FS_POSIX_ACL
1083 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
1084 seq_puts(seq, ",acl");
1085 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
1086 seq_puts(seq, ",noacl");
1087#endif
1088 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
1089 seq_printf(seq, ",commit=%u",
1090 (unsigned) (sbi->s_commit_interval / HZ));
1091 }
1092 if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
1093 seq_printf(seq, ",min_batch_time=%u",
1094 (unsigned) sbi->s_min_batch_time);
1095 }
1096 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
1097 seq_printf(seq, ",max_batch_time=%u",
1098 (unsigned) sbi->s_max_batch_time);
1099 }
1100
1101 /*
1102 * We're changing the default of barrier mount option, so
1103 * let's always display its mount state so it's clear what its
1104 * status is.
1105 */
1106 seq_puts(seq, ",barrier=");
1107 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
1108 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
1109 seq_puts(seq, ",journal_async_commit");
1110 else if (test_opt(sb, JOURNAL_CHECKSUM))
1111 seq_puts(seq, ",journal_checksum");
1112 if (test_opt(sb, I_VERSION))
1113 seq_puts(seq, ",i_version");
1114 if (!test_opt(sb, DELALLOC) &&
1115 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1116 seq_puts(seq, ",nodelalloc");
1117
1118 if (!test_opt(sb, MBLK_IO_SUBMIT))
1119 seq_puts(seq, ",nomblk_io_submit");
1120 if (sbi->s_stripe)
1121 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1122 /*
1123 * journal mode get enabled in different ways
1124 * So just print the value even if we didn't specify it
1125 */
1126 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1127 seq_puts(seq, ",data=journal");
1128 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1129 seq_puts(seq, ",data=ordered");
1130 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1131 seq_puts(seq, ",data=writeback");
1132
1133 if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1134 seq_printf(seq, ",inode_readahead_blks=%u",
1135 sbi->s_inode_readahead_blks);
1136
1137 if (test_opt(sb, DATA_ERR_ABORT))
1138 seq_puts(seq, ",data_err=abort");
1139
1140 if (test_opt(sb, NO_AUTO_DA_ALLOC))
1141 seq_puts(seq, ",noauto_da_alloc");
1142
1143 if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
1144 seq_puts(seq, ",discard");
1145
1146 if (test_opt(sb, NOLOAD))
1147 seq_puts(seq, ",norecovery");
1148
1149 if (test_opt(sb, DIOREAD_NOLOCK))
1150 seq_puts(seq, ",dioread_nolock");
1151
1152 if (test_opt(sb, BLOCK_VALIDITY) &&
1153 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1154 seq_puts(seq, ",block_validity");
1155
1156 if (!test_opt(sb, INIT_INODE_TABLE))
1157 seq_puts(seq, ",noinit_itable");
1158 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1159 seq_printf(seq, ",init_itable=%u",
1160 (unsigned) sbi->s_li_wait_mult);
1161
1162 ext4_show_quota_options(seq, sb);
1163
1164 return 0;
1165}
1166
1167static struct inode *ext4_nfs_get_inode(struct super_block *sb, 1021static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1168 u64 ino, u32 generation) 1022 u64 ino, u32 generation)
1169{ 1023{
@@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = {
1316enum { 1170enum {
1317 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 1171 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1318 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1172 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1319 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1173 Opt_nouid32, Opt_debug, Opt_removed,
1320 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1174 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1321 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, 1175 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1322 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 1176 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1323 Opt_journal_update, Opt_journal_dev, 1177 Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
1324 Opt_journal_checksum, Opt_journal_async_commit,
1325 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1178 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1326 Opt_data_err_abort, Opt_data_err_ignore, 1179 Opt_data_err_abort, Opt_data_err_ignore,
1327 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1180 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1328 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1181 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1329 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, 1182 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1330 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, 1183 Opt_usrquota, Opt_grpquota, Opt_i_version,
1331 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, 1184 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1332 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1185 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1333 Opt_inode_readahead_blks, Opt_journal_ioprio, 1186 Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1350,20 +1203,19 @@ static const match_table_t tokens = {
1350 {Opt_err_ro, "errors=remount-ro"}, 1203 {Opt_err_ro, "errors=remount-ro"},
1351 {Opt_nouid32, "nouid32"}, 1204 {Opt_nouid32, "nouid32"},
1352 {Opt_debug, "debug"}, 1205 {Opt_debug, "debug"},
1353 {Opt_oldalloc, "oldalloc"}, 1206 {Opt_removed, "oldalloc"},
1354 {Opt_orlov, "orlov"}, 1207 {Opt_removed, "orlov"},
1355 {Opt_user_xattr, "user_xattr"}, 1208 {Opt_user_xattr, "user_xattr"},
1356 {Opt_nouser_xattr, "nouser_xattr"}, 1209 {Opt_nouser_xattr, "nouser_xattr"},
1357 {Opt_acl, "acl"}, 1210 {Opt_acl, "acl"},
1358 {Opt_noacl, "noacl"}, 1211 {Opt_noacl, "noacl"},
1359 {Opt_noload, "noload"},
1360 {Opt_noload, "norecovery"}, 1212 {Opt_noload, "norecovery"},
1361 {Opt_nobh, "nobh"}, 1213 {Opt_noload, "noload"},
1362 {Opt_bh, "bh"}, 1214 {Opt_removed, "nobh"},
1215 {Opt_removed, "bh"},
1363 {Opt_commit, "commit=%u"}, 1216 {Opt_commit, "commit=%u"},
1364 {Opt_min_batch_time, "min_batch_time=%u"}, 1217 {Opt_min_batch_time, "min_batch_time=%u"},
1365 {Opt_max_batch_time, "max_batch_time=%u"}, 1218 {Opt_max_batch_time, "max_batch_time=%u"},
1366 {Opt_journal_update, "journal=update"},
1367 {Opt_journal_dev, "journal_dev=%u"}, 1219 {Opt_journal_dev, "journal_dev=%u"},
1368 {Opt_journal_checksum, "journal_checksum"}, 1220 {Opt_journal_checksum, "journal_checksum"},
1369 {Opt_journal_async_commit, "journal_async_commit"}, 1221 {Opt_journal_async_commit, "journal_async_commit"},
@@ -1389,7 +1241,6 @@ static const match_table_t tokens = {
1389 {Opt_nobarrier, "nobarrier"}, 1241 {Opt_nobarrier, "nobarrier"},
1390 {Opt_i_version, "i_version"}, 1242 {Opt_i_version, "i_version"},
1391 {Opt_stripe, "stripe=%u"}, 1243 {Opt_stripe, "stripe=%u"},
1392 {Opt_resize, "resize"},
1393 {Opt_delalloc, "delalloc"}, 1244 {Opt_delalloc, "delalloc"},
1394 {Opt_nodelalloc, "nodelalloc"}, 1245 {Opt_nodelalloc, "nodelalloc"},
1395 {Opt_mblk_io_submit, "mblk_io_submit"}, 1246 {Opt_mblk_io_submit, "mblk_io_submit"},
@@ -1408,6 +1259,11 @@ static const match_table_t tokens = {
1408 {Opt_init_itable, "init_itable=%u"}, 1259 {Opt_init_itable, "init_itable=%u"},
1409 {Opt_init_itable, "init_itable"}, 1260 {Opt_init_itable, "init_itable"},
1410 {Opt_noinit_itable, "noinit_itable"}, 1261 {Opt_noinit_itable, "noinit_itable"},
1262 {Opt_removed, "check=none"}, /* mount option from ext2/3 */
1263 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
1264 {Opt_removed, "reservation"}, /* mount option from ext2/3 */
1265 {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1266 {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
1411 {Opt_err, NULL}, 1267 {Opt_err, NULL},
1412}; 1268};
1413 1269
@@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype)
1496} 1352}
1497#endif 1353#endif
1498 1354
1499static int parse_options(char *options, struct super_block *sb, 1355#define MOPT_SET 0x0001
1500 unsigned long *journal_devnum, 1356#define MOPT_CLEAR 0x0002
1501 unsigned int *journal_ioprio, 1357#define MOPT_NOSUPPORT 0x0004
1502 ext4_fsblk_t *n_blocks_count, int is_remount) 1358#define MOPT_EXPLICIT 0x0008
1503{ 1359#define MOPT_CLEAR_ERR 0x0010
1504 struct ext4_sb_info *sbi = EXT4_SB(sb); 1360#define MOPT_GTE0 0x0020
1505 char *p;
1506 substring_t args[MAX_OPT_ARGS];
1507 int data_opt = 0;
1508 int option;
1509#ifdef CONFIG_QUOTA 1361#ifdef CONFIG_QUOTA
1510 int qfmt; 1362#define MOPT_Q 0
1363#define MOPT_QFMT 0x0040
1364#else
1365#define MOPT_Q MOPT_NOSUPPORT
1366#define MOPT_QFMT MOPT_NOSUPPORT
1511#endif 1367#endif
1512 1368#define MOPT_DATAJ 0x0080
1513 if (!options) 1369
1514 return 1; 1370static const struct mount_opts {
1515 1371 int token;
1516 while ((p = strsep(&options, ",")) != NULL) { 1372 int mount_opt;
1517 int token; 1373 int flags;
1518 if (!*p) 1374} ext4_mount_opts[] = {
1519 continue; 1375 {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1520 1376 {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1521 /* 1377 {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1522 * Initialize args struct so we know whether arg was 1378 {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1523 * found; some options take optional arguments. 1379 {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
1524 */ 1380 {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
1525 args[0].to = args[0].from = NULL; 1381 {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1526 token = match_token(p, tokens, args); 1382 {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1527 switch (token) { 1383 {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
1528 case Opt_bsd_df: 1384 {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
1529 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1385 {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1530 clear_opt(sb, MINIX_DF); 1386 {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1531 break; 1387 {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
1532 case Opt_minix_df: 1388 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
1533 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1389 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
1534 set_opt(sb, MINIX_DF); 1390 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1535 1391 EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
1536 break; 1392 {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
1537 case Opt_grpid: 1393 {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1538 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1394 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1539 set_opt(sb, GRPID); 1395 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1540 1396 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
1541 break; 1397 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
1542 case Opt_nogrpid: 1398 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1543 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1399 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1544 clear_opt(sb, GRPID); 1400 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1545 1401 {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1546 break; 1402 {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1547 case Opt_resuid: 1403 {Opt_commit, 0, MOPT_GTE0},
1548 if (match_int(&args[0], &option)) 1404 {Opt_max_batch_time, 0, MOPT_GTE0},
1549 return 0; 1405 {Opt_min_batch_time, 0, MOPT_GTE0},
1550 sbi->s_resuid = option; 1406 {Opt_inode_readahead_blks, 0, MOPT_GTE0},
1551 break; 1407 {Opt_init_itable, 0, MOPT_GTE0},
1552 case Opt_resgid: 1408 {Opt_stripe, 0, MOPT_GTE0},
1553 if (match_int(&args[0], &option)) 1409 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
1554 return 0; 1410 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
1555 sbi->s_resgid = option; 1411 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
1556 break;
1557 case Opt_sb:
1558 /* handled by get_sb_block() instead of here */
1559 /* *sb_block = match_int(&args[0]); */
1560 break;
1561 case Opt_err_panic:
1562 clear_opt(sb, ERRORS_CONT);
1563 clear_opt(sb, ERRORS_RO);
1564 set_opt(sb, ERRORS_PANIC);
1565 break;
1566 case Opt_err_ro:
1567 clear_opt(sb, ERRORS_CONT);
1568 clear_opt(sb, ERRORS_PANIC);
1569 set_opt(sb, ERRORS_RO);
1570 break;
1571 case Opt_err_cont:
1572 clear_opt(sb, ERRORS_RO);
1573 clear_opt(sb, ERRORS_PANIC);
1574 set_opt(sb, ERRORS_CONT);
1575 break;
1576 case Opt_nouid32:
1577 set_opt(sb, NO_UID32);
1578 break;
1579 case Opt_debug:
1580 set_opt(sb, DEBUG);
1581 break;
1582 case Opt_oldalloc:
1583 ext4_msg(sb, KERN_WARNING,
1584 "Ignoring deprecated oldalloc option");
1585 break;
1586 case Opt_orlov:
1587 ext4_msg(sb, KERN_WARNING,
1588 "Ignoring deprecated orlov option");
1589 break;
1590#ifdef CONFIG_EXT4_FS_XATTR 1412#ifdef CONFIG_EXT4_FS_XATTR
1591 case Opt_user_xattr: 1413 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1592 set_opt(sb, XATTR_USER); 1414 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1593 break;
1594 case Opt_nouser_xattr:
1595 clear_opt(sb, XATTR_USER);
1596 break;
1597#else 1415#else
1598 case Opt_user_xattr: 1416 {Opt_user_xattr, 0, MOPT_NOSUPPORT},
1599 case Opt_nouser_xattr: 1417 {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
1600 ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
1601 break;
1602#endif 1418#endif
1603#ifdef CONFIG_EXT4_FS_POSIX_ACL 1419#ifdef CONFIG_EXT4_FS_POSIX_ACL
1604 case Opt_acl: 1420 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1605 set_opt(sb, POSIX_ACL); 1421 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1606 break;
1607 case Opt_noacl:
1608 clear_opt(sb, POSIX_ACL);
1609 break;
1610#else 1422#else
1611 case Opt_acl: 1423 {Opt_acl, 0, MOPT_NOSUPPORT},
1612 case Opt_noacl: 1424 {Opt_noacl, 0, MOPT_NOSUPPORT},
1613 ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
1614 break;
1615#endif 1425#endif
1616 case Opt_journal_update: 1426 {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1617 /* @@@ FIXME */ 1427 {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1618 /* Eventually we will want to be able to create 1428 {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1619 a journal file here. For now, only allow the 1429 {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1620 user to specify an existing inode to be the 1430 MOPT_SET | MOPT_Q},
1621 journal file. */ 1431 {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1622 if (is_remount) { 1432 MOPT_SET | MOPT_Q},
1623 ext4_msg(sb, KERN_ERR, 1433 {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1624 "Cannot specify journal on remount"); 1434 EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
1625 return 0; 1435 {Opt_usrjquota, 0, MOPT_Q},
1626 } 1436 {Opt_grpjquota, 0, MOPT_Q},
1627 set_opt(sb, UPDATE_JOURNAL); 1437 {Opt_offusrjquota, 0, MOPT_Q},
1628 break; 1438 {Opt_offgrpjquota, 0, MOPT_Q},
1629 case Opt_journal_dev: 1439 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1630 if (is_remount) { 1440 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1441 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1442 {Opt_err, 0, 0}
1443};
1444
1445static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1446 substring_t *args, unsigned long *journal_devnum,
1447 unsigned int *journal_ioprio, int is_remount)
1448{
1449 struct ext4_sb_info *sbi = EXT4_SB(sb);
1450 const struct mount_opts *m;
1451 int arg = 0;
1452
1453 if (args->from && match_int(args, &arg))
1454 return -1;
1455 switch (token) {
1456 case Opt_noacl:
1457 case Opt_nouser_xattr:
1458 ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
1459 break;
1460 case Opt_sb:
1461 return 1; /* handled by get_sb_block() */
1462 case Opt_removed:
1463 ext4_msg(sb, KERN_WARNING,
1464 "Ignoring removed %s option", opt);
1465 return 1;
1466 case Opt_resuid:
1467 sbi->s_resuid = arg;
1468 return 1;
1469 case Opt_resgid:
1470 sbi->s_resgid = arg;
1471 return 1;
1472 case Opt_abort:
1473 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1474 return 1;
1475 case Opt_i_version:
1476 sb->s_flags |= MS_I_VERSION;
1477 return 1;
1478 case Opt_journal_dev:
1479 if (is_remount) {
1480 ext4_msg(sb, KERN_ERR,
1481 "Cannot specify journal on remount");
1482 return -1;
1483 }
1484 *journal_devnum = arg;
1485 return 1;
1486 case Opt_journal_ioprio:
1487 if (arg < 0 || arg > 7)
1488 return -1;
1489 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
1490 return 1;
1491 }
1492
1493 for (m = ext4_mount_opts; m->token != Opt_err; m++) {
1494 if (token != m->token)
1495 continue;
1496 if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
1497 return -1;
1498 if (m->flags & MOPT_EXPLICIT)
1499 set_opt2(sb, EXPLICIT_DELALLOC);
1500 if (m->flags & MOPT_CLEAR_ERR)
1501 clear_opt(sb, ERRORS_MASK);
1502 if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
1503 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1504 "options when quota turned on");
1505 return -1;
1506 }
1507
1508 if (m->flags & MOPT_NOSUPPORT) {
1509 ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
1510 } else if (token == Opt_commit) {
1511 if (arg == 0)
1512 arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
1513 sbi->s_commit_interval = HZ * arg;
1514 } else if (token == Opt_max_batch_time) {
1515 if (arg == 0)
1516 arg = EXT4_DEF_MAX_BATCH_TIME;
1517 sbi->s_max_batch_time = arg;
1518 } else if (token == Opt_min_batch_time) {
1519 sbi->s_min_batch_time = arg;
1520 } else if (token == Opt_inode_readahead_blks) {
1521 if (arg > (1 << 30))
1522 return -1;
1523 if (arg && !is_power_of_2(arg)) {
1631 ext4_msg(sb, KERN_ERR, 1524 ext4_msg(sb, KERN_ERR,
1632 "Cannot specify journal on remount"); 1525 "EXT4-fs: inode_readahead_blks"
1633 return 0; 1526 " must be a power of 2");
1527 return -1;
1634 } 1528 }
1635 if (match_int(&args[0], &option)) 1529 sbi->s_inode_readahead_blks = arg;
1636 return 0; 1530 } else if (token == Opt_init_itable) {
1637 *journal_devnum = option; 1531 set_opt(sb, INIT_INODE_TABLE);
1638 break; 1532 if (!args->from)
1639 case Opt_journal_checksum: 1533 arg = EXT4_DEF_LI_WAIT_MULT;
1640 set_opt(sb, JOURNAL_CHECKSUM); 1534 sbi->s_li_wait_mult = arg;
1641 break; 1535 } else if (token == Opt_stripe) {
1642 case Opt_journal_async_commit: 1536 sbi->s_stripe = arg;
1643 set_opt(sb, JOURNAL_ASYNC_COMMIT); 1537 } else if (m->flags & MOPT_DATAJ) {
1644 set_opt(sb, JOURNAL_CHECKSUM);
1645 break;
1646 case Opt_noload:
1647 set_opt(sb, NOLOAD);
1648 break;
1649 case Opt_commit:
1650 if (match_int(&args[0], &option))
1651 return 0;
1652 if (option < 0)
1653 return 0;
1654 if (option == 0)
1655 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1656 sbi->s_commit_interval = HZ * option;
1657 break;
1658 case Opt_max_batch_time:
1659 if (match_int(&args[0], &option))
1660 return 0;
1661 if (option < 0)
1662 return 0;
1663 if (option == 0)
1664 option = EXT4_DEF_MAX_BATCH_TIME;
1665 sbi->s_max_batch_time = option;
1666 break;
1667 case Opt_min_batch_time:
1668 if (match_int(&args[0], &option))
1669 return 0;
1670 if (option < 0)
1671 return 0;
1672 sbi->s_min_batch_time = option;
1673 break;
1674 case Opt_data_journal:
1675 data_opt = EXT4_MOUNT_JOURNAL_DATA;
1676 goto datacheck;
1677 case Opt_data_ordered:
1678 data_opt = EXT4_MOUNT_ORDERED_DATA;
1679 goto datacheck;
1680 case Opt_data_writeback:
1681 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1682 datacheck:
1683 if (is_remount) { 1538 if (is_remount) {
1684 if (!sbi->s_journal) 1539 if (!sbi->s_journal)
1685 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); 1540 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1686 else if (test_opt(sb, DATA_FLAGS) != data_opt) { 1541 else if (test_opt(sb, DATA_FLAGS) !=
1542 m->mount_opt) {
1687 ext4_msg(sb, KERN_ERR, 1543 ext4_msg(sb, KERN_ERR,
1688 "Cannot change data mode on remount"); 1544 "Cannot change data mode on remount");
1689 return 0; 1545 return -1;
1690 } 1546 }
1691 } else { 1547 } else {
1692 clear_opt(sb, DATA_FLAGS); 1548 clear_opt(sb, DATA_FLAGS);
1693 sbi->s_mount_opt |= data_opt; 1549 sbi->s_mount_opt |= m->mount_opt;
1694 } 1550 }
1695 break;
1696 case Opt_data_err_abort:
1697 set_opt(sb, DATA_ERR_ABORT);
1698 break;
1699 case Opt_data_err_ignore:
1700 clear_opt(sb, DATA_ERR_ABORT);
1701 break;
1702#ifdef CONFIG_QUOTA 1551#ifdef CONFIG_QUOTA
1703 case Opt_usrjquota: 1552 } else if (token == Opt_usrjquota) {
1704 if (!set_qf_name(sb, USRQUOTA, &args[0])) 1553 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1705 return 0; 1554 return -1;
1706 break; 1555 } else if (token == Opt_grpjquota) {
1707 case Opt_grpjquota:
1708 if (!set_qf_name(sb, GRPQUOTA, &args[0])) 1556 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1709 return 0; 1557 return -1;
1710 break; 1558 } else if (token == Opt_offusrjquota) {
1711 case Opt_offusrjquota:
1712 if (!clear_qf_name(sb, USRQUOTA)) 1559 if (!clear_qf_name(sb, USRQUOTA))
1713 return 0; 1560 return -1;
1714 break; 1561 } else if (token == Opt_offgrpjquota) {
1715 case Opt_offgrpjquota:
1716 if (!clear_qf_name(sb, GRPQUOTA)) 1562 if (!clear_qf_name(sb, GRPQUOTA))
1717 return 0; 1563 return -1;
1718 break; 1564 } else if (m->flags & MOPT_QFMT) {
1719
1720 case Opt_jqfmt_vfsold:
1721 qfmt = QFMT_VFS_OLD;
1722 goto set_qf_format;
1723 case Opt_jqfmt_vfsv0:
1724 qfmt = QFMT_VFS_V0;
1725 goto set_qf_format;
1726 case Opt_jqfmt_vfsv1:
1727 qfmt = QFMT_VFS_V1;
1728set_qf_format:
1729 if (sb_any_quota_loaded(sb) && 1565 if (sb_any_quota_loaded(sb) &&
1730 sbi->s_jquota_fmt != qfmt) { 1566 sbi->s_jquota_fmt != m->mount_opt) {
1731 ext4_msg(sb, KERN_ERR, "Cannot change " 1567 ext4_msg(sb, KERN_ERR, "Cannot "
1732 "journaled quota options when " 1568 "change journaled quota options "
1733 "quota turned on"); 1569 "when quota turned on");
1734 return 0; 1570 return -1;
1735 }
1736 sbi->s_jquota_fmt = qfmt;
1737 break;
1738 case Opt_quota:
1739 case Opt_usrquota:
1740 set_opt(sb, QUOTA);
1741 set_opt(sb, USRQUOTA);
1742 break;
1743 case Opt_grpquota:
1744 set_opt(sb, QUOTA);
1745 set_opt(sb, GRPQUOTA);
1746 break;
1747 case Opt_noquota:
1748 if (sb_any_quota_loaded(sb)) {
1749 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1750 "options when quota turned on");
1751 return 0;
1752 } 1571 }
1753 clear_opt(sb, QUOTA); 1572 sbi->s_jquota_fmt = m->mount_opt;
1754 clear_opt(sb, USRQUOTA);
1755 clear_opt(sb, GRPQUOTA);
1756 break;
1757#else
1758 case Opt_quota:
1759 case Opt_usrquota:
1760 case Opt_grpquota:
1761 ext4_msg(sb, KERN_ERR,
1762 "quota options not supported");
1763 break;
1764 case Opt_usrjquota:
1765 case Opt_grpjquota:
1766 case Opt_offusrjquota:
1767 case Opt_offgrpjquota:
1768 case Opt_jqfmt_vfsold:
1769 case Opt_jqfmt_vfsv0:
1770 case Opt_jqfmt_vfsv1:
1771 ext4_msg(sb, KERN_ERR,
1772 "journaled quota options not supported");
1773 break;
1774 case Opt_noquota:
1775 break;
1776#endif 1573#endif
1777 case Opt_abort: 1574 } else {
1778 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; 1575 if (!args->from)
1779 break; 1576 arg = 1;
1780 case Opt_nobarrier: 1577 if (m->flags & MOPT_CLEAR)
1781 clear_opt(sb, BARRIER); 1578 arg = !arg;
1782 break; 1579 else if (unlikely(!(m->flags & MOPT_SET))) {
1783 case Opt_barrier: 1580 ext4_msg(sb, KERN_WARNING,
1784 if (args[0].from) { 1581 "buggy handling of option %s", opt);
1785 if (match_int(&args[0], &option)) 1582 WARN_ON(1);
1786 return 0; 1583 return -1;
1787 } else
1788 option = 1; /* No argument, default to 1 */
1789 if (option)
1790 set_opt(sb, BARRIER);
1791 else
1792 clear_opt(sb, BARRIER);
1793 break;
1794 case Opt_ignore:
1795 break;
1796 case Opt_resize:
1797 if (!is_remount) {
1798 ext4_msg(sb, KERN_ERR,
1799 "resize option only available "
1800 "for remount");
1801 return 0;
1802 }
1803 if (match_int(&args[0], &option) != 0)
1804 return 0;
1805 *n_blocks_count = option;
1806 break;
1807 case Opt_nobh:
1808 ext4_msg(sb, KERN_WARNING,
1809 "Ignoring deprecated nobh option");
1810 break;
1811 case Opt_bh:
1812 ext4_msg(sb, KERN_WARNING,
1813 "Ignoring deprecated bh option");
1814 break;
1815 case Opt_i_version:
1816 set_opt(sb, I_VERSION);
1817 sb->s_flags |= MS_I_VERSION;
1818 break;
1819 case Opt_nodelalloc:
1820 clear_opt(sb, DELALLOC);
1821 clear_opt2(sb, EXPLICIT_DELALLOC);
1822 break;
1823 case Opt_mblk_io_submit:
1824 set_opt(sb, MBLK_IO_SUBMIT);
1825 break;
1826 case Opt_nomblk_io_submit:
1827 clear_opt(sb, MBLK_IO_SUBMIT);
1828 break;
1829 case Opt_stripe:
1830 if (match_int(&args[0], &option))
1831 return 0;
1832 if (option < 0)
1833 return 0;
1834 sbi->s_stripe = option;
1835 break;
1836 case Opt_delalloc:
1837 set_opt(sb, DELALLOC);
1838 set_opt2(sb, EXPLICIT_DELALLOC);
1839 break;
1840 case Opt_block_validity:
1841 set_opt(sb, BLOCK_VALIDITY);
1842 break;
1843 case Opt_noblock_validity:
1844 clear_opt(sb, BLOCK_VALIDITY);
1845 break;
1846 case Opt_inode_readahead_blks:
1847 if (match_int(&args[0], &option))
1848 return 0;
1849 if (option < 0 || option > (1 << 30))
1850 return 0;
1851 if (option && !is_power_of_2(option)) {
1852 ext4_msg(sb, KERN_ERR,
1853 "EXT4-fs: inode_readahead_blks"
1854 " must be a power of 2");
1855 return 0;
1856 } 1584 }
1857 sbi->s_inode_readahead_blks = option; 1585 if (arg != 0)
1858 break; 1586 sbi->s_mount_opt |= m->mount_opt;
1859 case Opt_journal_ioprio:
1860 if (match_int(&args[0], &option))
1861 return 0;
1862 if (option < 0 || option > 7)
1863 break;
1864 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1865 option);
1866 break;
1867 case Opt_noauto_da_alloc:
1868 set_opt(sb, NO_AUTO_DA_ALLOC);
1869 break;
1870 case Opt_auto_da_alloc:
1871 if (args[0].from) {
1872 if (match_int(&args[0], &option))
1873 return 0;
1874 } else
1875 option = 1; /* No argument, default to 1 */
1876 if (option)
1877 clear_opt(sb, NO_AUTO_DA_ALLOC);
1878 else 1587 else
1879 set_opt(sb,NO_AUTO_DA_ALLOC); 1588 sbi->s_mount_opt &= ~m->mount_opt;
1880 break;
1881 case Opt_discard:
1882 set_opt(sb, DISCARD);
1883 break;
1884 case Opt_nodiscard:
1885 clear_opt(sb, DISCARD);
1886 break;
1887 case Opt_dioread_nolock:
1888 set_opt(sb, DIOREAD_NOLOCK);
1889 break;
1890 case Opt_dioread_lock:
1891 clear_opt(sb, DIOREAD_NOLOCK);
1892 break;
1893 case Opt_init_itable:
1894 set_opt(sb, INIT_INODE_TABLE);
1895 if (args[0].from) {
1896 if (match_int(&args[0], &option))
1897 return 0;
1898 } else
1899 option = EXT4_DEF_LI_WAIT_MULT;
1900 if (option < 0)
1901 return 0;
1902 sbi->s_li_wait_mult = option;
1903 break;
1904 case Opt_noinit_itable:
1905 clear_opt(sb, INIT_INODE_TABLE);
1906 break;
1907 default:
1908 ext4_msg(sb, KERN_ERR,
1909 "Unrecognized mount option \"%s\" "
1910 "or missing value", p);
1911 return 0;
1912 } 1589 }
1590 return 1;
1591 }
1592 ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
1593 "or missing value", opt);
1594 return -1;
1595}
1596
1597static int parse_options(char *options, struct super_block *sb,
1598 unsigned long *journal_devnum,
1599 unsigned int *journal_ioprio,
1600 int is_remount)
1601{
1602 struct ext4_sb_info *sbi = EXT4_SB(sb);
1603 char *p;
1604 substring_t args[MAX_OPT_ARGS];
1605 int token;
1606
1607 if (!options)
1608 return 1;
1609
1610 while ((p = strsep(&options, ",")) != NULL) {
1611 if (!*p)
1612 continue;
1613 /*
1614 * Initialize args struct so we know whether arg was
1615 * found; some options take optional arguments.
1616 */
1617 args[0].to = args[0].from = 0;
1618 token = match_token(p, tokens, args);
1619 if (handle_mount_opt(sb, p, token, args, journal_devnum,
1620 journal_ioprio, is_remount) < 0)
1621 return 0;
1913 } 1622 }
1914#ifdef CONFIG_QUOTA 1623#ifdef CONFIG_QUOTA
1915 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1624 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
@@ -1942,6 +1651,160 @@ set_qf_format:
1942 return 1; 1651 return 1;
1943} 1652}
1944 1653
1654static inline void ext4_show_quota_options(struct seq_file *seq,
1655 struct super_block *sb)
1656{
1657#if defined(CONFIG_QUOTA)
1658 struct ext4_sb_info *sbi = EXT4_SB(sb);
1659
1660 if (sbi->s_jquota_fmt) {
1661 char *fmtname = "";
1662
1663 switch (sbi->s_jquota_fmt) {
1664 case QFMT_VFS_OLD:
1665 fmtname = "vfsold";
1666 break;
1667 case QFMT_VFS_V0:
1668 fmtname = "vfsv0";
1669 break;
1670 case QFMT_VFS_V1:
1671 fmtname = "vfsv1";
1672 break;
1673 }
1674 seq_printf(seq, ",jqfmt=%s", fmtname);
1675 }
1676
1677 if (sbi->s_qf_names[USRQUOTA])
1678 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
1679
1680 if (sbi->s_qf_names[GRPQUOTA])
1681 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1682
1683 if (test_opt(sb, USRQUOTA))
1684 seq_puts(seq, ",usrquota");
1685
1686 if (test_opt(sb, GRPQUOTA))
1687 seq_puts(seq, ",grpquota");
1688#endif
1689}
1690
1691static const char *token2str(int token)
1692{
1693 static const struct match_token *t;
1694
1695 for (t = tokens; t->token != Opt_err; t++)
1696 if (t->token == token && !strchr(t->pattern, '='))
1697 break;
1698 return t->pattern;
1699}
1700
1701/*
1702 * Show an option if
1703 * - it's set to a non-default value OR
1704 * - if the per-sb default is different from the global default
1705 */
1706static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1707 int nodefs)
1708{
1709 struct ext4_sb_info *sbi = EXT4_SB(sb);
1710 struct ext4_super_block *es = sbi->s_es;
1711 int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
1712 const struct mount_opts *m;
1713 char sep = nodefs ? '\n' : ',';
1714
1715#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
1716#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
1717
1718 if (sbi->s_sb_block != 1)
1719 SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
1720
1721 for (m = ext4_mount_opts; m->token != Opt_err; m++) {
1722 int want_set = m->flags & MOPT_SET;
1723 if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
1724 (m->flags & MOPT_CLEAR_ERR))
1725 continue;
1726 if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
1727 continue; /* skip if same as the default */
1728 if ((want_set &&
1729 (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
1730 (!want_set && (sbi->s_mount_opt & m->mount_opt)))
1731 continue; /* select Opt_noFoo vs Opt_Foo */
1732 SEQ_OPTS_PRINT("%s", token2str(m->token));
1733 }
1734
1735 if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
1736 le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
1737 SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
1738 if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
1739 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
1740 SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
1741 def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
1742 if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
1743 SEQ_OPTS_PUTS("errors=remount-ro");
1744 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1745 SEQ_OPTS_PUTS("errors=continue");
1746 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1747 SEQ_OPTS_PUTS("errors=panic");
1748 if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
1749 SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
1750 if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
1751 SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
1752 if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
1753 SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
1754 if (sb->s_flags & MS_I_VERSION)
1755 SEQ_OPTS_PUTS("i_version");
1756 if (nodefs || sbi->s_stripe)
1757 SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
1758 if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
1759 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1760 SEQ_OPTS_PUTS("data=journal");
1761 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1762 SEQ_OPTS_PUTS("data=ordered");
1763 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1764 SEQ_OPTS_PUTS("data=writeback");
1765 }
1766 if (nodefs ||
1767 sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1768 SEQ_OPTS_PRINT("inode_readahead_blks=%u",
1769 sbi->s_inode_readahead_blks);
1770
1771 if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
1772 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
1773 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1774
1775 ext4_show_quota_options(seq, sb);
1776 return 0;
1777}
1778
1779static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1780{
1781 return _ext4_show_options(seq, root->d_sb, 0);
1782}
1783
1784static int options_seq_show(struct seq_file *seq, void *offset)
1785{
1786 struct super_block *sb = seq->private;
1787 int rc;
1788
1789 seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
1790 rc = _ext4_show_options(seq, sb, 1);
1791 seq_puts(seq, "\n");
1792 return rc;
1793}
1794
1795static int options_open_fs(struct inode *inode, struct file *file)
1796{
1797 return single_open(file, options_seq_show, PDE(inode)->data);
1798}
1799
1800static const struct file_operations ext4_seq_options_fops = {
1801 .owner = THIS_MODULE,
1802 .open = options_open_fs,
1803 .read = seq_read,
1804 .llseek = seq_lseek,
1805 .release = single_release,
1806};
1807
1945static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 1808static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1946 int read_only) 1809 int read_only)
1947{ 1810{
@@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void)
2945 ext4_clear_request_list(); 2808 ext4_clear_request_list();
2946 kfree(ext4_li_info); 2809 kfree(ext4_li_info);
2947 ext4_li_info = NULL; 2810 ext4_li_info = NULL;
2948 printk(KERN_CRIT "EXT4: error %d creating inode table " 2811 printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
2949 "initialization thread\n", 2812 "initialization thread\n",
2950 err); 2813 err);
2951 return err; 2814 return err;
@@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3183 set_opt(sb, INIT_INODE_TABLE); 3046 set_opt(sb, INIT_INODE_TABLE);
3184 if (def_mount_opts & EXT4_DEFM_DEBUG) 3047 if (def_mount_opts & EXT4_DEFM_DEBUG)
3185 set_opt(sb, DEBUG); 3048 set_opt(sb, DEBUG);
3186 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3049 if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
3187 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
3188 "2.6.38");
3189 set_opt(sb, GRPID); 3050 set_opt(sb, GRPID);
3190 }
3191 if (def_mount_opts & EXT4_DEFM_UID16) 3051 if (def_mount_opts & EXT4_DEFM_UID16)
3192 set_opt(sb, NO_UID32); 3052 set_opt(sb, NO_UID32);
3193 /* xattr user namespace & acls are now defaulted on */ 3053 /* xattr user namespace & acls are now defaulted on */
@@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3240 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 3100 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3241 3101
3242 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3102 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3243 &journal_devnum, &journal_ioprio, NULL, 0)) { 3103 &journal_devnum, &journal_ioprio, 0)) {
3244 ext4_msg(sb, KERN_WARNING, 3104 ext4_msg(sb, KERN_WARNING,
3245 "failed to parse options in superblock: %s", 3105 "failed to parse options in superblock: %s",
3246 sbi->s_es->s_mount_opts); 3106 sbi->s_es->s_mount_opts);
3247 } 3107 }
3108 sbi->s_def_mount_opt = sbi->s_mount_opt;
3248 if (!parse_options((char *) data, sb, &journal_devnum, 3109 if (!parse_options((char *) data, sb, &journal_devnum,
3249 &journal_ioprio, NULL, 0)) 3110 &journal_ioprio, 0))
3250 goto failed_mount; 3111 goto failed_mount;
3251 3112
3252 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 3113 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3416#else 3277#else
3417 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 3278 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
3418#endif 3279#endif
3419 sb->s_dirt = 1;
3420 } 3280 }
3421 3281
3422 /* Handle clustersize */ 3282 /* Handle clustersize */
@@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3540 if (ext4_proc_root) 3400 if (ext4_proc_root)
3541 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 3401 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
3542 3402
3403 if (sbi->s_proc)
3404 proc_create_data("options", S_IRUGO, sbi->s_proc,
3405 &ext4_seq_options_fops, sb);
3406
3543 bgl_lock_init(sbi->s_blockgroup_lock); 3407 bgl_lock_init(sbi->s_blockgroup_lock);
3544 3408
3545 for (i = 0; i < db_count; i++) { 3409 for (i = 0; i < db_count; i++) {
@@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3694 } 3558 }
3695 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3559 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3696 3560
3561 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
3562
3697 /* 3563 /*
3698 * The journal may have updated the bg summary counts, so we 3564 * The journal may have updated the bg summary counts, so we
3699 * need to update the global counters. 3565 * need to update the global counters.
@@ -3861,6 +3727,7 @@ failed_mount2:
3861 ext4_kvfree(sbi->s_group_desc); 3727 ext4_kvfree(sbi->s_group_desc);
3862failed_mount: 3728failed_mount:
3863 if (sbi->s_proc) { 3729 if (sbi->s_proc) {
3730 remove_proc_entry("options", sbi->s_proc);
3864 remove_proc_entry(sb->s_id, ext4_proc_root); 3731 remove_proc_entry(sb->s_id, ext4_proc_root);
3865 } 3732 }
3866#ifdef CONFIG_QUOTA 3733#ifdef CONFIG_QUOTA
@@ -4090,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb,
4090 if (!(journal->j_flags & JBD2_BARRIER)) 3957 if (!(journal->j_flags & JBD2_BARRIER))
4091 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3958 ext4_msg(sb, KERN_INFO, "barriers disabled");
4092 3959
4093 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
4094 err = jbd2_journal_update_format(journal);
4095 if (err) {
4096 ext4_msg(sb, KERN_ERR, "error updating journal");
4097 jbd2_journal_destroy(journal);
4098 return err;
4099 }
4100 }
4101
4102 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) 3960 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
4103 err = jbd2_journal_wipe(journal, !really_read_only); 3961 err = jbd2_journal_wipe(journal, !really_read_only);
4104 if (!err) { 3962 if (!err) {
@@ -4385,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4385{ 4243{
4386 struct ext4_super_block *es; 4244 struct ext4_super_block *es;
4387 struct ext4_sb_info *sbi = EXT4_SB(sb); 4245 struct ext4_sb_info *sbi = EXT4_SB(sb);
4388 ext4_fsblk_t n_blocks_count = 0;
4389 unsigned long old_sb_flags; 4246 unsigned long old_sb_flags;
4390 struct ext4_mount_options old_opts; 4247 struct ext4_mount_options old_opts;
4391 int enable_quota = 0; 4248 int enable_quota = 0;
@@ -4418,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4418 /* 4275 /*
4419 * Allow the "check" option to be passed as a remount option. 4276 * Allow the "check" option to be passed as a remount option.
4420 */ 4277 */
4421 if (!parse_options(data, sb, NULL, &journal_ioprio, 4278 if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
4422 &n_blocks_count, 1)) {
4423 err = -EINVAL; 4279 err = -EINVAL;
4424 goto restore_opts; 4280 goto restore_opts;
4425 } 4281 }
@@ -4437,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4437 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 4293 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4438 } 4294 }
4439 4295
4440 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 4296 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
4441 n_blocks_count > ext4_blocks_count(es)) {
4442 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { 4297 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
4443 err = -EROFS; 4298 err = -EROFS;
4444 goto restore_opts; 4299 goto restore_opts;
@@ -4513,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4513 if (sbi->s_journal) 4368 if (sbi->s_journal)
4514 ext4_clear_journal_err(sb, es); 4369 ext4_clear_journal_err(sb, es);
4515 sbi->s_mount_state = le16_to_cpu(es->s_state); 4370 sbi->s_mount_state = le16_to_cpu(es->s_state);
4516 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
4517 goto restore_opts;
4518 if (!ext4_setup_super(sb, es, 0)) 4371 if (!ext4_setup_super(sb, es, 0))
4519 sb->s_flags &= ~MS_RDONLY; 4372 sb->s_flags &= ~MS_RDONLY;
4520 if (EXT4_HAS_INCOMPAT_FEATURE(sb, 4373 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d89a220..e88748e55c0f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -82,8 +82,8 @@
82 printk("\n"); \ 82 printk("\n"); \
83 } while (0) 83 } while (0)
84#else 84#else
85# define ea_idebug(f...) 85# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
86# define ea_bdebug(f...) 86# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
87#endif 87#endif
88 88
89static void ext4_xattr_cache_insert(struct buffer_head *); 89static void ext4_xattr_cache_insert(struct buffer_head *);
@@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
158static inline int 158static inline int
159ext4_xattr_check_block(struct buffer_head *bh) 159ext4_xattr_check_block(struct buffer_head *bh)
160{ 160{
161 int error;
162
163 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 161 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
164 BHDR(bh)->h_blocks != cpu_to_le32(1)) 162 BHDR(bh)->h_blocks != cpu_to_le32(1))
165 return -EIO; 163 return -EIO;
166 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); 164 return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
167 return error;
168} 165}
169 166
170static inline int 167static inline int
@@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
220 error = -ENODATA; 217 error = -ENODATA;
221 if (!EXT4_I(inode)->i_file_acl) 218 if (!EXT4_I(inode)->i_file_acl)
222 goto cleanup; 219 goto cleanup;
223 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); 220 ea_idebug(inode, "reading block %llu",
221 (unsigned long long)EXT4_I(inode)->i_file_acl);
224 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 222 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
225 if (!bh) 223 if (!bh)
226 goto cleanup; 224 goto cleanup;
@@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
363 error = 0; 361 error = 0;
364 if (!EXT4_I(inode)->i_file_acl) 362 if (!EXT4_I(inode)->i_file_acl)
365 goto cleanup; 363 goto cleanup;
366 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); 364 ea_idebug(inode, "reading block %llu",
365 (unsigned long long)EXT4_I(inode)->i_file_acl);
367 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 366 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
368 error = -EIO; 367 error = -EIO;
369 if (!bh) 368 if (!bh)
@@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
487 ext4_free_blocks(handle, inode, bh, 0, 1, 486 ext4_free_blocks(handle, inode, bh, 0, 1,
488 EXT4_FREE_BLOCKS_METADATA | 487 EXT4_FREE_BLOCKS_METADATA |
489 EXT4_FREE_BLOCKS_FORGET); 488 EXT4_FREE_BLOCKS_FORGET);
489 unlock_buffer(bh);
490 } else { 490 } else {
491 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 491 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
492 if (ce)
493 mb_cache_entry_release(ce);
494 unlock_buffer(bh);
492 error = ext4_handle_dirty_metadata(handle, inode, bh); 495 error = ext4_handle_dirty_metadata(handle, inode, bh);
493 if (IS_SYNC(inode)) 496 if (IS_SYNC(inode))
494 ext4_handle_sync(handle); 497 ext4_handle_sync(handle);
495 dquot_free_block(inode, 1); 498 dquot_free_block(inode, 1);
496 ea_bdebug(bh, "refcount now=%d; releasing", 499 ea_bdebug(bh, "refcount now=%d; releasing",
497 le32_to_cpu(BHDR(bh)->h_refcount)); 500 le32_to_cpu(BHDR(bh)->h_refcount));
498 if (ce)
499 mb_cache_entry_release(ce);
500 } 501 }
501 unlock_buffer(bh);
502out: 502out:
503 ext4_std_error(inode->i_sb, error); 503 ext4_std_error(inode->i_sb, error);
504 return; 504 return;
@@ -834,7 +834,8 @@ inserted:
834 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 834 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
835 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 835 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
836 836
837 ea_idebug(inode, "creating block %d", block); 837 ea_idebug(inode, "creating block %llu",
838 (unsigned long long)block);
838 839
839 new_bh = sb_getblk(sb, block); 840 new_bh = sb_getblk(sb, block);
840 if (!new_bh) { 841 if (!new_bh) {
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index d49d202903fb..c78841ee81cf 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh)
88 * whole transaction. 88 * whole transaction.
89 * 89 *
90 * Requires j_list_lock 90 * Requires j_list_lock
91 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
92 */ 91 */
93static int __try_to_free_cp_buf(struct journal_head *jh) 92static int __try_to_free_cp_buf(struct journal_head *jh)
94{ 93{
95 int ret = 0; 94 int ret = 0;
96 struct buffer_head *bh = jh2bh(jh); 95 struct buffer_head *bh = jh2bh(jh);
97 96
98 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && 97 if (jh->b_transaction == NULL && !buffer_locked(bh) &&
99 !buffer_dirty(bh) && !buffer_write_io_error(bh)) { 98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
100 /* 99 /*
101 * Get our reference so that bh cannot be freed before 100 * Get our reference so that bh cannot be freed before
@@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
104 get_bh(bh); 103 get_bh(bh);
105 JBUFFER_TRACE(jh, "remove from checkpoint list"); 104 JBUFFER_TRACE(jh, "remove from checkpoint list");
106 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 105 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
107 jbd_unlock_bh_state(bh);
108 BUFFER_TRACE(bh, "release"); 106 BUFFER_TRACE(bh, "release");
109 __brelse(bh); 107 __brelse(bh);
110 } else {
111 jbd_unlock_bh_state(bh);
112 } 108 }
113 return ret; 109 return ret;
114} 110}
@@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
180} 176}
181 177
182/* 178/*
183 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
184 * The caller must restart a list walk. Wait for someone else to run
185 * jbd_unlock_bh_state().
186 */
187static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
188 __releases(journal->j_list_lock)
189{
190 get_bh(bh);
191 spin_unlock(&journal->j_list_lock);
192 jbd_lock_bh_state(bh);
193 jbd_unlock_bh_state(bh);
194 put_bh(bh);
195}
196
197/*
198 * Clean up transaction's list of buffers submitted for io. 179 * Clean up transaction's list of buffers submitted for io.
199 * We wait for any pending IO to complete and remove any clean 180 * We wait for any pending IO to complete and remove any clean
200 * buffers. Note that we take the buffers in the opposite ordering 181 * buffers. Note that we take the buffers in the opposite ordering
@@ -222,15 +203,9 @@ restart:
222 while (!released && transaction->t_checkpoint_io_list) { 203 while (!released && transaction->t_checkpoint_io_list) {
223 jh = transaction->t_checkpoint_io_list; 204 jh = transaction->t_checkpoint_io_list;
224 bh = jh2bh(jh); 205 bh = jh2bh(jh);
225 if (!jbd_trylock_bh_state(bh)) {
226 jbd_sync_bh(journal, bh);
227 spin_lock(&journal->j_list_lock);
228 goto restart;
229 }
230 get_bh(bh); 206 get_bh(bh);
231 if (buffer_locked(bh)) { 207 if (buffer_locked(bh)) {
232 spin_unlock(&journal->j_list_lock); 208 spin_unlock(&journal->j_list_lock);
233 jbd_unlock_bh_state(bh);
234 wait_on_buffer(bh); 209 wait_on_buffer(bh);
235 /* the journal_head may have gone by now */ 210 /* the journal_head may have gone by now */
236 BUFFER_TRACE(bh, "brelse"); 211 BUFFER_TRACE(bh, "brelse");
@@ -246,7 +221,6 @@ restart:
246 * it has been written out and so we can drop it from the list 221 * it has been written out and so we can drop it from the list
247 */ 222 */
248 released = __jbd2_journal_remove_checkpoint(jh); 223 released = __jbd2_journal_remove_checkpoint(jh);
249 jbd_unlock_bh_state(bh);
250 __brelse(bh); 224 __brelse(bh);
251 } 225 }
252 226
@@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count)
266 240
267 for (i = 0; i < *batch_count; i++) { 241 for (i = 0; i < *batch_count; i++) {
268 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 242 struct buffer_head *bh = journal->j_chkpt_bhs[i];
269 clear_buffer_jwrite(bh);
270 BUFFER_TRACE(bh, "brelse"); 243 BUFFER_TRACE(bh, "brelse");
271 __brelse(bh); 244 __brelse(bh);
272 } 245 }
@@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count)
281 * be written out. 254 * be written out.
282 * 255 *
283 * Called with j_list_lock held and drops it if 1 is returned 256 * Called with j_list_lock held and drops it if 1 is returned
284 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
285 */ 257 */
286static int __process_buffer(journal_t *journal, struct journal_head *jh, 258static int __process_buffer(journal_t *journal, struct journal_head *jh,
287 int *batch_count, transaction_t *transaction) 259 int *batch_count, transaction_t *transaction)
@@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
292 if (buffer_locked(bh)) { 264 if (buffer_locked(bh)) {
293 get_bh(bh); 265 get_bh(bh);
294 spin_unlock(&journal->j_list_lock); 266 spin_unlock(&journal->j_list_lock);
295 jbd_unlock_bh_state(bh);
296 wait_on_buffer(bh); 267 wait_on_buffer(bh);
297 /* the journal_head may have gone by now */ 268 /* the journal_head may have gone by now */
298 BUFFER_TRACE(bh, "brelse"); 269 BUFFER_TRACE(bh, "brelse");
@@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
304 275
305 transaction->t_chp_stats.cs_forced_to_close++; 276 transaction->t_chp_stats.cs_forced_to_close++;
306 spin_unlock(&journal->j_list_lock); 277 spin_unlock(&journal->j_list_lock);
307 jbd_unlock_bh_state(bh);
308 if (unlikely(journal->j_flags & JBD2_UNMOUNT)) 278 if (unlikely(journal->j_flags & JBD2_UNMOUNT))
309 /* 279 /*
310 * The journal thread is dead; so starting and 280 * The journal thread is dead; so starting and
@@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
323 if (unlikely(buffer_write_io_error(bh))) 293 if (unlikely(buffer_write_io_error(bh)))
324 ret = -EIO; 294 ret = -EIO;
325 get_bh(bh); 295 get_bh(bh);
326 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
327 BUFFER_TRACE(bh, "remove from checkpoint"); 296 BUFFER_TRACE(bh, "remove from checkpoint");
328 __jbd2_journal_remove_checkpoint(jh); 297 __jbd2_journal_remove_checkpoint(jh);
329 spin_unlock(&journal->j_list_lock); 298 spin_unlock(&journal->j_list_lock);
330 jbd_unlock_bh_state(bh);
331 __brelse(bh); 299 __brelse(bh);
332 } else { 300 } else {
333 /* 301 /*
@@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
340 BUFFER_TRACE(bh, "queue"); 308 BUFFER_TRACE(bh, "queue");
341 get_bh(bh); 309 get_bh(bh);
342 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 310 J_ASSERT_BH(bh, !buffer_jwrite(bh));
343 set_buffer_jwrite(bh);
344 journal->j_chkpt_bhs[*batch_count] = bh; 311 journal->j_chkpt_bhs[*batch_count] = bh;
345 __buffer_relink_io(jh); 312 __buffer_relink_io(jh);
346 jbd_unlock_bh_state(bh);
347 transaction->t_chp_stats.cs_written++; 313 transaction->t_chp_stats.cs_written++;
348 (*batch_count)++; 314 (*batch_count)++;
349 if (*batch_count == JBD2_NR_BATCH) { 315 if (*batch_count == JBD2_NR_BATCH) {
@@ -407,15 +373,7 @@ restart:
407 int retry = 0, err; 373 int retry = 0, err;
408 374
409 while (!retry && transaction->t_checkpoint_list) { 375 while (!retry && transaction->t_checkpoint_list) {
410 struct buffer_head *bh;
411
412 jh = transaction->t_checkpoint_list; 376 jh = transaction->t_checkpoint_list;
413 bh = jh2bh(jh);
414 if (!jbd_trylock_bh_state(bh)) {
415 jbd_sync_bh(journal, bh);
416 retry = 1;
417 break;
418 }
419 retry = __process_buffer(journal, jh, &batch_count, 377 retry = __process_buffer(journal, jh, &batch_count,
420 transaction); 378 transaction);
421 if (retry < 0 && !result) 379 if (retry < 0 && !result)
@@ -478,79 +436,28 @@ out:
478 436
479int jbd2_cleanup_journal_tail(journal_t *journal) 437int jbd2_cleanup_journal_tail(journal_t *journal)
480{ 438{
481 transaction_t * transaction;
482 tid_t first_tid; 439 tid_t first_tid;
483 unsigned long blocknr, freed; 440 unsigned long blocknr;
484 441
485 if (is_journal_aborted(journal)) 442 if (is_journal_aborted(journal))
486 return 1; 443 return 1;
487 444
488 /* OK, work out the oldest transaction remaining in the log, and 445 if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
489 * the log block it starts at.
490 *
491 * If the log is now empty, we need to work out which is the
492 * next transaction ID we will write, and where it will
493 * start. */
494
495 write_lock(&journal->j_state_lock);
496 spin_lock(&journal->j_list_lock);
497 transaction = journal->j_checkpoint_transactions;
498 if (transaction) {
499 first_tid = transaction->t_tid;
500 blocknr = transaction->t_log_start;
501 } else if ((transaction = journal->j_committing_transaction) != NULL) {
502 first_tid = transaction->t_tid;
503 blocknr = transaction->t_log_start;
504 } else if ((transaction = journal->j_running_transaction) != NULL) {
505 first_tid = transaction->t_tid;
506 blocknr = journal->j_head;
507 } else {
508 first_tid = journal->j_transaction_sequence;
509 blocknr = journal->j_head;
510 }
511 spin_unlock(&journal->j_list_lock);
512 J_ASSERT(blocknr != 0);
513
514 /* If the oldest pinned transaction is at the tail of the log
515 already then there's not much we can do right now. */
516 if (journal->j_tail_sequence == first_tid) {
517 write_unlock(&journal->j_state_lock);
518 return 1; 446 return 1;
519 } 447 J_ASSERT(blocknr != 0);
520
521 /* OK, update the superblock to recover the freed space.
522 * Physical blocks come first: have we wrapped beyond the end of
523 * the log? */
524 freed = blocknr - journal->j_tail;
525 if (blocknr < journal->j_tail)
526 freed = freed + journal->j_last - journal->j_first;
527
528 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
529 jbd_debug(1,
530 "Cleaning journal tail from %d to %d (offset %lu), "
531 "freeing %lu\n",
532 journal->j_tail_sequence, first_tid, blocknr, freed);
533
534 journal->j_free += freed;
535 journal->j_tail_sequence = first_tid;
536 journal->j_tail = blocknr;
537 write_unlock(&journal->j_state_lock);
538 448
539 /* 449 /*
540 * If there is an external journal, we need to make sure that 450 * We need to make sure that any blocks that were recently written out
541 * any data blocks that were recently written out --- perhaps 451 * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
542 * by jbd2_log_do_checkpoint() --- are flushed out before we 452 * we drop the transactions from the journal. It's unlikely this will
543 * drop the transactions from the external journal. It's 453 * be necessary, especially with an appropriately sized journal, but we
544 * unlikely this will be necessary, especially with a 454 * need this to guarantee correctness. Fortunately
545 * appropriately sized journal, but we need this to guarantee 455 * jbd2_cleanup_journal_tail() doesn't get called all that often.
546 * correctness. Fortunately jbd2_cleanup_journal_tail()
547 * doesn't get called all that often.
548 */ 456 */
549 if ((journal->j_fs_dev != journal->j_dev) && 457 if (journal->j_flags & JBD2_BARRIER)
550 (journal->j_flags & JBD2_BARRIER))
551 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 458 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
552 if (!(journal->j_flags & JBD2_ABORT)) 459
553 jbd2_journal_update_superblock(journal, 1); 460 __jbd2_update_log_tail(journal, first_tid, blocknr);
554 return 0; 461 return 0;
555} 462}
556 463
@@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
582 do { 489 do {
583 jh = next_jh; 490 jh = next_jh;
584 next_jh = jh->b_cpnext; 491 next_jh = jh->b_cpnext;
585 /* Use trylock because of the ranking */ 492 ret = __try_to_free_cp_buf(jh);
586 if (jbd_trylock_bh_state(jh2bh(jh))) { 493 if (ret) {
587 ret = __try_to_free_cp_buf(jh); 494 freed++;
588 if (ret) { 495 if (ret == 2) {
589 freed++; 496 *released = 1;
590 if (ret == 2) { 497 return freed;
591 *released = 1;
592 return freed;
593 }
594 } 498 }
595 } 499 }
596 /* 500 /*
@@ -673,9 +577,7 @@ out:
673 * The function can free jh and bh. 577 * The function can free jh and bh.
674 * 578 *
675 * This function is called with j_list_lock held. 579 * This function is called with j_list_lock held.
676 * This function is called with jbd_lock_bh_state(jh2bh(jh))
677 */ 580 */
678
679int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 581int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
680{ 582{
681 struct transaction_chp_stats_s *stats; 583 struct transaction_chp_stats_s *stats;
@@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
722 transaction->t_tid, stats); 624 transaction->t_tid, stats);
723 625
724 __jbd2_journal_drop_transaction(journal, transaction); 626 __jbd2_journal_drop_transaction(journal, transaction);
725 kfree(transaction); 627 jbd2_journal_free_transaction(transaction);
726 628
727 /* Just in case anybody was waiting for more transactions to be 629 /* Just in case anybody was waiting for more transactions to be
728 checkpointed... */ 630 checkpointed... */
@@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
797 J_ASSERT(journal->j_committing_transaction != transaction); 699 J_ASSERT(journal->j_committing_transaction != transaction);
798 J_ASSERT(journal->j_running_transaction != transaction); 700 J_ASSERT(journal->j_running_transaction != transaction);
799 701
702 trace_jbd2_drop_transaction(journal, transaction);
703
800 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 704 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
801} 705}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index c067a8cae63b..17f557f01cf0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -331,6 +331,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
331 struct buffer_head *cbh = NULL; /* For transactional checksums */ 331 struct buffer_head *cbh = NULL; /* For transactional checksums */
332 __u32 crc32_sum = ~0; 332 __u32 crc32_sum = ~0;
333 struct blk_plug plug; 333 struct blk_plug plug;
334 /* Tail of the journal */
335 unsigned long first_block;
336 tid_t first_tid;
337 int update_tail;
334 338
335 /* 339 /*
336 * First job: lock down the current transaction and wait for 340 * First job: lock down the current transaction and wait for
@@ -340,7 +344,18 @@ void jbd2_journal_commit_transaction(journal_t *journal)
340 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 344 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
341 if (journal->j_flags & JBD2_FLUSHED) { 345 if (journal->j_flags & JBD2_FLUSHED) {
342 jbd_debug(3, "super block updated\n"); 346 jbd_debug(3, "super block updated\n");
343 jbd2_journal_update_superblock(journal, 1); 347 mutex_lock(&journal->j_checkpoint_mutex);
348 /*
349 * We hold j_checkpoint_mutex so tail cannot change under us.
350 * We don't need any special data guarantees for writing sb
351 * since journal is empty and it is ok for write to be
352 * flushed only with transaction commit.
353 */
354 jbd2_journal_update_sb_log_tail(journal,
355 journal->j_tail_sequence,
356 journal->j_tail,
357 WRITE_SYNC);
358 mutex_unlock(&journal->j_checkpoint_mutex);
344 } else { 359 } else {
345 jbd_debug(3, "superblock not updated\n"); 360 jbd_debug(3, "superblock not updated\n");
346 } 361 }
@@ -677,10 +692,30 @@ start_journal_io:
677 err = 0; 692 err = 0;
678 } 693 }
679 694
695 /*
696 * Get current oldest transaction in the log before we issue flush
697 * to the filesystem device. After the flush we can be sure that
698 * blocks of all older transactions are checkpointed to persistent
699 * storage and we will be safe to update journal start in the
700 * superblock with the numbers we get here.
701 */
702 update_tail =
703 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
704
680 write_lock(&journal->j_state_lock); 705 write_lock(&journal->j_state_lock);
706 if (update_tail) {
707 long freed = first_block - journal->j_tail;
708
709 if (first_block < journal->j_tail)
710 freed += journal->j_last - journal->j_first;
711 /* Update tail only if we free significant amount of space */
712 if (freed < journal->j_maxlen / 4)
713 update_tail = 0;
714 }
681 J_ASSERT(commit_transaction->t_state == T_COMMIT); 715 J_ASSERT(commit_transaction->t_state == T_COMMIT);
682 commit_transaction->t_state = T_COMMIT_DFLUSH; 716 commit_transaction->t_state = T_COMMIT_DFLUSH;
683 write_unlock(&journal->j_state_lock); 717 write_unlock(&journal->j_state_lock);
718
684 /* 719 /*
685 * If the journal is not located on the file system device, 720 * If the journal is not located on the file system device,
686 * then we must flush the file system device before we issue 721 * then we must flush the file system device before we issue
@@ -831,6 +866,14 @@ wait_for_iobuf:
831 if (err) 866 if (err)
832 jbd2_journal_abort(journal, err); 867 jbd2_journal_abort(journal, err);
833 868
869 /*
870 * Now disk caches for filesystem device are flushed so we are safe to
871 * erase checkpointed transactions from the log by updating journal
872 * superblock.
873 */
874 if (update_tail)
875 jbd2_update_log_tail(journal, first_tid, first_block);
876
834 /* End of a transaction! Finally, we can do checkpoint 877 /* End of a transaction! Finally, we can do checkpoint
835 processing: any buffers committed as a result of this 878 processing: any buffers committed as a result of this
836 transaction can be removed from any checkpoint list it was on 879 transaction can be removed from any checkpoint list it was on
@@ -1048,7 +1091,7 @@ restart_loop:
1048 jbd_debug(1, "JBD2: commit %d complete, head %d\n", 1091 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1049 journal->j_commit_sequence, journal->j_tail_sequence); 1092 journal->j_commit_sequence, journal->j_tail_sequence);
1050 if (to_free) 1093 if (to_free)
1051 kfree(commit_transaction); 1094 jbd2_journal_free_transaction(commit_transaction);
1052 1095
1053 wake_up(&journal->j_wait_done_commit); 1096 wake_up(&journal->j_wait_done_commit);
1054} 1097}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 839377e3d624..98ed6dbfe381 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -71,7 +71,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke);
71 71
72EXPORT_SYMBOL(jbd2_journal_init_dev); 72EXPORT_SYMBOL(jbd2_journal_init_dev);
73EXPORT_SYMBOL(jbd2_journal_init_inode); 73EXPORT_SYMBOL(jbd2_journal_init_inode);
74EXPORT_SYMBOL(jbd2_journal_update_format);
75EXPORT_SYMBOL(jbd2_journal_check_used_features); 74EXPORT_SYMBOL(jbd2_journal_check_used_features);
76EXPORT_SYMBOL(jbd2_journal_check_available_features); 75EXPORT_SYMBOL(jbd2_journal_check_available_features);
77EXPORT_SYMBOL(jbd2_journal_set_features); 76EXPORT_SYMBOL(jbd2_journal_set_features);
@@ -96,7 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
96EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 95EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
97EXPORT_SYMBOL(jbd2_inode_cache); 96EXPORT_SYMBOL(jbd2_inode_cache);
98 97
99static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
100static void __journal_abort_soft (journal_t *journal, int errno); 98static void __journal_abort_soft (journal_t *journal, int errno);
101static int jbd2_journal_create_slab(size_t slab_size); 99static int jbd2_journal_create_slab(size_t slab_size);
102 100
@@ -746,6 +744,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
746 return jbd2_journal_add_journal_head(bh); 744 return jbd2_journal_add_journal_head(bh);
747} 745}
748 746
747/*
748 * Return tid of the oldest transaction in the journal and block in the journal
749 * where the transaction starts.
750 *
751 * If the journal is now empty, return which will be the next transaction ID
752 * we will write and where will that transaction start.
753 *
754 * The return value is 0 if journal tail cannot be pushed any further, 1 if
755 * it can.
756 */
757int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
758 unsigned long *block)
759{
760 transaction_t *transaction;
761 int ret;
762
763 read_lock(&journal->j_state_lock);
764 spin_lock(&journal->j_list_lock);
765 transaction = journal->j_checkpoint_transactions;
766 if (transaction) {
767 *tid = transaction->t_tid;
768 *block = transaction->t_log_start;
769 } else if ((transaction = journal->j_committing_transaction) != NULL) {
770 *tid = transaction->t_tid;
771 *block = transaction->t_log_start;
772 } else if ((transaction = journal->j_running_transaction) != NULL) {
773 *tid = transaction->t_tid;
774 *block = journal->j_head;
775 } else {
776 *tid = journal->j_transaction_sequence;
777 *block = journal->j_head;
778 }
779 ret = tid_gt(*tid, journal->j_tail_sequence);
780 spin_unlock(&journal->j_list_lock);
781 read_unlock(&journal->j_state_lock);
782
783 return ret;
784}
785
786/*
787 * Update information in journal structure and in on disk journal superblock
788 * about log tail. This function does not check whether information passed in
789 * really pushes log tail further. It's responsibility of the caller to make
790 * sure provided log tail information is valid (e.g. by holding
791 * j_checkpoint_mutex all the time between computing log tail and calling this
792 * function as is the case with jbd2_cleanup_journal_tail()).
793 *
794 * Requires j_checkpoint_mutex
795 */
796void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
797{
798 unsigned long freed;
799
800 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
801
802 /*
803 * We cannot afford for write to remain in drive's caches since as
804 * soon as we update j_tail, next transaction can start reusing journal
805 * space and if we lose sb update during power failure we'd replay
806 * old transaction with possibly newly overwritten data.
807 */
808 jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
809 write_lock(&journal->j_state_lock);
810 freed = block - journal->j_tail;
811 if (block < journal->j_tail)
812 freed += journal->j_last - journal->j_first;
813
814 trace_jbd2_update_log_tail(journal, tid, block, freed);
815 jbd_debug(1,
816 "Cleaning journal tail from %d to %d (offset %lu), "
817 "freeing %lu\n",
818 journal->j_tail_sequence, tid, block, freed);
819
820 journal->j_free += freed;
821 journal->j_tail_sequence = tid;
822 journal->j_tail = block;
823 write_unlock(&journal->j_state_lock);
824}
825
826/*
827 * This is a variaon of __jbd2_update_log_tail which checks for validity of
828 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
829 * with other threads updating log tail.
830 */
831void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
832{
833 mutex_lock(&journal->j_checkpoint_mutex);
834 if (tid_gt(tid, journal->j_tail_sequence))
835 __jbd2_update_log_tail(journal, tid, block);
836 mutex_unlock(&journal->j_checkpoint_mutex);
837}
838
749struct jbd2_stats_proc_session { 839struct jbd2_stats_proc_session {
750 journal_t *journal; 840 journal_t *journal;
751 struct transaction_stats_s *stats; 841 struct transaction_stats_s *stats;
@@ -1114,40 +1204,45 @@ static int journal_reset(journal_t *journal)
1114 1204
1115 journal->j_max_transaction_buffers = journal->j_maxlen / 4; 1205 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
1116 1206
1117 /* Add the dynamic fields and write it to disk. */
1118 jbd2_journal_update_superblock(journal, 1);
1119 return jbd2_journal_start_thread(journal);
1120}
1121
1122/**
1123 * void jbd2_journal_update_superblock() - Update journal sb on disk.
1124 * @journal: The journal to update.
1125 * @wait: Set to '0' if you don't want to wait for IO completion.
1126 *
1127 * Update a journal's dynamic superblock fields and write it to disk,
1128 * optionally waiting for the IO to complete.
1129 */
1130void jbd2_journal_update_superblock(journal_t *journal, int wait)
1131{
1132 journal_superblock_t *sb = journal->j_superblock;
1133 struct buffer_head *bh = journal->j_sb_buffer;
1134
1135 /* 1207 /*
1136 * As a special case, if the on-disk copy is already marked as needing 1208 * As a special case, if the on-disk copy is already marked as needing
1137 * no recovery (s_start == 0) and there are no outstanding transactions 1209 * no recovery (s_start == 0), then we can safely defer the superblock
1138 * in the filesystem, then we can safely defer the superblock update 1210 * update until the next commit by setting JBD2_FLUSHED. This avoids
1139 * until the next commit by setting JBD2_FLUSHED. This avoids
1140 * attempting a write to a potential-readonly device. 1211 * attempting a write to a potential-readonly device.
1141 */ 1212 */
1142 if (sb->s_start == 0 && journal->j_tail_sequence == 1213 if (sb->s_start == 0) {
1143 journal->j_transaction_sequence) {
1144 jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " 1214 jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
1145 "(start %ld, seq %d, errno %d)\n", 1215 "(start %ld, seq %d, errno %d)\n",
1146 journal->j_tail, journal->j_tail_sequence, 1216 journal->j_tail, journal->j_tail_sequence,
1147 journal->j_errno); 1217 journal->j_errno);
1148 goto out; 1218 journal->j_flags |= JBD2_FLUSHED;
1219 } else {
1220 /* Lock here to make assertions happy... */
1221 mutex_lock(&journal->j_checkpoint_mutex);
1222 /*
1223 * Update log tail information. We use WRITE_FUA since new
1224 * transaction will start reusing journal space and so we
1225 * must make sure information about current log tail is on
1226 * disk before that.
1227 */
1228 jbd2_journal_update_sb_log_tail(journal,
1229 journal->j_tail_sequence,
1230 journal->j_tail,
1231 WRITE_FUA);
1232 mutex_unlock(&journal->j_checkpoint_mutex);
1149 } 1233 }
1234 return jbd2_journal_start_thread(journal);
1235}
1150 1236
1237static void jbd2_write_superblock(journal_t *journal, int write_op)
1238{
1239 struct buffer_head *bh = journal->j_sb_buffer;
1240 int ret;
1241
1242 trace_jbd2_write_superblock(journal, write_op);
1243 if (!(journal->j_flags & JBD2_BARRIER))
1244 write_op &= ~(REQ_FUA | REQ_FLUSH);
1245 lock_buffer(bh);
1151 if (buffer_write_io_error(bh)) { 1246 if (buffer_write_io_error(bh)) {
1152 /* 1247 /*
1153 * Oh, dear. A previous attempt to write the journal 1248 * Oh, dear. A previous attempt to write the journal
@@ -1163,48 +1258,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1163 clear_buffer_write_io_error(bh); 1258 clear_buffer_write_io_error(bh);
1164 set_buffer_uptodate(bh); 1259 set_buffer_uptodate(bh);
1165 } 1260 }
1261 get_bh(bh);
1262 bh->b_end_io = end_buffer_write_sync;
1263 ret = submit_bh(write_op, bh);
1264 wait_on_buffer(bh);
1265 if (buffer_write_io_error(bh)) {
1266 clear_buffer_write_io_error(bh);
1267 set_buffer_uptodate(bh);
1268 ret = -EIO;
1269 }
1270 if (ret) {
1271 printk(KERN_ERR "JBD2: Error %d detected when updating "
1272 "journal superblock for %s.\n", ret,
1273 journal->j_devname);
1274 }
1275}
1276
1277/**
1278 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1279 * @journal: The journal to update.
1280 * @tail_tid: TID of the new transaction at the tail of the log
1281 * @tail_block: The first block of the transaction at the tail of the log
1282 * @write_op: With which operation should we write the journal sb
1283 *
1284 * Update a journal's superblock information about log tail and write it to
1285 * disk, waiting for the IO to complete.
1286 */
1287void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1288 unsigned long tail_block, int write_op)
1289{
1290 journal_superblock_t *sb = journal->j_superblock;
1291
1292 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1293 jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
1294 tail_block, tail_tid);
1295
1296 sb->s_sequence = cpu_to_be32(tail_tid);
1297 sb->s_start = cpu_to_be32(tail_block);
1298
1299 jbd2_write_superblock(journal, write_op);
1300
1301 /* Log is no longer empty */
1302 write_lock(&journal->j_state_lock);
1303 WARN_ON(!sb->s_sequence);
1304 journal->j_flags &= ~JBD2_FLUSHED;
1305 write_unlock(&journal->j_state_lock);
1306}
1307
1308/**
1309 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
1310 * @journal: The journal to update.
1311 *
1312 * Update a journal's dynamic superblock fields to show that journal is empty.
1313 * Write updated superblock to disk waiting for IO to complete.
1314 */
1315static void jbd2_mark_journal_empty(journal_t *journal)
1316{
1317 journal_superblock_t *sb = journal->j_superblock;
1166 1318
1319 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1167 read_lock(&journal->j_state_lock); 1320 read_lock(&journal->j_state_lock);
1168 jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", 1321 jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
1169 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1322 journal->j_tail_sequence);
1170 1323
1171 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1324 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1172 sb->s_start = cpu_to_be32(journal->j_tail); 1325 sb->s_start = cpu_to_be32(0);
1173 sb->s_errno = cpu_to_be32(journal->j_errno);
1174 read_unlock(&journal->j_state_lock); 1326 read_unlock(&journal->j_state_lock);
1175 1327
1176 BUFFER_TRACE(bh, "marking dirty"); 1328 jbd2_write_superblock(journal, WRITE_FUA);
1177 mark_buffer_dirty(bh);
1178 if (wait) {
1179 sync_dirty_buffer(bh);
1180 if (buffer_write_io_error(bh)) {
1181 printk(KERN_ERR "JBD2: I/O error detected "
1182 "when updating journal superblock for %s.\n",
1183 journal->j_devname);
1184 clear_buffer_write_io_error(bh);
1185 set_buffer_uptodate(bh);
1186 }
1187 } else
1188 write_dirty_buffer(bh, WRITE);
1189
1190out:
1191 /* If we have just flushed the log (by marking s_start==0), then
1192 * any future commit will have to be careful to update the
1193 * superblock again to re-record the true start of the log. */
1194 1329
1330 /* Log is no longer empty */
1195 write_lock(&journal->j_state_lock); 1331 write_lock(&journal->j_state_lock);
1196 if (sb->s_start) 1332 journal->j_flags |= JBD2_FLUSHED;
1197 journal->j_flags &= ~JBD2_FLUSHED;
1198 else
1199 journal->j_flags |= JBD2_FLUSHED;
1200 write_unlock(&journal->j_state_lock); 1333 write_unlock(&journal->j_state_lock);
1201} 1334}
1202 1335
1336
1337/**
1338 * jbd2_journal_update_sb_errno() - Update error in the journal.
1339 * @journal: The journal to update.
1340 *
1341 * Update a journal's errno. Write updated superblock to disk waiting for IO
1342 * to complete.
1343 */
1344static void jbd2_journal_update_sb_errno(journal_t *journal)
1345{
1346 journal_superblock_t *sb = journal->j_superblock;
1347
1348 read_lock(&journal->j_state_lock);
1349 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
1350 journal->j_errno);
1351 sb->s_errno = cpu_to_be32(journal->j_errno);
1352 read_unlock(&journal->j_state_lock);
1353
1354 jbd2_write_superblock(journal, WRITE_SYNC);
1355}
1356
1203/* 1357/*
1204 * Read the superblock for a given journal, performing initial 1358 * Read the superblock for a given journal, performing initial
1205 * validation of the format. 1359 * validation of the format.
1206 */ 1360 */
1207
1208static int journal_get_superblock(journal_t *journal) 1361static int journal_get_superblock(journal_t *journal)
1209{ 1362{
1210 struct buffer_head *bh; 1363 struct buffer_head *bh;
@@ -1398,14 +1551,11 @@ int jbd2_journal_destroy(journal_t *journal)
1398 1551
1399 if (journal->j_sb_buffer) { 1552 if (journal->j_sb_buffer) {
1400 if (!is_journal_aborted(journal)) { 1553 if (!is_journal_aborted(journal)) {
1401 /* We can now mark the journal as empty. */ 1554 mutex_lock(&journal->j_checkpoint_mutex);
1402 journal->j_tail = 0; 1555 jbd2_mark_journal_empty(journal);
1403 journal->j_tail_sequence = 1556 mutex_unlock(&journal->j_checkpoint_mutex);
1404 ++journal->j_transaction_sequence; 1557 } else
1405 jbd2_journal_update_superblock(journal, 1);
1406 } else {
1407 err = -EIO; 1558 err = -EIO;
1408 }
1409 brelse(journal->j_sb_buffer); 1559 brelse(journal->j_sb_buffer);
1410 } 1560 }
1411 1561
@@ -1552,61 +1702,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1552EXPORT_SYMBOL(jbd2_journal_clear_features); 1702EXPORT_SYMBOL(jbd2_journal_clear_features);
1553 1703
1554/** 1704/**
1555 * int jbd2_journal_update_format () - Update on-disk journal structure.
1556 * @journal: Journal to act on.
1557 *
1558 * Given an initialised but unloaded journal struct, poke about in the
1559 * on-disk structure to update it to the most recent supported version.
1560 */
1561int jbd2_journal_update_format (journal_t *journal)
1562{
1563 journal_superblock_t *sb;
1564 int err;
1565
1566 err = journal_get_superblock(journal);
1567 if (err)
1568 return err;
1569
1570 sb = journal->j_superblock;
1571
1572 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1573 case JBD2_SUPERBLOCK_V2:
1574 return 0;
1575 case JBD2_SUPERBLOCK_V1:
1576 return journal_convert_superblock_v1(journal, sb);
1577 default:
1578 break;
1579 }
1580 return -EINVAL;
1581}
1582
1583static int journal_convert_superblock_v1(journal_t *journal,
1584 journal_superblock_t *sb)
1585{
1586 int offset, blocksize;
1587 struct buffer_head *bh;
1588
1589 printk(KERN_WARNING
1590 "JBD2: Converting superblock from version 1 to 2.\n");
1591
1592 /* Pre-initialise new fields to zero */
1593 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1594 blocksize = be32_to_cpu(sb->s_blocksize);
1595 memset(&sb->s_feature_compat, 0, blocksize-offset);
1596
1597 sb->s_nr_users = cpu_to_be32(1);
1598 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1599 journal->j_format_version = 2;
1600
1601 bh = journal->j_sb_buffer;
1602 BUFFER_TRACE(bh, "marking dirty");
1603 mark_buffer_dirty(bh);
1604 sync_dirty_buffer(bh);
1605 return 0;
1606}
1607
1608
1609/**
1610 * int jbd2_journal_flush () - Flush journal 1705 * int jbd2_journal_flush () - Flush journal
1611 * @journal: Journal to act on. 1706 * @journal: Journal to act on.
1612 * 1707 *
@@ -1619,7 +1714,6 @@ int jbd2_journal_flush(journal_t *journal)
1619{ 1714{
1620 int err = 0; 1715 int err = 0;
1621 transaction_t *transaction = NULL; 1716 transaction_t *transaction = NULL;
1622 unsigned long old_tail;
1623 1717
1624 write_lock(&journal->j_state_lock); 1718 write_lock(&journal->j_state_lock);
1625 1719
@@ -1654,6 +1748,7 @@ int jbd2_journal_flush(journal_t *journal)
1654 if (is_journal_aborted(journal)) 1748 if (is_journal_aborted(journal))
1655 return -EIO; 1749 return -EIO;
1656 1750
1751 mutex_lock(&journal->j_checkpoint_mutex);
1657 jbd2_cleanup_journal_tail(journal); 1752 jbd2_cleanup_journal_tail(journal);
1658 1753
1659 /* Finally, mark the journal as really needing no recovery. 1754 /* Finally, mark the journal as really needing no recovery.
@@ -1661,14 +1756,9 @@ int jbd2_journal_flush(journal_t *journal)
1661 * the magic code for a fully-recovered superblock. Any future 1756 * the magic code for a fully-recovered superblock. Any future
1662 * commits of data to the journal will restore the current 1757 * commits of data to the journal will restore the current
1663 * s_start value. */ 1758 * s_start value. */
1759 jbd2_mark_journal_empty(journal);
1760 mutex_unlock(&journal->j_checkpoint_mutex);
1664 write_lock(&journal->j_state_lock); 1761 write_lock(&journal->j_state_lock);
1665 old_tail = journal->j_tail;
1666 journal->j_tail = 0;
1667 write_unlock(&journal->j_state_lock);
1668 jbd2_journal_update_superblock(journal, 1);
1669 write_lock(&journal->j_state_lock);
1670 journal->j_tail = old_tail;
1671
1672 J_ASSERT(!journal->j_running_transaction); 1762 J_ASSERT(!journal->j_running_transaction);
1673 J_ASSERT(!journal->j_committing_transaction); 1763 J_ASSERT(!journal->j_committing_transaction);
1674 J_ASSERT(!journal->j_checkpoint_transactions); 1764 J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1708,8 +1798,12 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1708 write ? "Clearing" : "Ignoring"); 1798 write ? "Clearing" : "Ignoring");
1709 1799
1710 err = jbd2_journal_skip_recovery(journal); 1800 err = jbd2_journal_skip_recovery(journal);
1711 if (write) 1801 if (write) {
1712 jbd2_journal_update_superblock(journal, 1); 1802 /* Lock to make assertions happy... */
1803 mutex_lock(&journal->j_checkpoint_mutex);
1804 jbd2_mark_journal_empty(journal);
1805 mutex_unlock(&journal->j_checkpoint_mutex);
1806 }
1713 1807
1714 no_recovery: 1808 no_recovery:
1715 return err; 1809 return err;
@@ -1759,7 +1853,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
1759 __jbd2_journal_abort_hard(journal); 1853 __jbd2_journal_abort_hard(journal);
1760 1854
1761 if (errno) 1855 if (errno)
1762 jbd2_journal_update_superblock(journal, 1); 1856 jbd2_journal_update_sb_errno(journal);
1763} 1857}
1764 1858
1765/** 1859/**
@@ -2017,7 +2111,7 @@ static struct kmem_cache *jbd2_journal_head_cache;
2017static atomic_t nr_journal_heads = ATOMIC_INIT(0); 2111static atomic_t nr_journal_heads = ATOMIC_INIT(0);
2018#endif 2112#endif
2019 2113
2020static int journal_init_jbd2_journal_head_cache(void) 2114static int jbd2_journal_init_journal_head_cache(void)
2021{ 2115{
2022 int retval; 2116 int retval;
2023 2117
@@ -2035,7 +2129,7 @@ static int journal_init_jbd2_journal_head_cache(void)
2035 return retval; 2129 return retval;
2036} 2130}
2037 2131
2038static void jbd2_journal_destroy_jbd2_journal_head_cache(void) 2132static void jbd2_journal_destroy_journal_head_cache(void)
2039{ 2133{
2040 if (jbd2_journal_head_cache) { 2134 if (jbd2_journal_head_cache) {
2041 kmem_cache_destroy(jbd2_journal_head_cache); 2135 kmem_cache_destroy(jbd2_journal_head_cache);
@@ -2323,7 +2417,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2323 2417
2324struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; 2418struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2325 2419
2326static int __init journal_init_handle_cache(void) 2420static int __init jbd2_journal_init_handle_cache(void)
2327{ 2421{
2328 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); 2422 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2329 if (jbd2_handle_cache == NULL) { 2423 if (jbd2_handle_cache == NULL) {
@@ -2358,17 +2452,20 @@ static int __init journal_init_caches(void)
2358 2452
2359 ret = jbd2_journal_init_revoke_caches(); 2453 ret = jbd2_journal_init_revoke_caches();
2360 if (ret == 0) 2454 if (ret == 0)
2361 ret = journal_init_jbd2_journal_head_cache(); 2455 ret = jbd2_journal_init_journal_head_cache();
2456 if (ret == 0)
2457 ret = jbd2_journal_init_handle_cache();
2362 if (ret == 0) 2458 if (ret == 0)
2363 ret = journal_init_handle_cache(); 2459 ret = jbd2_journal_init_transaction_cache();
2364 return ret; 2460 return ret;
2365} 2461}
2366 2462
2367static void jbd2_journal_destroy_caches(void) 2463static void jbd2_journal_destroy_caches(void)
2368{ 2464{
2369 jbd2_journal_destroy_revoke_caches(); 2465 jbd2_journal_destroy_revoke_caches();
2370 jbd2_journal_destroy_jbd2_journal_head_cache(); 2466 jbd2_journal_destroy_journal_head_cache();
2371 jbd2_journal_destroy_handle_cache(); 2467 jbd2_journal_destroy_handle_cache();
2468 jbd2_journal_destroy_transaction_cache();
2372 jbd2_journal_destroy_slabs(); 2469 jbd2_journal_destroy_slabs();
2373} 2470}
2374 2471
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7baf1390..c1a03354a22f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/crc32.h> 23#include <linux/crc32.h>
24#include <linux/blkdev.h>
24#endif 25#endif
25 26
26/* 27/*
@@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal)
265 err2 = sync_blockdev(journal->j_fs_dev); 266 err2 = sync_blockdev(journal->j_fs_dev);
266 if (!err) 267 if (!err)
267 err = err2; 268 err = err2;
268 269 /* Make sure all replayed data is on permanent storage */
270 if (journal->j_flags & JBD2_BARRIER)
271 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
269 return err; 272 return err;
270} 273}
271 274
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 30b2867d6cc9..6973705d6a3d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void)
208 J_ASSERT(!jbd2_revoke_record_cache); 208 J_ASSERT(!jbd2_revoke_record_cache);
209 J_ASSERT(!jbd2_revoke_table_cache); 209 J_ASSERT(!jbd2_revoke_table_cache);
210 210
211 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", 211 jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
212 sizeof(struct jbd2_revoke_record_s), 212 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
213 0,
214 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
215 NULL);
216 if (!jbd2_revoke_record_cache) 213 if (!jbd2_revoke_record_cache)
217 goto record_cache_failure; 214 goto record_cache_failure;
218 215
219 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", 216 jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
220 sizeof(struct jbd2_revoke_table_s), 217 SLAB_TEMPORARY);
221 0, SLAB_TEMPORARY, NULL);
222 if (!jbd2_revoke_table_cache) 218 if (!jbd2_revoke_table_cache)
223 goto table_cache_failure; 219 goto table_cache_failure;
224 return 0; 220 return 0;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5aba56e1fd5..ddcd3549c6c2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -33,6 +33,35 @@
33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
34static void __jbd2_journal_unfile_buffer(struct journal_head *jh); 34static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
35 35
36static struct kmem_cache *transaction_cache;
37int __init jbd2_journal_init_transaction_cache(void)
38{
39 J_ASSERT(!transaction_cache);
40 transaction_cache = kmem_cache_create("jbd2_transaction_s",
41 sizeof(transaction_t),
42 0,
43 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
44 NULL);
45 if (transaction_cache)
46 return 0;
47 return -ENOMEM;
48}
49
50void jbd2_journal_destroy_transaction_cache(void)
51{
52 if (transaction_cache) {
53 kmem_cache_destroy(transaction_cache);
54 transaction_cache = NULL;
55 }
56}
57
58void jbd2_journal_free_transaction(transaction_t *transaction)
59{
60 if (unlikely(ZERO_OR_NULL_PTR(transaction)))
61 return;
62 kmem_cache_free(transaction_cache, transaction);
63}
64
36/* 65/*
37 * jbd2_get_transaction: obtain a new transaction_t object. 66 * jbd2_get_transaction: obtain a new transaction_t object.
38 * 67 *
@@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
133 162
134alloc_transaction: 163alloc_transaction:
135 if (!journal->j_running_transaction) { 164 if (!journal->j_running_transaction) {
136 new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); 165 new_transaction = kmem_cache_alloc(transaction_cache,
166 gfp_mask | __GFP_ZERO);
137 if (!new_transaction) { 167 if (!new_transaction) {
138 /* 168 /*
139 * If __GFP_FS is not present, then we may be 169 * If __GFP_FS is not present, then we may be
@@ -162,7 +192,7 @@ repeat:
162 if (is_journal_aborted(journal) || 192 if (is_journal_aborted(journal) ||
163 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 193 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
164 read_unlock(&journal->j_state_lock); 194 read_unlock(&journal->j_state_lock);
165 kfree(new_transaction); 195 jbd2_journal_free_transaction(new_transaction);
166 return -EROFS; 196 return -EROFS;
167 } 197 }
168 198
@@ -284,7 +314,7 @@ repeat:
284 read_unlock(&journal->j_state_lock); 314 read_unlock(&journal->j_state_lock);
285 315
286 lock_map_acquire(&handle->h_lockdep_map); 316 lock_map_acquire(&handle->h_lockdep_map);
287 kfree(new_transaction); 317 jbd2_journal_free_transaction(new_transaction);
288 return 0; 318 return 0;
289} 319}
290 320
@@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1549 * of these pointers, it could go bad. Generally the caller needs to re-read 1579 * of these pointers, it could go bad. Generally the caller needs to re-read
1550 * the pointer from the transaction_t. 1580 * the pointer from the transaction_t.
1551 * 1581 *
1552 * Called under j_list_lock. The journal may not be locked. 1582 * Called under j_list_lock.
1553 */ 1583 */
1554void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) 1584static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1555{ 1585{
1556 struct journal_head **list = NULL; 1586 struct journal_head **list = NULL;
1557 transaction_t *transaction; 1587 transaction_t *transaction;
@@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1646 spin_lock(&journal->j_list_lock); 1676 spin_lock(&journal->j_list_lock);
1647 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1677 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1648 /* written-back checkpointed metadata buffer */ 1678 /* written-back checkpointed metadata buffer */
1649 if (jh->b_jlist == BJ_None) { 1679 JBUFFER_TRACE(jh, "remove from checkpoint list");
1650 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1680 __jbd2_journal_remove_checkpoint(jh);
1651 __jbd2_journal_remove_checkpoint(jh);
1652 }
1653 } 1681 }
1654 spin_unlock(&journal->j_list_lock); 1682 spin_unlock(&journal->j_list_lock);
1655out: 1683out:
@@ -1949,6 +1977,8 @@ zap_buffer_unlocked:
1949 clear_buffer_mapped(bh); 1977 clear_buffer_mapped(bh);
1950 clear_buffer_req(bh); 1978 clear_buffer_req(bh);
1951 clear_buffer_new(bh); 1979 clear_buffer_new(bh);
1980 clear_buffer_delay(bh);
1981 clear_buffer_unwritten(bh);
1952 bh->b_bdev = NULL; 1982 bh->b_bdev = NULL;
1953 return may_free; 1983 return may_free;
1954} 1984}