aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGirish Shilamkar <girish@clusterfs.com>2008-01-28 23:58:27 -0500
committerTheodore Ts'o <tytso@mit.edu>2008-01-28 23:58:27 -0500
commit818d276ceb83aa9fdebb5e0a53188290312de987 (patch)
treede3fb4ffadd72caea2876c5232ce76cd14b3646e
parent8e85fb3f305b24b79c6d9cb7a56d22b062335ad3 (diff)
ext4: Add the journal checksum feature
The journal checksum feature adds two new flags i.e JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and JBD2_FEATURE_COMPAT_CHECKSUM. JBD2_FEATURE_CHECKSUM flag indicates that the commit block contains the checksum for the blocks described by the descriptor blocks. Due to checksums, writing of the commit record no longer needs to be synchronous. Now commit record can be sent to disk without waiting for descriptor blocks to be written to disk. This behavior is controlled using JBD2_FEATURE_ASYNC_COMMIT flag. Older kernels/e2fsck should not be able to recover the journal with _ASYNC_COMMIT hence it is made incompat. The commit header has been extended to hold the checksum along with the type of the checksum. For recovery in pass scan checksums are verified to ensure the sanity and completeness(in case of _ASYNC_COMMIT) of every transaction. Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Girish Shilamkar <girish@clusterfs.com> Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com>
-rw-r--r--Documentation/filesystems/ext4.txt10
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/ext4/super.c25
-rw-r--r--fs/jbd2/commit.c198
-rw-r--r--fs/jbd2/journal.c26
-rw-r--r--fs/jbd2/recovery.c151
-rw-r--r--include/linux/ext4_fs.h3
-rw-r--r--include/linux/jbd2.h36
8 files changed, 388 insertions, 62 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 6a4adcae9f9a..4f329afe20ec 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -89,6 +89,16 @@ When mounting an ext4 filesystem, the following option are accepted:
89extents ext4 will use extents to address file data. The 89extents ext4 will use extents to address file data. The
90 file system will no longer be mountable by ext3. 90 file system will no longer be mountable by ext3.
91 91
92journal_checksum Enable checksumming of the journal transactions.
93 This will allow the recovery code in e2fsck and the
94 kernel to detect corruption in the kernel. It is a
95 compatible change and will be ignored by older kernels.
96
97journal_async_commit Commit block can be written to disk without waiting
98 for descriptor blocks. If enabled older kernels cannot
99 mount the device. This will enable 'journal_checksum'
100 internally.
101
92journal=update Update the ext4 file system's journal to the current 102journal=update Update the ext4 file system's journal to the current
93 format. 103 format.
94 104
diff --git a/fs/Kconfig b/fs/Kconfig
index 9656139d2e99..219ec06a8c7e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -236,6 +236,7 @@ config JBD_DEBUG
236 236
237config JBD2 237config JBD2
238 tristate 238 tristate
239 select CRC32
239 help 240 help
240 This is a generic journaling layer for block devices that support 241 This is a generic journaling layer for block devices that support
241 both 32-bit and 64-bit block numbers. It is currently used by 242 both 32-bit and 64-bit block numbers. It is currently used by
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c7305443e100..f7479d30735e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -869,6 +869,7 @@ enum {
869 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 869 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
870 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 870 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
871 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 871 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
872 Opt_journal_checksum, Opt_journal_async_commit,
872 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 873 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
873 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 874 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
874 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 875 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
@@ -908,6 +909,8 @@ static match_table_t tokens = {
908 {Opt_journal_update, "journal=update"}, 909 {Opt_journal_update, "journal=update"},
909 {Opt_journal_inum, "journal=%u"}, 910 {Opt_journal_inum, "journal=%u"},
910 {Opt_journal_dev, "journal_dev=%u"}, 911 {Opt_journal_dev, "journal_dev=%u"},
912 {Opt_journal_checksum, "journal_checksum"},
913 {Opt_journal_async_commit, "journal_async_commit"},
911 {Opt_abort, "abort"}, 914 {Opt_abort, "abort"},
912 {Opt_data_journal, "data=journal"}, 915 {Opt_data_journal, "data=journal"},
913 {Opt_data_ordered, "data=ordered"}, 916 {Opt_data_ordered, "data=ordered"},
@@ -1095,6 +1098,13 @@ static int parse_options (char *options, struct super_block *sb,
1095 return 0; 1098 return 0;
1096 *journal_devnum = option; 1099 *journal_devnum = option;
1097 break; 1100 break;
1101 case Opt_journal_checksum:
1102 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1103 break;
1104 case Opt_journal_async_commit:
1105 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1106 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1107 break;
1098 case Opt_noload: 1108 case Opt_noload:
1099 set_opt (sbi->s_mount_opt, NOLOAD); 1109 set_opt (sbi->s_mount_opt, NOLOAD);
1100 break; 1110 break;
@@ -2114,6 +2124,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2114 goto failed_mount4; 2124 goto failed_mount4;
2115 } 2125 }
2116 2126
2127 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
2128 jbd2_journal_set_features(sbi->s_journal,
2129 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2130 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2131 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
2132 jbd2_journal_set_features(sbi->s_journal,
2133 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2134 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2135 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2136 } else {
2137 jbd2_journal_clear_features(sbi->s_journal,
2138 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2140 }
2141
2117 /* We have now updated the journal if required, so we can 2142 /* We have now updated the journal if required, so we can
2118 * validate the data journaling mode. */ 2143 * validate the data journaling mode. */
2119 switch (test_opt(sb, DATA_FLAGS)) { 2144 switch (test_opt(sb, DATA_FLAGS)) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8749a86f4175..da8d0eb3b7b9 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/crc32.h>
24 25
25/* 26/*
26 * Default IO end handler for temporary BJ_IO buffer_heads. 27 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -93,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh)
93 return 1; 94 return 1;
94} 95}
95 96
96/* Done it all: now write the commit record. We should have 97/*
98 * Done it all: now submit the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort 99 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write 100 * mode we can now just skip the rest of the journal write
99 * entirely. 101 * entirely.
100 * 102 *
101 * Returns 1 if the journal needs to be aborted or 0 on success 103 * Returns 1 if the journal needs to be aborted or 0 on success
102 */ 104 */
103static int journal_write_commit_record(journal_t *journal, 105static int journal_submit_commit_record(journal_t *journal,
104 transaction_t *commit_transaction) 106 transaction_t *commit_transaction,
107 struct buffer_head **cbh,
108 __u32 crc32_sum)
105{ 109{
106 struct journal_head *descriptor; 110 struct journal_head *descriptor;
111 struct commit_header *tmp;
107 struct buffer_head *bh; 112 struct buffer_head *bh;
108 int i, ret; 113 int ret;
109 int barrier_done = 0; 114 int barrier_done = 0;
110 115
111 if (is_journal_aborted(journal)) 116 if (is_journal_aborted(journal))
@@ -117,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal,
117 122
118 bh = jh2bh(descriptor); 123 bh = jh2bh(descriptor);
119 124
120 /* AKPM: buglet - add `i' to tmp! */ 125 tmp = (struct commit_header *)bh->b_data;
121 for (i = 0; i < bh->b_size; i += 512) { 126 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
122 journal_header_t *tmp = (journal_header_t*)bh->b_data; 127 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
123 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 128 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
124 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 129
125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 130 if (JBD2_HAS_COMPAT_FEATURE(journal,
131 JBD2_FEATURE_COMPAT_CHECKSUM)) {
132 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
133 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
134 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
126 } 135 }
127 136
128 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "submit commit block");
138 lock_buffer(bh);
139
129 set_buffer_dirty(bh); 140 set_buffer_dirty(bh);
130 if (journal->j_flags & JBD2_BARRIER) { 141 set_buffer_uptodate(bh);
142 bh->b_end_io = journal_end_buffer_io_sync;
143
144 if (journal->j_flags & JBD2_BARRIER &&
145 !JBD2_HAS_COMPAT_FEATURE(journal,
146 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
131 set_buffer_ordered(bh); 147 set_buffer_ordered(bh);
132 barrier_done = 1; 148 barrier_done = 1;
133 } 149 }
134 ret = sync_dirty_buffer(bh); 150 ret = submit_bh(WRITE, bh);
151
135 /* is it possible for another commit to fail at roughly 152 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to 153 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want 154 * trust the barrier flag in the super, but instead want
@@ -152,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal,
152 clear_buffer_ordered(bh); 169 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh); 170 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh); 171 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh); 172 ret = submit_bh(WRITE, bh);
156 } 173 }
157 put_bh(bh); /* One for getblk() */ 174 *cbh = bh;
158 jbd2_journal_put_journal_head(descriptor); 175 return ret;
176}
177
178/*
179 * This function along with journal_submit_commit_record
180 * allows to write the commit record asynchronously.
181 */
182static int journal_wait_on_commit_record(struct buffer_head *bh)
183{
184 int ret = 0;
185
186 clear_buffer_dirty(bh);
187 wait_on_buffer(bh);
159 188
160 return (ret == -EIO); 189 if (unlikely(!buffer_uptodate(bh)))
190 ret = -EIO;
191 put_bh(bh); /* One for getblk() */
192 jbd2_journal_put_journal_head(bh2jh(bh));
193
194 return ret;
161} 195}
162 196
197/*
198 * Wait for all submitted IO to complete.
199 */
200static int journal_wait_on_locked_list(journal_t *journal,
201 transaction_t *commit_transaction)
202{
203 int ret = 0;
204 struct journal_head *jh;
205
206 while (commit_transaction->t_locked_list) {
207 struct buffer_head *bh;
208
209 jh = commit_transaction->t_locked_list->b_tprev;
210 bh = jh2bh(jh);
211 get_bh(bh);
212 if (buffer_locked(bh)) {
213 spin_unlock(&journal->j_list_lock);
214 wait_on_buffer(bh);
215 if (unlikely(!buffer_uptodate(bh)))
216 ret = -EIO;
217 spin_lock(&journal->j_list_lock);
218 }
219 if (!inverted_lock(journal, bh)) {
220 put_bh(bh);
221 spin_lock(&journal->j_list_lock);
222 continue;
223 }
224 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
225 __jbd2_journal_unfile_buffer(jh);
226 jbd_unlock_bh_state(bh);
227 jbd2_journal_remove_journal_head(bh);
228 put_bh(bh);
229 } else {
230 jbd_unlock_bh_state(bh);
231 }
232 put_bh(bh);
233 cond_resched_lock(&journal->j_list_lock);
234 }
235 return ret;
236 }
237
163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 238static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{ 239{
165 int i; 240 int i;
@@ -275,7 +350,21 @@ write_out_data:
275 journal_do_submit_data(wbuf, bufs); 350 journal_do_submit_data(wbuf, bufs);
276} 351}
277 352
278static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, 353static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
354{
355 struct page *page = bh->b_page;
356 char *addr;
357 __u32 checksum;
358
359 addr = kmap_atomic(page, KM_USER0);
360 checksum = crc32_be(crc32_sum,
361 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
362 kunmap_atomic(addr, KM_USER0);
363
364 return checksum;
365}
366
367static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
279 unsigned long long block) 368 unsigned long long block)
280{ 369{
281 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 370 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
@@ -307,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
307 int tag_flag; 396 int tag_flag;
308 int i; 397 int i;
309 int tag_bytes = journal_tag_bytes(journal); 398 int tag_bytes = journal_tag_bytes(journal);
399 struct buffer_head *cbh = NULL; /* For transactional checksums */
400 __u32 crc32_sum = ~0;
310 401
311 /* 402 /*
312 * First job: lock down the current transaction and wait for 403 * First job: lock down the current transaction and wait for
@@ -451,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
451 journal_submit_data_buffers(journal, commit_transaction); 542 journal_submit_data_buffers(journal, commit_transaction);
452 543
453 /* 544 /*
454 * Wait for all previously submitted IO to complete. 545 * Wait for all previously submitted IO to complete if commit
546 * record is to be written synchronously.
455 */ 547 */
456 spin_lock(&journal->j_list_lock); 548 spin_lock(&journal->j_list_lock);
457 while (commit_transaction->t_locked_list) { 549 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
458 struct buffer_head *bh; 550 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
551 err = journal_wait_on_locked_list(journal,
552 commit_transaction);
459 553
460 jh = commit_transaction->t_locked_list->b_tprev;
461 bh = jh2bh(jh);
462 get_bh(bh);
463 if (buffer_locked(bh)) {
464 spin_unlock(&journal->j_list_lock);
465 wait_on_buffer(bh);
466 if (unlikely(!buffer_uptodate(bh)))
467 err = -EIO;
468 spin_lock(&journal->j_list_lock);
469 }
470 if (!inverted_lock(journal, bh)) {
471 put_bh(bh);
472 spin_lock(&journal->j_list_lock);
473 continue;
474 }
475 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
476 __jbd2_journal_unfile_buffer(jh);
477 jbd_unlock_bh_state(bh);
478 jbd2_journal_remove_journal_head(bh);
479 put_bh(bh);
480 } else {
481 jbd_unlock_bh_state(bh);
482 }
483 put_bh(bh);
484 cond_resched_lock(&journal->j_list_lock);
485 }
486 spin_unlock(&journal->j_list_lock); 554 spin_unlock(&journal->j_list_lock);
487 555
488 if (err) 556 if (err)
@@ -656,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
656start_journal_io: 724start_journal_io:
657 for (i = 0; i < bufs; i++) { 725 for (i = 0; i < bufs; i++) {
658 struct buffer_head *bh = wbuf[i]; 726 struct buffer_head *bh = wbuf[i];
727 /*
728 * Compute checksum.
729 */
730 if (JBD2_HAS_COMPAT_FEATURE(journal,
731 JBD2_FEATURE_COMPAT_CHECKSUM)) {
732 crc32_sum =
733 jbd2_checksum_data(crc32_sum, bh);
734 }
735
659 lock_buffer(bh); 736 lock_buffer(bh);
660 clear_buffer_dirty(bh); 737 clear_buffer_dirty(bh);
661 set_buffer_uptodate(bh); 738 set_buffer_uptodate(bh);
@@ -672,6 +749,23 @@ start_journal_io:
672 } 749 }
673 } 750 }
674 751
752 /* Done it all: now write the commit record asynchronously. */
753
754 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
755 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
756 err = journal_submit_commit_record(journal, commit_transaction,
757 &cbh, crc32_sum);
758 if (err)
759 __jbd2_journal_abort_hard(journal);
760
761 spin_lock(&journal->j_list_lock);
762 err = journal_wait_on_locked_list(journal,
763 commit_transaction);
764 spin_unlock(&journal->j_list_lock);
765 if (err)
766 __jbd2_journal_abort_hard(journal);
767 }
768
675 /* Lo and behold: we have just managed to send a transaction to 769 /* Lo and behold: we have just managed to send a transaction to
676 the log. Before we can commit it, wait for the IO so far to 770 the log. Before we can commit it, wait for the IO so far to
677 complete. Control buffers being written are on the 771 complete. Control buffers being written are on the
@@ -771,8 +865,14 @@ wait_for_iobuf:
771 865
772 jbd_debug(3, "JBD: commit phase 6\n"); 866 jbd_debug(3, "JBD: commit phase 6\n");
773 867
774 if (journal_write_commit_record(journal, commit_transaction)) 868 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
775 err = -EIO; 869 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
870 err = journal_submit_commit_record(journal, commit_transaction,
871 &cbh, crc32_sum);
872 if (err)
873 __jbd2_journal_abort_hard(journal);
874 }
875 err = journal_wait_on_commit_record(cbh);
776 876
777 if (err) 877 if (err)
778 jbd2_journal_abort(journal, err); 878 jbd2_journal_abort(journal, err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 3667c91bc786..59ba2494dcaf 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1578,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1578 return 1; 1578 return 1;
1579} 1579}
1580 1580
1581/*
1582 * jbd2_journal_clear_features () - Clear a given journal feature in the
1583 * superblock
1584 * @journal: Journal to act on.
1585 * @compat: bitmask of compatible features
1586 * @ro: bitmask of features that force read-only mount
1587 * @incompat: bitmask of incompatible features
1588 *
1589 * Clear a given journal feature as present on the
1590 * superblock.
1591 */
1592void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1593 unsigned long ro, unsigned long incompat)
1594{
1595 journal_superblock_t *sb;
1596
1597 jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
1598 compat, ro, incompat);
1599
1600 sb = journal->j_superblock;
1601
1602 sb->s_feature_compat &= ~cpu_to_be32(compat);
1603 sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
1604 sb->s_feature_incompat &= ~cpu_to_be32(incompat);
1605}
1606EXPORT_SYMBOL(jbd2_journal_clear_features);
1581 1607
1582/** 1608/**
1583 * int jbd2_journal_update_format () - Update on-disk journal structure. 1609 * int jbd2_journal_update_format () - Update on-disk journal structure.
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d0ce627539ef..921680663fa2 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/crc32.h>
24#endif 25#endif
25 26
26/* 27/*
@@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
316 return block; 317 return block;
317} 318}
318 319
320/*
321 * calc_chksums calculates the checksums for the blocks described in the
322 * descriptor block.
323 */
324static int calc_chksums(journal_t *journal, struct buffer_head *bh,
325 unsigned long *next_log_block, __u32 *crc32_sum)
326{
327 int i, num_blks, err;
328 unsigned long io_block;
329 struct buffer_head *obh;
330
331 num_blks = count_tags(journal, bh);
332 /* Calculate checksum of the descriptor block. */
333 *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
334
335 for (i = 0; i < num_blks; i++) {
336 io_block = (*next_log_block)++;
337 wrap(journal, *next_log_block);
338 err = jread(&obh, journal, io_block);
339 if (err) {
340 printk(KERN_ERR "JBD: IO error %d recovering block "
341 "%lu in log\n", err, io_block);
342 return 1;
343 } else {
344 *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
345 obh->b_size);
346 }
347 }
348 return 0;
349}
350
319static int do_one_pass(journal_t *journal, 351static int do_one_pass(journal_t *journal,
320 struct recovery_info *info, enum passtype pass) 352 struct recovery_info *info, enum passtype pass)
321{ 353{
@@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal,
328 unsigned int sequence; 360 unsigned int sequence;
329 int blocktype; 361 int blocktype;
330 int tag_bytes = journal_tag_bytes(journal); 362 int tag_bytes = journal_tag_bytes(journal);
363 __u32 crc32_sum = ~0; /* Transactional Checksums */
331 364
332 /* Precompute the maximum metadata descriptors in a descriptor block */ 365 /* Precompute the maximum metadata descriptors in a descriptor block */
333 int MAX_BLOCKS_PER_DESC; 366 int MAX_BLOCKS_PER_DESC;
@@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal,
419 switch(blocktype) { 452 switch(blocktype) {
420 case JBD2_DESCRIPTOR_BLOCK: 453 case JBD2_DESCRIPTOR_BLOCK:
421 /* If it is a valid descriptor block, replay it 454 /* If it is a valid descriptor block, replay it
422 * in pass REPLAY; otherwise, just skip over the 455 * in pass REPLAY; if journal_checksums enabled, then
423 * blocks it describes. */ 456 * calculate checksums in PASS_SCAN, otherwise,
457 * just skip over the blocks it describes. */
424 if (pass != PASS_REPLAY) { 458 if (pass != PASS_REPLAY) {
459 if (pass == PASS_SCAN &&
460 JBD2_HAS_COMPAT_FEATURE(journal,
461 JBD2_FEATURE_COMPAT_CHECKSUM) &&
462 !info->end_transaction) {
463 if (calc_chksums(journal, bh,
464 &next_log_block,
465 &crc32_sum)) {
466 put_bh(bh);
467 break;
468 }
469 put_bh(bh);
470 continue;
471 }
425 next_log_block += count_tags(journal, bh); 472 next_log_block += count_tags(journal, bh);
426 wrap(journal, next_log_block); 473 wrap(journal, next_log_block);
427 brelse(bh); 474 put_bh(bh);
428 continue; 475 continue;
429 } 476 }
430 477
@@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal,
516 continue; 563 continue;
517 564
518 case JBD2_COMMIT_BLOCK: 565 case JBD2_COMMIT_BLOCK:
519 /* Found an expected commit block: not much to 566 /* How to differentiate between interrupted commit
520 * do other than move on to the next sequence 567 * and journal corruption ?
568 *
569 * {nth transaction}
570 * Checksum Verification Failed
571 * |
572 * ____________________
573 * | |
574 * async_commit sync_commit
575 * | |
576 * | GO TO NEXT "Journal Corruption"
577 * | TRANSACTION
578 * |
579 * {(n+1)th transanction}
580 * |
581 * _______|______________
582 * | |
583 * Commit block found Commit block not found
584 * | |
585 * "Journal Corruption" |
586 * _____________|_________
587 * | |
588 * nth trans corrupt OR nth trans
589 * and (n+1)th interrupted interrupted
590 * before commit block
591 * could reach the disk.
592 * (Cannot find the difference in above
593 * mentioned conditions. Hence assume
594 * "Interrupted Commit".)
595 */
596
597 /* Found an expected commit block: if checksums
598 * are present verify them in PASS_SCAN; else not
599 * much to do other than move on to the next sequence
521 * number. */ 600 * number. */
601 if (pass == PASS_SCAN &&
602 JBD2_HAS_COMPAT_FEATURE(journal,
603 JBD2_FEATURE_COMPAT_CHECKSUM)) {
604 int chksum_err, chksum_seen;
605 struct commit_header *cbh =
606 (struct commit_header *)bh->b_data;
607 unsigned found_chksum =
608 be32_to_cpu(cbh->h_chksum[0]);
609
610 chksum_err = chksum_seen = 0;
611
612 if (info->end_transaction) {
613 printk(KERN_ERR "JBD: Transaction %u "
614 "found to be corrupt.\n",
615 next_commit_ID - 1);
616 brelse(bh);
617 break;
618 }
619
620 if (crc32_sum == found_chksum &&
621 cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
622 cbh->h_chksum_size ==
623 JBD2_CRC32_CHKSUM_SIZE)
624 chksum_seen = 1;
625 else if (!(cbh->h_chksum_type == 0 &&
626 cbh->h_chksum_size == 0 &&
627 found_chksum == 0 &&
628 !chksum_seen))
629 /*
630 * If fs is mounted using an old kernel and then
631 * kernel with journal_chksum is used then we
632 * get a situation where the journal flag has
633 * checksum flag set but checksums are not
634 * present i.e chksum = 0, in the individual
635 * commit blocks.
636 * Hence to avoid checksum failures, in this
637 * situation, this extra check is added.
638 */
639 chksum_err = 1;
640
641 if (chksum_err) {
642 info->end_transaction = next_commit_ID;
643
644 if (!JBD2_HAS_COMPAT_FEATURE(journal,
645 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
646 printk(KERN_ERR
647 "JBD: Transaction %u "
648 "found to be corrupt.\n",
649 next_commit_ID);
650 brelse(bh);
651 break;
652 }
653 }
654 crc32_sum = ~0;
655 }
522 brelse(bh); 656 brelse(bh);
523 next_commit_ID++; 657 next_commit_ID++;
524 continue; 658 continue;
@@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal,
554 * transaction marks the end of the valid log. 688 * transaction marks the end of the valid log.
555 */ 689 */
556 690
557 if (pass == PASS_SCAN) 691 if (pass == PASS_SCAN) {
558 info->end_transaction = next_commit_ID; 692 if (!info->end_transaction)
559 else { 693 info->end_transaction = next_commit_ID;
694 } else {
560 /* It's really bad news if different passes end up at 695 /* It's really bad news if different passes end up at
561 * different places (but possible due to IO errors). */ 696 * different places (but possible due to IO errors). */
562 if (info->end_transaction != next_commit_ID) { 697 if (info->end_transaction != next_commit_ID) {
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 300cc5a5adb9..cd406dba0e64 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -467,7 +467,8 @@ do { \
467#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 467#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
468#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 468#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
469#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ 469#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
470 470#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
471#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
471/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 472/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
472#ifndef _LINUX_EXT2_FS_H 473#ifndef _LINUX_EXT2_FS_H
473#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 474#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 685640036e81..98a2bc5d3e3f 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -149,6 +149,28 @@ typedef struct journal_header_s
149 __be32 h_sequence; 149 __be32 h_sequence;
150} journal_header_t; 150} journal_header_t;
151 151
152/*
153 * Checksum types.
154 */
155#define JBD2_CRC32_CHKSUM 1
156#define JBD2_MD5_CHKSUM 2
157#define JBD2_SHA1_CHKSUM 3
158
159#define JBD2_CRC32_CHKSUM_SIZE 4
160
161#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32))
162/*
163 * Commit block header for storing transactional checksums:
164 */
165struct commit_header {
166 __be32 h_magic;
167 __be32 h_blocktype;
168 __be32 h_sequence;
169 unsigned char h_chksum_type;
170 unsigned char h_chksum_size;
171 unsigned char h_padding[2];
172 __be32 h_chksum[JBD2_CHECKSUM_BYTES];
173};
152 174
153/* 175/*
154 * The block tag: used to describe a single buffer in the journal. 176 * The block tag: used to describe a single buffer in the journal.
@@ -242,14 +264,18 @@ typedef struct journal_superblock_s
242 ((j)->j_format_version >= 2 && \ 264 ((j)->j_format_version >= 2 && \
243 ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) 265 ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
244 266
245#define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 267#define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001
246#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 268
269#define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001
270#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002
271#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
247 272
248/* Features known to this kernel version: */ 273/* Features known to this kernel version: */
249#define JBD2_KNOWN_COMPAT_FEATURES 0 274#define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM
250#define JBD2_KNOWN_ROCOMPAT_FEATURES 0 275#define JBD2_KNOWN_ROCOMPAT_FEATURES 0
251#define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ 276#define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \
252 JBD2_FEATURE_INCOMPAT_64BIT) 277 JBD2_FEATURE_INCOMPAT_64BIT | \
278 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)
253 279
254#ifdef __KERNEL__ 280#ifdef __KERNEL__
255 281
@@ -997,6 +1023,8 @@ extern int jbd2_journal_check_available_features
997 (journal_t *, unsigned long, unsigned long, unsigned long); 1023 (journal_t *, unsigned long, unsigned long, unsigned long);
998extern int jbd2_journal_set_features 1024extern int jbd2_journal_set_features
999 (journal_t *, unsigned long, unsigned long, unsigned long); 1025 (journal_t *, unsigned long, unsigned long, unsigned long);
1026extern void jbd2_journal_clear_features
1027 (journal_t *, unsigned long, unsigned long, unsigned long);
1000extern int jbd2_journal_create (journal_t *); 1028extern int jbd2_journal_create (journal_t *);
1001extern int jbd2_journal_load (journal_t *journal); 1029extern int jbd2_journal_load (journal_t *journal);
1002extern void jbd2_journal_destroy (journal_t *); 1030extern void jbd2_journal_destroy (journal_t *);