diff options
author | Girish Shilamkar <girish@clusterfs.com> | 2008-01-28 23:58:27 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-01-28 23:58:27 -0500 |
commit | 818d276ceb83aa9fdebb5e0a53188290312de987 (patch) | |
tree | de3fb4ffadd72caea2876c5232ce76cd14b3646e | |
parent | 8e85fb3f305b24b79c6d9cb7a56d22b062335ad3 (diff) |
ext4: Add the journal checksum feature
The journal checksum feature adds two new flags i.e
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and JBD2_FEATURE_COMPAT_CHECKSUM.
JBD2_FEATURE_CHECKSUM flag indicates that the commit block contains the
checksum for the blocks described by the descriptor blocks.
Due to checksums, writing of the commit record no longer needs to be
synchronous. Now commit record can be sent to disk without waiting for
descriptor blocks to be written to disk. This behavior is controlled
using JBD2_FEATURE_ASYNC_COMMIT flag. Older kernels/e2fsck should not be
able to recover the journal with _ASYNC_COMMIT hence it is made
incompat.
The commit header has been extended to hold the checksum along with the
type of the checksum.
For recovery in pass scan checksums are verified to ensure the sanity
and completeness(in case of _ASYNC_COMMIT) of every transaction.
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Girish Shilamkar <girish@clusterfs.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
-rw-r--r-- | Documentation/filesystems/ext4.txt | 10 | ||||
-rw-r--r-- | fs/Kconfig | 1 | ||||
-rw-r--r-- | fs/ext4/super.c | 25 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 198 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 26 | ||||
-rw-r--r-- | fs/jbd2/recovery.c | 151 | ||||
-rw-r--r-- | include/linux/ext4_fs.h | 3 | ||||
-rw-r--r-- | include/linux/jbd2.h | 36 |
8 files changed, 388 insertions, 62 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6a4adcae9f9a..4f329afe20ec 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -89,6 +89,16 @@ When mounting an ext4 filesystem, the following option are accepted: | |||
89 | extents ext4 will use extents to address file data. The | 89 | extents ext4 will use extents to address file data. The |
90 | file system will no longer be mountable by ext3. | 90 | file system will no longer be mountable by ext3. |
91 | 91 | ||
92 | journal_checksum Enable checksumming of the journal transactions. | ||
93 | This will allow the recovery code in e2fsck and the | ||
94 | kernel to detect corruption in the kernel. It is a | ||
95 | compatible change and will be ignored by older kernels. | ||
96 | |||
97 | journal_async_commit Commit block can be written to disk without waiting | ||
98 | for descriptor blocks. If enabled older kernels cannot | ||
99 | mount the device. This will enable 'journal_checksum' | ||
100 | internally. | ||
101 | |||
92 | journal=update Update the ext4 file system's journal to the current | 102 | journal=update Update the ext4 file system's journal to the current |
93 | format. | 103 | format. |
94 | 104 | ||
diff --git a/fs/Kconfig b/fs/Kconfig index 9656139d2e99..219ec06a8c7e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
@@ -236,6 +236,7 @@ config JBD_DEBUG | |||
236 | 236 | ||
237 | config JBD2 | 237 | config JBD2 |
238 | tristate | 238 | tristate |
239 | select CRC32 | ||
239 | help | 240 | help |
240 | This is a generic journaling layer for block devices that support | 241 | This is a generic journaling layer for block devices that support |
241 | both 32-bit and 64-bit block numbers. It is currently used by | 242 | both 32-bit and 64-bit block numbers. It is currently used by |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7305443e100..f7479d30735e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -869,6 +869,7 @@ enum { | |||
869 | Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, | 869 | Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, |
870 | Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, | 870 | Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, |
871 | Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, | 871 | Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, |
872 | Opt_journal_checksum, Opt_journal_async_commit, | ||
872 | Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, | 873 | Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, |
873 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, | 874 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, |
874 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, | 875 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, |
@@ -908,6 +909,8 @@ static match_table_t tokens = { | |||
908 | {Opt_journal_update, "journal=update"}, | 909 | {Opt_journal_update, "journal=update"}, |
909 | {Opt_journal_inum, "journal=%u"}, | 910 | {Opt_journal_inum, "journal=%u"}, |
910 | {Opt_journal_dev, "journal_dev=%u"}, | 911 | {Opt_journal_dev, "journal_dev=%u"}, |
912 | {Opt_journal_checksum, "journal_checksum"}, | ||
913 | {Opt_journal_async_commit, "journal_async_commit"}, | ||
911 | {Opt_abort, "abort"}, | 914 | {Opt_abort, "abort"}, |
912 | {Opt_data_journal, "data=journal"}, | 915 | {Opt_data_journal, "data=journal"}, |
913 | {Opt_data_ordered, "data=ordered"}, | 916 | {Opt_data_ordered, "data=ordered"}, |
@@ -1095,6 +1098,13 @@ static int parse_options (char *options, struct super_block *sb, | |||
1095 | return 0; | 1098 | return 0; |
1096 | *journal_devnum = option; | 1099 | *journal_devnum = option; |
1097 | break; | 1100 | break; |
1101 | case Opt_journal_checksum: | ||
1102 | set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); | ||
1103 | break; | ||
1104 | case Opt_journal_async_commit: | ||
1105 | set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); | ||
1106 | set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); | ||
1107 | break; | ||
1098 | case Opt_noload: | 1108 | case Opt_noload: |
1099 | set_opt (sbi->s_mount_opt, NOLOAD); | 1109 | set_opt (sbi->s_mount_opt, NOLOAD); |
1100 | break; | 1110 | break; |
@@ -2114,6 +2124,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
2114 | goto failed_mount4; | 2124 | goto failed_mount4; |
2115 | } | 2125 | } |
2116 | 2126 | ||
2127 | if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { | ||
2128 | jbd2_journal_set_features(sbi->s_journal, | ||
2129 | JBD2_FEATURE_COMPAT_CHECKSUM, 0, | ||
2130 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); | ||
2131 | } else if (test_opt(sb, JOURNAL_CHECKSUM)) { | ||
2132 | jbd2_journal_set_features(sbi->s_journal, | ||
2133 | JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); | ||
2134 | jbd2_journal_clear_features(sbi->s_journal, 0, 0, | ||
2135 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); | ||
2136 | } else { | ||
2137 | jbd2_journal_clear_features(sbi->s_journal, | ||
2138 | JBD2_FEATURE_COMPAT_CHECKSUM, 0, | ||
2139 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); | ||
2140 | } | ||
2141 | |||
2117 | /* We have now updated the journal if required, so we can | 2142 | /* We have now updated the journal if required, so we can |
2118 | * validate the data journaling mode. */ | 2143 | * validate the data journaling mode. */ |
2119 | switch (test_opt(sb, DATA_FLAGS)) { | 2144 | switch (test_opt(sb, DATA_FLAGS)) { |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8749a86f4175..da8d0eb3b7b9 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/crc32.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * Default IO end handler for temporary BJ_IO buffer_heads. | 27 | * Default IO end handler for temporary BJ_IO buffer_heads. |
@@ -93,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh) | |||
93 | return 1; | 94 | return 1; |
94 | } | 95 | } |
95 | 96 | ||
96 | /* Done it all: now write the commit record. We should have | 97 | /* |
98 | * Done it all: now submit the commit record. We should have | ||
97 | * cleaned up our previous buffers by now, so if we are in abort | 99 | * cleaned up our previous buffers by now, so if we are in abort |
98 | * mode we can now just skip the rest of the journal write | 100 | * mode we can now just skip the rest of the journal write |
99 | * entirely. | 101 | * entirely. |
100 | * | 102 | * |
101 | * Returns 1 if the journal needs to be aborted or 0 on success | 103 | * Returns 1 if the journal needs to be aborted or 0 on success |
102 | */ | 104 | */ |
103 | static int journal_write_commit_record(journal_t *journal, | 105 | static int journal_submit_commit_record(journal_t *journal, |
104 | transaction_t *commit_transaction) | 106 | transaction_t *commit_transaction, |
107 | struct buffer_head **cbh, | ||
108 | __u32 crc32_sum) | ||
105 | { | 109 | { |
106 | struct journal_head *descriptor; | 110 | struct journal_head *descriptor; |
111 | struct commit_header *tmp; | ||
107 | struct buffer_head *bh; | 112 | struct buffer_head *bh; |
108 | int i, ret; | 113 | int ret; |
109 | int barrier_done = 0; | 114 | int barrier_done = 0; |
110 | 115 | ||
111 | if (is_journal_aborted(journal)) | 116 | if (is_journal_aborted(journal)) |
@@ -117,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal, | |||
117 | 122 | ||
118 | bh = jh2bh(descriptor); | 123 | bh = jh2bh(descriptor); |
119 | 124 | ||
120 | /* AKPM: buglet - add `i' to tmp! */ | 125 | tmp = (struct commit_header *)bh->b_data; |
121 | for (i = 0; i < bh->b_size; i += 512) { | 126 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
122 | journal_header_t *tmp = (journal_header_t*)bh->b_data; | 127 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); |
123 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 128 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
124 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | 129 | |
125 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 130 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
131 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | ||
132 | tmp->h_chksum_type = JBD2_CRC32_CHKSUM; | ||
133 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; | ||
134 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); | ||
126 | } | 135 | } |
127 | 136 | ||
128 | JBUFFER_TRACE(descriptor, "write commit block"); | 137 | JBUFFER_TRACE(descriptor, "submit commit block"); |
138 | lock_buffer(bh); | ||
139 | |||
129 | set_buffer_dirty(bh); | 140 | set_buffer_dirty(bh); |
130 | if (journal->j_flags & JBD2_BARRIER) { | 141 | set_buffer_uptodate(bh); |
142 | bh->b_end_io = journal_end_buffer_io_sync; | ||
143 | |||
144 | if (journal->j_flags & JBD2_BARRIER && | ||
145 | !JBD2_HAS_COMPAT_FEATURE(journal, | ||
146 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | ||
131 | set_buffer_ordered(bh); | 147 | set_buffer_ordered(bh); |
132 | barrier_done = 1; | 148 | barrier_done = 1; |
133 | } | 149 | } |
134 | ret = sync_dirty_buffer(bh); | 150 | ret = submit_bh(WRITE, bh); |
151 | |||
135 | /* is it possible for another commit to fail at roughly | 152 | /* is it possible for another commit to fail at roughly |
136 | * the same time as this one? If so, we don't want to | 153 | * the same time as this one? If so, we don't want to |
137 | * trust the barrier flag in the super, but instead want | 154 | * trust the barrier flag in the super, but instead want |
@@ -152,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal, | |||
152 | clear_buffer_ordered(bh); | 169 | clear_buffer_ordered(bh); |
153 | set_buffer_uptodate(bh); | 170 | set_buffer_uptodate(bh); |
154 | set_buffer_dirty(bh); | 171 | set_buffer_dirty(bh); |
155 | ret = sync_dirty_buffer(bh); | 172 | ret = submit_bh(WRITE, bh); |
156 | } | 173 | } |
157 | put_bh(bh); /* One for getblk() */ | 174 | *cbh = bh; |
158 | jbd2_journal_put_journal_head(descriptor); | 175 | return ret; |
176 | } | ||
177 | |||
178 | /* | ||
179 | * This function along with journal_submit_commit_record | ||
180 | * allows to write the commit record asynchronously. | ||
181 | */ | ||
182 | static int journal_wait_on_commit_record(struct buffer_head *bh) | ||
183 | { | ||
184 | int ret = 0; | ||
185 | |||
186 | clear_buffer_dirty(bh); | ||
187 | wait_on_buffer(bh); | ||
159 | 188 | ||
160 | return (ret == -EIO); | 189 | if (unlikely(!buffer_uptodate(bh))) |
190 | ret = -EIO; | ||
191 | put_bh(bh); /* One for getblk() */ | ||
192 | jbd2_journal_put_journal_head(bh2jh(bh)); | ||
193 | |||
194 | return ret; | ||
161 | } | 195 | } |
162 | 196 | ||
197 | /* | ||
198 | * Wait for all submitted IO to complete. | ||
199 | */ | ||
200 | static int journal_wait_on_locked_list(journal_t *journal, | ||
201 | transaction_t *commit_transaction) | ||
202 | { | ||
203 | int ret = 0; | ||
204 | struct journal_head *jh; | ||
205 | |||
206 | while (commit_transaction->t_locked_list) { | ||
207 | struct buffer_head *bh; | ||
208 | |||
209 | jh = commit_transaction->t_locked_list->b_tprev; | ||
210 | bh = jh2bh(jh); | ||
211 | get_bh(bh); | ||
212 | if (buffer_locked(bh)) { | ||
213 | spin_unlock(&journal->j_list_lock); | ||
214 | wait_on_buffer(bh); | ||
215 | if (unlikely(!buffer_uptodate(bh))) | ||
216 | ret = -EIO; | ||
217 | spin_lock(&journal->j_list_lock); | ||
218 | } | ||
219 | if (!inverted_lock(journal, bh)) { | ||
220 | put_bh(bh); | ||
221 | spin_lock(&journal->j_list_lock); | ||
222 | continue; | ||
223 | } | ||
224 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
225 | __jbd2_journal_unfile_buffer(jh); | ||
226 | jbd_unlock_bh_state(bh); | ||
227 | jbd2_journal_remove_journal_head(bh); | ||
228 | put_bh(bh); | ||
229 | } else { | ||
230 | jbd_unlock_bh_state(bh); | ||
231 | } | ||
232 | put_bh(bh); | ||
233 | cond_resched_lock(&journal->j_list_lock); | ||
234 | } | ||
235 | return ret; | ||
236 | } | ||
237 | |||
163 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | 238 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) |
164 | { | 239 | { |
165 | int i; | 240 | int i; |
@@ -275,7 +350,21 @@ write_out_data: | |||
275 | journal_do_submit_data(wbuf, bufs); | 350 | journal_do_submit_data(wbuf, bufs); |
276 | } | 351 | } |
277 | 352 | ||
278 | static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | 353 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) |
354 | { | ||
355 | struct page *page = bh->b_page; | ||
356 | char *addr; | ||
357 | __u32 checksum; | ||
358 | |||
359 | addr = kmap_atomic(page, KM_USER0); | ||
360 | checksum = crc32_be(crc32_sum, | ||
361 | (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); | ||
362 | kunmap_atomic(addr, KM_USER0); | ||
363 | |||
364 | return checksum; | ||
365 | } | ||
366 | |||
367 | static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | ||
279 | unsigned long long block) | 368 | unsigned long long block) |
280 | { | 369 | { |
281 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); | 370 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); |
@@ -307,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
307 | int tag_flag; | 396 | int tag_flag; |
308 | int i; | 397 | int i; |
309 | int tag_bytes = journal_tag_bytes(journal); | 398 | int tag_bytes = journal_tag_bytes(journal); |
399 | struct buffer_head *cbh = NULL; /* For transactional checksums */ | ||
400 | __u32 crc32_sum = ~0; | ||
310 | 401 | ||
311 | /* | 402 | /* |
312 | * First job: lock down the current transaction and wait for | 403 | * First job: lock down the current transaction and wait for |
@@ -451,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
451 | journal_submit_data_buffers(journal, commit_transaction); | 542 | journal_submit_data_buffers(journal, commit_transaction); |
452 | 543 | ||
453 | /* | 544 | /* |
454 | * Wait for all previously submitted IO to complete. | 545 | * Wait for all previously submitted IO to complete if commit |
546 | * record is to be written synchronously. | ||
455 | */ | 547 | */ |
456 | spin_lock(&journal->j_list_lock); | 548 | spin_lock(&journal->j_list_lock); |
457 | while (commit_transaction->t_locked_list) { | 549 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
458 | struct buffer_head *bh; | 550 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) |
551 | err = journal_wait_on_locked_list(journal, | ||
552 | commit_transaction); | ||
459 | 553 | ||
460 | jh = commit_transaction->t_locked_list->b_tprev; | ||
461 | bh = jh2bh(jh); | ||
462 | get_bh(bh); | ||
463 | if (buffer_locked(bh)) { | ||
464 | spin_unlock(&journal->j_list_lock); | ||
465 | wait_on_buffer(bh); | ||
466 | if (unlikely(!buffer_uptodate(bh))) | ||
467 | err = -EIO; | ||
468 | spin_lock(&journal->j_list_lock); | ||
469 | } | ||
470 | if (!inverted_lock(journal, bh)) { | ||
471 | put_bh(bh); | ||
472 | spin_lock(&journal->j_list_lock); | ||
473 | continue; | ||
474 | } | ||
475 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
476 | __jbd2_journal_unfile_buffer(jh); | ||
477 | jbd_unlock_bh_state(bh); | ||
478 | jbd2_journal_remove_journal_head(bh); | ||
479 | put_bh(bh); | ||
480 | } else { | ||
481 | jbd_unlock_bh_state(bh); | ||
482 | } | ||
483 | put_bh(bh); | ||
484 | cond_resched_lock(&journal->j_list_lock); | ||
485 | } | ||
486 | spin_unlock(&journal->j_list_lock); | 554 | spin_unlock(&journal->j_list_lock); |
487 | 555 | ||
488 | if (err) | 556 | if (err) |
@@ -656,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
656 | start_journal_io: | 724 | start_journal_io: |
657 | for (i = 0; i < bufs; i++) { | 725 | for (i = 0; i < bufs; i++) { |
658 | struct buffer_head *bh = wbuf[i]; | 726 | struct buffer_head *bh = wbuf[i]; |
727 | /* | ||
728 | * Compute checksum. | ||
729 | */ | ||
730 | if (JBD2_HAS_COMPAT_FEATURE(journal, | ||
731 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | ||
732 | crc32_sum = | ||
733 | jbd2_checksum_data(crc32_sum, bh); | ||
734 | } | ||
735 | |||
659 | lock_buffer(bh); | 736 | lock_buffer(bh); |
660 | clear_buffer_dirty(bh); | 737 | clear_buffer_dirty(bh); |
661 | set_buffer_uptodate(bh); | 738 | set_buffer_uptodate(bh); |
@@ -672,6 +749,23 @@ start_journal_io: | |||
672 | } | 749 | } |
673 | } | 750 | } |
674 | 751 | ||
752 | /* Done it all: now write the commit record asynchronously. */ | ||
753 | |||
754 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, | ||
755 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | ||
756 | err = journal_submit_commit_record(journal, commit_transaction, | ||
757 | &cbh, crc32_sum); | ||
758 | if (err) | ||
759 | __jbd2_journal_abort_hard(journal); | ||
760 | |||
761 | spin_lock(&journal->j_list_lock); | ||
762 | err = journal_wait_on_locked_list(journal, | ||
763 | commit_transaction); | ||
764 | spin_unlock(&journal->j_list_lock); | ||
765 | if (err) | ||
766 | __jbd2_journal_abort_hard(journal); | ||
767 | } | ||
768 | |||
675 | /* Lo and behold: we have just managed to send a transaction to | 769 | /* Lo and behold: we have just managed to send a transaction to |
676 | the log. Before we can commit it, wait for the IO so far to | 770 | the log. Before we can commit it, wait for the IO so far to |
677 | complete. Control buffers being written are on the | 771 | complete. Control buffers being written are on the |
@@ -771,8 +865,14 @@ wait_for_iobuf: | |||
771 | 865 | ||
772 | jbd_debug(3, "JBD: commit phase 6\n"); | 866 | jbd_debug(3, "JBD: commit phase 6\n"); |
773 | 867 | ||
774 | if (journal_write_commit_record(journal, commit_transaction)) | 868 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
775 | err = -EIO; | 869 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
870 | err = journal_submit_commit_record(journal, commit_transaction, | ||
871 | &cbh, crc32_sum); | ||
872 | if (err) | ||
873 | __jbd2_journal_abort_hard(journal); | ||
874 | } | ||
875 | err = journal_wait_on_commit_record(cbh); | ||
776 | 876 | ||
777 | if (err) | 877 | if (err) |
778 | jbd2_journal_abort(journal, err); | 878 | jbd2_journal_abort(journal, err); |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 3667c91bc786..59ba2494dcaf 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -1578,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, | |||
1578 | return 1; | 1578 | return 1; |
1579 | } | 1579 | } |
1580 | 1580 | ||
1581 | /* | ||
1582 | * jbd2_journal_clear_features () - Clear a given journal feature in the | ||
1583 | * superblock | ||
1584 | * @journal: Journal to act on. | ||
1585 | * @compat: bitmask of compatible features | ||
1586 | * @ro: bitmask of features that force read-only mount | ||
1587 | * @incompat: bitmask of incompatible features | ||
1588 | * | ||
1589 | * Clear a given journal feature as present on the | ||
1590 | * superblock. | ||
1591 | */ | ||
1592 | void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, | ||
1593 | unsigned long ro, unsigned long incompat) | ||
1594 | { | ||
1595 | journal_superblock_t *sb; | ||
1596 | |||
1597 | jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", | ||
1598 | compat, ro, incompat); | ||
1599 | |||
1600 | sb = journal->j_superblock; | ||
1601 | |||
1602 | sb->s_feature_compat &= ~cpu_to_be32(compat); | ||
1603 | sb->s_feature_ro_compat &= ~cpu_to_be32(ro); | ||
1604 | sb->s_feature_incompat &= ~cpu_to_be32(incompat); | ||
1605 | } | ||
1606 | EXPORT_SYMBOL(jbd2_journal_clear_features); | ||
1581 | 1607 | ||
1582 | /** | 1608 | /** |
1583 | * int jbd2_journal_update_format () - Update on-disk journal structure. | 1609 | * int jbd2_journal_update_format () - Update on-disk journal structure. |
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d0ce627539ef..921680663fa2 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/jbd2.h> | 21 | #include <linux/jbd2.h> |
22 | #include <linux/errno.h> | 22 | #include <linux/errno.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/crc32.h> | ||
24 | #endif | 25 | #endif |
25 | 26 | ||
26 | /* | 27 | /* |
@@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag | |||
316 | return block; | 317 | return block; |
317 | } | 318 | } |
318 | 319 | ||
320 | /* | ||
321 | * calc_chksums calculates the checksums for the blocks described in the | ||
322 | * descriptor block. | ||
323 | */ | ||
324 | static int calc_chksums(journal_t *journal, struct buffer_head *bh, | ||
325 | unsigned long *next_log_block, __u32 *crc32_sum) | ||
326 | { | ||
327 | int i, num_blks, err; | ||
328 | unsigned long io_block; | ||
329 | struct buffer_head *obh; | ||
330 | |||
331 | num_blks = count_tags(journal, bh); | ||
332 | /* Calculate checksum of the descriptor block. */ | ||
333 | *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); | ||
334 | |||
335 | for (i = 0; i < num_blks; i++) { | ||
336 | io_block = (*next_log_block)++; | ||
337 | wrap(journal, *next_log_block); | ||
338 | err = jread(&obh, journal, io_block); | ||
339 | if (err) { | ||
340 | printk(KERN_ERR "JBD: IO error %d recovering block " | ||
341 | "%lu in log\n", err, io_block); | ||
342 | return 1; | ||
343 | } else { | ||
344 | *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, | ||
345 | obh->b_size); | ||
346 | } | ||
347 | } | ||
348 | return 0; | ||
349 | } | ||
350 | |||
319 | static int do_one_pass(journal_t *journal, | 351 | static int do_one_pass(journal_t *journal, |
320 | struct recovery_info *info, enum passtype pass) | 352 | struct recovery_info *info, enum passtype pass) |
321 | { | 353 | { |
@@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal, | |||
328 | unsigned int sequence; | 360 | unsigned int sequence; |
329 | int blocktype; | 361 | int blocktype; |
330 | int tag_bytes = journal_tag_bytes(journal); | 362 | int tag_bytes = journal_tag_bytes(journal); |
363 | __u32 crc32_sum = ~0; /* Transactional Checksums */ | ||
331 | 364 | ||
332 | /* Precompute the maximum metadata descriptors in a descriptor block */ | 365 | /* Precompute the maximum metadata descriptors in a descriptor block */ |
333 | int MAX_BLOCKS_PER_DESC; | 366 | int MAX_BLOCKS_PER_DESC; |
@@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal, | |||
419 | switch(blocktype) { | 452 | switch(blocktype) { |
420 | case JBD2_DESCRIPTOR_BLOCK: | 453 | case JBD2_DESCRIPTOR_BLOCK: |
421 | /* If it is a valid descriptor block, replay it | 454 | /* If it is a valid descriptor block, replay it |
422 | * in pass REPLAY; otherwise, just skip over the | 455 | * in pass REPLAY; if journal_checksums enabled, then |
423 | * blocks it describes. */ | 456 | * calculate checksums in PASS_SCAN, otherwise, |
457 | * just skip over the blocks it describes. */ | ||
424 | if (pass != PASS_REPLAY) { | 458 | if (pass != PASS_REPLAY) { |
459 | if (pass == PASS_SCAN && | ||
460 | JBD2_HAS_COMPAT_FEATURE(journal, | ||
461 | JBD2_FEATURE_COMPAT_CHECKSUM) && | ||
462 | !info->end_transaction) { | ||
463 | if (calc_chksums(journal, bh, | ||
464 | &next_log_block, | ||
465 | &crc32_sum)) { | ||
466 | put_bh(bh); | ||
467 | break; | ||
468 | } | ||
469 | put_bh(bh); | ||
470 | continue; | ||
471 | } | ||
425 | next_log_block += count_tags(journal, bh); | 472 | next_log_block += count_tags(journal, bh); |
426 | wrap(journal, next_log_block); | 473 | wrap(journal, next_log_block); |
427 | brelse(bh); | 474 | put_bh(bh); |
428 | continue; | 475 | continue; |
429 | } | 476 | } |
430 | 477 | ||
@@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal, | |||
516 | continue; | 563 | continue; |
517 | 564 | ||
518 | case JBD2_COMMIT_BLOCK: | 565 | case JBD2_COMMIT_BLOCK: |
519 | /* Found an expected commit block: not much to | 566 | /* How to differentiate between interrupted commit |
520 | * do other than move on to the next sequence | 567 | * and journal corruption ? |
568 | * | ||
569 | * {nth transaction} | ||
570 | * Checksum Verification Failed | ||
571 | * | | ||
572 | * ____________________ | ||
573 | * | | | ||
574 | * async_commit sync_commit | ||
575 | * | | | ||
576 | * | GO TO NEXT "Journal Corruption" | ||
577 | * | TRANSACTION | ||
578 | * | | ||
579 | * {(n+1)th transanction} | ||
580 | * | | ||
581 | * _______|______________ | ||
582 | * | | | ||
583 | * Commit block found Commit block not found | ||
584 | * | | | ||
585 | * "Journal Corruption" | | ||
586 | * _____________|_________ | ||
587 | * | | | ||
588 | * nth trans corrupt OR nth trans | ||
589 | * and (n+1)th interrupted interrupted | ||
590 | * before commit block | ||
591 | * could reach the disk. | ||
592 | * (Cannot find the difference in above | ||
593 | * mentioned conditions. Hence assume | ||
594 | * "Interrupted Commit".) | ||
595 | */ | ||
596 | |||
597 | /* Found an expected commit block: if checksums | ||
598 | * are present verify them in PASS_SCAN; else not | ||
599 | * much to do other than move on to the next sequence | ||
521 | * number. */ | 600 | * number. */ |
601 | if (pass == PASS_SCAN && | ||
602 | JBD2_HAS_COMPAT_FEATURE(journal, | ||
603 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | ||
604 | int chksum_err, chksum_seen; | ||
605 | struct commit_header *cbh = | ||
606 | (struct commit_header *)bh->b_data; | ||
607 | unsigned found_chksum = | ||
608 | be32_to_cpu(cbh->h_chksum[0]); | ||
609 | |||
610 | chksum_err = chksum_seen = 0; | ||
611 | |||
612 | if (info->end_transaction) { | ||
613 | printk(KERN_ERR "JBD: Transaction %u " | ||
614 | "found to be corrupt.\n", | ||
615 | next_commit_ID - 1); | ||
616 | brelse(bh); | ||
617 | break; | ||
618 | } | ||
619 | |||
620 | if (crc32_sum == found_chksum && | ||
621 | cbh->h_chksum_type == JBD2_CRC32_CHKSUM && | ||
622 | cbh->h_chksum_size == | ||
623 | JBD2_CRC32_CHKSUM_SIZE) | ||
624 | chksum_seen = 1; | ||
625 | else if (!(cbh->h_chksum_type == 0 && | ||
626 | cbh->h_chksum_size == 0 && | ||
627 | found_chksum == 0 && | ||
628 | !chksum_seen)) | ||
629 | /* | ||
630 | * If fs is mounted using an old kernel and then | ||
631 | * kernel with journal_chksum is used then we | ||
632 | * get a situation where the journal flag has | ||
633 | * checksum flag set but checksums are not | ||
634 | * present i.e chksum = 0, in the individual | ||
635 | * commit blocks. | ||
636 | * Hence to avoid checksum failures, in this | ||
637 | * situation, this extra check is added. | ||
638 | */ | ||
639 | chksum_err = 1; | ||
640 | |||
641 | if (chksum_err) { | ||
642 | info->end_transaction = next_commit_ID; | ||
643 | |||
644 | if (!JBD2_HAS_COMPAT_FEATURE(journal, | ||
645 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ | ||
646 | printk(KERN_ERR | ||
647 | "JBD: Transaction %u " | ||
648 | "found to be corrupt.\n", | ||
649 | next_commit_ID); | ||
650 | brelse(bh); | ||
651 | break; | ||
652 | } | ||
653 | } | ||
654 | crc32_sum = ~0; | ||
655 | } | ||
522 | brelse(bh); | 656 | brelse(bh); |
523 | next_commit_ID++; | 657 | next_commit_ID++; |
524 | continue; | 658 | continue; |
@@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal, | |||
554 | * transaction marks the end of the valid log. | 688 | * transaction marks the end of the valid log. |
555 | */ | 689 | */ |
556 | 690 | ||
557 | if (pass == PASS_SCAN) | 691 | if (pass == PASS_SCAN) { |
558 | info->end_transaction = next_commit_ID; | 692 | if (!info->end_transaction) |
559 | else { | 693 | info->end_transaction = next_commit_ID; |
694 | } else { | ||
560 | /* It's really bad news if different passes end up at | 695 | /* It's really bad news if different passes end up at |
561 | * different places (but possible due to IO errors). */ | 696 | * different places (but possible due to IO errors). */ |
562 | if (info->end_transaction != next_commit_ID) { | 697 | if (info->end_transaction != next_commit_ID) { |
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h index 300cc5a5adb9..cd406dba0e64 100644 --- a/include/linux/ext4_fs.h +++ b/include/linux/ext4_fs.h | |||
@@ -467,7 +467,8 @@ do { \ | |||
467 | #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ | 467 | #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ |
468 | #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ | 468 | #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ |
469 | #define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ | 469 | #define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ |
470 | 470 | #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ | |
471 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ | ||
471 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ | 472 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ |
472 | #ifndef _LINUX_EXT2_FS_H | 473 | #ifndef _LINUX_EXT2_FS_H |
473 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt | 474 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt |
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 685640036e81..98a2bc5d3e3f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h | |||
@@ -149,6 +149,28 @@ typedef struct journal_header_s | |||
149 | __be32 h_sequence; | 149 | __be32 h_sequence; |
150 | } journal_header_t; | 150 | } journal_header_t; |
151 | 151 | ||
152 | /* | ||
153 | * Checksum types. | ||
154 | */ | ||
155 | #define JBD2_CRC32_CHKSUM 1 | ||
156 | #define JBD2_MD5_CHKSUM 2 | ||
157 | #define JBD2_SHA1_CHKSUM 3 | ||
158 | |||
159 | #define JBD2_CRC32_CHKSUM_SIZE 4 | ||
160 | |||
161 | #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) | ||
162 | /* | ||
163 | * Commit block header for storing transactional checksums: | ||
164 | */ | ||
165 | struct commit_header { | ||
166 | __be32 h_magic; | ||
167 | __be32 h_blocktype; | ||
168 | __be32 h_sequence; | ||
169 | unsigned char h_chksum_type; | ||
170 | unsigned char h_chksum_size; | ||
171 | unsigned char h_padding[2]; | ||
172 | __be32 h_chksum[JBD2_CHECKSUM_BYTES]; | ||
173 | }; | ||
152 | 174 | ||
153 | /* | 175 | /* |
154 | * The block tag: used to describe a single buffer in the journal. | 176 | * The block tag: used to describe a single buffer in the journal. |
@@ -242,14 +264,18 @@ typedef struct journal_superblock_s | |||
242 | ((j)->j_format_version >= 2 && \ | 264 | ((j)->j_format_version >= 2 && \ |
243 | ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) | 265 | ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) |
244 | 266 | ||
245 | #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 | 267 | #define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 |
246 | #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 | 268 | |
269 | #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 | ||
270 | #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 | ||
271 | #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 | ||
247 | 272 | ||
248 | /* Features known to this kernel version: */ | 273 | /* Features known to this kernel version: */ |
249 | #define JBD2_KNOWN_COMPAT_FEATURES 0 | 274 | #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM |
250 | #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 | 275 | #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 |
251 | #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ | 276 | #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ |
252 | JBD2_FEATURE_INCOMPAT_64BIT) | 277 | JBD2_FEATURE_INCOMPAT_64BIT | \ |
278 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) | ||
253 | 279 | ||
254 | #ifdef __KERNEL__ | 280 | #ifdef __KERNEL__ |
255 | 281 | ||
@@ -997,6 +1023,8 @@ extern int jbd2_journal_check_available_features | |||
997 | (journal_t *, unsigned long, unsigned long, unsigned long); | 1023 | (journal_t *, unsigned long, unsigned long, unsigned long); |
998 | extern int jbd2_journal_set_features | 1024 | extern int jbd2_journal_set_features |
999 | (journal_t *, unsigned long, unsigned long, unsigned long); | 1025 | (journal_t *, unsigned long, unsigned long, unsigned long); |
1026 | extern void jbd2_journal_clear_features | ||
1027 | (journal_t *, unsigned long, unsigned long, unsigned long); | ||
1000 | extern int jbd2_journal_create (journal_t *); | 1028 | extern int jbd2_journal_create (journal_t *); |
1001 | extern int jbd2_journal_load (journal_t *journal); | 1029 | extern int jbd2_journal_load (journal_t *journal); |
1002 | extern void jbd2_journal_destroy (journal_t *); | 1030 | extern void jbd2_journal_destroy (journal_t *); |