diff options
author | Girish Shilamkar <girish@clusterfs.com> | 2008-01-28 23:58:27 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-01-28 23:58:27 -0500 |
commit | 818d276ceb83aa9fdebb5e0a53188290312de987 (patch) | |
tree | de3fb4ffadd72caea2876c5232ce76cd14b3646e /fs/jbd2/commit.c | |
parent | 8e85fb3f305b24b79c6d9cb7a56d22b062335ad3 (diff) |
ext4: Add the journal checksum feature
The journal checksum feature adds two new flags i.e
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and JBD2_FEATURE_COMPAT_CHECKSUM.
JBD2_FEATURE_CHECKSUM flag indicates that the commit block contains the
checksum for the blocks described by the descriptor blocks.
Due to checksums, writing of the commit record no longer needs to be
synchronous. Now commit record can be sent to disk without waiting for
descriptor blocks to be written to disk. This behavior is controlled
using JBD2_FEATURE_ASYNC_COMMIT flag. Older kernels/e2fsck should not be
able to recover the journal with _ASYNC_COMMIT hence it is made
incompat.
The commit header has been extended to hold the checksum along with the
type of the checksum.
For recovery in pass scan checksums are verified to ensure the sanity
and completeness(in case of _ASYNC_COMMIT) of every transaction.
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Girish Shilamkar <girish@clusterfs.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Diffstat (limited to 'fs/jbd2/commit.c')
-rw-r--r-- | fs/jbd2/commit.c | 198 |
1 files changed, 149 insertions, 49 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8749a86f4175..da8d0eb3b7b9 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/crc32.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * Default IO end handler for temporary BJ_IO buffer_heads. | 27 | * Default IO end handler for temporary BJ_IO buffer_heads. |
@@ -93,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh) | |||
93 | return 1; | 94 | return 1; |
94 | } | 95 | } |
95 | 96 | ||
96 | /* Done it all: now write the commit record. We should have | 97 | /* |
98 | * Done it all: now submit the commit record. We should have | ||
97 | * cleaned up our previous buffers by now, so if we are in abort | 99 | * cleaned up our previous buffers by now, so if we are in abort |
98 | * mode we can now just skip the rest of the journal write | 100 | * mode we can now just skip the rest of the journal write |
99 | * entirely. | 101 | * entirely. |
100 | * | 102 | * |
101 | * Returns 1 if the journal needs to be aborted or 0 on success | 103 | * Returns 1 if the journal needs to be aborted or 0 on success |
102 | */ | 104 | */ |
103 | static int journal_write_commit_record(journal_t *journal, | 105 | static int journal_submit_commit_record(journal_t *journal, |
104 | transaction_t *commit_transaction) | 106 | transaction_t *commit_transaction, |
107 | struct buffer_head **cbh, | ||
108 | __u32 crc32_sum) | ||
105 | { | 109 | { |
106 | struct journal_head *descriptor; | 110 | struct journal_head *descriptor; |
111 | struct commit_header *tmp; | ||
107 | struct buffer_head *bh; | 112 | struct buffer_head *bh; |
108 | int i, ret; | 113 | int ret; |
109 | int barrier_done = 0; | 114 | int barrier_done = 0; |
110 | 115 | ||
111 | if (is_journal_aborted(journal)) | 116 | if (is_journal_aborted(journal)) |
@@ -117,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal, | |||
117 | 122 | ||
118 | bh = jh2bh(descriptor); | 123 | bh = jh2bh(descriptor); |
119 | 124 | ||
120 | /* AKPM: buglet - add `i' to tmp! */ | 125 | tmp = (struct commit_header *)bh->b_data; |
121 | for (i = 0; i < bh->b_size; i += 512) { | 126 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
122 | journal_header_t *tmp = (journal_header_t*)bh->b_data; | 127 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); |
123 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 128 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
124 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | 129 | |
125 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 130 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
131 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | ||
132 | tmp->h_chksum_type = JBD2_CRC32_CHKSUM; | ||
133 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; | ||
134 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); | ||
126 | } | 135 | } |
127 | 136 | ||
128 | JBUFFER_TRACE(descriptor, "write commit block"); | 137 | JBUFFER_TRACE(descriptor, "submit commit block"); |
138 | lock_buffer(bh); | ||
139 | |||
129 | set_buffer_dirty(bh); | 140 | set_buffer_dirty(bh); |
130 | if (journal->j_flags & JBD2_BARRIER) { | 141 | set_buffer_uptodate(bh); |
142 | bh->b_end_io = journal_end_buffer_io_sync; | ||
143 | |||
144 | if (journal->j_flags & JBD2_BARRIER && | ||
145 | !JBD2_HAS_COMPAT_FEATURE(journal, | ||
146 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | ||
131 | set_buffer_ordered(bh); | 147 | set_buffer_ordered(bh); |
132 | barrier_done = 1; | 148 | barrier_done = 1; |
133 | } | 149 | } |
134 | ret = sync_dirty_buffer(bh); | 150 | ret = submit_bh(WRITE, bh); |
151 | |||
135 | /* is it possible for another commit to fail at roughly | 152 | /* is it possible for another commit to fail at roughly |
136 | * the same time as this one? If so, we don't want to | 153 | * the same time as this one? If so, we don't want to |
137 | * trust the barrier flag in the super, but instead want | 154 | * trust the barrier flag in the super, but instead want |
@@ -152,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal, | |||
152 | clear_buffer_ordered(bh); | 169 | clear_buffer_ordered(bh); |
153 | set_buffer_uptodate(bh); | 170 | set_buffer_uptodate(bh); |
154 | set_buffer_dirty(bh); | 171 | set_buffer_dirty(bh); |
155 | ret = sync_dirty_buffer(bh); | 172 | ret = submit_bh(WRITE, bh); |
156 | } | 173 | } |
157 | put_bh(bh); /* One for getblk() */ | 174 | *cbh = bh; |
158 | jbd2_journal_put_journal_head(descriptor); | 175 | return ret; |
176 | } | ||
177 | |||
178 | /* | ||
179 | * This function along with journal_submit_commit_record | ||
180 | * allows to write the commit record asynchronously. | ||
181 | */ | ||
182 | static int journal_wait_on_commit_record(struct buffer_head *bh) | ||
183 | { | ||
184 | int ret = 0; | ||
185 | |||
186 | clear_buffer_dirty(bh); | ||
187 | wait_on_buffer(bh); | ||
159 | 188 | ||
160 | return (ret == -EIO); | 189 | if (unlikely(!buffer_uptodate(bh))) |
190 | ret = -EIO; | ||
191 | put_bh(bh); /* One for getblk() */ | ||
192 | jbd2_journal_put_journal_head(bh2jh(bh)); | ||
193 | |||
194 | return ret; | ||
161 | } | 195 | } |
162 | 196 | ||
197 | /* | ||
198 | * Wait for all submitted IO to complete. | ||
199 | */ | ||
200 | static int journal_wait_on_locked_list(journal_t *journal, | ||
201 | transaction_t *commit_transaction) | ||
202 | { | ||
203 | int ret = 0; | ||
204 | struct journal_head *jh; | ||
205 | |||
206 | while (commit_transaction->t_locked_list) { | ||
207 | struct buffer_head *bh; | ||
208 | |||
209 | jh = commit_transaction->t_locked_list->b_tprev; | ||
210 | bh = jh2bh(jh); | ||
211 | get_bh(bh); | ||
212 | if (buffer_locked(bh)) { | ||
213 | spin_unlock(&journal->j_list_lock); | ||
214 | wait_on_buffer(bh); | ||
215 | if (unlikely(!buffer_uptodate(bh))) | ||
216 | ret = -EIO; | ||
217 | spin_lock(&journal->j_list_lock); | ||
218 | } | ||
219 | if (!inverted_lock(journal, bh)) { | ||
220 | put_bh(bh); | ||
221 | spin_lock(&journal->j_list_lock); | ||
222 | continue; | ||
223 | } | ||
224 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
225 | __jbd2_journal_unfile_buffer(jh); | ||
226 | jbd_unlock_bh_state(bh); | ||
227 | jbd2_journal_remove_journal_head(bh); | ||
228 | put_bh(bh); | ||
229 | } else { | ||
230 | jbd_unlock_bh_state(bh); | ||
231 | } | ||
232 | put_bh(bh); | ||
233 | cond_resched_lock(&journal->j_list_lock); | ||
234 | } | ||
235 | return ret; | ||
236 | } | ||
237 | |||
163 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | 238 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) |
164 | { | 239 | { |
165 | int i; | 240 | int i; |
@@ -275,7 +350,21 @@ write_out_data: | |||
275 | journal_do_submit_data(wbuf, bufs); | 350 | journal_do_submit_data(wbuf, bufs); |
276 | } | 351 | } |
277 | 352 | ||
278 | static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | 353 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) |
354 | { | ||
355 | struct page *page = bh->b_page; | ||
356 | char *addr; | ||
357 | __u32 checksum; | ||
358 | |||
359 | addr = kmap_atomic(page, KM_USER0); | ||
360 | checksum = crc32_be(crc32_sum, | ||
361 | (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); | ||
362 | kunmap_atomic(addr, KM_USER0); | ||
363 | |||
364 | return checksum; | ||
365 | } | ||
366 | |||
367 | static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | ||
279 | unsigned long long block) | 368 | unsigned long long block) |
280 | { | 369 | { |
281 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); | 370 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); |
@@ -307,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
307 | int tag_flag; | 396 | int tag_flag; |
308 | int i; | 397 | int i; |
309 | int tag_bytes = journal_tag_bytes(journal); | 398 | int tag_bytes = journal_tag_bytes(journal); |
399 | struct buffer_head *cbh = NULL; /* For transactional checksums */ | ||
400 | __u32 crc32_sum = ~0; | ||
310 | 401 | ||
311 | /* | 402 | /* |
312 | * First job: lock down the current transaction and wait for | 403 | * First job: lock down the current transaction and wait for |
@@ -451,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
451 | journal_submit_data_buffers(journal, commit_transaction); | 542 | journal_submit_data_buffers(journal, commit_transaction); |
452 | 543 | ||
453 | /* | 544 | /* |
454 | * Wait for all previously submitted IO to complete. | 545 | * Wait for all previously submitted IO to complete if commit |
546 | * record is to be written synchronously. | ||
455 | */ | 547 | */ |
456 | spin_lock(&journal->j_list_lock); | 548 | spin_lock(&journal->j_list_lock); |
457 | while (commit_transaction->t_locked_list) { | 549 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
458 | struct buffer_head *bh; | 550 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) |
551 | err = journal_wait_on_locked_list(journal, | ||
552 | commit_transaction); | ||
459 | 553 | ||
460 | jh = commit_transaction->t_locked_list->b_tprev; | ||
461 | bh = jh2bh(jh); | ||
462 | get_bh(bh); | ||
463 | if (buffer_locked(bh)) { | ||
464 | spin_unlock(&journal->j_list_lock); | ||
465 | wait_on_buffer(bh); | ||
466 | if (unlikely(!buffer_uptodate(bh))) | ||
467 | err = -EIO; | ||
468 | spin_lock(&journal->j_list_lock); | ||
469 | } | ||
470 | if (!inverted_lock(journal, bh)) { | ||
471 | put_bh(bh); | ||
472 | spin_lock(&journal->j_list_lock); | ||
473 | continue; | ||
474 | } | ||
475 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
476 | __jbd2_journal_unfile_buffer(jh); | ||
477 | jbd_unlock_bh_state(bh); | ||
478 | jbd2_journal_remove_journal_head(bh); | ||
479 | put_bh(bh); | ||
480 | } else { | ||
481 | jbd_unlock_bh_state(bh); | ||
482 | } | ||
483 | put_bh(bh); | ||
484 | cond_resched_lock(&journal->j_list_lock); | ||
485 | } | ||
486 | spin_unlock(&journal->j_list_lock); | 554 | spin_unlock(&journal->j_list_lock); |
487 | 555 | ||
488 | if (err) | 556 | if (err) |
@@ -656,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
656 | start_journal_io: | 724 | start_journal_io: |
657 | for (i = 0; i < bufs; i++) { | 725 | for (i = 0; i < bufs; i++) { |
658 | struct buffer_head *bh = wbuf[i]; | 726 | struct buffer_head *bh = wbuf[i]; |
727 | /* | ||
728 | * Compute checksum. | ||
729 | */ | ||
730 | if (JBD2_HAS_COMPAT_FEATURE(journal, | ||
731 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | ||
732 | crc32_sum = | ||
733 | jbd2_checksum_data(crc32_sum, bh); | ||
734 | } | ||
735 | |||
659 | lock_buffer(bh); | 736 | lock_buffer(bh); |
660 | clear_buffer_dirty(bh); | 737 | clear_buffer_dirty(bh); |
661 | set_buffer_uptodate(bh); | 738 | set_buffer_uptodate(bh); |
@@ -672,6 +749,23 @@ start_journal_io: | |||
672 | } | 749 | } |
673 | } | 750 | } |
674 | 751 | ||
752 | /* Done it all: now write the commit record asynchronously. */ | ||
753 | |||
754 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, | ||
755 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | ||
756 | err = journal_submit_commit_record(journal, commit_transaction, | ||
757 | &cbh, crc32_sum); | ||
758 | if (err) | ||
759 | __jbd2_journal_abort_hard(journal); | ||
760 | |||
761 | spin_lock(&journal->j_list_lock); | ||
762 | err = journal_wait_on_locked_list(journal, | ||
763 | commit_transaction); | ||
764 | spin_unlock(&journal->j_list_lock); | ||
765 | if (err) | ||
766 | __jbd2_journal_abort_hard(journal); | ||
767 | } | ||
768 | |||
675 | /* Lo and behold: we have just managed to send a transaction to | 769 | /* Lo and behold: we have just managed to send a transaction to |
676 | the log. Before we can commit it, wait for the IO so far to | 770 | the log. Before we can commit it, wait for the IO so far to |
677 | complete. Control buffers being written are on the | 771 | complete. Control buffers being written are on the |
@@ -771,8 +865,14 @@ wait_for_iobuf: | |||
771 | 865 | ||
772 | jbd_debug(3, "JBD: commit phase 6\n"); | 866 | jbd_debug(3, "JBD: commit phase 6\n"); |
773 | 867 | ||
774 | if (journal_write_commit_record(journal, commit_transaction)) | 868 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
775 | err = -EIO; | 869 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
870 | err = journal_submit_commit_record(journal, commit_transaction, | ||
871 | &cbh, crc32_sum); | ||
872 | if (err) | ||
873 | __jbd2_journal_abort_hard(journal); | ||
874 | } | ||
875 | err = journal_wait_on_commit_record(cbh); | ||
776 | 876 | ||
777 | if (err) | 877 | if (err) |
778 | jbd2_journal_abort(journal, err); | 878 | jbd2_journal_abort(journal, err); |