aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2012-03-13 22:22:54 -0400
committerTheodore Ts'o <tytso@mit.edu>2012-03-13 22:22:54 -0400
commit79feb521a44705262d15cc819a4117a447b11ea7 (patch)
treea4de6ed084b7a68c0885049d94841ce8334b64a7 /fs
parenta78bb11d7acd525623c6a0c2ff4e213d527573fa (diff)
jbd2: issue cache flush after checkpointing even with internal journal
When we reach jbd2_cleanup_journal_tail(), there is no guarantee that checkpointed buffers are on a stable storage - especially if buffers were written out by jbd2_log_do_checkpoint(), they are likely to be only in disk's caches. Thus when we update journal superblock effectively removing old transaction from journal, this write of superblock can get to stable storage before those checkpointed buffers which can result in filesystem corruption after a crash. Thus we must unconditionally issue a cache flush before we update journal superblock in these cases. A similar problem can also occur if journal superblock is written only in disk's caches, other transaction starts reusing space of the transaction cleaned from the log and power failure happens. Subsequent journal replay would still try to replay the old transaction but some of it's blocks may be already overwritten by the new transaction. For this reason we must use WRITE_FUA when updating log tail and we must first write new log tail to disk and update in-memory information only after that. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r--fs/jbd2/checkpoint.c75
-rw-r--r--fs/jbd2/commit.c11
-rw-r--r--fs/jbd2/journal.c138
-rw-r--r--fs/jbd2/recovery.c5
4 files changed, 143 insertions, 86 deletions
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 19dcd0b86bca..7f7ee5b90402 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -478,79 +478,28 @@ out:
478 478
479int jbd2_cleanup_journal_tail(journal_t *journal) 479int jbd2_cleanup_journal_tail(journal_t *journal)
480{ 480{
481 transaction_t * transaction;
482 tid_t first_tid; 481 tid_t first_tid;
483 unsigned long blocknr, freed; 482 unsigned long blocknr;
484 483
485 if (is_journal_aborted(journal)) 484 if (is_journal_aborted(journal))
486 return 1; 485 return 1;
487 486
488 /* OK, work out the oldest transaction remaining in the log, and 487 if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
489 * the log block it starts at.
490 *
491 * If the log is now empty, we need to work out which is the
492 * next transaction ID we will write, and where it will
493 * start. */
494
495 write_lock(&journal->j_state_lock);
496 spin_lock(&journal->j_list_lock);
497 transaction = journal->j_checkpoint_transactions;
498 if (transaction) {
499 first_tid = transaction->t_tid;
500 blocknr = transaction->t_log_start;
501 } else if ((transaction = journal->j_committing_transaction) != NULL) {
502 first_tid = transaction->t_tid;
503 blocknr = transaction->t_log_start;
504 } else if ((transaction = journal->j_running_transaction) != NULL) {
505 first_tid = transaction->t_tid;
506 blocknr = journal->j_head;
507 } else {
508 first_tid = journal->j_transaction_sequence;
509 blocknr = journal->j_head;
510 }
511 spin_unlock(&journal->j_list_lock);
512 J_ASSERT(blocknr != 0);
513
514 /* If the oldest pinned transaction is at the tail of the log
515 already then there's not much we can do right now. */
516 if (journal->j_tail_sequence == first_tid) {
517 write_unlock(&journal->j_state_lock);
518 return 1; 488 return 1;
519 } 489 J_ASSERT(blocknr != 0);
520
521 /* OK, update the superblock to recover the freed space.
522 * Physical blocks come first: have we wrapped beyond the end of
523 * the log? */
524 freed = blocknr - journal->j_tail;
525 if (blocknr < journal->j_tail)
526 freed = freed + journal->j_last - journal->j_first;
527
528 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
529 jbd_debug(1,
530 "Cleaning journal tail from %d to %d (offset %lu), "
531 "freeing %lu\n",
532 journal->j_tail_sequence, first_tid, blocknr, freed);
533
534 journal->j_free += freed;
535 journal->j_tail_sequence = first_tid;
536 journal->j_tail = blocknr;
537 write_unlock(&journal->j_state_lock);
538 490
539 /* 491 /*
540 * If there is an external journal, we need to make sure that 492 * We need to make sure that any blocks that were recently written out
541 * any data blocks that were recently written out --- perhaps 493 * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
542 * by jbd2_log_do_checkpoint() --- are flushed out before we 494 * we drop the transactions from the journal. It's unlikely this will
543 * drop the transactions from the external journal. It's 495 * be necessary, especially with an appropriately sized journal, but we
544 * unlikely this will be necessary, especially with a 496 * need this to guarantee correctness. Fortunately
545 * appropriately sized journal, but we need this to guarantee 497 * jbd2_cleanup_journal_tail() doesn't get called all that often.
546 * correctness. Fortunately jbd2_cleanup_journal_tail()
547 * doesn't get called all that often.
548 */ 498 */
549 if ((journal->j_fs_dev != journal->j_dev) && 499 if (journal->j_flags & JBD2_BARRIER)
550 (journal->j_flags & JBD2_BARRIER))
551 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 500 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
552 if (!(journal->j_flags & JBD2_ABORT)) 501
553 jbd2_journal_update_sb_log_tail(journal); 502 __jbd2_update_log_tail(journal, first_tid, blocknr);
554 return 0; 503 return 0;
555} 504}
556 505
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6705717d9b7f..b89ef84786a7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -341,7 +341,16 @@ void jbd2_journal_commit_transaction(journal_t *journal)
341 if (journal->j_flags & JBD2_FLUSHED) { 341 if (journal->j_flags & JBD2_FLUSHED) {
342 jbd_debug(3, "super block updated\n"); 342 jbd_debug(3, "super block updated\n");
343 mutex_lock(&journal->j_checkpoint_mutex); 343 mutex_lock(&journal->j_checkpoint_mutex);
344 jbd2_journal_update_sb_log_tail(journal); 344 /*
345 * We hold j_checkpoint_mutex so tail cannot change under us.
346 * We don't need any special data guarantees for writing sb
347 * since journal is empty and it is ok for write to be
348 * flushed only with transaction commit.
349 */
350 jbd2_journal_update_sb_log_tail(journal,
351 journal->j_tail_sequence,
352 journal->j_tail,
353 WRITE_SYNC);
345 mutex_unlock(&journal->j_checkpoint_mutex); 354 mutex_unlock(&journal->j_checkpoint_mutex);
346 } else { 355 } else {
347 jbd_debug(3, "superblock not updated\n"); 356 jbd_debug(3, "superblock not updated\n");
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fc5f2acc9f18..c5ff177400ff 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -742,6 +742,85 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
742 return jbd2_journal_add_journal_head(bh); 742 return jbd2_journal_add_journal_head(bh);
743} 743}
744 744
745/*
746 * Return tid of the oldest transaction in the journal and block in the journal
747 * where the transaction starts.
748 *
749 * If the journal is now empty, return which will be the next transaction ID
750 * we will write and where will that transaction start.
751 *
752 * The return value is 0 if journal tail cannot be pushed any further, 1 if
753 * it can.
754 */
755int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
756 unsigned long *block)
757{
758 transaction_t *transaction;
759 int ret;
760
761 read_lock(&journal->j_state_lock);
762 spin_lock(&journal->j_list_lock);
763 transaction = journal->j_checkpoint_transactions;
764 if (transaction) {
765 *tid = transaction->t_tid;
766 *block = transaction->t_log_start;
767 } else if ((transaction = journal->j_committing_transaction) != NULL) {
768 *tid = transaction->t_tid;
769 *block = transaction->t_log_start;
770 } else if ((transaction = journal->j_running_transaction) != NULL) {
771 *tid = transaction->t_tid;
772 *block = journal->j_head;
773 } else {
774 *tid = journal->j_transaction_sequence;
775 *block = journal->j_head;
776 }
777 ret = tid_gt(*tid, journal->j_tail_sequence);
778 spin_unlock(&journal->j_list_lock);
779 read_unlock(&journal->j_state_lock);
780
781 return ret;
782}
783
784/*
785 * Update information in journal structure and in on disk journal superblock
786 * about log tail. This function does not check whether information passed in
787 * really pushes log tail further. It's responsibility of the caller to make
788 * sure provided log tail information is valid (e.g. by holding
789 * j_checkpoint_mutex all the time between computing log tail and calling this
790 * function as is the case with jbd2_cleanup_journal_tail()).
791 *
792 * Requires j_checkpoint_mutex
793 */
794void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
795{
796 unsigned long freed;
797
798 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
799
800 /*
801 * We cannot afford for write to remain in drive's caches since as
802 * soon as we update j_tail, next transaction can start reusing journal
803 * space and if we lose sb update during power failure we'd replay
804 * old transaction with possibly newly overwritten data.
805 */
806 jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
807 write_lock(&journal->j_state_lock);
808 freed = block - journal->j_tail;
809 if (block < journal->j_tail)
810 freed += journal->j_last - journal->j_first;
811
812 trace_jbd2_update_log_tail(journal, tid, block, freed);
813 jbd_debug(1,
814 "Cleaning journal tail from %d to %d (offset %lu), "
815 "freeing %lu\n",
816 journal->j_tail_sequence, tid, block, freed);
817
818 journal->j_free += freed;
819 journal->j_tail_sequence = tid;
820 journal->j_tail = block;
821 write_unlock(&journal->j_state_lock);
822}
823
745struct jbd2_stats_proc_session { 824struct jbd2_stats_proc_session {
746 journal_t *journal; 825 journal_t *journal;
747 struct transaction_stats_s *stats; 826 struct transaction_stats_s *stats;
@@ -1125,18 +1204,30 @@ static int journal_reset(journal_t *journal)
1125 } else { 1204 } else {
1126 /* Lock here to make assertions happy... */ 1205 /* Lock here to make assertions happy... */
1127 mutex_lock(&journal->j_checkpoint_mutex); 1206 mutex_lock(&journal->j_checkpoint_mutex);
1128 /* Add the dynamic fields and write it to disk. */ 1207 /*
1129 jbd2_journal_update_sb_log_tail(journal); 1208 * Update log tail information. We use WRITE_FUA since new
1209 * transaction will start reusing journal space and so we
1210 * must make sure information about current log tail is on
1211 * disk before that.
1212 */
1213 jbd2_journal_update_sb_log_tail(journal,
1214 journal->j_tail_sequence,
1215 journal->j_tail,
1216 WRITE_FUA);
1130 mutex_unlock(&journal->j_checkpoint_mutex); 1217 mutex_unlock(&journal->j_checkpoint_mutex);
1131 } 1218 }
1132 return jbd2_journal_start_thread(journal); 1219 return jbd2_journal_start_thread(journal);
1133} 1220}
1134 1221
1135static void jbd2_write_superblock(journal_t *journal) 1222static void jbd2_write_superblock(journal_t *journal, int write_op)
1136{ 1223{
1137 struct buffer_head *bh = journal->j_sb_buffer; 1224 struct buffer_head *bh = journal->j_sb_buffer;
1225 int ret;
1138 1226
1139 trace_jbd2_write_superblock(journal); 1227 trace_jbd2_write_superblock(journal, write_op);
1228 if (!(journal->j_flags & JBD2_BARRIER))
1229 write_op &= ~(REQ_FUA | REQ_FLUSH);
1230 lock_buffer(bh);
1140 if (buffer_write_io_error(bh)) { 1231 if (buffer_write_io_error(bh)) {
1141 /* 1232 /*
1142 * Oh, dear. A previous attempt to write the journal 1233 * Oh, dear. A previous attempt to write the journal
@@ -1152,40 +1243,45 @@ static void jbd2_write_superblock(journal_t *journal)
1152 clear_buffer_write_io_error(bh); 1243 clear_buffer_write_io_error(bh);
1153 set_buffer_uptodate(bh); 1244 set_buffer_uptodate(bh);
1154 } 1245 }
1155 1246 get_bh(bh);
1156 BUFFER_TRACE(bh, "marking dirty"); 1247 bh->b_end_io = end_buffer_write_sync;
1157 mark_buffer_dirty(bh); 1248 ret = submit_bh(write_op, bh);
1158 sync_dirty_buffer(bh); 1249 wait_on_buffer(bh);
1159 if (buffer_write_io_error(bh)) { 1250 if (buffer_write_io_error(bh)) {
1160 printk(KERN_ERR "JBD2: I/O error detected "
1161 "when updating journal superblock for %s.\n",
1162 journal->j_devname);
1163 clear_buffer_write_io_error(bh); 1251 clear_buffer_write_io_error(bh);
1164 set_buffer_uptodate(bh); 1252 set_buffer_uptodate(bh);
1253 ret = -EIO;
1254 }
1255 if (ret) {
1256 printk(KERN_ERR "JBD2: Error %d detected when updating "
1257 "journal superblock for %s.\n", ret,
1258 journal->j_devname);
1165 } 1259 }
1166} 1260}
1167 1261
1168/** 1262/**
1169 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. 1263 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1170 * @journal: The journal to update. 1264 * @journal: The journal to update.
1265 * @tail_tid: TID of the new transaction at the tail of the log
1266 * @tail_block: The first block of the transaction at the tail of the log
1267 * @write_op: With which operation should we write the journal sb
1171 * 1268 *
1172 * Update a journal's superblock information about log tail and write it to 1269 * Update a journal's superblock information about log tail and write it to
1173 * disk, waiting for the IO to complete. 1270 * disk, waiting for the IO to complete.
1174 */ 1271 */
1175void jbd2_journal_update_sb_log_tail(journal_t *journal) 1272void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1273 unsigned long tail_block, int write_op)
1176{ 1274{
1177 journal_superblock_t *sb = journal->j_superblock; 1275 journal_superblock_t *sb = journal->j_superblock;
1178 1276
1179 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1277 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1180 read_lock(&journal->j_state_lock); 1278 jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
1181 jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d)\n", 1279 tail_block, tail_tid);
1182 journal->j_tail, journal->j_tail_sequence);
1183 1280
1184 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1281 sb->s_sequence = cpu_to_be32(tail_tid);
1185 sb->s_start = cpu_to_be32(journal->j_tail); 1282 sb->s_start = cpu_to_be32(tail_block);
1186 read_unlock(&journal->j_state_lock);
1187 1283
1188 jbd2_write_superblock(journal); 1284 jbd2_write_superblock(journal, write_op);
1189 1285
1190 /* Log is no longer empty */ 1286 /* Log is no longer empty */
1191 write_lock(&journal->j_state_lock); 1287 write_lock(&journal->j_state_lock);
@@ -1214,7 +1310,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
1214 sb->s_start = cpu_to_be32(0); 1310 sb->s_start = cpu_to_be32(0);
1215 read_unlock(&journal->j_state_lock); 1311 read_unlock(&journal->j_state_lock);
1216 1312
1217 jbd2_write_superblock(journal); 1313 jbd2_write_superblock(journal, WRITE_FUA);
1218 1314
1219 /* Log is no longer empty */ 1315 /* Log is no longer empty */
1220 write_lock(&journal->j_state_lock); 1316 write_lock(&journal->j_state_lock);
@@ -1240,7 +1336,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal)
1240 sb->s_errno = cpu_to_be32(journal->j_errno); 1336 sb->s_errno = cpu_to_be32(journal->j_errno);
1241 read_unlock(&journal->j_state_lock); 1337 read_unlock(&journal->j_state_lock);
1242 1338
1243 jbd2_write_superblock(journal); 1339 jbd2_write_superblock(journal, WRITE_SYNC);
1244} 1340}
1245 1341
1246/* 1342/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7baf1390..c1a03354a22f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/crc32.h> 23#include <linux/crc32.h>
24#include <linux/blkdev.h>
24#endif 25#endif
25 26
26/* 27/*
@@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal)
265 err2 = sync_blockdev(journal->j_fs_dev); 266 err2 = sync_blockdev(journal->j_fs_dev);
266 if (!err) 267 if (!err)
267 err = err2; 268 err = err2;
268 269 /* Make sure all replayed data is on permanent storage */
270 if (journal->j_flags & JBD2_BARRIER)
271 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
269 return err; 272 return err;
270} 273}
271 274