Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jbd/recovery.c
1 files changed, 591 insertions, 0 deletions
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
new file mode 100644
index 000000000000..103c34e4fb28
--- /dev/null
+++ b/fs/jbd/recovery.c
@@ -0,0 +1,591 @@
+/*
+ * linux/fs/recovery.c
+ * 
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.  
+ */
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#endif
+/*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them. 
+ */
+struct recovery_info 
+{
+        tid_t           start_transaction;
+        tid_t           end_transaction;
+        int             nr_replays;
+        int             nr_revokes;
+        int             nr_revoke_hits;
+};
+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+static int do_one_pass(journal_t *journal,
+                                struct recovery_info *info, enum passtype pass);
+static int scan_revoke_records(journal_t *, struct buffer_head *,
+                                tid_t, struct recovery_info *);
+#ifdef __KERNEL__
+/* Release readahead buffers after use */
+void journal_brelse_array(struct buffer_head *b[], int n)
+{
+        while (--n >= 0)
+                brelse (b[n]);
+}
+/*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us.  We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all.  Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though.  128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+#define MAXBUF 8
+static int do_readahead(journal_t *journal, unsigned int start)
+{
+        int err;
+        unsigned int max, nbufs, next;
+        unsigned long blocknr;
+        struct buffer_head *bh;
+        struct buffer_head * bufs[MAXBUF];
+        /* Do up to 128K of readahead */
+        max = start + (128 * 1024 / journal->j_blocksize);
+        if (max > journal->j_maxlen)
+                max = journal->j_maxlen;
+        /* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+         * a time to the block device IO layer. */
+        nbufs = 0;
+        for (next = start; next < max; next++) {
+                err = journal_bmap(journal, next, &blocknr);
+                if (err) {
+                        printk (KERN_ERR "JBD: bad block at offset %u\n",
+                                next);
+                        goto failed;
+                }
+                bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+                if (!bh) {
+                        err = -ENOMEM;
+                        goto failed;
+                }
+                if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+                        bufs[nbufs++] = bh;
+                        if (nbufs == MAXBUF) {
+                                ll_rw_block(READ, nbufs, bufs);
+                                journal_brelse_array(bufs, nbufs);
+                                nbufs = 0;
+                        }
+                } else
+                        brelse(bh);
+        }
+        if (nbufs)
+                ll_rw_block(READ, nbufs, bufs);
+        err = 0;
+failed:
+        if (nbufs) 
+                journal_brelse_array(bufs, nbufs);
+        return err;
+}
+#endif /* __KERNEL__ */
+/*
+ * Read a block from the journal
+ */
+static int jread(struct buffer_head **bhp, journal_t *journal, 
+                 unsigned int offset)
+{
+        int err;
+        unsigned long blocknr;
+        struct buffer_head *bh;
+        *bhp = NULL;
+        if (offset >= journal->j_maxlen) {
+                printk(KERN_ERR "JBD: corrupted journal superblock\n");
+                return -EIO;
+        }
+        err = journal_bmap(journal, offset, &blocknr);
+        if (err) {
+                printk (KERN_ERR "JBD: bad block at offset %u\n",
+                        offset);
+                return err;
+        }
+        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        if (!bh)
+                return -ENOMEM;
+        if (!buffer_uptodate(bh)) {
+                /* If this is a brand new buffer, start readahead.
+                   Otherwise, we assume we are already reading it.  */
+                if (!buffer_req(bh))
+                        do_readahead(journal, offset);
+                wait_on_buffer(bh);
+        }
+        if (!buffer_uptodate(bh)) {
+                printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+                        offset);
+                brelse(bh);
+                return -EIO;
+        }
+        *bhp = bh;
+        return 0;
+}
+/*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+static int count_tags(struct buffer_head *bh, int size)
+{
+        char *                  tagp;
+        journal_block_tag_t *   tag;
+        int                     nr = 0;
+        tagp = &bh->b_data[sizeof(journal_header_t)];
+        while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
+                tag = (journal_block_tag_t *) tagp;
+                nr++;
+                tagp += sizeof(journal_block_tag_t);
+                if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
+                        tagp += 16;
+                if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
+                        break;
+        }
+        return nr;
+}
+/* Make sure we wrap around the log correctly! */
+#define wrap(journal, var)                                              \
+do {                                                                    \
+        if (var >= (journal)->j_last)                                   \
+                var -= ((journal)->j_last - (journal)->j_first);        \
+} while (0)
+/**
+ * int journal_recover(journal_t *journal) - recovers a on-disk journal
+ * @journal: the journal to recover
+ * 
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.  
+ *
+ * Recovery is done in three passes.  In the first pass, we look for the
+ * end of the log.  In the second, we assemble the list of revoke
+ * blocks.  In the third and final pass, we replay any un-revoked blocks
+ * in the log.  
+ */
+int journal_recover(journal_t *journal)
+{
+        int                     err;
+        journal_superblock_t *  sb;
+        struct recovery_info    info;
+        memset(&info, 0, sizeof(info));
+        sb = journal->j_superblock;
+        /* 
+         * The journal superblock's s_start field (the current log head)
+         * is always zero if, and only if, the journal was cleanly
+         * unmounted.  
+         */
+        if (!sb->s_start) {
+                jbd_debug(1, "No recovery required, last transaction %d\n",
+                          be32_to_cpu(sb->s_sequence));
+                journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+                return 0;
+        }
+        err = do_one_pass(journal, &info, PASS_SCAN);
+        if (!err)
+                err = do_one_pass(journal, &info, PASS_REVOKE);
+        if (!err)
+                err = do_one_pass(journal, &info, PASS_REPLAY);
+        jbd_debug(0, "JBD: recovery, exit status %d, "
+                  "recovered transactions %u to %u\n",
+                  err, info.start_transaction, info.end_transaction);
+        jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", 
+                  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+        /* Restart the log at the next transaction ID, thus invalidating
+         * any existing commit records in the log. */
+        journal->j_transaction_sequence = ++info.end_transaction;
+        journal_clear_revoke(journal);
+        sync_blockdev(journal->j_fs_dev);
+        return err;
+}
+/**
+ * int journal_skip_recovery() - Start journal and wipe exiting records 
+ * @journal: journal to startup
+ * 
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).  
+ * This function does'nt appear to be exorted..
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID. 
+ */
+int journal_skip_recovery(journal_t *journal)
+{
+        int                     err;
+        journal_superblock_t *  sb;
+        struct recovery_info    info;
+        memset (&info, 0, sizeof(info));
+        sb = journal->j_superblock;
+        err = do_one_pass(journal, &info, PASS_SCAN);
+        if (err) {
+                printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+                ++journal->j_transaction_sequence;
+        } else {
+#ifdef CONFIG_JBD_DEBUG
+                int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+#endif
+                jbd_debug(0, 
+                          "JBD: ignoring %d transaction%s from the journal.\n",
+                          dropped, (dropped == 1) ? "" : "s");
+                journal->j_transaction_sequence = ++info.end_transaction;
+        }
+        journal->j_tail = 0;
+        return err;
+}
+static int do_one_pass(journal_t *journal,
+                        struct recovery_info *info, enum passtype pass)
+{
+        unsigned int            first_commit_ID, next_commit_ID;
+        unsigned long           next_log_block;
+        int                     err, success = 0;
+        journal_superblock_t *  sb;
+        journal_header_t *      tmp;
+        struct buffer_head *    bh;
+        unsigned int            sequence;
+        int                     blocktype;
+        /* Precompute the maximum metadata descriptors in a descriptor block */
+        int                     MAX_BLOCKS_PER_DESC;
+        MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
+                               / sizeof(journal_block_tag_t));
+        /* 
+         * First thing is to establish what we expect to find in the log
+         * (in terms of transaction IDs), and where (in terms of log
+         * block offsets): query the superblock.  
+         */
+        sb = journal->j_superblock;
+        next_commit_ID = be32_to_cpu(sb->s_sequence);
+        next_log_block = be32_to_cpu(sb->s_start);
+        first_commit_ID = next_commit_ID;
+        if (pass == PASS_SCAN)
+                info->start_transaction = first_commit_ID;
+        jbd_debug(1, "Starting recovery pass %d\n", pass);
+        /*
+         * Now we walk through the log, transaction by transaction,
+         * making sure that each transaction has a commit block in the
+         * expected place.  Each complete transaction gets replayed back
+         * into the main filesystem. 
+         */
+        while (1) {
+                int                     flags;
+                char *                  tagp;
+                journal_block_tag_t *   tag;
+                struct buffer_head *    obh;
+                struct buffer_head *    nbh;
+                cond_resched();         /* We're under lock_kernel() */
+                /* If we already know where to stop the log traversal,
+                 * check right now that we haven't gone past the end of
+                 * the log. */
+                if (pass != PASS_SCAN)
+                        if (tid_geq(next_commit_ID, info->end_transaction))
+                                break;
+                jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+                          next_commit_ID, next_log_block, journal->j_last);
+                /* Skip over each chunk of the transaction looking
+                 * either the next descriptor block or the final commit
+                 * record. */
+                jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+                err = jread(&bh, journal, next_log_block);
+                if (err)
+                        goto failed;
+                next_log_block++;
+                wrap(journal, next_log_block);
+                /* What kind of buffer is it? 
+                 * 
+                 * If it is a descriptor block, check that it has the
+                 * expected sequence number.  Otherwise, we're all done
+                 * here. */
+                tmp = (journal_header_t *)bh->b_data;
+                if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
+                        brelse(bh);
+                        break;
+                }
+                blocktype = be32_to_cpu(tmp->h_blocktype);
+                sequence = be32_to_cpu(tmp->h_sequence);
+                jbd_debug(3, "Found magic %d, sequence %d\n", 
+                          blocktype, sequence);
+                if (sequence != next_commit_ID) {
+                        brelse(bh);
+                        break;
+                }
+                /* OK, we have a valid descriptor block which matches
+                 * all of the sequence number checks.  What are we going
+                 * to do with it?  That depends on the pass... */
+                switch(blocktype) {
+                case JFS_DESCRIPTOR_BLOCK:
+                        /* If it is a valid descriptor block, replay it
+                         * in pass REPLAY; otherwise, just skip over the
+                         * blocks it describes. */
+                        if (pass != PASS_REPLAY) {
+                                next_log_block +=
+                                        count_tags(bh, journal->j_blocksize);
+                                wrap(journal, next_log_block);
+                                brelse(bh);
+                                continue;
+                        }
+                        /* A descriptor block: we can now write all of
+                         * the data blocks.  Yay, useful work is finally
+                         * getting done here! */
+                        tagp = &bh->b_data[sizeof(journal_header_t)];
+                        while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+                               <= journal->j_blocksize) {
+                                unsigned long io_block;
+                                tag = (journal_block_tag_t *) tagp;
+                                flags = be32_to_cpu(tag->t_flags);
+                                io_block = next_log_block++;
+                                wrap(journal, next_log_block);
+                                err = jread(&obh, journal, io_block);
+                                if (err) {
+                                        /* Recover what we can, but
+                                         * report failure at the end. */
+                                        success = err;
+                                        printk (KERN_ERR 
+                                                "JBD: IO error %d recovering "
+                                                "block %ld in log\n",
+                                                err, io_block);
+                                } else {
+                                        unsigned long blocknr;
+                                        J_ASSERT(obh != NULL);
+                                        blocknr = be32_to_cpu(tag->t_blocknr);
+                                        /* If the block has been
+                                         * revoked, then we're all done
+                                         * here. */
+                                        if (journal_test_revoke
+                                            (journal, blocknr, 
+                                             next_commit_ID)) {
+                                                brelse(obh);
+                                                ++info->nr_revoke_hits;
+                                                goto skip_write;
+                                        }
+                                        /* Find a buffer for the new
+                                         * data being restored */
+                                        nbh = __getblk(journal->j_fs_dev,
+                                                        blocknr,
+                                                        journal->j_blocksize);
+                                        if (nbh == NULL) {
+                                                printk(KERN_ERR 
+                                                       "JBD: Out of memory "
+                                                       "during recovery.\n");
+                                                err = -ENOMEM;
+                                                brelse(bh);
+                                                brelse(obh);
+                                                goto failed;
+                                        }
+                                        lock_buffer(nbh);
+                                        memcpy(nbh->b_data, obh->b_data,
+                                                        journal->j_blocksize);
+                                        if (flags & JFS_FLAG_ESCAPE) {
+                                                *((__be32 *)bh->b_data) =
+                                                cpu_to_be32(JFS_MAGIC_NUMBER);
+                                        }
+                                        BUFFER_TRACE(nbh, "marking dirty");
+                                        set_buffer_uptodate(nbh);
+                                        mark_buffer_dirty(nbh);
+                                        BUFFER_TRACE(nbh, "marking uptodate");
+                                        ++info->nr_replays;
+                                        /* ll_rw_block(WRITE, 1, &nbh); */
+                                        unlock_buffer(nbh);
+                                        brelse(obh);
+                                        brelse(nbh);
+                                }
+                        skip_write:
+                                tagp += sizeof(journal_block_tag_t);
+                                if (!(flags & JFS_FLAG_SAME_UUID))
+                                        tagp += 16;
+                                if (flags & JFS_FLAG_LAST_TAG)
+                                        break;
+                        }
+                        brelse(bh);
+                        continue;
+                case JFS_COMMIT_BLOCK:
+                        /* Found an expected commit block: not much to
+                         * do other than move on to the next sequence
+                         * number. */
+                        brelse(bh);
+                        next_commit_ID++;
+                        continue;
+                case JFS_REVOKE_BLOCK:
+                        /* If we aren't in the REVOKE pass, then we can
+                         * just skip over this block. */
+                        if (pass != PASS_REVOKE) {
+                                brelse(bh);
+                                continue;
+                        }
+                        err = scan_revoke_records(journal, bh,
+                                                  next_commit_ID, info);
+                        brelse(bh);
+                        if (err)
+                                goto failed;
+                        continue;
+                default:
+                        jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+                                  blocktype);
+                        goto done;
+                }
+        }
+ done:
+        /* 
+         * We broke out of the log scan loop: either we came to the
+         * known end of the log or we found an unexpected block in the
+         * log.  If the latter happened, then we know that the "current"
+         * transaction marks the end of the valid log.
+         */
+        if (pass == PASS_SCAN)
+                info->end_transaction = next_commit_ID;
+        else {
+                /* It's really bad news if different passes end up at
+                 * different places (but possible due to IO errors). */
+                if (info->end_transaction != next_commit_ID) {
+                        printk (KERN_ERR "JBD: recovery pass %d ended at "
+                                "transaction %u, expected %u\n",
+                                pass, next_commit_ID, info->end_transaction);
+                        if (!success)
+                                success = -EIO;
+                }
+        }
+        return success;
+ failed:
+        return err;
+}
+/* Scan a revoke record, marking all blocks mentioned as revoked. */
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 
+                               tid_t sequence, struct recovery_info *info)
+{
+        journal_revoke_header_t *header;
+        int offset, max;
+        header = (journal_revoke_header_t *) bh->b_data;
+        offset = sizeof(journal_revoke_header_t);
+        max = be32_to_cpu(header->r_count);
+        while (offset < max) {
+                unsigned long blocknr;
+                int err;
+                blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
+                offset += 4;
+                err = journal_set_revoke(journal, blocknr, sequence);
+                if (err)
+                        return err;
+                ++info->nr_revokes;
+        }
+        return 0;
+}
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jbd/recovery.c

diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c new file mode 100644 index 000000000000..103c34e4fb28 --- /dev/null +++ b/fs/jbd/recovery.c
@@ -0,0 +1,591 @@
	1	/*
	2	* linux/fs/recovery.c
	3	*
	4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
	5	*
	6	* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
	7	*
	8	* This file is part of the Linux kernel and is made available under
	9	* the terms of the GNU General Public License, version 2, or at your
	10	* option, any later version, incorporated herein by reference.
	11	*
	12	* Journal recovery routines for the generic filesystem journaling code;
	13	* part of the ext2fs journaling system.
	14	*/
	15
	16	#ifndef __KERNEL__
	17	#include "jfs_user.h"
	18	#else
	19	#include <linux/time.h>
	20	#include <linux/fs.h>
	21	#include <linux/jbd.h>
	22	#include <linux/errno.h>
	23	#include <linux/slab.h>
	24	#endif
	25
	26	/*
	27	* Maintain information about the progress of the recovery job, so that
	28	* the different passes can carry information between them.
	29	*/
	30	struct recovery_info
	31	{
	32	tid_t start_transaction;
	33	tid_t end_transaction;
	34
	35	int nr_replays;
	36	int nr_revokes;
	37	int nr_revoke_hits;
	38	};
	39
	40	enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
	41	static int do_one_pass(journal_t *journal,
	42	struct recovery_info *info, enum passtype pass);
	43	static int scan_revoke_records(journal_t , struct buffer_head ,
	44	tid_t, struct recovery_info *);
	45
	46	#ifdef __KERNEL__
	47
	48	/* Release readahead buffers after use */
	49	void journal_brelse_array(struct buffer_head *b[], int n)
	50	{
	51	while (--n >= 0)
	52	brelse (b[n]);
	53	}
	54
	55
	56	/*
	57	* When reading from the journal, we are going through the block device
	58	* layer directly and so there is no readahead being done for us. We
	59	* need to implement any readahead ourselves if we want it to happen at
	60	* all. Recovery is basically one long sequential read, so make sure we
	61	* do the IO in reasonably large chunks.
	62	*
	63	* This is not so critical that we need to be enormously clever about
	64	* the readahead size, though. 128K is a purely arbitrary, good-enough
	65	* fixed value.
	66	*/
	67
	68	#define MAXBUF 8
	69	static int do_readahead(journal_t *journal, unsigned int start)
	70	{
	71	int err;
	72	unsigned int max, nbufs, next;
	73	unsigned long blocknr;
	74	struct buffer_head *bh;
	75
	76	struct buffer_head * bufs[MAXBUF];
	77
	78	/* Do up to 128K of readahead */
	79	max = start + (128 * 1024 / journal->j_blocksize);
	80	if (max > journal->j_maxlen)
	81	max = journal->j_maxlen;
	82
	83	/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
	84	* a time to the block device IO layer. */
	85
	86	nbufs = 0;
	87
	88	for (next = start; next < max; next++) {
	89	err = journal_bmap(journal, next, &blocknr);
	90
	91	if (err) {
	92	printk (KERN_ERR "JBD: bad block at offset %u\n",
	93	next);
	94	goto failed;
	95	}
	96
	97	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
	98	if (!bh) {
	99	err = -ENOMEM;
	100	goto failed;
	101	}
	102
	103	if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
	104	bufs[nbufs++] = bh;
	105	if (nbufs == MAXBUF) {
	106	ll_rw_block(READ, nbufs, bufs);
	107	journal_brelse_array(bufs, nbufs);
	108	nbufs = 0;
	109	}
	110	} else
	111	brelse(bh);
	112	}
	113
	114	if (nbufs)
	115	ll_rw_block(READ, nbufs, bufs);
	116	err = 0;
	117
	118	failed:
	119	if (nbufs)
	120	journal_brelse_array(bufs, nbufs);
	121	return err;
	122	}
	123
	124	#endif /* __KERNEL__ */
	125
	126
	127	/*
	128	* Read a block from the journal
	129	*/
	130
	131	static int jread(struct buffer_head *bhp, journal_t journal,
	132	unsigned int offset)
	133	{
	134	int err;
	135	unsigned long blocknr;
	136	struct buffer_head *bh;
	137
	138	*bhp = NULL;
	139
	140	if (offset >= journal->j_maxlen) {
	141	printk(KERN_ERR "JBD: corrupted journal superblock\n");
	142	return -EIO;
	143	}
	144
	145	err = journal_bmap(journal, offset, &blocknr);
	146
	147	if (err) {
	148	printk (KERN_ERR "JBD: bad block at offset %u\n",
	149	offset);
	150	return err;
	151	}
	152
	153	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
	154	if (!bh)
	155	return -ENOMEM;
	156
	157	if (!buffer_uptodate(bh)) {
	158	/* If this is a brand new buffer, start readahead.
	159	Otherwise, we assume we are already reading it. */
	160	if (!buffer_req(bh))
	161	do_readahead(journal, offset);
	162	wait_on_buffer(bh);
	163	}
	164
	165	if (!buffer_uptodate(bh)) {
	166	printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
	167	offset);
	168	brelse(bh);
	169	return -EIO;
	170	}
	171
	172	*bhp = bh;
	173	return 0;
	174	}
	175
	176
	177	/*
	178	* Count the number of in-use tags in a journal descriptor block.
	179	*/
	180
	181	static int count_tags(struct buffer_head *bh, int size)
	182	{
	183	char * tagp;
	184	journal_block_tag_t * tag;
	185	int nr = 0;
	186
	187	tagp = &bh->b_data[sizeof(journal_header_t)];
	188
	189	while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
	190	tag = (journal_block_tag_t *) tagp;
	191
	192	nr++;
	193	tagp += sizeof(journal_block_tag_t);
	194	if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
	195	tagp += 16;
	196
	197	if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
	198	break;
	199	}
	200
	201	return nr;
	202	}
	203
	204
	205	/* Make sure we wrap around the log correctly! */
	206	#define wrap(journal, var) \
	207	do { \
	208	if (var >= (journal)->j_last) \
	209	var -= ((journal)->j_last - (journal)->j_first); \
	210	} while (0)
	211
	212	/**
	213	* int journal_recover(journal_t *journal) - recovers a on-disk journal
	214	* @journal: the journal to recover
	215	*
	216	* The primary function for recovering the log contents when mounting a
	217	* journaled device.
	218	*
	219	* Recovery is done in three passes. In the first pass, we look for the
	220	* end of the log. In the second, we assemble the list of revoke
	221	* blocks. In the third and final pass, we replay any un-revoked blocks
	222	* in the log.
	223	*/
	224	int journal_recover(journal_t *journal)
	225	{
	226	int err;
	227	journal_superblock_t * sb;
	228
	229	struct recovery_info info;
	230
	231	memset(&info, 0, sizeof(info));
	232	sb = journal->j_superblock;
	233
	234	/*
	235	* The journal superblock's s_start field (the current log head)
	236	* is always zero if, and only if, the journal was cleanly
	237	* unmounted.
	238	*/
	239
	240	if (!sb->s_start) {
	241	jbd_debug(1, "No recovery required, last transaction %d\n",
	242	be32_to_cpu(sb->s_sequence));
	243	journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
	244	return 0;
	245	}
	246
	247	err = do_one_pass(journal, &info, PASS_SCAN);
	248	if (!err)
	249	err = do_one_pass(journal, &info, PASS_REVOKE);
	250	if (!err)
	251	err = do_one_pass(journal, &info, PASS_REPLAY);
	252
	253	jbd_debug(0, "JBD: recovery, exit status %d, "
	254	"recovered transactions %u to %u\n",
	255	err, info.start_transaction, info.end_transaction);
	256	jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
	257	info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
	258
	259	/* Restart the log at the next transaction ID, thus invalidating
	260	* any existing commit records in the log. */
	261	journal->j_transaction_sequence = ++info.end_transaction;
	262
	263	journal_clear_revoke(journal);
	264	sync_blockdev(journal->j_fs_dev);
	265	return err;
	266	}
	267
	268	/**
	269	* int journal_skip_recovery() - Start journal and wipe exiting records
	270	* @journal: journal to startup
	271	*
	272	* Locate any valid recovery information from the journal and set up the
	273	* journal structures in memory to ignore it (presumably because the
	274	* caller has evidence that it is out of date).
	275	* This function does'nt appear to be exorted..
	276	*
	277	* We perform one pass over the journal to allow us to tell the user how
	278	* much recovery information is being erased, and to let us initialise
	279	* the journal transaction sequence numbers to the next unused ID.
	280	*/
	281	int journal_skip_recovery(journal_t *journal)
	282	{
	283	int err;
	284	journal_superblock_t * sb;
	285
	286	struct recovery_info info;
	287
	288	memset (&info, 0, sizeof(info));
	289	sb = journal->j_superblock;
	290
	291	err = do_one_pass(journal, &info, PASS_SCAN);
	292
	293	if (err) {
	294	printk(KERN_ERR "JBD: error %d scanning journal\n", err);
	295	++journal->j_transaction_sequence;
	296	} else {
	297	#ifdef CONFIG_JBD_DEBUG
	298	int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
	299	#endif
	300	jbd_debug(0,
	301	"JBD: ignoring %d transaction%s from the journal.\n",
	302	dropped, (dropped == 1) ? "" : "s");
	303	journal->j_transaction_sequence = ++info.end_transaction;
	304	}
	305
	306	journal->j_tail = 0;
	307	return err;
	308	}
	309
	310	static int do_one_pass(journal_t *journal,
	311	struct recovery_info *info, enum passtype pass)
	312	{
	313	unsigned int first_commit_ID, next_commit_ID;
	314	unsigned long next_log_block;
	315	int err, success = 0;
	316	journal_superblock_t * sb;
	317	journal_header_t * tmp;
	318	struct buffer_head * bh;
	319	unsigned int sequence;
	320	int blocktype;
	321
	322	/* Precompute the maximum metadata descriptors in a descriptor block */
	323	int MAX_BLOCKS_PER_DESC;
	324	MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
	325	/ sizeof(journal_block_tag_t));
	326
	327	/*
	328	* First thing is to establish what we expect to find in the log
	329	* (in terms of transaction IDs), and where (in terms of log
	330	* block offsets): query the superblock.
	331	*/
	332
	333	sb = journal->j_superblock;
	334	next_commit_ID = be32_to_cpu(sb->s_sequence);
	335	next_log_block = be32_to_cpu(sb->s_start);
	336
	337	first_commit_ID = next_commit_ID;
	338	if (pass == PASS_SCAN)
	339	info->start_transaction = first_commit_ID;
	340
	341	jbd_debug(1, "Starting recovery pass %d\n", pass);
	342
	343	/*
	344	* Now we walk through the log, transaction by transaction,
	345	* making sure that each transaction has a commit block in the
	346	* expected place. Each complete transaction gets replayed back
	347	* into the main filesystem.
	348	*/
	349
	350	while (1) {
	351	int flags;
	352	char * tagp;
	353	journal_block_tag_t * tag;
	354	struct buffer_head * obh;
	355	struct buffer_head * nbh;
	356
	357	cond_resched(); /* We're under lock_kernel() */
	358
	359	/* If we already know where to stop the log traversal,
	360	* check right now that we haven't gone past the end of
	361	* the log. */
	362
	363	if (pass != PASS_SCAN)
	364	if (tid_geq(next_commit_ID, info->end_transaction))
	365	break;
	366
	367	jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
	368	next_commit_ID, next_log_block, journal->j_last);
	369
	370	/* Skip over each chunk of the transaction looking
	371	* either the next descriptor block or the final commit
	372	* record. */
	373
	374	jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
	375	err = jread(&bh, journal, next_log_block);
	376	if (err)
	377	goto failed;
	378
	379	next_log_block++;
	380	wrap(journal, next_log_block);
	381
	382	/* What kind of buffer is it?
	383	*
	384	* If it is a descriptor block, check that it has the
	385	* expected sequence number. Otherwise, we're all done
	386	* here. */
	387
	388	tmp = (journal_header_t *)bh->b_data;
	389
	390	if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
	391	brelse(bh);
	392	break;
	393	}
	394
	395	blocktype = be32_to_cpu(tmp->h_blocktype);
	396	sequence = be32_to_cpu(tmp->h_sequence);
	397	jbd_debug(3, "Found magic %d, sequence %d\n",
	398	blocktype, sequence);
	399
	400	if (sequence != next_commit_ID) {
	401	brelse(bh);
	402	break;
	403	}
	404
	405	/* OK, we have a valid descriptor block which matches
	406	* all of the sequence number checks. What are we going
	407	* to do with it? That depends on the pass... */
	408
	409	switch(blocktype) {
	410	case JFS_DESCRIPTOR_BLOCK:
	411	/* If it is a valid descriptor block, replay it
	412	* in pass REPLAY; otherwise, just skip over the
	413	* blocks it describes. */
	414	if (pass != PASS_REPLAY) {
	415	next_log_block +=
	416	count_tags(bh, journal->j_blocksize);
	417	wrap(journal, next_log_block);
	418	brelse(bh);
	419	continue;
	420	}
	421
	422	/* A descriptor block: we can now write all of
	423	* the data blocks. Yay, useful work is finally
	424	* getting done here! */
	425
	426	tagp = &bh->b_data[sizeof(journal_header_t)];
	427	while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
	428	<= journal->j_blocksize) {
	429	unsigned long io_block;
	430
	431	tag = (journal_block_tag_t *) tagp;
	432	flags = be32_to_cpu(tag->t_flags);
	433
	434	io_block = next_log_block++;
	435	wrap(journal, next_log_block);
	436	err = jread(&obh, journal, io_block);
	437	if (err) {
	438	/* Recover what we can, but
	439	* report failure at the end. */
	440	success = err;
	441	printk (KERN_ERR
	442	"JBD: IO error %d recovering "
	443	"block %ld in log\n",
	444	err, io_block);
	445	} else {
	446	unsigned long blocknr;
	447
	448	J_ASSERT(obh != NULL);
	449	blocknr = be32_to_cpu(tag->t_blocknr);
	450
	451	/* If the block has been
	452	* revoked, then we're all done
	453	* here. */
	454	if (journal_test_revoke
	455	(journal, blocknr,
	456	next_commit_ID)) {
	457	brelse(obh);
	458	++info->nr_revoke_hits;
	459	goto skip_write;
	460	}
	461
	462	/* Find a buffer for the new
	463	* data being restored */
	464	nbh = __getblk(journal->j_fs_dev,
	465	blocknr,
	466	journal->j_blocksize);
	467	if (nbh == NULL) {
	468	printk(KERN_ERR
	469	"JBD: Out of memory "
	470	"during recovery.\n");
	471	err = -ENOMEM;
	472	brelse(bh);
	473	brelse(obh);
	474	goto failed;
	475	}
	476
	477	lock_buffer(nbh);
	478	memcpy(nbh->b_data, obh->b_data,
	479	journal->j_blocksize);
	480	if (flags & JFS_FLAG_ESCAPE) {
	481	((__be32 )bh->b_data) =
	482	cpu_to_be32(JFS_MAGIC_NUMBER);
	483	}
	484
	485	BUFFER_TRACE(nbh, "marking dirty");
	486	set_buffer_uptodate(nbh);
	487	mark_buffer_dirty(nbh);
	488	BUFFER_TRACE(nbh, "marking uptodate");
	489	++info->nr_replays;
	490	/* ll_rw_block(WRITE, 1, &nbh); */
	491	unlock_buffer(nbh);
	492	brelse(obh);
	493	brelse(nbh);
	494	}
	495
	496	skip_write:
	497	tagp += sizeof(journal_block_tag_t);
	498	if (!(flags & JFS_FLAG_SAME_UUID))
	499	tagp += 16;
	500
	501	if (flags & JFS_FLAG_LAST_TAG)
	502	break;
	503	}
	504
	505	brelse(bh);
	506	continue;
	507
	508	case JFS_COMMIT_BLOCK:
	509	/* Found an expected commit block: not much to
	510	* do other than move on to the next sequence
	511	* number. */
	512	brelse(bh);
	513	next_commit_ID++;
	514	continue;
	515
	516	case JFS_REVOKE_BLOCK:
	517	/* If we aren't in the REVOKE pass, then we can
	518	* just skip over this block. */
	519	if (pass != PASS_REVOKE) {
	520	brelse(bh);
	521	continue;
	522	}
	523
	524	err = scan_revoke_records(journal, bh,
	525	next_commit_ID, info);
	526	brelse(bh);
	527	if (err)
	528	goto failed;
	529	continue;
	530
	531	default:
	532	jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
	533	blocktype);
	534	goto done;
	535	}
	536	}
	537
	538	done:
	539	/*
	540	* We broke out of the log scan loop: either we came to the
	541	* known end of the log or we found an unexpected block in the
	542	* log. If the latter happened, then we know that the "current"
	543	* transaction marks the end of the valid log.
	544	*/
	545
	546	if (pass == PASS_SCAN)
	547	info->end_transaction = next_commit_ID;
	548	else {
	549	/* It's really bad news if different passes end up at
	550	* different places (but possible due to IO errors). */
	551	if (info->end_transaction != next_commit_ID) {
	552	printk (KERN_ERR "JBD: recovery pass %d ended at "
	553	"transaction %u, expected %u\n",
	554	pass, next_commit_ID, info->end_transaction);
	555	if (!success)
	556	success = -EIO;
	557	}
	558	}
	559
	560	return success;
	561
	562	failed:
	563	return err;
	564	}
	565
	566
	567	/* Scan a revoke record, marking all blocks mentioned as revoked. */
	568
	569	static int scan_revoke_records(journal_t journal, struct buffer_head bh,
	570	tid_t sequence, struct recovery_info *info)
	571	{
	572	journal_revoke_header_t *header;
	573	int offset, max;
	574
	575	header = (journal_revoke_header_t *) bh->b_data;
	576	offset = sizeof(journal_revoke_header_t);
	577	max = be32_to_cpu(header->r_count);
	578
	579	while (offset < max) {
	580	unsigned long blocknr;
	581	int err;
	582
	583	blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
	584	offset += 4;
	585	err = journal_set_revoke(journal, blocknr, sequence);
	586	if (err)
	587	return err;
	588	++info->nr_revokes;
	589	}
	590	return 0;
	591	}