[PATCH 2/2] ocfs2: Fix race between mount and recovery

As the fs recovery is asynchronous, there is a small chance that another node can mount (and thus recover) the slot before the recovery thread gets to it. If this happens, the recovery thread will block indefinitely on the journal/slot lock as that lock will be held for the duration of the mount (by design) by the node assigned to that slot. The solution implemented is to keep track of the journal replays using a recovery generation in the journal inode, which will be incremented by the thread replaying that journal. The recovery thread, before attempting the blocking lock on the journal/slot lock, will compare the generation on disk with what it has cached and skip recovery if it does not match. This bug appears to have been inadvertently introduced during the mount/umount vote removal by mainline commit 34d024f84345807bf44163fac84e921513dde323. In the mount voting scheme, the messaging would indirectly indicate that the slot was being recovered. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com> Signed-off-by: Mark Fasheh <mfasheh@suse.com>
author: Sunil Mushran <sunil.mushran@oracle.com> 2008-07-14 20:31:10 -0400
committer: Mark Fasheh <mfasheh@suse.com> 2008-07-31 19:21:14 -0400
commit: 539d8264093560b917ee3afe4c7f74e5da09d6a5 (patch)
tree: 1fce83387272c0b2d61bd945769f4984aa5e79ce /fs
parent: c69991aac71a8beb57c11d651c7fd4b24c32aa8b (diff)
4 files changed, 148 insertions, 42 deletions
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a8c19cb3cfdd..7a37240f7a31 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
 static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-                                      int dirty);
+                                      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
                                 int slot_num);
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -562,8 +562,18 @@ done:
        return status;
 }
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+        le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+        return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-                                      int dirty)
+                                      int dirty, int replayed)
 {
        int status;
        unsigned int flags;
@@ -593,6 +603,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                flags &= ~OCFS2_JOURNAL_DIRTY_FL;
        fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+        if (replayed)
+                ocfs2_bump_recovery_generation(fe);
        status = ocfs2_write_block(osb, bh, journal->j_inode);
        if (status < 0)
                mlog_errno(status);
@@ -667,7 +680,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
                 * Do not toggle if flush was unsuccessful otherwise
                 * will leave dirty metadata in a "clean" journal
                 */
-                status = ocfs2_journal_toggle_dirty(osb, 0);
+                status = ocfs2_journal_toggle_dirty(osb, 0, 0);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -710,7 +723,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
        }
 }
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 {
        int status = 0;
        struct ocfs2_super *osb;
@@ -729,7 +742,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
        ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
-        status = ocfs2_journal_toggle_dirty(osb, 1);
+        status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
        if (status < 0) {
                mlog_errno(status);
                goto done;
@@ -771,7 +784,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
                goto bail;
        }
-        status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+        status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
        if (status < 0)
                mlog_errno(status);
@@ -1034,6 +1047,12 @@ restart:
        spin_unlock(&osb->osb_lock);
        mlog(0, "All nodes recovered\n");
+        /* Refresh all journal recovery generations from disk */
+        status = ocfs2_check_journals_nolocks(osb);
+        status = (status == -EROFS) ? 0 : status;
+        if (status < 0)
+                mlog_errno(status);
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1115,42 @@ out:
        mlog_exit_void();
 }
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+                                    int slot_num,
+                                    struct buffer_head **bh,
+                                    struct inode **ret_inode)
+{
+        int status = -EACCES;
+        struct inode *inode = NULL;
+        BUG_ON(slot_num >= osb->max_slots);
+        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+                                            slot_num);
+        if (!inode || is_bad_inode(inode)) {
+                mlog_errno(status);
+                goto bail;
+        }
+        SET_INODE_JOURNAL(inode);
+        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = 0;
+bail:
+        if (inode) {
+                if (status || !ret_inode)
+                        iput(inode);
+                else
+                        *ret_inode = inode;
+        }
+        return status;
+}
 /* Does the actual journal replay and marks the journal inode as
 * clean. Will only replay if the journal inode is marked dirty. */
 static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1164,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        struct ocfs2_dinode *fe;
        journal_t *journal = NULL;
        struct buffer_head *bh = NULL;
+        u32 slot_reco_gen;
-        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+        status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
-                                            slot_num);
+        if (status) {
-        if (inode == NULL) {
-                status = -EACCES;
                mlog_errno(status);
                goto done;
        }
-        if (is_bad_inode(inode)) {
-                status = -EACCES;
+        fe = (struct ocfs2_dinode *)bh->b_data;
-                iput(inode);
+        slot_reco_gen = ocfs2_get_recovery_generation(fe);
-                inode = NULL;
+        brelse(bh);
-                mlog_errno(status);
+        bh = NULL;
+        /*
+         * As the fs recovery is asynchronous, there is a small chance that
+         * another node mounted (and recovered) the slot before the recovery
+         * thread could get the lock. To handle that, we dirty read the journal
+         * inode for that slot to get the recovery generation. If it is
+         * different than what we expected, the slot has been recovered.
+         * If not, it needs recovery.
+         */
+        if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+                mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+                     osb->slot_recovery_generations[slot_num], slot_reco_gen);
+                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+                status = -EBUSY;
                goto done;
        }
-        SET_INODE_JOURNAL(inode);
+        /* Continue with recovery as the journal has not yet been recovered */
        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
@@ -1138,9 +1207,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        fe = (struct ocfs2_dinode *) bh->b_data;
        flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+        slot_reco_gen = ocfs2_get_recovery_generation(fe);
        if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
                mlog(0, "No recovery required for node %d\n", node_num);
+                /* Refresh recovery generation for the slot */
+                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
                goto done;
        }
@@ -1188,6 +1260,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        flags &= ~OCFS2_JOURNAL_DIRTY_FL;
        fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+        /* Increment recovery generation to indicate successful recovery */
+        ocfs2_bump_recovery_generation(fe);
+        osb->slot_recovery_generations[slot_num] =
+                                        ocfs2_get_recovery_generation(fe);
        status = ocfs2_write_block(osb, bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -1252,6 +1329,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
+                if (status == -EBUSY) {
+                        mlog(0, "Skipping recovery for slot %u (node %u) "
+                             "as another node has recovered it\n", slot_num,
+                             node_num);
+                        status = 0;
+                        goto done;
+                }
                mlog_errno(status);
                goto done;
        }
@@ -1334,12 +1418,29 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
        unsigned int node_num;
        int status, i;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_dinode *di;
        /* This is called with the super block cluster lock, so we
         * know that the slot map can't change underneath us. */
        spin_lock(&osb->osb_lock);
        for (i = 0; i < osb->max_slots; i++) {
+                /* Read journal inode to get the recovery generation */
+                status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                di = (struct ocfs2_dinode *)bh->b_data;
+                osb->slot_recovery_generations[i] =
+                                        ocfs2_get_recovery_generation(di);
+                brelse(bh);
+                bh = NULL;
+                mlog(0, "Slot %u recovery generation is %u\n", i,
+                     osb->slot_recovery_generations[i]);
                if (i == osb->slot_num)
                        continue;
@@ -1603,49 +1704,41 @@ static int ocfs2_commit_thread(void *arg)
        return 0;
 }
-/* Look for a dirty journal without taking any cluster locks. Used for
+/* Reads all the journal inodes without taking any cluster locks. Used
- * hard readonly access to determine whether the file system journals
+ * for hard readonly access to determine whether any journal requires
- * require recovery. */
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
 int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
 {
        int ret = 0;
        unsigned int slot;
-        struct buffer_head *di_bh;
+        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
-        struct inode *journal = NULL;
+        int journal_dirty = 0;
        for(slot = 0; slot < osb->max_slots; slot++) {
-                journal = ocfs2_get_system_file_inode(osb,
+                ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
-                                                      JOURNAL_SYSTEM_INODE,
+                if (ret) {
-                                                      slot);
-                if (!journal || is_bad_inode(journal)) {
-                        ret = -EACCES;
-                        mlog_errno(ret);
-                        goto out;
-                }
-                di_bh = NULL;
-                ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
-                                       0, journal);
-                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
                }
                di = (struct ocfs2_dinode *) di_bh->b_data;
+                osb->slot_recovery_generations[slot] =
+                                        ocfs2_get_recovery_generation(di);
                if (le32_to_cpu(di->id1.journal1.ij_flags) &
                    OCFS2_JOURNAL_DIRTY_FL)
-                        ret = -EROFS;
+                        journal_dirty = 1;
                brelse(di_bh);
-                if (ret)
+                di_bh = NULL;
-                        break;
        }
 out:
-        if (journal)
+        if (journal_dirty)
-                iput(journal);
+                ret = -EROFS;
        return ret;
 }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index db82be2532ed..2178ebffa05f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -161,7 +161,8 @@ int    ocfs2_journal_init(struct ocfs2_journal *journal,
 void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
 int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
                          int full);
-int    ocfs2_journal_load(struct ocfs2_journal *journal, int local);
+int    ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+                          int replayed);
 int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
 void   ocfs2_recovery_thread(struct ocfs2_super *osb,
                             int node_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1cb814be8ef1..7f625f2b1117 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -204,6 +204,8 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
+        u32 *slot_recovery_generations;
        spinlock_t node_map_lock;
        u64 root_blkno;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2560b33889aa..88255d3f52b4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1442,6 +1442,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+        osb->slot_recovery_generations =
+                kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+                        GFP_KERNEL);
+        if (!osb->slot_recovery_generations) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
        init_waitqueue_head(&osb->osb_wipe_event);
        osb->osb_orphan_wipes = kcalloc(osb->max_slots,
                                        sizeof(*osb->osb_orphan_wipes),
@@ -1703,7 +1712,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
        local = ocfs2_mount_local(osb);
        /* will play back anything left in the journal. */
-        status = ocfs2_journal_load(osb->journal, local);
+        status = ocfs2_journal_load(osb->journal, local, dirty);
        if (status < 0) {
                mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
                goto finally;
@@ -1768,6 +1777,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        ocfs2_free_slot_info(osb);
        kfree(osb->osb_orphan_wipes);
+        kfree(osb->slot_recovery_generations);
        /* FIXME
         * This belongs in journal shutdown, but because we have to
         * allocate osb->journal at the start of ocfs2_initalize_osb(),
author	Sunil Mushran <sunil.mushran@oracle.com>	2008-07-14 20:31:10 -0400
committer	Mark Fasheh <mfasheh@suse.com>	2008-07-31 19:21:14 -0400
commit	539d8264093560b917ee3afe4c7f74e5da09d6a5 (patch)
tree	1fce83387272c0b2d61bd945769f4984aa5e79ce /fs
parent	c69991aac71a8beb57c11d651c7fd4b24c32aa8b (diff)

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a8c19cb3cfdd..7a37240f7a31 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
57	static int ocfs2_commit_cache(struct ocfs2_super *osb);	57	static int ocfs2_commit_cache(struct ocfs2_super *osb);
58	static int ocfs2_wait_on_mount(struct ocfs2_super *osb);	58	static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
59	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,	59	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
60	int dirty);	60	int dirty, int replayed);
61	static int ocfs2_trylock_journal(struct ocfs2_super *osb,	61	static int ocfs2_trylock_journal(struct ocfs2_super *osb,
62	int slot_num);	62	int slot_num);
63	static int ocfs2_recover_orphans(struct ocfs2_super *osb,	63	static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -562,8 +562,18 @@ done:
562	return status;	562	return status;
563	}	563	}
564		564
		565	static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
		566	{
		567	le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
		568	}
		569
		570	static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
		571	{
		572	return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
		573	}
		574
565	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,	575	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
566	int dirty)	576	int dirty, int replayed)
567	{	577	{
568	int status;	578	int status;
569	unsigned int flags;	579	unsigned int flags;
@@ -593,6 +603,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
593	flags &= ~OCFS2_JOURNAL_DIRTY_FL;	603	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
594	fe->id1.journal1.ij_flags = cpu_to_le32(flags);	604	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
595		605
		606	if (replayed)
		607	ocfs2_bump_recovery_generation(fe);
		608
596	status = ocfs2_write_block(osb, bh, journal->j_inode);	609	status = ocfs2_write_block(osb, bh, journal->j_inode);
597	if (status < 0)	610	if (status < 0)
598	mlog_errno(status);	611	mlog_errno(status);
@@ -667,7 +680,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
667	* Do not toggle if flush was unsuccessful otherwise	680	* Do not toggle if flush was unsuccessful otherwise
668	* will leave dirty metadata in a "clean" journal	681	* will leave dirty metadata in a "clean" journal
669	*/	682	*/
670	status = ocfs2_journal_toggle_dirty(osb, 0);	683	status = ocfs2_journal_toggle_dirty(osb, 0, 0);
671	if (status < 0)	684	if (status < 0)
672	mlog_errno(status);	685	mlog_errno(status);
673	}	686	}
@@ -710,7 +723,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
710	}	723	}
711	}	724	}
712		725
713	int ocfs2_journal_load(struct ocfs2_journal *journal, int local)	726	int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
714	{	727	{
715	int status = 0;	728	int status = 0;
716	struct ocfs2_super *osb;	729	struct ocfs2_super *osb;
@@ -729,7 +742,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
729		742
730	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);	743	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
731		744
732	status = ocfs2_journal_toggle_dirty(osb, 1);	745	status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
733	if (status < 0) {	746	if (status < 0) {
734	mlog_errno(status);	747	mlog_errno(status);
735	goto done;	748	goto done;
@@ -771,7 +784,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
771	goto bail;	784	goto bail;
772	}	785	}
773		786
774	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);	787	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
775	if (status < 0)	788	if (status < 0)
776	mlog_errno(status);	789	mlog_errno(status);
777		790
@@ -1034,6 +1047,12 @@ restart:
1034	spin_unlock(&osb->osb_lock);	1047	spin_unlock(&osb->osb_lock);
1035	mlog(0, "All nodes recovered\n");	1048	mlog(0, "All nodes recovered\n");
1036		1049
		1050	/* Refresh all journal recovery generations from disk */
		1051	status = ocfs2_check_journals_nolocks(osb);
		1052	status = (status == -EROFS) ? 0 : status;
		1053	if (status < 0)
		1054	mlog_errno(status);
		1055
1037	ocfs2_super_unlock(osb, 1);	1056	ocfs2_super_unlock(osb, 1);
1038		1057
1039	/* We always run recovery on our own orphan dir - the dead	1058	/* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1115,42 @@ out:
1096	mlog_exit_void();	1115	mlog_exit_void();
1097	}	1116	}
1098		1117
		1118	static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
		1119	int slot_num,
		1120	struct buffer_head **bh,
		1121	struct inode **ret_inode)
		1122	{
		1123	int status = -EACCES;
		1124	struct inode *inode = NULL;
		1125
		1126	BUG_ON(slot_num >= osb->max_slots);
		1127
		1128	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
		1129	slot_num);
		1130	if (!inode \|\| is_bad_inode(inode)) {
		1131	mlog_errno(status);
		1132	goto bail;
		1133	}
		1134	SET_INODE_JOURNAL(inode);
		1135
		1136	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
		1137	if (status < 0) {
		1138	mlog_errno(status);
		1139	goto bail;
		1140	}
		1141
		1142	status = 0;
		1143
		1144	bail:
		1145	if (inode) {
		1146	if (status \|\| !ret_inode)
		1147	iput(inode);
		1148	else
		1149	*ret_inode = inode;
		1150	}
		1151	return status;
		1152	}
		1153
1099	/* Does the actual journal replay and marks the journal inode as	1154	/* Does the actual journal replay and marks the journal inode as
1100	* clean. Will only replay if the journal inode is marked dirty. */	1155	* clean. Will only replay if the journal inode is marked dirty. */
1101	static int ocfs2_replay_journal(struct ocfs2_super *osb,	1156	static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1164,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1109	struct ocfs2_dinode *fe;	1164	struct ocfs2_dinode *fe;
1110	journal_t *journal = NULL;	1165	journal_t *journal = NULL;
1111	struct buffer_head *bh = NULL;	1166	struct buffer_head *bh = NULL;
		1167	u32 slot_reco_gen;
1112		1168
1113	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,	1169	status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
1114	slot_num);	1170	if (status) {
1115	if (inode == NULL) {
1116	status = -EACCES;
1117	mlog_errno(status);	1171	mlog_errno(status);
1118	goto done;	1172	goto done;
1119	}	1173	}
1120	if (is_bad_inode(inode)) {	1174
1121	status = -EACCES;	1175	fe = (struct ocfs2_dinode *)bh->b_data;
1122	iput(inode);	1176	slot_reco_gen = ocfs2_get_recovery_generation(fe);
1123	inode = NULL;	1177	brelse(bh);
1124	mlog_errno(status);	1178	bh = NULL;
		1179
		1180	/*
		1181	* As the fs recovery is asynchronous, there is a small chance that
		1182	* another node mounted (and recovered) the slot before the recovery
		1183	* thread could get the lock. To handle that, we dirty read the journal
		1184	* inode for that slot to get the recovery generation. If it is
		1185	* different than what we expected, the slot has been recovered.
		1186	* If not, it needs recovery.
		1187	*/
		1188	if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
		1189	mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
		1190	osb->slot_recovery_generations[slot_num], slot_reco_gen);
		1191	osb->slot_recovery_generations[slot_num] = slot_reco_gen;
		1192	status = -EBUSY;
1125	goto done;	1193	goto done;
1126	}	1194	}
1127	SET_INODE_JOURNAL(inode);	1195
		1196	/* Continue with recovery as the journal has not yet been recovered */
1128		1197
1129	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);	1198	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
1130	if (status < 0) {	1199	if (status < 0) {
@@ -1138,9 +1207,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1138	fe = (struct ocfs2_dinode *) bh->b_data;	1207	fe = (struct ocfs2_dinode *) bh->b_data;
1139		1208
1140	flags = le32_to_cpu(fe->id1.journal1.ij_flags);	1209	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
		1210	slot_reco_gen = ocfs2_get_recovery_generation(fe);
1141		1211
1142	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {	1212	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
1143	mlog(0, "No recovery required for node %d\n", node_num);	1213	mlog(0, "No recovery required for node %d\n", node_num);
		1214	/* Refresh recovery generation for the slot */
		1215	osb->slot_recovery_generations[slot_num] = slot_reco_gen;
1144	goto done;	1216	goto done;
1145	}	1217	}
1146		1218
@@ -1188,6 +1260,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1188	flags &= ~OCFS2_JOURNAL_DIRTY_FL;	1260	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
1189	fe->id1.journal1.ij_flags = cpu_to_le32(flags);	1261	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
1190		1262
		1263	/* Increment recovery generation to indicate successful recovery */
		1264	ocfs2_bump_recovery_generation(fe);
		1265	osb->slot_recovery_generations[slot_num] =
		1266	ocfs2_get_recovery_generation(fe);
		1267
1191	status = ocfs2_write_block(osb, bh, inode);	1268	status = ocfs2_write_block(osb, bh, inode);
1192	if (status < 0)	1269	if (status < 0)
1193	mlog_errno(status);	1270	mlog_errno(status);
@@ -1252,6 +1329,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1252		1329
1253	status = ocfs2_replay_journal(osb, node_num, slot_num);	1330	status = ocfs2_replay_journal(osb, node_num, slot_num);
1254	if (status < 0) {	1331	if (status < 0) {
		1332	if (status == -EBUSY) {
		1333	mlog(0, "Skipping recovery for slot %u (node %u) "
		1334	"as another node has recovered it\n", slot_num,
		1335	node_num);
		1336	status = 0;
		1337	goto done;
		1338	}
1255	mlog_errno(status);	1339	mlog_errno(status);
1256	goto done;	1340	goto done;
1257	}	1341	}
@@ -1334,12 +1418,29 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1334	{	1418	{
1335	unsigned int node_num;	1419	unsigned int node_num;
1336	int status, i;	1420	int status, i;
		1421	struct buffer_head *bh = NULL;
		1422	struct ocfs2_dinode *di;
1337		1423
1338	/* This is called with the super block cluster lock, so we	1424	/* This is called with the super block cluster lock, so we
1339	* know that the slot map can't change underneath us. */	1425	* know that the slot map can't change underneath us. */
1340		1426
1341	spin_lock(&osb->osb_lock);	1427	spin_lock(&osb->osb_lock);
1342	for (i = 0; i < osb->max_slots; i++) {	1428	for (i = 0; i < osb->max_slots; i++) {
		1429	/* Read journal inode to get the recovery generation */
		1430	status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
		1431	if (status) {
		1432	mlog_errno(status);
		1433	goto bail;
		1434	}
		1435	di = (struct ocfs2_dinode *)bh->b_data;
		1436	osb->slot_recovery_generations[i] =
		1437	ocfs2_get_recovery_generation(di);
		1438	brelse(bh);
		1439	bh = NULL;
		1440
		1441	mlog(0, "Slot %u recovery generation is %u\n", i,
		1442	osb->slot_recovery_generations[i]);
		1443
1343	if (i == osb->slot_num)	1444	if (i == osb->slot_num)
1344	continue;	1445	continue;
1345		1446
@@ -1603,49 +1704,41 @@ static int ocfs2_commit_thread(void *arg)
1603	return 0;	1704	return 0;
1604	}	1705	}
1605		1706
1606	/* Look for a dirty journal without taking any cluster locks. Used for	1707	/* Reads all the journal inodes without taking any cluster locks. Used
1607	* hard readonly access to determine whether the file system journals	1708	* for hard readonly access to determine whether any journal requires
1608	* require recovery. */	1709	* recovery. Also used to refresh the recovery generation numbers after
		1710	* a journal has been recovered by another node.
		1711	*/
1609	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)	1712	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
1610	{	1713	{
1611	int ret = 0;	1714	int ret = 0;
1612	unsigned int slot;	1715	unsigned int slot;
1613	struct buffer_head *di_bh;	1716	struct buffer_head *di_bh = NULL;
1614	struct ocfs2_dinode *di;	1717	struct ocfs2_dinode *di;
1615	struct inode *journal = NULL;	1718	int journal_dirty = 0;
1616		1719
1617	for(slot = 0; slot < osb->max_slots; slot++) {	1720	for(slot = 0; slot < osb->max_slots; slot++) {
1618	journal = ocfs2_get_system_file_inode(osb,	1721	ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
1619	JOURNAL_SYSTEM_INODE,	1722	if (ret) {
1620	slot);
1621	if (!journal \|\| is_bad_inode(journal)) {
1622	ret = -EACCES;
1623	mlog_errno(ret);
1624	goto out;
1625	}
1626
1627	di_bh = NULL;
1628	ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
1629	0, journal);
1630	if (ret < 0) {
1631	mlog_errno(ret);	1723	mlog_errno(ret);
1632	goto out;	1724	goto out;
1633	}	1725	}
1634		1726
1635	di = (struct ocfs2_dinode *) di_bh->b_data;	1727	di = (struct ocfs2_dinode *) di_bh->b_data;
1636		1728
		1729	osb->slot_recovery_generations[slot] =
		1730	ocfs2_get_recovery_generation(di);
		1731
1637	if (le32_to_cpu(di->id1.journal1.ij_flags) &	1732	if (le32_to_cpu(di->id1.journal1.ij_flags) &
1638	OCFS2_JOURNAL_DIRTY_FL)	1733	OCFS2_JOURNAL_DIRTY_FL)
1639	ret = -EROFS;	1734	journal_dirty = 1;
1640		1735
1641	brelse(di_bh);	1736	brelse(di_bh);
1642	if (ret)	1737	di_bh = NULL;
1643	break;
1644	}	1738	}
1645		1739
1646	out:	1740	out:
1647	if (journal)	1741	if (journal_dirty)
1648	iput(journal);	1742	ret = -EROFS;
1649
1650	return ret;	1743	return ret;
1651	}	1744	}


diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index db82be2532ed..2178ebffa05f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h
@@ -161,7 +161,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal,
161	void ocfs2_journal_shutdown(struct ocfs2_super *osb);	161	void ocfs2_journal_shutdown(struct ocfs2_super *osb);
162	int ocfs2_journal_wipe(struct ocfs2_journal *journal,	162	int ocfs2_journal_wipe(struct ocfs2_journal *journal,
163	int full);	163	int full);
164	int ocfs2_journal_load(struct ocfs2_journal *journal, int local);	164	int ocfs2_journal_load(struct ocfs2_journal *journal, int local,
		165	int replayed);
165	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);	166	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
166	void ocfs2_recovery_thread(struct ocfs2_super *osb,	167	void ocfs2_recovery_thread(struct ocfs2_super *osb,
167	int node_num);	168	int node_num);


diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 1cb814be8ef1..7f625f2b1117 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h
@@ -204,6 +204,8 @@ struct ocfs2_super
204		204
205	struct ocfs2_slot_info *slot_info;	205	struct ocfs2_slot_info *slot_info;
206		206
		207	u32 *slot_recovery_generations;
		208
207	spinlock_t node_map_lock;	209	spinlock_t node_map_lock;
208		210
209	u64 root_blkno;	211	u64 root_blkno;


diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2560b33889aa..88255d3f52b4 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c
@@ -1442,6 +1442,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
1442	}	1442	}
1443	mlog(0, "max_slots for this device: %u\n", osb->max_slots);	1443	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
1444		1444
		1445	osb->slot_recovery_generations =
		1446	kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
		1447	GFP_KERNEL);
		1448	if (!osb->slot_recovery_generations) {
		1449	status = -ENOMEM;
		1450	mlog_errno(status);
		1451	goto bail;
		1452	}
		1453
1445	init_waitqueue_head(&osb->osb_wipe_event);	1454	init_waitqueue_head(&osb->osb_wipe_event);
1446	osb->osb_orphan_wipes = kcalloc(osb->max_slots,	1455	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
1447	sizeof(*osb->osb_orphan_wipes),	1456	sizeof(*osb->osb_orphan_wipes),
@@ -1703,7 +1712,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
1703	local = ocfs2_mount_local(osb);	1712	local = ocfs2_mount_local(osb);
1704		1713
1705	/* will play back anything left in the journal. */	1714	/* will play back anything left in the journal. */
1706	status = ocfs2_journal_load(osb->journal, local);	1715	status = ocfs2_journal_load(osb->journal, local, dirty);
1707	if (status < 0) {	1716	if (status < 0) {
1708	mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);	1717	mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
1709	goto finally;	1718	goto finally;
@@ -1768,6 +1777,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
1768	ocfs2_free_slot_info(osb);	1777	ocfs2_free_slot_info(osb);
1769		1778
1770	kfree(osb->osb_orphan_wipes);	1779	kfree(osb->osb_orphan_wipes);
		1780	kfree(osb->slot_recovery_generations);
1771	/* FIXME	1781	/* FIXME
1772	* This belongs in journal shutdown, but because we have to	1782	* This belongs in journal shutdown, but because we have to
1773	* allocate osb->journal at the start of ocfs2_initalize_osb(),	1783	* allocate osb->journal at the start of ocfs2_initalize_osb(),