4 files changed, 148 insertions, 42 deletions
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a8c19cb3cfdd..7a37240f7a31 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
 static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-                                      int dirty);
+                                      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
                                 int slot_num);
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -562,8 +562,18 @@ done:
        return status;
 }
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+        le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+        return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-                                      int dirty)
+                                      int dirty, int replayed)
 {
        int status;
        unsigned int flags;
@@ -593,6 +603,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                flags &= ~OCFS2_JOURNAL_DIRTY_FL;
        fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+        if (replayed)
+                ocfs2_bump_recovery_generation(fe);
        status = ocfs2_write_block(osb, bh, journal->j_inode);
        if (status < 0)
                mlog_errno(status);
@@ -667,7 +680,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
                 * Do not toggle if flush was unsuccessful otherwise
                 * will leave dirty metadata in a "clean" journal
                 */
-                status = ocfs2_journal_toggle_dirty(osb, 0);
+                status = ocfs2_journal_toggle_dirty(osb, 0, 0);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -710,7 +723,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
        }
 }
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 {
        int status = 0;
        struct ocfs2_super *osb;
@@ -729,7 +742,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
        ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
-        status = ocfs2_journal_toggle_dirty(osb, 1);
+        status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
        if (status < 0) {
                mlog_errno(status);
                goto done;
@@ -771,7 +784,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
                goto bail;
        }
-        status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+        status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
        if (status < 0)
                mlog_errno(status);
@@ -1034,6 +1047,12 @@ restart:
        spin_unlock(&osb->osb_lock);
        mlog(0, "All nodes recovered\n");
+        /* Refresh all journal recovery generations from disk */
+        status = ocfs2_check_journals_nolocks(osb);
+        status = (status == -EROFS) ? 0 : status;
+        if (status < 0)
+                mlog_errno(status);
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1115,42 @@ out:
        mlog_exit_void();
 }
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+                                    int slot_num,
+                                    struct buffer_head **bh,
+                                    struct inode **ret_inode)
+{
+        int status = -EACCES;
+        struct inode *inode = NULL;
+        BUG_ON(slot_num >= osb->max_slots);
+        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+                                            slot_num);
+        if (!inode || is_bad_inode(inode)) {
+                mlog_errno(status);
+                goto bail;
+        }
+        SET_INODE_JOURNAL(inode);
+        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = 0;
+bail:
+        if (inode) {
+                if (status || !ret_inode)
+                        iput(inode);
+                else
+                        *ret_inode = inode;
+        }
+        return status;
+}
 /* Does the actual journal replay and marks the journal inode as
 * clean. Will only replay if the journal inode is marked dirty. */
 static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1164,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        struct ocfs2_dinode *fe;
        journal_t *journal = NULL;
        struct buffer_head *bh = NULL;
+        u32 slot_reco_gen;
-        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+        status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
-                                            slot_num);
+        if (status) {
-        if (inode == NULL) {
-                status = -EACCES;
                mlog_errno(status);
                goto done;
        }
-        if (is_bad_inode(inode)) {
-                status = -EACCES;
+        fe = (struct ocfs2_dinode *)bh->b_data;
-                iput(inode);
+        slot_reco_gen = ocfs2_get_recovery_generation(fe);
-                inode = NULL;
+        brelse(bh);
-                mlog_errno(status);
+        bh = NULL;
+        /*
+         * As the fs recovery is asynchronous, there is a small chance that
+         * another node mounted (and recovered) the slot before the recovery
+         * thread could get the lock. To handle that, we dirty read the journal
+         * inode for that slot to get the recovery generation. If it is
+         * different than what we expected, the slot has been recovered.
+         * If not, it needs recovery.
+         */
+        if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+                mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+                     osb->slot_recovery_generations[slot_num], slot_reco_gen);
+                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+                status = -EBUSY;
                goto done;
        }
-        SET_INODE_JOURNAL(inode);
+        /* Continue with recovery as the journal has not yet been recovered */
        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
@@ -1138,9 +1207,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        fe = (struct ocfs2_dinode *) bh->b_data;
        flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+        slot_reco_gen = ocfs2_get_recovery_generation(fe);
        if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
                mlog(0, "No recovery required for node %d\n", node_num);
+                /* Refresh recovery generation for the slot */
+                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
                goto done;
        }
@@ -1188,6 +1260,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        flags &= ~OCFS2_JOURNAL_DIRTY_FL;
        fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+        /* Increment recovery generation to indicate successful recovery */
+        ocfs2_bump_recovery_generation(fe);
+        osb->slot_recovery_generations[slot_num] =
+                                        ocfs2_get_recovery_generation(fe);
        status = ocfs2_write_block(osb, bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -1252,6 +1329,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
+                if (status == -EBUSY) {
+                        mlog(0, "Skipping recovery for slot %u (node %u) "
+                             "as another node has recovered it\n", slot_num,
+                             node_num);
+                        status = 0;
+                        goto done;
+                }
                mlog_errno(status);
                goto done;
        }
@@ -1334,12 +1418,29 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
        unsigned int node_num;
        int status, i;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_dinode *di;
        /* This is called with the super block cluster lock, so we
         * know that the slot map can't change underneath us. */
        spin_lock(&osb->osb_lock);
        for (i = 0; i < osb->max_slots; i++) {
+                /* Read journal inode to get the recovery generation */
+                status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                di = (struct ocfs2_dinode *)bh->b_data;
+                osb->slot_recovery_generations[i] =
+                                        ocfs2_get_recovery_generation(di);
+                brelse(bh);
+                bh = NULL;
+                mlog(0, "Slot %u recovery generation is %u\n", i,
+                     osb->slot_recovery_generations[i]);
                if (i == osb->slot_num)
                        continue;
@@ -1603,49 +1704,41 @@ static int ocfs2_commit_thread(void *arg)
        return 0;
 }
-/* Look for a dirty journal without taking any cluster locks. Used for
+/* Reads all the journal inodes without taking any cluster locks. Used
- * hard readonly access to determine whether the file system journals
+ * for hard readonly access to determine whether any journal requires
- * require recovery. */
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
 int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
 {
        int ret = 0;
        unsigned int slot;
-        struct buffer_head *di_bh;
+        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
-        struct inode *journal = NULL;
+        int journal_dirty = 0;
        for(slot = 0; slot < osb->max_slots; slot++) {
-                journal = ocfs2_get_system_file_inode(osb,
+                ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
-                                                      JOURNAL_SYSTEM_INODE,
+                if (ret) {
-                                                      slot);
-                if (!journal || is_bad_inode(journal)) {
-                        ret = -EACCES;
-                        mlog_errno(ret);
-                        goto out;
-                }
-                di_bh = NULL;
-                ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
-                                       0, journal);
-                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
                }
                di = (struct ocfs2_dinode *) di_bh->b_data;
+                osb->slot_recovery_generations[slot] =
+                                        ocfs2_get_recovery_generation(di);
                if (le32_to_cpu(di->id1.journal1.ij_flags) &
                    OCFS2_JOURNAL_DIRTY_FL)
-                        ret = -EROFS;
+                        journal_dirty = 1;
                brelse(di_bh);
-                if (ret)
+                di_bh = NULL;
-                        break;
        }
 out:
-        if (journal)
+        if (journal_dirty)
-                iput(journal);
+                ret = -EROFS;
        return ret;
 }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index db82be2532ed..2178ebffa05f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -161,7 +161,8 @@ int    ocfs2_journal_init(struct ocfs2_journal *journal,
 void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
 int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
                          int full);
-int    ocfs2_journal_load(struct ocfs2_journal *journal, int local);
+int    ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+                          int replayed);
 int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
 void   ocfs2_recovery_thread(struct ocfs2_super *osb,
                             int node_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1cb814be8ef1..7f625f2b1117 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -204,6 +204,8 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
+        u32 *slot_recovery_generations;
        spinlock_t node_map_lock;
        u64 root_blkno;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2560b33889aa..88255d3f52b4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1442,6 +1442,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+        osb->slot_recovery_generations =
+                kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+                        GFP_KERNEL);
+        if (!osb->slot_recovery_generations) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
        init_waitqueue_head(&osb->osb_wipe_event);
        osb->osb_orphan_wipes = kcalloc(osb->max_slots,
                                        sizeof(*osb->osb_orphan_wipes),
@@ -1703,7 +1712,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
        local = ocfs2_mount_local(osb);
        /* will play back anything left in the journal. */
-        status = ocfs2_journal_load(osb->journal, local);
+        status = ocfs2_journal_load(osb->journal, local, dirty);
        if (status < 0) {
                mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
                goto finally;
@@ -1768,6 +1777,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        ocfs2_free_slot_info(osb);
        kfree(osb->osb_orphan_wipes);
+        kfree(osb->slot_recovery_generations);
        /* FIXME
         * This belongs in journal shutdown, but because we have to
         * allocate osb->journal at the start of ocfs2_initalize_osb(),

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a8c19cb3cfdd..7a37240f7a31 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
57	static int ocfs2_commit_cache(struct ocfs2_super *osb);	57	static int ocfs2_commit_cache(struct ocfs2_super *osb);
58	static int ocfs2_wait_on_mount(struct ocfs2_super *osb);	58	static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
59	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,	59	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
60	int dirty);	60	int dirty, int replayed);
61	static int ocfs2_trylock_journal(struct ocfs2_super *osb,	61	static int ocfs2_trylock_journal(struct ocfs2_super *osb,
62	int slot_num);	62	int slot_num);
63	static int ocfs2_recover_orphans(struct ocfs2_super *osb,	63	static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -562,8 +562,18 @@ done:
562	return status;	562	return status;
563	}	563	}
564		564
		565	static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
		566	{
		567	le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
		568	}
		569
		570	static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
		571	{
		572	return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
		573	}
		574
565	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,	575	static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
566	int dirty)	576	int dirty, int replayed)
567	{	577	{
568	int status;	578	int status;
569	unsigned int flags;	579	unsigned int flags;
@@ -593,6 +603,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
593	flags &= ~OCFS2_JOURNAL_DIRTY_FL;	603	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
594	fe->id1.journal1.ij_flags = cpu_to_le32(flags);	604	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
595		605
		606	if (replayed)
		607	ocfs2_bump_recovery_generation(fe);
		608
596	status = ocfs2_write_block(osb, bh, journal->j_inode);	609	status = ocfs2_write_block(osb, bh, journal->j_inode);
597	if (status < 0)	610	if (status < 0)
598	mlog_errno(status);	611	mlog_errno(status);
@@ -667,7 +680,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
667	* Do not toggle if flush was unsuccessful otherwise	680	* Do not toggle if flush was unsuccessful otherwise
668	* will leave dirty metadata in a "clean" journal	681	* will leave dirty metadata in a "clean" journal
669	*/	682	*/
670	status = ocfs2_journal_toggle_dirty(osb, 0);	683	status = ocfs2_journal_toggle_dirty(osb, 0, 0);
671	if (status < 0)	684	if (status < 0)
672	mlog_errno(status);	685	mlog_errno(status);
673	}	686	}
@@ -710,7 +723,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
710	}	723	}
711	}	724	}
712		725
713	int ocfs2_journal_load(struct ocfs2_journal *journal, int local)	726	int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
714	{	727	{
715	int status = 0;	728	int status = 0;
716	struct ocfs2_super *osb;	729	struct ocfs2_super *osb;
@@ -729,7 +742,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
729		742
730	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);	743	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
731		744
732	status = ocfs2_journal_toggle_dirty(osb, 1);	745	status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
733	if (status < 0) {	746	if (status < 0) {
734	mlog_errno(status);	747	mlog_errno(status);
735	goto done;	748	goto done;
@@ -771,7 +784,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
771	goto bail;	784	goto bail;
772	}	785	}
773		786
774	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);	787	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
775	if (status < 0)	788	if (status < 0)
776	mlog_errno(status);	789	mlog_errno(status);
777		790
@@ -1034,6 +1047,12 @@ restart:
1034	spin_unlock(&osb->osb_lock);	1047	spin_unlock(&osb->osb_lock);
1035	mlog(0, "All nodes recovered\n");	1048	mlog(0, "All nodes recovered\n");
1036		1049
		1050	/* Refresh all journal recovery generations from disk */
		1051	status = ocfs2_check_journals_nolocks(osb);
		1052	status = (status == -EROFS) ? 0 : status;
		1053	if (status < 0)
		1054	mlog_errno(status);
		1055
1037	ocfs2_super_unlock(osb, 1);	1056	ocfs2_super_unlock(osb, 1);
1038		1057
1039	/* We always run recovery on our own orphan dir - the dead	1058	/* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1115,42 @@ out:
1096	mlog_exit_void();	1115	mlog_exit_void();
1097	}	1116	}
1098		1117
		1118	static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
		1119	int slot_num,
		1120	struct buffer_head **bh,
		1121	struct inode **ret_inode)
		1122	{
		1123	int status = -EACCES;
		1124	struct inode *inode = NULL;
		1125
		1126	BUG_ON(slot_num >= osb->max_slots);
		1127
		1128	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
		1129	slot_num);
		1130	if (!inode \|\| is_bad_inode(inode)) {
		1131	mlog_errno(status);
		1132	goto bail;
		1133	}
		1134	SET_INODE_JOURNAL(inode);
		1135
		1136	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
		1137	if (status < 0) {
		1138	mlog_errno(status);
		1139	goto bail;
		1140	}
		1141
		1142	status = 0;
		1143
		1144	bail:
		1145	if (inode) {
		1146	if (status \|\| !ret_inode)
		1147	iput(inode);
		1148	else
		1149	*ret_inode = inode;
		1150	}
		1151	return status;
		1152	}
		1153
1099	/* Does the actual journal replay and marks the journal inode as	1154	/* Does the actual journal replay and marks the journal inode as
1100	* clean. Will only replay if the journal inode is marked dirty. */	1155	* clean. Will only replay if the journal inode is marked dirty. */
1101	static int ocfs2_replay_journal(struct ocfs2_super *osb,	1156	static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1164,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1109	struct ocfs2_dinode *fe;	1164	struct ocfs2_dinode *fe;
1110	journal_t *journal = NULL;	1165	journal_t *journal = NULL;
1111	struct buffer_head *bh = NULL;	1166	struct buffer_head *bh = NULL;
		1167	u32 slot_reco_gen;
1112		1168
1113	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,	1169	status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
1114	slot_num);	1170	if (status) {
1115	if (inode == NULL) {
1116	status = -EACCES;
1117	mlog_errno(status);	1171	mlog_errno(status);
1118	goto done;	1172	goto done;
1119	}	1173	}
1120	if (is_bad_inode(inode)) {	1174
1121	status = -EACCES;	1175	fe = (struct ocfs2_dinode *)bh->b_data;
1122	iput(inode);	1176	slot_reco_gen = ocfs2_get_recovery_generation(fe);
1123	inode = NULL;	1177	brelse(bh);
1124	mlog_errno(status);	1178	bh = NULL;
		1179
		1180	/*
		1181	* As the fs recovery is asynchronous, there is a small chance that
		1182	* another node mounted (and recovered) the slot before the recovery
		1183	* thread could get the lock. To handle that, we dirty read the journal
		1184	* inode for that slot to get the recovery generation. If it is
		1185	* different than what we expected, the slot has been recovered.
		1186	* If not, it needs recovery.
		1187	*/
		1188	if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
		1189	mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
		1190	osb->slot_recovery_generations[slot_num], slot_reco_gen);
		1191	osb->slot_recovery_generations[slot_num] = slot_reco_gen;
		1192	status = -EBUSY;
1125	goto done;	1193	goto done;
1126	}	1194	}
1127	SET_INODE_JOURNAL(inode);	1195
		1196	/* Continue with recovery as the journal has not yet been recovered */
1128		1197
1129	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);	1198	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
1130	if (status < 0) {	1199	if (status < 0) {
@@ -1138,9 +1207,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1138	fe = (struct ocfs2_dinode *) bh->b_data;	1207	fe = (struct ocfs2_dinode *) bh->b_data;
1139		1208
1140	flags = le32_to_cpu(fe->id1.journal1.ij_flags);	1209	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
		1210	slot_reco_gen = ocfs2_get_recovery_generation(fe);
1141		1211
1142	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {	1212	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
1143	mlog(0, "No recovery required for node %d\n", node_num);	1213	mlog(0, "No recovery required for node %d\n", node_num);
		1214	/* Refresh recovery generation for the slot */
		1215	osb->slot_recovery_generations[slot_num] = slot_reco_gen;
1144	goto done;	1216	goto done;
1145	}	1217	}
1146		1218
@@ -1188,6 +1260,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1188	flags &= ~OCFS2_JOURNAL_DIRTY_FL;	1260	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
1189	fe->id1.journal1.ij_flags = cpu_to_le32(flags);	1261	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
1190		1262
		1263	/* Increment recovery generation to indicate successful recovery */
		1264	ocfs2_bump_recovery_generation(fe);
		1265	osb->slot_recovery_generations[slot_num] =
		1266	ocfs2_get_recovery_generation(fe);
		1267
1191	status = ocfs2_write_block(osb, bh, inode);	1268	status = ocfs2_write_block(osb, bh, inode);
1192	if (status < 0)	1269	if (status < 0)
1193	mlog_errno(status);	1270	mlog_errno(status);
@@ -1252,6 +1329,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1252		1329
1253	status = ocfs2_replay_journal(osb, node_num, slot_num);	1330	status = ocfs2_replay_journal(osb, node_num, slot_num);
1254	if (status < 0) {	1331	if (status < 0) {
		1332	if (status == -EBUSY) {
		1333	mlog(0, "Skipping recovery for slot %u (node %u) "
		1334	"as another node has recovered it\n", slot_num,
		1335	node_num);
		1336	status = 0;
		1337	goto done;
		1338	}
1255	mlog_errno(status);	1339	mlog_errno(status);
1256	goto done;	1340	goto done;
1257	}	1341	}
@@ -1334,12 +1418,29 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1334	{	1418	{
1335	unsigned int node_num;	1419	unsigned int node_num;
1336	int status, i;	1420	int status, i;
		1421	struct buffer_head *bh = NULL;
		1422	struct ocfs2_dinode *di;
1337		1423
1338	/* This is called with the super block cluster lock, so we	1424	/* This is called with the super block cluster lock, so we
1339	* know that the slot map can't change underneath us. */	1425	* know that the slot map can't change underneath us. */
1340		1426
1341	spin_lock(&osb->osb_lock);	1427	spin_lock(&osb->osb_lock);
1342	for (i = 0; i < osb->max_slots; i++) {	1428	for (i = 0; i < osb->max_slots; i++) {
		1429	/* Read journal inode to get the recovery generation */
		1430	status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
		1431	if (status) {
		1432	mlog_errno(status);
		1433	goto bail;
		1434	}
		1435	di = (struct ocfs2_dinode *)bh->b_data;
		1436	osb->slot_recovery_generations[i] =
		1437	ocfs2_get_recovery_generation(di);
		1438	brelse(bh);
		1439	bh = NULL;
		1440
		1441	mlog(0, "Slot %u recovery generation is %u\n", i,
		1442	osb->slot_recovery_generations[i]);
		1443
1343	if (i == osb->slot_num)	1444	if (i == osb->slot_num)
1344	continue;	1445	continue;
1345		1446
@@ -1603,49 +1704,41 @@ static int ocfs2_commit_thread(void *arg)
1603	return 0;	1704	return 0;
1604	}	1705	}
1605		1706
1606	/* Look for a dirty journal without taking any cluster locks. Used for	1707	/* Reads all the journal inodes without taking any cluster locks. Used
1607	* hard readonly access to determine whether the file system journals	1708	* for hard readonly access to determine whether any journal requires
1608	* require recovery. */	1709	* recovery. Also used to refresh the recovery generation numbers after
		1710	* a journal has been recovered by another node.
		1711	*/
1609	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)	1712	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
1610	{	1713	{
1611	int ret = 0;	1714	int ret = 0;
1612	unsigned int slot;	1715	unsigned int slot;
1613	struct buffer_head *di_bh;	1716	struct buffer_head *di_bh = NULL;
1614	struct ocfs2_dinode *di;	1717	struct ocfs2_dinode *di;
1615	struct inode *journal = NULL;	1718	int journal_dirty = 0;
1616		1719
1617	for(slot = 0; slot < osb->max_slots; slot++) {	1720	for(slot = 0; slot < osb->max_slots; slot++) {
1618	journal = ocfs2_get_system_file_inode(osb,	1721	ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
1619	JOURNAL_SYSTEM_INODE,	1722	if (ret) {
1620	slot);
1621	if (!journal \|\| is_bad_inode(journal)) {
1622	ret = -EACCES;
1623	mlog_errno(ret);
1624	goto out;
1625	}
1626
1627	di_bh = NULL;
1628	ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
1629	0, journal);
1630	if (ret < 0) {
1631	mlog_errno(ret);	1723	mlog_errno(ret);
1632	goto out;	1724	goto out;
1633	}	1725	}
1634		1726
1635	di = (struct ocfs2_dinode *) di_bh->b_data;	1727	di = (struct ocfs2_dinode *) di_bh->b_data;
1636		1728
		1729	osb->slot_recovery_generations[slot] =
		1730	ocfs2_get_recovery_generation(di);
		1731
1637	if (le32_to_cpu(di->id1.journal1.ij_flags) &	1732	if (le32_to_cpu(di->id1.journal1.ij_flags) &
1638	OCFS2_JOURNAL_DIRTY_FL)	1733	OCFS2_JOURNAL_DIRTY_FL)
1639	ret = -EROFS;	1734	journal_dirty = 1;
1640		1735
1641	brelse(di_bh);	1736	brelse(di_bh);
1642	if (ret)	1737	di_bh = NULL;
1643	break;
1644	}	1738	}
1645		1739
1646	out:	1740	out:
1647	if (journal)	1741	if (journal_dirty)
1648	iput(journal);	1742	ret = -EROFS;
1649
1650	return ret;	1743	return ret;
1651	}	1744	}


diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index db82be2532ed..2178ebffa05f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h
@@ -161,7 +161,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal,
161	void ocfs2_journal_shutdown(struct ocfs2_super *osb);	161	void ocfs2_journal_shutdown(struct ocfs2_super *osb);
162	int ocfs2_journal_wipe(struct ocfs2_journal *journal,	162	int ocfs2_journal_wipe(struct ocfs2_journal *journal,
163	int full);	163	int full);
164	int ocfs2_journal_load(struct ocfs2_journal *journal, int local);	164	int ocfs2_journal_load(struct ocfs2_journal *journal, int local,
		165	int replayed);
165	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);	166	int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
166	void ocfs2_recovery_thread(struct ocfs2_super *osb,	167	void ocfs2_recovery_thread(struct ocfs2_super *osb,
167	int node_num);	168	int node_num);


diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 1cb814be8ef1..7f625f2b1117 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h
@@ -204,6 +204,8 @@ struct ocfs2_super
204		204
205	struct ocfs2_slot_info *slot_info;	205	struct ocfs2_slot_info *slot_info;
206		206
		207	u32 *slot_recovery_generations;
		208
207	spinlock_t node_map_lock;	209	spinlock_t node_map_lock;
208		210
209	u64 root_blkno;	211	u64 root_blkno;


diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2560b33889aa..88255d3f52b4 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c
@@ -1442,6 +1442,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
1442	}	1442	}
1443	mlog(0, "max_slots for this device: %u\n", osb->max_slots);	1443	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
1444		1444
		1445	osb->slot_recovery_generations =
		1446	kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
		1447	GFP_KERNEL);
		1448	if (!osb->slot_recovery_generations) {
		1449	status = -ENOMEM;
		1450	mlog_errno(status);
		1451	goto bail;
		1452	}
		1453
1445	init_waitqueue_head(&osb->osb_wipe_event);	1454	init_waitqueue_head(&osb->osb_wipe_event);
1446	osb->osb_orphan_wipes = kcalloc(osb->max_slots,	1455	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
1447	sizeof(*osb->osb_orphan_wipes),	1456	sizeof(*osb->osb_orphan_wipes),
@@ -1703,7 +1712,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
1703	local = ocfs2_mount_local(osb);	1712	local = ocfs2_mount_local(osb);
1704		1713
1705	/* will play back anything left in the journal. */	1714	/* will play back anything left in the journal. */
1706	status = ocfs2_journal_load(osb->journal, local);	1715	status = ocfs2_journal_load(osb->journal, local, dirty);
1707	if (status < 0) {	1716	if (status < 0) {
1708	mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);	1717	mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
1709	goto finally;	1718	goto finally;
@@ -1768,6 +1777,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
1768	ocfs2_free_slot_info(osb);	1777	ocfs2_free_slot_info(osb);
1769		1778
1770	kfree(osb->osb_orphan_wipes);	1779	kfree(osb->osb_orphan_wipes);
		1780	kfree(osb->slot_recovery_generations);
1771	/* FIXME	1781	/* FIXME
1772	* This belongs in journal shutdown, but because we have to	1782	* This belongs in journal shutdown, but because we have to
1773	* allocate osb->journal at the start of ocfs2_initalize_osb(),	1783	* allocate osb->journal at the start of ocfs2_initalize_osb(),