1 files changed, 166 insertions, 15 deletions
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ed0c6d0850d7..ca4c0ea5a4cd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+struct ocfs2_recovery_map {
+        int rm_used;
+        unsigned int *rm_entries;
+};
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+        struct ocfs2_recovery_map *rm;
+        mutex_init(&osb->recovery_lock);
+        osb->disable_recovery = 0;
+        osb->recovery_thread_task = NULL;
+        init_waitqueue_head(&osb->recovery_event);
+        rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+                     osb->max_slots * sizeof(unsigned int),
+                     GFP_KERNEL);
+        if (!rm) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        rm->rm_entries = (unsigned int *)((char *)rm +
+                                          sizeof(struct ocfs2_recovery_map));
+        osb->recovery_map = rm;
+        return 0;
+}
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+        mb();
+        return osb->recovery_thread_task != NULL;
+}
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+        struct ocfs2_recovery_map *rm;
+        /* disable any new recovery threads and wait for any currently
+         * running ones to exit. Do this before setting the vol_state. */
+        mutex_lock(&osb->recovery_lock);
+        osb->disable_recovery = 1;
+        mutex_unlock(&osb->recovery_lock);
+        wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+        /* At this point, we know that no more recovery threads can be
+         * launched, so wait for any recovery completion work to
+         * complete. */
+        flush_workqueue(ocfs2_wq);
+        /*
+         * Now that recovery is shut down, and the osb is about to be
+         * freed,  the osb_lock is not taken here.
+         */
+        rm = osb->recovery_map;
+        /* XXX: Should we bug if there are dirty entries? */
+        kfree(rm);
+}
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+                                     unsigned int node_num)
+{
+        int i;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        assert_spin_locked(&osb->osb_lock);
+        for (i = 0; i < rm->rm_used; i++) {
+                if (rm->rm_entries[i] == node_num)
+                        return 1;
+        }
+        return 0;
+}
+/* Behaves like test-and-set.  Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+                                  unsigned int node_num)
+{
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        if (__ocfs2_recovery_map_test(osb, node_num)) {
+                spin_unlock(&osb->osb_lock);
+                return 1;
+        }
+        /* XXX: Can this be exploited? Not from o2dlm... */
+        BUG_ON(rm->rm_used >= osb->max_slots);
+        rm->rm_entries[rm->rm_used] = node_num;
+        rm->rm_used++;
+        spin_unlock(&osb->osb_lock);
+        return 0;
+}
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+                                     unsigned int node_num)
+{
+        int i;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        for (i = 0; i < rm->rm_used; i++) {
+                if (rm->rm_entries[i] == node_num)
+                        break;
+        }
+        if (i < rm->rm_used) {
+                /* XXX: be careful with the pointer math */
+                memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+                        (rm->rm_used - i - 1) * sizeof(unsigned int));
+                rm->rm_used--;
+        }
+        spin_unlock(&osb->osb_lock);
+}
 static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
        int status = 0;
@@ -650,6 +781,23 @@ bail:
        return status;
 }
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+        int empty;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        empty = (rm->rm_used == 0);
+        spin_unlock(&osb->osb_lock);
+        return empty;
+}
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+        wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
 /*
 * JBD Might read a cached version of another nodes journal file. We
 * don't want this as this file changes often and we get no
@@ -848,6 +996,7 @@ static int __ocfs2_recovery_thread(void *arg)
 {
        int status, node_num;
        struct ocfs2_super *osb = arg;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
        mlog_entry_void();
@@ -863,26 +1012,29 @@ restart:
                goto bail;
        }
-        while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+        spin_lock(&osb->osb_lock);
-                node_num = ocfs2_node_map_first_set_bit(osb,
+        while (rm->rm_used) {
-                                                        &osb->recovery_map);
+                /* It's always safe to remove entry zero, as we won't
-                if (node_num == O2NM_INVALID_NODE_NUM) {
+                 * clear it until ocfs2_recover_node() has succeeded. */
-                        mlog(0, "Out of nodes to recover.\n");
+                node_num = rm->rm_entries[0];
-                        break;
+                spin_unlock(&osb->osb_lock);
-                }
                status = ocfs2_recover_node(osb, node_num);
-                if (status < 0) {
+                if (!status) {
+                        ocfs2_recovery_map_clear(osb, node_num);
+                } else {
                        mlog(ML_ERROR,
                             "Error %d recovering node %d on device (%u,%u)!\n",
                             status, node_num,
                             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
                        mlog(ML_ERROR, "Volume requires unmount.\n");
-                        continue;
                }
-                ocfs2_recovery_map_clear(osb, node_num);
+                spin_lock(&osb->osb_lock);
        }
+        spin_unlock(&osb->osb_lock);
+        mlog(0, "All nodes recovered\n");
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1045,7 @@ restart:
 bail:
        mutex_lock(&osb->recovery_lock);
-        if (!status &&
+        if (!status && !ocfs2_recovery_completed(osb)) {
-            !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
                mutex_unlock(&osb->recovery_lock);
                goto restart;
        }
@@ -924,8 +1075,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
        /* People waiting on recovery will wait on
         * the recovery map to empty. */
-        if (!ocfs2_recovery_map_set(osb, node_num))
+        if (ocfs2_recovery_map_set(osb, node_num))
-                mlog(0, "node %d already be in recovery.\n", node_num);
+                mlog(0, "node %d already in recovery map.\n", node_num);
        mlog(0, "starting recovery thread...\n");
@@ -1197,7 +1348,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
                if (status == -ENOENT)
                        continue;
-                if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+                if (__ocfs2_recovery_map_test(osb, node_num))
                        continue;
                spin_unlock(&osb->osb_lock);

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ed0c6d0850d7..ca4c0ea5a4cd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
64	int slot);	64	int slot);
65	static int ocfs2_commit_thread(void *arg);	65	static int ocfs2_commit_thread(void *arg);
66		66
		67
		68	/*
		69	* The recovery_list is a simple linked list of node numbers to recover.
		70	* It is protected by the recovery_lock.
		71	*/
		72
		73	struct ocfs2_recovery_map {
		74	int rm_used;
		75	unsigned int *rm_entries;
		76	};
		77
		78	int ocfs2_recovery_init(struct ocfs2_super *osb)
		79	{
		80	struct ocfs2_recovery_map *rm;
		81
		82	mutex_init(&osb->recovery_lock);
		83	osb->disable_recovery = 0;
		84	osb->recovery_thread_task = NULL;
		85	init_waitqueue_head(&osb->recovery_event);
		86
		87	rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
		88	osb->max_slots * sizeof(unsigned int),
		89	GFP_KERNEL);
		90	if (!rm) {
		91	mlog_errno(-ENOMEM);
		92	return -ENOMEM;
		93	}
		94
		95	rm->rm_entries = (unsigned int )((char )rm +
		96	sizeof(struct ocfs2_recovery_map));
		97	osb->recovery_map = rm;
		98
		99	return 0;
		100	}
		101
		102	/* we can't grab the goofy sem lock from inside wait_event, so we use
		103	* memory barriers to make sure that we'll see the null task before
		104	* being woken up */
		105	static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
		106	{
		107	mb();
		108	return osb->recovery_thread_task != NULL;
		109	}
		110
		111	void ocfs2_recovery_exit(struct ocfs2_super *osb)
		112	{
		113	struct ocfs2_recovery_map *rm;
		114
		115	/* disable any new recovery threads and wait for any currently
		116	* running ones to exit. Do this before setting the vol_state. */
		117	mutex_lock(&osb->recovery_lock);
		118	osb->disable_recovery = 1;
		119	mutex_unlock(&osb->recovery_lock);
		120	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
		121
		122	/* At this point, we know that no more recovery threads can be
		123	* launched, so wait for any recovery completion work to
		124	* complete. */
		125	flush_workqueue(ocfs2_wq);
		126
		127	/*
		128	* Now that recovery is shut down, and the osb is about to be
		129	* freed, the osb_lock is not taken here.
		130	*/
		131	rm = osb->recovery_map;
		132	/* XXX: Should we bug if there are dirty entries? */
		133
		134	kfree(rm);
		135	}
		136
		137	static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
		138	unsigned int node_num)
		139	{
		140	int i;
		141	struct ocfs2_recovery_map *rm = osb->recovery_map;
		142
		143	assert_spin_locked(&osb->osb_lock);
		144
		145	for (i = 0; i < rm->rm_used; i++) {
		146	if (rm->rm_entries[i] == node_num)
		147	return 1;
		148	}
		149
		150	return 0;
		151	}
		152
		153	/* Behaves like test-and-set. Returns the previous value */
		154	static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
		155	unsigned int node_num)
		156	{
		157	struct ocfs2_recovery_map *rm = osb->recovery_map;
		158
		159	spin_lock(&osb->osb_lock);
		160	if (__ocfs2_recovery_map_test(osb, node_num)) {
		161	spin_unlock(&osb->osb_lock);
		162	return 1;
		163	}
		164
		165	/* XXX: Can this be exploited? Not from o2dlm... */
		166	BUG_ON(rm->rm_used >= osb->max_slots);
		167
		168	rm->rm_entries[rm->rm_used] = node_num;
		169	rm->rm_used++;
		170	spin_unlock(&osb->osb_lock);
		171
		172	return 0;
		173	}
		174
		175	static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
		176	unsigned int node_num)
		177	{
		178	int i;
		179	struct ocfs2_recovery_map *rm = osb->recovery_map;
		180
		181	spin_lock(&osb->osb_lock);
		182
		183	for (i = 0; i < rm->rm_used; i++) {
		184	if (rm->rm_entries[i] == node_num)
		185	break;
		186	}
		187
		188	if (i < rm->rm_used) {
		189	/* XXX: be careful with the pointer math */
		190	memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
		191	(rm->rm_used - i - 1) * sizeof(unsigned int));
		192	rm->rm_used--;
		193	}
		194
		195	spin_unlock(&osb->osb_lock);
		196	}
		197
67	static int ocfs2_commit_cache(struct ocfs2_super *osb)	198	static int ocfs2_commit_cache(struct ocfs2_super *osb)
68	{	199	{
69	int status = 0;	200	int status = 0;
@@ -650,6 +781,23 @@ bail:
650	return status;	781	return status;
651	}	782	}
652		783
		784	static int ocfs2_recovery_completed(struct ocfs2_super *osb)
		785	{
		786	int empty;
		787	struct ocfs2_recovery_map *rm = osb->recovery_map;
		788
		789	spin_lock(&osb->osb_lock);
		790	empty = (rm->rm_used == 0);
		791	spin_unlock(&osb->osb_lock);
		792
		793	return empty;
		794	}
		795
		796	void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
		797	{
		798	wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
		799	}
		800
653	/*	801	/*
654	* JBD Might read a cached version of another nodes journal file. We	802	* JBD Might read a cached version of another nodes journal file. We
655	* don't want this as this file changes often and we get no	803	* don't want this as this file changes often and we get no
@@ -848,6 +996,7 @@ static int __ocfs2_recovery_thread(void *arg)
848	{	996	{
849	int status, node_num;	997	int status, node_num;
850	struct ocfs2_super *osb = arg;	998	struct ocfs2_super *osb = arg;
		999	struct ocfs2_recovery_map *rm = osb->recovery_map;
851		1000
852	mlog_entry_void();	1001	mlog_entry_void();
853		1002
@@ -863,26 +1012,29 @@ restart:
863	goto bail;	1012	goto bail;
864	}	1013	}
865		1014
866	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {	1015	spin_lock(&osb->osb_lock);
867	node_num = ocfs2_node_map_first_set_bit(osb,	1016	while (rm->rm_used) {
868	&osb->recovery_map);	1017	/* It's always safe to remove entry zero, as we won't
869	if (node_num == O2NM_INVALID_NODE_NUM) {	1018	* clear it until ocfs2_recover_node() has succeeded. */
870	mlog(0, "Out of nodes to recover.\n");	1019	node_num = rm->rm_entries[0];
871	break;	1020	spin_unlock(&osb->osb_lock);
872	}
873		1021
874	status = ocfs2_recover_node(osb, node_num);	1022	status = ocfs2_recover_node(osb, node_num);
875	if (status < 0) {	1023	if (!status) {
		1024	ocfs2_recovery_map_clear(osb, node_num);
		1025	} else {
876	mlog(ML_ERROR,	1026	mlog(ML_ERROR,
877	"Error %d recovering node %d on device (%u,%u)!\n",	1027	"Error %d recovering node %d on device (%u,%u)!\n",
878	status, node_num,	1028	status, node_num,
879	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));	1029	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
880	mlog(ML_ERROR, "Volume requires unmount.\n");	1030	mlog(ML_ERROR, "Volume requires unmount.\n");
881	continue;
882	}	1031	}
883		1032
884	ocfs2_recovery_map_clear(osb, node_num);	1033	spin_lock(&osb->osb_lock);
885	}	1034	}
		1035	spin_unlock(&osb->osb_lock);
		1036	mlog(0, "All nodes recovered\n");
		1037
886	ocfs2_super_unlock(osb, 1);	1038	ocfs2_super_unlock(osb, 1);
887		1039
888	/* We always run recovery on our own orphan dir - the dead	1040	/* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1045,7 @@ restart:
893		1045
894	bail:	1046	bail:
895	mutex_lock(&osb->recovery_lock);	1047	mutex_lock(&osb->recovery_lock);
896	if (!status &&	1048	if (!status && !ocfs2_recovery_completed(osb)) {
897	!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
898	mutex_unlock(&osb->recovery_lock);	1049	mutex_unlock(&osb->recovery_lock);
899	goto restart;	1050	goto restart;
900	}	1051	}
@@ -924,8 +1075,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
924		1075
925	/* People waiting on recovery will wait on	1076	/* People waiting on recovery will wait on
926	* the recovery map to empty. */	1077	* the recovery map to empty. */
927	if (!ocfs2_recovery_map_set(osb, node_num))	1078	if (ocfs2_recovery_map_set(osb, node_num))
928	mlog(0, "node %d already be in recovery.\n", node_num);	1079	mlog(0, "node %d already in recovery map.\n", node_num);
929		1080
930	mlog(0, "starting recovery thread...\n");	1081	mlog(0, "starting recovery thread...\n");
931		1082
@@ -1197,7 +1348,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1197	if (status == -ENOENT)	1348	if (status == -ENOENT)
1198	continue;	1349	continue;
1199		1350
1200	if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))	1351	if (__ocfs2_recovery_map_test(osb, node_num))
1201	continue;	1352	continue;
1202	spin_unlock(&osb->osb_lock);	1353	spin_unlock(&osb->osb_lock);
1203		1354