[PATCH] ocfs2: dlm recovery fixes

when starting lock mastery (excepting the recovery lock) wait on any nodes needing recovery. fix one instance where lock resources were left attached to the recovery list after recovery completed. ensure that the node_down code is run uniformly regardless of which node found the dead node first. Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
author: Kurt Hackel <kurt.hackel@oracle.com> 2006-03-06 17:08:49 -0500
committer: Mark Fasheh <mark.fasheh@oracle.com> 2006-03-24 17:58:25 -0500
commit: c03872f5f50bc10f2a1a485f08879a8d01bcfe49 (patch)
tree: 9ac370cf1a7c015522af75af3f60e9d6c4425bbc /fs/ocfs2/dlm/dlmmaster.c
parent: 9c6510a5bfe2f1c5f5b93386c06954be02e974e4 (diff)
1 files changed, 103 insertions, 0 deletions
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 78ac3a00eb54..940be4c13b1f 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -239,6 +239,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
                                       struct dlm_lock_resource *res,
                                       u8 target);
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+                                       struct dlm_lock_resource *res);
 int dlm_is_host_down(int errno)
@@ -677,6 +679,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
        struct dlm_node_iter iter;
        unsigned int namelen;
        int tries = 0;
+        int bit, wait_on_recovery = 0;
        BUG_ON(!lockid);
@@ -762,6 +765,18 @@ lookup:
                dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
                set_bit(dlm->node_num, mle->maybe_map);
                list_add(&mle->list, &dlm->master_list);
+                /* still holding the dlm spinlock, check the recovery map
+                 * to see if there are any nodes that still need to be 
+                 * considered.  these will not appear in the mle nodemap
+                 * but they might own this lockres.  wait on them. */
+                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+                if (bit < O2NM_MAX_NODES) {
+                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                             "recover before lock mastery can begin\n",
+                             dlm->name, namelen, (char *)lockid, bit);
+                        wait_on_recovery = 1;
+                }
        }
        /* at this point there is either a DLM_MLE_BLOCK or a
@@ -779,6 +794,39 @@ lookup:
        spin_unlock(&dlm->master_lock);
        spin_unlock(&dlm->spinlock);
+        while (wait_on_recovery) {
+                /* any cluster changes that occurred after dropping the
+                 * dlm spinlock would be detectable be a change on the mle,
+                 * so we only need to clear out the recovery map once. */
+                if (dlm_is_recovery_lock(lockid, namelen)) {
+                        mlog(ML_NOTICE, "%s: recovery map is not empty, but "
+                             "must master $RECOVERY lock now\n", dlm->name);
+                        if (!dlm_pre_master_reco_lockres(dlm, res))
+                                wait_on_recovery = 0;
+                        else {
+                                mlog(0, "%s: waiting 500ms for heartbeat state "
+                                    "change\n", dlm->name);
+                                msleep(500);
+                        }
+                        continue;
+                } 
+                dlm_kick_recovery_thread(dlm);
+                msleep(100);
+                dlm_wait_for_recovery(dlm);
+                spin_lock(&dlm->spinlock);
+                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+                if (bit < O2NM_MAX_NODES) {
+                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                             "recover before lock mastery can begin\n",
+                             dlm->name, namelen, (char *)lockid, bit);
+                        wait_on_recovery = 1;
+                } else
+                        wait_on_recovery = 0;
+                spin_unlock(&dlm->spinlock);
+        }
        /* must wait for lock to be mastered elsewhere */
        if (blocked)
                goto wait;
@@ -1835,6 +1883,61 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
        mlog(0, "finished with dlm_assert_master_worker\n");
 }
+/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
+ * We cannot wait for node recovery to complete to begin mastering this
+ * lockres because this lockres is used to kick off recovery! ;-)
+ * So, do a pre-check on all living nodes to see if any of those nodes
+ * think that $RECOVERY is currently mastered by a dead node.  If so,
+ * we wait a short time to allow that node to get notified by its own
+ * heartbeat stack, then check again.  All $RECOVERY lock resources
+ * mastered by dead nodes are purged when the hearbeat callback is 
+ * fired, so we can know for sure that it is safe to continue once
+ * the node returns a live node or no node.  */
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+                                       struct dlm_lock_resource *res)
+{
+        struct dlm_node_iter iter;
+        int nodenum;
+        int ret = 0;
+        u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
+        spin_lock(&dlm->spinlock);
+        dlm_node_iter_init(dlm->domain_map, &iter);
+        spin_unlock(&dlm->spinlock);
+        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+                /* do not send to self */
+                if (nodenum == dlm->node_num)
+                        continue;
+                ret = dlm_do_master_requery(dlm, res, nodenum, &master);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        if (!dlm_is_host_down(ret))
+                                BUG();
+                        /* host is down, so answer for that node would be
+                         * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+                }
+                if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+                        /* check to see if this master is in the recovery map */
+                        spin_lock(&dlm->spinlock);
+                        if (test_bit(master, dlm->recovery_map)) {
+                                mlog(ML_NOTICE, "%s: node %u has not seen "
+                                     "node %u go down yet, and thinks the "
+                                     "dead node is mastering the recovery "
+                                     "lock.  must wait.\n", dlm->name,
+                                     nodenum, master);
+                                ret = -EAGAIN;
+                        }
+                        spin_unlock(&dlm->spinlock);
+                        mlog(0, "%s: reco lock master is %u\n", dlm->name, 
+                             master);
+                        break;
+                }
+        }
+        return ret;
+}
 /*
 * DLM_MIGRATE_LOCKRES
author	Kurt Hackel <kurt.hackel@oracle.com>	2006-03-06 17:08:49 -0500
committer	Mark Fasheh <mark.fasheh@oracle.com>	2006-03-24 17:58:25 -0500
commit	c03872f5f50bc10f2a1a485f08879a8d01bcfe49 (patch)
tree	9ac370cf1a7c015522af75af3f60e9d6c4425bbc /fs/ocfs2/dlm/dlmmaster.c
parent	9c6510a5bfe2f1c5f5b93386c06954be02e974e4 (diff)

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 78ac3a00eb54..940be4c13b1f 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -239,6 +239,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
239	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,	239	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
240	struct dlm_lock_resource *res,	240	struct dlm_lock_resource *res,
241	u8 target);	241	u8 target);
		242	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
		243	struct dlm_lock_resource *res);
242		244
243		245
244	int dlm_is_host_down(int errno)	246	int dlm_is_host_down(int errno)
@@ -677,6 +679,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
677	struct dlm_node_iter iter;	679	struct dlm_node_iter iter;
678	unsigned int namelen;	680	unsigned int namelen;
679	int tries = 0;	681	int tries = 0;
		682	int bit, wait_on_recovery = 0;
680		683
681	BUG_ON(!lockid);	684	BUG_ON(!lockid);
682		685
@@ -762,6 +765,18 @@ lookup:
762	dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);	765	dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
763	set_bit(dlm->node_num, mle->maybe_map);	766	set_bit(dlm->node_num, mle->maybe_map);
764	list_add(&mle->list, &dlm->master_list);	767	list_add(&mle->list, &dlm->master_list);
		768
		769	/* still holding the dlm spinlock, check the recovery map
		770	* to see if there are any nodes that still need to be
		771	* considered. these will not appear in the mle nodemap
		772	* but they might own this lockres. wait on them. */
		773	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
		774	if (bit < O2NM_MAX_NODES) {
		775	mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
		776	"recover before lock mastery can begin\n",
		777	dlm->name, namelen, (char *)lockid, bit);
		778	wait_on_recovery = 1;
		779	}
765	}	780	}
766		781
767	/* at this point there is either a DLM_MLE_BLOCK or a	782	/* at this point there is either a DLM_MLE_BLOCK or a
@@ -779,6 +794,39 @@ lookup:
779	spin_unlock(&dlm->master_lock);	794	spin_unlock(&dlm->master_lock);
780	spin_unlock(&dlm->spinlock);	795	spin_unlock(&dlm->spinlock);
781		796
		797	while (wait_on_recovery) {
		798	/* any cluster changes that occurred after dropping the
		799	* dlm spinlock would be detectable be a change on the mle,
		800	* so we only need to clear out the recovery map once. */
		801	if (dlm_is_recovery_lock(lockid, namelen)) {
		802	mlog(ML_NOTICE, "%s: recovery map is not empty, but "
		803	"must master $RECOVERY lock now\n", dlm->name);
		804	if (!dlm_pre_master_reco_lockres(dlm, res))
		805	wait_on_recovery = 0;
		806	else {
		807	mlog(0, "%s: waiting 500ms for heartbeat state "
		808	"change\n", dlm->name);
		809	msleep(500);
		810	}
		811	continue;
		812	}
		813
		814	dlm_kick_recovery_thread(dlm);
		815	msleep(100);
		816	dlm_wait_for_recovery(dlm);
		817
		818	spin_lock(&dlm->spinlock);
		819	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
		820	if (bit < O2NM_MAX_NODES) {
		821	mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
		822	"recover before lock mastery can begin\n",
		823	dlm->name, namelen, (char *)lockid, bit);
		824	wait_on_recovery = 1;
		825	} else
		826	wait_on_recovery = 0;
		827	spin_unlock(&dlm->spinlock);
		828	}
		829
782	/* must wait for lock to be mastered elsewhere */	830	/* must wait for lock to be mastered elsewhere */
783	if (blocked)	831	if (blocked)
784	goto wait;	832	goto wait;
@@ -1835,6 +1883,61 @@ static void dlm_assert_master_worker(struct dlm_work_item item, void data)
1835	mlog(0, "finished with dlm_assert_master_worker\n");	1883	mlog(0, "finished with dlm_assert_master_worker\n");
1836	}	1884	}
1837		1885
		1886	/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
		1887	* We cannot wait for node recovery to complete to begin mastering this
		1888	* lockres because this lockres is used to kick off recovery! ;-)
		1889	* So, do a pre-check on all living nodes to see if any of those nodes
		1890	* think that $RECOVERY is currently mastered by a dead node. If so,
		1891	* we wait a short time to allow that node to get notified by its own
		1892	* heartbeat stack, then check again. All $RECOVERY lock resources
		1893	* mastered by dead nodes are purged when the hearbeat callback is
		1894	* fired, so we can know for sure that it is safe to continue once
		1895	* the node returns a live node or no node. */
		1896	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
		1897	struct dlm_lock_resource *res)
		1898	{
		1899	struct dlm_node_iter iter;
		1900	int nodenum;
		1901	int ret = 0;
		1902	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
		1903
		1904	spin_lock(&dlm->spinlock);
		1905	dlm_node_iter_init(dlm->domain_map, &iter);
		1906	spin_unlock(&dlm->spinlock);
		1907
		1908	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
		1909	/* do not send to self */
		1910	if (nodenum == dlm->node_num)
		1911	continue;
		1912	ret = dlm_do_master_requery(dlm, res, nodenum, &master);
		1913	if (ret < 0) {
		1914	mlog_errno(ret);
		1915	if (!dlm_is_host_down(ret))
		1916	BUG();
		1917	/* host is down, so answer for that node would be
		1918	* DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
		1919	}
		1920
		1921	if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
		1922	/* check to see if this master is in the recovery map */
		1923	spin_lock(&dlm->spinlock);
		1924	if (test_bit(master, dlm->recovery_map)) {
		1925	mlog(ML_NOTICE, "%s: node %u has not seen "
		1926	"node %u go down yet, and thinks the "
		1927	"dead node is mastering the recovery "
		1928	"lock. must wait.\n", dlm->name,
		1929	nodenum, master);
		1930	ret = -EAGAIN;
		1931	}
		1932	spin_unlock(&dlm->spinlock);
		1933	mlog(0, "%s: reco lock master is %u\n", dlm->name,
		1934	master);
		1935	break;
		1936	}
		1937	}
		1938	return ret;
		1939	}
		1940
1838		1941
1839	/*	1942	/*
1840	* DLM_MIGRATE_LOCKRES	1943	* DLM_MIGRATE_LOCKRES