1 files changed, 103 insertions, 0 deletions
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 78ac3a00eb54..940be4c13b1f 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -239,6 +239,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
                                       struct dlm_lock_resource *res,
                                       u8 target);
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+                                       struct dlm_lock_resource *res);
 int dlm_is_host_down(int errno)
@@ -677,6 +679,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
        struct dlm_node_iter iter;
        unsigned int namelen;
        int tries = 0;
+        int bit, wait_on_recovery = 0;
        BUG_ON(!lockid);
@@ -762,6 +765,18 @@ lookup:
                dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
                set_bit(dlm->node_num, mle->maybe_map);
                list_add(&mle->list, &dlm->master_list);
+                /* still holding the dlm spinlock, check the recovery map
+                 * to see if there are any nodes that still need to be 
+                 * considered.  these will not appear in the mle nodemap
+                 * but they might own this lockres.  wait on them. */
+                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+                if (bit < O2NM_MAX_NODES) {
+                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                             "recover before lock mastery can begin\n",
+                             dlm->name, namelen, (char *)lockid, bit);
+                        wait_on_recovery = 1;
+                }
        }
        /* at this point there is either a DLM_MLE_BLOCK or a
@@ -779,6 +794,39 @@ lookup:
        spin_unlock(&dlm->master_lock);
        spin_unlock(&dlm->spinlock);
+        while (wait_on_recovery) {
+                /* any cluster changes that occurred after dropping the
+                 * dlm spinlock would be detectable be a change on the mle,
+                 * so we only need to clear out the recovery map once. */
+                if (dlm_is_recovery_lock(lockid, namelen)) {
+                        mlog(ML_NOTICE, "%s: recovery map is not empty, but "
+                             "must master $RECOVERY lock now\n", dlm->name);
+                        if (!dlm_pre_master_reco_lockres(dlm, res))
+                                wait_on_recovery = 0;
+                        else {
+                                mlog(0, "%s: waiting 500ms for heartbeat state "
+                                    "change\n", dlm->name);
+                                msleep(500);
+                        }
+                        continue;
+                } 
+                dlm_kick_recovery_thread(dlm);
+                msleep(100);
+                dlm_wait_for_recovery(dlm);
+                spin_lock(&dlm->spinlock);
+                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+                if (bit < O2NM_MAX_NODES) {
+                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                             "recover before lock mastery can begin\n",
+                             dlm->name, namelen, (char *)lockid, bit);
+                        wait_on_recovery = 1;
+                } else
+                        wait_on_recovery = 0;
+                spin_unlock(&dlm->spinlock);
+        }
        /* must wait for lock to be mastered elsewhere */
        if (blocked)
                goto wait;
@@ -1835,6 +1883,61 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
        mlog(0, "finished with dlm_assert_master_worker\n");
 }
+/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
+ * We cannot wait for node recovery to complete to begin mastering this
+ * lockres because this lockres is used to kick off recovery! ;-)
+ * So, do a pre-check on all living nodes to see if any of those nodes
+ * think that $RECOVERY is currently mastered by a dead node.  If so,
+ * we wait a short time to allow that node to get notified by its own
+ * heartbeat stack, then check again.  All $RECOVERY lock resources
+ * mastered by dead nodes are purged when the hearbeat callback is 
+ * fired, so we can know for sure that it is safe to continue once
+ * the node returns a live node or no node.  */
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+                                       struct dlm_lock_resource *res)
+{
+        struct dlm_node_iter iter;
+        int nodenum;
+        int ret = 0;
+        u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
+        spin_lock(&dlm->spinlock);
+        dlm_node_iter_init(dlm->domain_map, &iter);
+        spin_unlock(&dlm->spinlock);
+        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+                /* do not send to self */
+                if (nodenum == dlm->node_num)
+                        continue;
+                ret = dlm_do_master_requery(dlm, res, nodenum, &master);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        if (!dlm_is_host_down(ret))
+                                BUG();
+                        /* host is down, so answer for that node would be
+                         * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+                }
+                if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+                        /* check to see if this master is in the recovery map */
+                        spin_lock(&dlm->spinlock);
+                        if (test_bit(master, dlm->recovery_map)) {
+                                mlog(ML_NOTICE, "%s: node %u has not seen "
+                                     "node %u go down yet, and thinks the "
+                                     "dead node is mastering the recovery "
+                                     "lock.  must wait.\n", dlm->name,
+                                     nodenum, master);
+                                ret = -EAGAIN;
+                        }
+                        spin_unlock(&dlm->spinlock);
+                        mlog(0, "%s: reco lock master is %u\n", dlm->name, 
+                             master);
+                        break;
+                }
+        }
+        return ret;
+}
 /*
 * DLM_MIGRATE_LOCKRES

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 78ac3a00eb54..940be4c13b1f 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -239,6 +239,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
239	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,	239	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
240	struct dlm_lock_resource *res,	240	struct dlm_lock_resource *res,
241	u8 target);	241	u8 target);
		242	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
		243	struct dlm_lock_resource *res);
242		244
243		245
244	int dlm_is_host_down(int errno)	246	int dlm_is_host_down(int errno)
@@ -677,6 +679,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
677	struct dlm_node_iter iter;	679	struct dlm_node_iter iter;
678	unsigned int namelen;	680	unsigned int namelen;
679	int tries = 0;	681	int tries = 0;
		682	int bit, wait_on_recovery = 0;
680		683
681	BUG_ON(!lockid);	684	BUG_ON(!lockid);
682		685
@@ -762,6 +765,18 @@ lookup:
762	dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);	765	dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
763	set_bit(dlm->node_num, mle->maybe_map);	766	set_bit(dlm->node_num, mle->maybe_map);
764	list_add(&mle->list, &dlm->master_list);	767	list_add(&mle->list, &dlm->master_list);
		768
		769	/* still holding the dlm spinlock, check the recovery map
		770	* to see if there are any nodes that still need to be
		771	* considered. these will not appear in the mle nodemap
		772	* but they might own this lockres. wait on them. */
		773	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
		774	if (bit < O2NM_MAX_NODES) {
		775	mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
		776	"recover before lock mastery can begin\n",
		777	dlm->name, namelen, (char *)lockid, bit);
		778	wait_on_recovery = 1;
		779	}
765	}	780	}
766		781
767	/* at this point there is either a DLM_MLE_BLOCK or a	782	/* at this point there is either a DLM_MLE_BLOCK or a
@@ -779,6 +794,39 @@ lookup:
779	spin_unlock(&dlm->master_lock);	794	spin_unlock(&dlm->master_lock);
780	spin_unlock(&dlm->spinlock);	795	spin_unlock(&dlm->spinlock);
781		796
		797	while (wait_on_recovery) {
		798	/* any cluster changes that occurred after dropping the
		799	* dlm spinlock would be detectable be a change on the mle,
		800	* so we only need to clear out the recovery map once. */
		801	if (dlm_is_recovery_lock(lockid, namelen)) {
		802	mlog(ML_NOTICE, "%s: recovery map is not empty, but "
		803	"must master $RECOVERY lock now\n", dlm->name);
		804	if (!dlm_pre_master_reco_lockres(dlm, res))
		805	wait_on_recovery = 0;
		806	else {
		807	mlog(0, "%s: waiting 500ms for heartbeat state "
		808	"change\n", dlm->name);
		809	msleep(500);
		810	}
		811	continue;
		812	}
		813
		814	dlm_kick_recovery_thread(dlm);
		815	msleep(100);
		816	dlm_wait_for_recovery(dlm);
		817
		818	spin_lock(&dlm->spinlock);
		819	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
		820	if (bit < O2NM_MAX_NODES) {
		821	mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
		822	"recover before lock mastery can begin\n",
		823	dlm->name, namelen, (char *)lockid, bit);
		824	wait_on_recovery = 1;
		825	} else
		826	wait_on_recovery = 0;
		827	spin_unlock(&dlm->spinlock);
		828	}
		829
782	/* must wait for lock to be mastered elsewhere */	830	/* must wait for lock to be mastered elsewhere */
783	if (blocked)	831	if (blocked)
784	goto wait;	832	goto wait;
@@ -1835,6 +1883,61 @@ static void dlm_assert_master_worker(struct dlm_work_item item, void data)
1835	mlog(0, "finished with dlm_assert_master_worker\n");	1883	mlog(0, "finished with dlm_assert_master_worker\n");
1836	}	1884	}
1837		1885
		1886	/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
		1887	* We cannot wait for node recovery to complete to begin mastering this
		1888	* lockres because this lockres is used to kick off recovery! ;-)
		1889	* So, do a pre-check on all living nodes to see if any of those nodes
		1890	* think that $RECOVERY is currently mastered by a dead node. If so,
		1891	* we wait a short time to allow that node to get notified by its own
		1892	* heartbeat stack, then check again. All $RECOVERY lock resources
		1893	* mastered by dead nodes are purged when the hearbeat callback is
		1894	* fired, so we can know for sure that it is safe to continue once
		1895	* the node returns a live node or no node. */
		1896	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
		1897	struct dlm_lock_resource *res)
		1898	{
		1899	struct dlm_node_iter iter;
		1900	int nodenum;
		1901	int ret = 0;
		1902	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
		1903
		1904	spin_lock(&dlm->spinlock);
		1905	dlm_node_iter_init(dlm->domain_map, &iter);
		1906	spin_unlock(&dlm->spinlock);
		1907
		1908	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
		1909	/* do not send to self */
		1910	if (nodenum == dlm->node_num)
		1911	continue;
		1912	ret = dlm_do_master_requery(dlm, res, nodenum, &master);
		1913	if (ret < 0) {
		1914	mlog_errno(ret);
		1915	if (!dlm_is_host_down(ret))
		1916	BUG();
		1917	/* host is down, so answer for that node would be
		1918	* DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
		1919	}
		1920
		1921	if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
		1922	/* check to see if this master is in the recovery map */
		1923	spin_lock(&dlm->spinlock);
		1924	if (test_bit(master, dlm->recovery_map)) {
		1925	mlog(ML_NOTICE, "%s: node %u has not seen "
		1926	"node %u go down yet, and thinks the "
		1927	"dead node is mastering the recovery "
		1928	"lock. must wait.\n", dlm->name,
		1929	nodenum, master);
		1930	ret = -EAGAIN;
		1931	}
		1932	spin_unlock(&dlm->spinlock);
		1933	mlog(0, "%s: reco lock master is %u\n", dlm->name,
		1934	master);
		1935	break;
		1936	}
		1937	}
		1938	return ret;
		1939	}
		1940
1838		1941
1839	/*	1942	/*
1840	* DLM_MIGRATE_LOCKRES	1943	* DLM_MIGRATE_LOCKRES