ocfs2: dlm_remaster_locks() should never exit without completing

We cannot restart recovery. Once we begin to recover a node, keep the state of the recovery intact and follow through, regardless of any other node deaths that may occur. Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
author: Kurt Hackel <kurt.hackel@oracle.com> 2006-05-01 16:49:20 -0400
committer: Mark Fasheh <mark.fasheh@oracle.com> 2006-06-26 17:43:09 -0400
commit: 6a41321121ee2af33b8ac55c87657603df480b25 (patch)
tree: 648abdd1bf2ede54a3e9759bd4b989587381dcc4 /fs/ocfs2/dlm
parent: c8df412e1c746dd21094966d04b3a79aad0f4d08 (diff)
1 files changed, 62 insertions, 54 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 00209f4a2916..22a0b055cfcd 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -480,6 +480,7 @@ master_here:
        status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
        if (status < 0) {
+                /* we should never hit this anymore */
                mlog(ML_ERROR, "error %d remastering locks for node %u, "
                     "retrying.\n", status, dlm->reco.dead_node);
                /* yield a bit to allow any final network messages
@@ -506,9 +507,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        int destroy = 0;
        int pass = 0;
-        status = dlm_init_recovery_area(dlm, dead_node);
+        do {
-        if (status < 0)
+                /* we have become recovery master.  there is no escaping
-                goto leave;
+                 * this, so just keep trying until we get it. */
+                status = dlm_init_recovery_area(dlm, dead_node);
+                if (status < 0) {
+                        mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+                             "retrying\n", dlm->name);
+                        msleep(1000);
+                }
+        } while (status != 0);
        /* safe to access the node data list without a lock, since this
         * process is the only one to change the list */
@@ -525,16 +533,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                        continue;
                }
-                status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
+                do {
-                if (status < 0) {
+                        status = dlm_request_all_locks(dlm, ndata->node_num,
-                        mlog_errno(status);
+                                                       dead_node);
-                        if (dlm_is_host_down(status))
+                        if (status < 0) {
-                                ndata->state = DLM_RECO_NODE_DATA_DEAD;
+                                mlog_errno(status);
-                        else {
+                                if (dlm_is_host_down(status)) {
-                                destroy = 1;
+                                        /* node died, ignore it for recovery */
-                                goto leave;
+                                        status = 0;
+                                        ndata->state = DLM_RECO_NODE_DATA_DEAD;
+                                        /* wait for the domain map to catch up
+                                         * with the network state. */
+                                        wait_event_timeout(dlm->dlm_reco_thread_wq,
+                                                           dlm_is_node_dead(dlm,
+                                                                ndata->node_num),
+                                                           msecs_to_jiffies(1000));
+                                        mlog(0, "waited 1 sec for %u, "
+                                             "dead? %s\n", ndata->node_num,
+                                             dlm_is_node_dead(dlm, ndata->node_num) ?
+                                             "yes" : "no");
+                                } else {
+                                        /* -ENOMEM on the other node */
+                                        mlog(0, "%s: node %u returned "
+                                             "%d during recovery, retrying "
+                                             "after a short wait\n",
+                                             dlm->name, ndata->node_num,
+                                             status);
+                                        msleep(100);
+                                }
                        }
-                }
+                } while (status != 0);
                switch (ndata->state) {
                        case DLM_RECO_NODE_DATA_INIT:
@@ -546,10 +574,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                mlog(0, "node %u died after requesting "
                                     "recovery info for node %u\n",
                                     ndata->node_num, dead_node);
-                                // start all over
+                                /* fine.  don't need this node's info.
-                                destroy = 1;
+                                 * continue without it. */
-                                status = -EAGAIN;
+                                break;
-                                goto leave;
                        case DLM_RECO_NODE_DATA_REQUESTING:
                                ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
                                mlog(0, "now receiving recovery data from "
@@ -593,28 +620,12 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                        BUG();
                                        break;
                                case DLM_RECO_NODE_DATA_DEAD:
-                                        mlog(ML_NOTICE, "node %u died after "
+                                        mlog(0, "node %u died after "
                                             "requesting recovery info for "
                                             "node %u\n", ndata->node_num,
                                             dead_node);
                                        spin_unlock(&dlm_reco_state_lock);
-                                        // start all over
+                                        break;
-                                        destroy = 1;
-                                        status = -EAGAIN;
-                                        /* instead of spinning like crazy here,
-                                         * wait for the domain map to catch up
-                                         * with the network state.  otherwise this
-                                         * can be hit hundreds of times before
-                                         * the node is really seen as dead. */
-                                        wait_event_timeout(dlm->dlm_reco_thread_wq,
-                                                           dlm_is_node_dead(dlm,
-                                                                ndata->node_num),
-                                                           msecs_to_jiffies(1000));
-                                        mlog(0, "waited 1 sec for %u, "
-                                             "dead? %s\n", ndata->node_num,
-                                             dlm_is_node_dead(dlm, ndata->node_num) ?
-                                             "yes" : "no");
-                                        goto leave;
                                case DLM_RECO_NODE_DATA_RECEIVING:
                                case DLM_RECO_NODE_DATA_REQUESTED:
                                        mlog(0, "%s: node %u still in state %s\n",
@@ -659,7 +670,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                             jiffies, dlm->reco.dead_node,
                             dlm->node_num, dlm->reco.new_master);
                        destroy = 1;
-                        status = ret;
+                        status = 0;
                        /* rescan everything marked dirty along the way */
                        dlm_kick_thread(dlm, NULL);
                        break;
@@ -672,7 +683,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        }
-leave:
        if (destroy)
                dlm_destroy_recovery_area(dlm, dead_node);
@@ -832,24 +842,22 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        if (dead_node != dlm->reco.dead_node ||
            reco_master != dlm->reco.new_master) {
-                /* show extra debug info if the recovery state is messed */
+                /* worker could have been created before the recovery master
-                mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
+                 * died.  if so, do not continue, but do not error. */
-                     "request(dead=%u, master=%u)\n",
+                if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
-                     dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
+                        mlog(ML_NOTICE, "%s: will not send recovery state, "
-                     dead_node, reco_master);
+                             "recovery master %u died, thread=(dead=%u,mas=%u)"
-                mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
+                             " current=(dead=%u,mas=%u)\n", dlm->name,
-                     "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
+                             reco_master, dead_node, reco_master,
-                     dlm->name, mres->lockname_len, mres->lockname, mres->master,
+                             dlm->reco.dead_node, dlm->reco.new_master);
-                     mres->num_locks, mres->total_locks, mres->flags,
+                } else {
-                     dlm_get_lock_cookie_node(mres->ml[0].cookie),
+                        mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
-                     dlm_get_lock_cookie_seq(mres->ml[0].cookie),
+                             "master=%u), request(dead=%u, master=%u)\n",
-                     mres->ml[0].list, mres->ml[0].flags,
+                             dlm->name, dlm->reco.dead_node,
-                     mres->ml[0].type, mres->ml[0].convert_type,
+                             dlm->reco.new_master, dead_node, reco_master);
-                     mres->ml[0].highest_blocked, mres->ml[0].node);
+                }
-                BUG();
+                goto leave;
        }
-        BUG_ON(dead_node != dlm->reco.dead_node);
-        BUG_ON(reco_master != dlm->reco.new_master);
        /* lock resources should have already been moved to the
         * dlm->reco.resources list.  now move items from that list
@@ -889,7 +897,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
                             dlm->name, reco_master, dead_node, ret);
                }
        }
+leave:
        free_page((unsigned long)data);
 }
author	Kurt Hackel <kurt.hackel@oracle.com>	2006-05-01 16:49:20 -0400
committer	Mark Fasheh <mark.fasheh@oracle.com>	2006-06-26 17:43:09 -0400
commit	6a41321121ee2af33b8ac55c87657603df480b25 (patch)
tree	648abdd1bf2ede54a3e9759bd4b989587381dcc4 /fs/ocfs2/dlm
parent	c8df412e1c746dd21094966d04b3a79aad0f4d08 (diff)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 00209f4a2916..22a0b055cfcd 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -480,6 +480,7 @@ master_here:
480		480
481	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);	481	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
482	if (status < 0) {	482	if (status < 0) {
		483	/* we should never hit this anymore */
483	mlog(ML_ERROR, "error %d remastering locks for node %u, "	484	mlog(ML_ERROR, "error %d remastering locks for node %u, "
484	"retrying.\n", status, dlm->reco.dead_node);	485	"retrying.\n", status, dlm->reco.dead_node);
485	/* yield a bit to allow any final network messages	486	/* yield a bit to allow any final network messages
@@ -506,9 +507,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
506	int destroy = 0;	507	int destroy = 0;
507	int pass = 0;	508	int pass = 0;
508		509
509	status = dlm_init_recovery_area(dlm, dead_node);	510	do {
510	if (status < 0)	511	/* we have become recovery master. there is no escaping
511	goto leave;	512	* this, so just keep trying until we get it. */
		513	status = dlm_init_recovery_area(dlm, dead_node);
		514	if (status < 0) {
		515	mlog(ML_ERROR, "%s: failed to alloc recovery area, "
		516	"retrying\n", dlm->name);
		517	msleep(1000);
		518	}
		519	} while (status != 0);
512		520
513	/* safe to access the node data list without a lock, since this	521	/* safe to access the node data list without a lock, since this
514	* process is the only one to change the list */	522	* process is the only one to change the list */
@@ -525,16 +533,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
525	continue;	533	continue;
526	}	534	}
527		535
528	status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);	536	do {
529	if (status < 0) {	537	status = dlm_request_all_locks(dlm, ndata->node_num,
530	mlog_errno(status);	538	dead_node);
531	if (dlm_is_host_down(status))	539	if (status < 0) {
532	ndata->state = DLM_RECO_NODE_DATA_DEAD;	540	mlog_errno(status);
533	else {	541	if (dlm_is_host_down(status)) {
534	destroy = 1;	542	/* node died, ignore it for recovery */
535	goto leave;	543	status = 0;
		544	ndata->state = DLM_RECO_NODE_DATA_DEAD;
		545	/* wait for the domain map to catch up
		546	* with the network state. */
		547	wait_event_timeout(dlm->dlm_reco_thread_wq,
		548	dlm_is_node_dead(dlm,
		549	ndata->node_num),
		550	msecs_to_jiffies(1000));
		551	mlog(0, "waited 1 sec for %u, "
		552	"dead? %s\n", ndata->node_num,
		553	dlm_is_node_dead(dlm, ndata->node_num) ?
		554	"yes" : "no");
		555	} else {
		556	/* -ENOMEM on the other node */
		557	mlog(0, "%s: node %u returned "
		558	"%d during recovery, retrying "
		559	"after a short wait\n",
		560	dlm->name, ndata->node_num,
		561	status);
		562	msleep(100);
		563	}
536	}	564	}
537	}	565	} while (status != 0);
538		566
539	switch (ndata->state) {	567	switch (ndata->state) {
540	case DLM_RECO_NODE_DATA_INIT:	568	case DLM_RECO_NODE_DATA_INIT:
@@ -546,10 +574,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
546	mlog(0, "node %u died after requesting "	574	mlog(0, "node %u died after requesting "
547	"recovery info for node %u\n",	575	"recovery info for node %u\n",
548	ndata->node_num, dead_node);	576	ndata->node_num, dead_node);
549	// start all over	577	/* fine. don't need this node's info.
550	destroy = 1;	578	* continue without it. */
551	status = -EAGAIN;	579	break;
552	goto leave;
553	case DLM_RECO_NODE_DATA_REQUESTING:	580	case DLM_RECO_NODE_DATA_REQUESTING:
554	ndata->state = DLM_RECO_NODE_DATA_REQUESTED;	581	ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
555	mlog(0, "now receiving recovery data from "	582	mlog(0, "now receiving recovery data from "
@@ -593,28 +620,12 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
593	BUG();	620	BUG();
594	break;	621	break;
595	case DLM_RECO_NODE_DATA_DEAD:	622	case DLM_RECO_NODE_DATA_DEAD:
596	mlog(ML_NOTICE, "node %u died after "	623	mlog(0, "node %u died after "
597	"requesting recovery info for "	624	"requesting recovery info for "
598	"node %u\n", ndata->node_num,	625	"node %u\n", ndata->node_num,
599	dead_node);	626	dead_node);
600	spin_unlock(&dlm_reco_state_lock);	627	spin_unlock(&dlm_reco_state_lock);
601	// start all over	628	break;
602	destroy = 1;
603	status = -EAGAIN;
604	/* instead of spinning like crazy here,
605	* wait for the domain map to catch up
606	* with the network state. otherwise this
607	* can be hit hundreds of times before
608	* the node is really seen as dead. */
609	wait_event_timeout(dlm->dlm_reco_thread_wq,
610	dlm_is_node_dead(dlm,
611	ndata->node_num),
612	msecs_to_jiffies(1000));
613	mlog(0, "waited 1 sec for %u, "
614	"dead? %s\n", ndata->node_num,
615	dlm_is_node_dead(dlm, ndata->node_num) ?
616	"yes" : "no");
617	goto leave;
618	case DLM_RECO_NODE_DATA_RECEIVING:	629	case DLM_RECO_NODE_DATA_RECEIVING:
619	case DLM_RECO_NODE_DATA_REQUESTED:	630	case DLM_RECO_NODE_DATA_REQUESTED:
620	mlog(0, "%s: node %u still in state %s\n",	631	mlog(0, "%s: node %u still in state %s\n",
@@ -659,7 +670,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
659	jiffies, dlm->reco.dead_node,	670	jiffies, dlm->reco.dead_node,
660	dlm->node_num, dlm->reco.new_master);	671	dlm->node_num, dlm->reco.new_master);
661	destroy = 1;	672	destroy = 1;
662	status = ret;	673	status = 0;
663	/* rescan everything marked dirty along the way */	674	/* rescan everything marked dirty along the way */
664	dlm_kick_thread(dlm, NULL);	675	dlm_kick_thread(dlm, NULL);
665	break;	676	break;
@@ -672,7 +683,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
672		683
673	}	684	}
674		685
675	leave:
676	if (destroy)	686	if (destroy)
677	dlm_destroy_recovery_area(dlm, dead_node);	687	dlm_destroy_recovery_area(dlm, dead_node);
678		688
@@ -832,24 +842,22 @@ static void dlm_request_all_locks_worker(struct dlm_work_item item, void data)
832		842
833	if (dead_node != dlm->reco.dead_node \|\|	843	if (dead_node != dlm->reco.dead_node \|\|
834	reco_master != dlm->reco.new_master) {	844	reco_master != dlm->reco.new_master) {
835	/* show extra debug info if the recovery state is messed */	845	/* worker could have been created before the recovery master
836	mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "	846	* died. if so, do not continue, but do not error. */
837	"request(dead=%u, master=%u)\n",	847	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
838	dlm->name, dlm->reco.dead_node, dlm->reco.new_master,	848	mlog(ML_NOTICE, "%s: will not send recovery state, "
839	dead_node, reco_master);	849	"recovery master %u died, thread=(dead=%u,mas=%u)"
840	mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "	850	" current=(dead=%u,mas=%u)\n", dlm->name,
841	"entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",	851	reco_master, dead_node, reco_master,
842	dlm->name, mres->lockname_len, mres->lockname, mres->master,	852	dlm->reco.dead_node, dlm->reco.new_master);
843	mres->num_locks, mres->total_locks, mres->flags,	853	} else {
844	dlm_get_lock_cookie_node(mres->ml[0].cookie),	854	mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
845	dlm_get_lock_cookie_seq(mres->ml[0].cookie),	855	"master=%u), request(dead=%u, master=%u)\n",
846	mres->ml[0].list, mres->ml[0].flags,	856	dlm->name, dlm->reco.dead_node,
847	mres->ml[0].type, mres->ml[0].convert_type,	857	dlm->reco.new_master, dead_node, reco_master);
848	mres->ml[0].highest_blocked, mres->ml[0].node);	858	}
849	BUG();	859	goto leave;
850	}	860	}
851	BUG_ON(dead_node != dlm->reco.dead_node);
852	BUG_ON(reco_master != dlm->reco.new_master);
853		861
854	/* lock resources should have already been moved to the	862	/* lock resources should have already been moved to the
855	* dlm->reco.resources list. now move items from that list	863	* dlm->reco.resources list. now move items from that list
@@ -889,7 +897,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item item, void data)
889	dlm->name, reco_master, dead_node, ret);	897	dlm->name, reco_master, dead_node, ret);
890	}	898	}
891	}	899	}
892		900	leave:
893	free_page((unsigned long)data);	901	free_page((unsigned long)data);
894	}	902	}
895		903