diff options
author | Kurt Hackel <kurt.hackel@oracle.com> | 2006-05-01 16:49:20 -0400 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2006-06-26 17:43:09 -0400 |
commit | 6a41321121ee2af33b8ac55c87657603df480b25 (patch) | |
tree | 648abdd1bf2ede54a3e9759bd4b989587381dcc4 /fs/ocfs2/dlm | |
parent | c8df412e1c746dd21094966d04b3a79aad0f4d08 (diff) |
ocfs2: dlm_remaster_locks() should never exit without completing
We cannot restart recovery. Once we begin to recover a node, keep the state
of the recovery intact and follow through, regardless of any other node
deaths that may occur.
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/dlm')
-rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 116 |
1 files changed, 62 insertions, 54 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 00209f4a2916..22a0b055cfcd 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -480,6 +480,7 @@ master_here: | |||
480 | 480 | ||
481 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); | 481 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); |
482 | if (status < 0) { | 482 | if (status < 0) { |
483 | /* we should never hit this anymore */ | ||
483 | mlog(ML_ERROR, "error %d remastering locks for node %u, " | 484 | mlog(ML_ERROR, "error %d remastering locks for node %u, " |
484 | "retrying.\n", status, dlm->reco.dead_node); | 485 | "retrying.\n", status, dlm->reco.dead_node); |
485 | /* yield a bit to allow any final network messages | 486 | /* yield a bit to allow any final network messages |
@@ -506,9 +507,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
506 | int destroy = 0; | 507 | int destroy = 0; |
507 | int pass = 0; | 508 | int pass = 0; |
508 | 509 | ||
509 | status = dlm_init_recovery_area(dlm, dead_node); | 510 | do { |
510 | if (status < 0) | 511 | /* we have become recovery master. there is no escaping |
511 | goto leave; | 512 | * this, so just keep trying until we get it. */ |
513 | status = dlm_init_recovery_area(dlm, dead_node); | ||
514 | if (status < 0) { | ||
515 | mlog(ML_ERROR, "%s: failed to alloc recovery area, " | ||
516 | "retrying\n", dlm->name); | ||
517 | msleep(1000); | ||
518 | } | ||
519 | } while (status != 0); | ||
512 | 520 | ||
513 | /* safe to access the node data list without a lock, since this | 521 | /* safe to access the node data list without a lock, since this |
514 | * process is the only one to change the list */ | 522 | * process is the only one to change the list */ |
@@ -525,16 +533,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
525 | continue; | 533 | continue; |
526 | } | 534 | } |
527 | 535 | ||
528 | status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); | 536 | do { |
529 | if (status < 0) { | 537 | status = dlm_request_all_locks(dlm, ndata->node_num, |
530 | mlog_errno(status); | 538 | dead_node); |
531 | if (dlm_is_host_down(status)) | 539 | if (status < 0) { |
532 | ndata->state = DLM_RECO_NODE_DATA_DEAD; | 540 | mlog_errno(status); |
533 | else { | 541 | if (dlm_is_host_down(status)) { |
534 | destroy = 1; | 542 | /* node died, ignore it for recovery */ |
535 | goto leave; | 543 | status = 0; |
544 | ndata->state = DLM_RECO_NODE_DATA_DEAD; | ||
545 | /* wait for the domain map to catch up | ||
546 | * with the network state. */ | ||
547 | wait_event_timeout(dlm->dlm_reco_thread_wq, | ||
548 | dlm_is_node_dead(dlm, | ||
549 | ndata->node_num), | ||
550 | msecs_to_jiffies(1000)); | ||
551 | mlog(0, "waited 1 sec for %u, " | ||
552 | "dead? %s\n", ndata->node_num, | ||
553 | dlm_is_node_dead(dlm, ndata->node_num) ? | ||
554 | "yes" : "no"); | ||
555 | } else { | ||
556 | /* -ENOMEM on the other node */ | ||
557 | mlog(0, "%s: node %u returned " | ||
558 | "%d during recovery, retrying " | ||
559 | "after a short wait\n", | ||
560 | dlm->name, ndata->node_num, | ||
561 | status); | ||
562 | msleep(100); | ||
563 | } | ||
536 | } | 564 | } |
537 | } | 565 | } while (status != 0); |
538 | 566 | ||
539 | switch (ndata->state) { | 567 | switch (ndata->state) { |
540 | case DLM_RECO_NODE_DATA_INIT: | 568 | case DLM_RECO_NODE_DATA_INIT: |
@@ -546,10 +574,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
546 | mlog(0, "node %u died after requesting " | 574 | mlog(0, "node %u died after requesting " |
547 | "recovery info for node %u\n", | 575 | "recovery info for node %u\n", |
548 | ndata->node_num, dead_node); | 576 | ndata->node_num, dead_node); |
549 | // start all over | 577 | /* fine. don't need this node's info. |
550 | destroy = 1; | 578 | * continue without it. */ |
551 | status = -EAGAIN; | 579 | break; |
552 | goto leave; | ||
553 | case DLM_RECO_NODE_DATA_REQUESTING: | 580 | case DLM_RECO_NODE_DATA_REQUESTING: |
554 | ndata->state = DLM_RECO_NODE_DATA_REQUESTED; | 581 | ndata->state = DLM_RECO_NODE_DATA_REQUESTED; |
555 | mlog(0, "now receiving recovery data from " | 582 | mlog(0, "now receiving recovery data from " |
@@ -593,28 +620,12 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
593 | BUG(); | 620 | BUG(); |
594 | break; | 621 | break; |
595 | case DLM_RECO_NODE_DATA_DEAD: | 622 | case DLM_RECO_NODE_DATA_DEAD: |
596 | mlog(ML_NOTICE, "node %u died after " | 623 | mlog(0, "node %u died after " |
597 | "requesting recovery info for " | 624 | "requesting recovery info for " |
598 | "node %u\n", ndata->node_num, | 625 | "node %u\n", ndata->node_num, |
599 | dead_node); | 626 | dead_node); |
600 | spin_unlock(&dlm_reco_state_lock); | 627 | spin_unlock(&dlm_reco_state_lock); |
601 | // start all over | 628 | break; |
602 | destroy = 1; | ||
603 | status = -EAGAIN; | ||
604 | /* instead of spinning like crazy here, | ||
605 | * wait for the domain map to catch up | ||
606 | * with the network state. otherwise this | ||
607 | * can be hit hundreds of times before | ||
608 | * the node is really seen as dead. */ | ||
609 | wait_event_timeout(dlm->dlm_reco_thread_wq, | ||
610 | dlm_is_node_dead(dlm, | ||
611 | ndata->node_num), | ||
612 | msecs_to_jiffies(1000)); | ||
613 | mlog(0, "waited 1 sec for %u, " | ||
614 | "dead? %s\n", ndata->node_num, | ||
615 | dlm_is_node_dead(dlm, ndata->node_num) ? | ||
616 | "yes" : "no"); | ||
617 | goto leave; | ||
618 | case DLM_RECO_NODE_DATA_RECEIVING: | 629 | case DLM_RECO_NODE_DATA_RECEIVING: |
619 | case DLM_RECO_NODE_DATA_REQUESTED: | 630 | case DLM_RECO_NODE_DATA_REQUESTED: |
620 | mlog(0, "%s: node %u still in state %s\n", | 631 | mlog(0, "%s: node %u still in state %s\n", |
@@ -659,7 +670,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
659 | jiffies, dlm->reco.dead_node, | 670 | jiffies, dlm->reco.dead_node, |
660 | dlm->node_num, dlm->reco.new_master); | 671 | dlm->node_num, dlm->reco.new_master); |
661 | destroy = 1; | 672 | destroy = 1; |
662 | status = ret; | 673 | status = 0; |
663 | /* rescan everything marked dirty along the way */ | 674 | /* rescan everything marked dirty along the way */ |
664 | dlm_kick_thread(dlm, NULL); | 675 | dlm_kick_thread(dlm, NULL); |
665 | break; | 676 | break; |
@@ -672,7 +683,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
672 | 683 | ||
673 | } | 684 | } |
674 | 685 | ||
675 | leave: | ||
676 | if (destroy) | 686 | if (destroy) |
677 | dlm_destroy_recovery_area(dlm, dead_node); | 687 | dlm_destroy_recovery_area(dlm, dead_node); |
678 | 688 | ||
@@ -832,24 +842,22 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
832 | 842 | ||
833 | if (dead_node != dlm->reco.dead_node || | 843 | if (dead_node != dlm->reco.dead_node || |
834 | reco_master != dlm->reco.new_master) { | 844 | reco_master != dlm->reco.new_master) { |
835 | /* show extra debug info if the recovery state is messed */ | 845 | /* worker could have been created before the recovery master |
836 | mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " | 846 | * died. if so, do not continue, but do not error. */ |
837 | "request(dead=%u, master=%u)\n", | 847 | if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { |
838 | dlm->name, dlm->reco.dead_node, dlm->reco.new_master, | 848 | mlog(ML_NOTICE, "%s: will not send recovery state, " |
839 | dead_node, reco_master); | 849 | "recovery master %u died, thread=(dead=%u,mas=%u)" |
840 | mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " | 850 | " current=(dead=%u,mas=%u)\n", dlm->name, |
841 | "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", | 851 | reco_master, dead_node, reco_master, |
842 | dlm->name, mres->lockname_len, mres->lockname, mres->master, | 852 | dlm->reco.dead_node, dlm->reco.new_master); |
843 | mres->num_locks, mres->total_locks, mres->flags, | 853 | } else { |
844 | dlm_get_lock_cookie_node(mres->ml[0].cookie), | 854 | mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " |
845 | dlm_get_lock_cookie_seq(mres->ml[0].cookie), | 855 | "master=%u), request(dead=%u, master=%u)\n", |
846 | mres->ml[0].list, mres->ml[0].flags, | 856 | dlm->name, dlm->reco.dead_node, |
847 | mres->ml[0].type, mres->ml[0].convert_type, | 857 | dlm->reco.new_master, dead_node, reco_master); |
848 | mres->ml[0].highest_blocked, mres->ml[0].node); | 858 | } |
849 | BUG(); | 859 | goto leave; |
850 | } | 860 | } |
851 | BUG_ON(dead_node != dlm->reco.dead_node); | ||
852 | BUG_ON(reco_master != dlm->reco.new_master); | ||
853 | 861 | ||
854 | /* lock resources should have already been moved to the | 862 | /* lock resources should have already been moved to the |
855 | * dlm->reco.resources list. now move items from that list | 863 | * dlm->reco.resources list. now move items from that list |
@@ -889,7 +897,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
889 | dlm->name, reco_master, dead_node, ret); | 897 | dlm->name, reco_master, dead_node, ret); |
890 | } | 898 | } |
891 | } | 899 | } |
892 | 900 | leave: | |
893 | free_page((unsigned long)data); | 901 | free_page((unsigned long)data); |
894 | } | 902 | } |
895 | 903 | ||