aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/dlm/dlmrecovery.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/dlm/dlmrecovery.c')
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c53
1 files changed, 31 insertions, 22 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d28a21..a3c312c43b90 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -430,6 +430,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
430{ 430{
431 spin_lock(&dlm->spinlock); 431 spin_lock(&dlm->spinlock);
432 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); 432 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
433 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
434 dlm->name, dlm->reco.dead_node);
433 dlm->reco.state |= DLM_RECO_STATE_ACTIVE; 435 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
434 spin_unlock(&dlm->spinlock); 436 spin_unlock(&dlm->spinlock);
435} 437}
@@ -440,9 +442,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
440 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); 442 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
441 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; 443 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
442 spin_unlock(&dlm->spinlock); 444 spin_unlock(&dlm->spinlock);
445 printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
443 wake_up(&dlm->reco.event); 446 wake_up(&dlm->reco.event);
444} 447}
445 448
449static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
450{
451 printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
452 "dead node %u in domain %s\n", dlm->reco.new_master,
453 (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
454 dlm->reco.dead_node, dlm->name);
455}
456
446static int dlm_do_recovery(struct dlm_ctxt *dlm) 457static int dlm_do_recovery(struct dlm_ctxt *dlm)
447{ 458{
448 int status = 0; 459 int status = 0;
@@ -505,9 +516,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
505 } 516 }
506 mlog(0, "another node will master this recovery session.\n"); 517 mlog(0, "another node will master this recovery session.\n");
507 } 518 }
508 mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", 519
509 dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, 520 dlm_print_recovery_master(dlm);
510 dlm->node_num, dlm->reco.dead_node);
511 521
512 /* it is safe to start everything back up here 522 /* it is safe to start everything back up here
513 * because all of the dead node's lock resources 523 * because all of the dead node's lock resources
@@ -518,15 +528,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
518 return 0; 528 return 0;
519 529
520master_here: 530master_here:
521 mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " 531 dlm_print_recovery_master(dlm);
522 "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
523 dlm->node_num, dlm->reco.dead_node, dlm->name);
524 532
525 status = dlm_remaster_locks(dlm, dlm->reco.dead_node); 533 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
526 if (status < 0) { 534 if (status < 0) {
527 /* we should never hit this anymore */ 535 /* we should never hit this anymore */
528 mlog(ML_ERROR, "error %d remastering locks for node %u, " 536 mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
529 "retrying.\n", status, dlm->reco.dead_node); 537 "retrying.\n", dlm->name, status, dlm->reco.dead_node);
530 /* yield a bit to allow any final network messages 538 /* yield a bit to allow any final network messages
531 * to get handled on remaining nodes */ 539 * to get handled on remaining nodes */
532 msleep(100); 540 msleep(100);
@@ -567,7 +575,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
567 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 575 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
568 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 576 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
569 577
570 mlog(0, "requesting lock info from node %u\n", 578 mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
571 ndata->node_num); 579 ndata->node_num);
572 580
573 if (ndata->node_num == dlm->node_num) { 581 if (ndata->node_num == dlm->node_num) {
@@ -640,7 +648,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
640 spin_unlock(&dlm_reco_state_lock); 648 spin_unlock(&dlm_reco_state_lock);
641 } 649 }
642 650
643 mlog(0, "done requesting all lock info\n"); 651 mlog(0, "%s: Done requesting all lock info\n", dlm->name);
644 652
645 /* nodes should be sending reco data now 653 /* nodes should be sending reco data now
646 * just need to wait */ 654 * just need to wait */
@@ -802,10 +810,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
802 810
803 /* negative status is handled by caller */ 811 /* negative status is handled by caller */
804 if (ret < 0) 812 if (ret < 0)
805 mlog(ML_ERROR, "Error %d when sending message %u (key " 813 mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
806 "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, 814 "to recover dead node %u\n", dlm->name, ret,
807 dlm->key, request_from); 815 request_from, dead_node);
808
809 // return from here, then 816 // return from here, then
810 // sleep until all received or error 817 // sleep until all received or error
811 return ret; 818 return ret;
@@ -956,9 +963,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
956 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 963 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
957 sizeof(done_msg), send_to, &tmpret); 964 sizeof(done_msg), send_to, &tmpret);
958 if (ret < 0) { 965 if (ret < 0) {
959 mlog(ML_ERROR, "Error %d when sending message %u (key " 966 mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
960 "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, 967 "to recover dead node %u\n", dlm->name, ret, send_to,
961 dlm->key, send_to); 968 dead_node);
962 if (!dlm_is_host_down(ret)) { 969 if (!dlm_is_host_down(ret)) {
963 BUG(); 970 BUG();
964 } 971 }
@@ -1127,9 +1134,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1127 if (ret < 0) { 1134 if (ret < 0) {
1128 /* XXX: negative status is not handled. 1135 /* XXX: negative status is not handled.
1129 * this will end up killing this node. */ 1136 * this will end up killing this node. */
1130 mlog(ML_ERROR, "Error %d when sending message %u (key " 1137 mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
1131 "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, 1138 "node %u (%s)\n", dlm->name, mres->lockname_len,
1132 dlm->key, send_to); 1139 mres->lockname, ret, send_to,
1140 (orig_flags & DLM_MRES_MIGRATION ?
1141 "migration" : "recovery"));
1133 } else { 1142 } else {
1134 /* might get an -ENOMEM back here */ 1143 /* might get an -ENOMEM back here */
1135 ret = status; 1144 ret = status;
@@ -2324,9 +2333,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2324 dlm_revalidate_lvb(dlm, res, dead_node); 2333 dlm_revalidate_lvb(dlm, res, dead_node);
2325 if (res->owner == dead_node) { 2334 if (res->owner == dead_node) {
2326 if (res->state & DLM_LOCK_RES_DROPPING_REF) { 2335 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2327 mlog(ML_NOTICE, "Ignore %.*s for " 2336 mlog(ML_NOTICE, "%s: res %.*s, Skip "
2328 "recovery as it is being freed\n", 2337 "recovery as it is being freed\n",
2329 res->lockname.len, 2338 dlm->name, res->lockname.len,
2330 res->lockname.name); 2339 res->lockname.name);
2331 } else 2340 } else
2332 dlm_move_lockres_to_recovery_list(dlm, 2341 dlm_move_lockres_to_recovery_list(dlm,