diff options
-rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 102 |
1 files changed, 90 insertions, 12 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 59c8976915a9..81bd2400e221 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -239,6 +239,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) | |||
239 | * | 239 | * |
240 | */ | 240 | */ |
241 | 241 | ||
242 | static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) | ||
243 | { | ||
244 | struct dlm_reco_node_data *ndata; | ||
245 | struct dlm_lock_resource *res; | ||
246 | |||
247 | mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", | ||
248 | dlm->name, dlm->dlm_reco_thread_task->pid, | ||
249 | dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", | ||
250 | dlm->reco.dead_node, dlm->reco.new_master); | ||
251 | |||
252 | list_for_each_entry(ndata, &dlm->reco.node_data, list) { | ||
253 | char *st = "unknown"; | ||
254 | switch (ndata->state) { | ||
255 | case DLM_RECO_NODE_DATA_INIT: | ||
256 | st = "init"; | ||
257 | break; | ||
258 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
259 | st = "requesting"; | ||
260 | break; | ||
261 | case DLM_RECO_NODE_DATA_DEAD: | ||
262 | st = "dead"; | ||
263 | break; | ||
264 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
265 | st = "receiving"; | ||
266 | break; | ||
267 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
268 | st = "requested"; | ||
269 | break; | ||
270 | case DLM_RECO_NODE_DATA_DONE: | ||
271 | st = "done"; | ||
272 | break; | ||
273 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
274 | st = "finalize-sent"; | ||
275 | break; | ||
276 | default: | ||
277 | st = "bad"; | ||
278 | break; | ||
279 | } | ||
280 | mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", | ||
281 | dlm->name, ndata->node_num, st); | ||
282 | } | ||
283 | list_for_each_entry(res, &dlm->reco.resources, recovering) { | ||
284 | mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", | ||
285 | dlm->name, res->lockname.len, res->lockname.name); | ||
286 | } | ||
287 | } | ||
242 | 288 | ||
243 | #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) | 289 | #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) |
244 | 290 | ||
@@ -385,7 +431,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
385 | /* return to main thread loop and sleep. */ | 431 | /* return to main thread loop and sleep. */ |
386 | return 0; | 432 | return 0; |
387 | } | 433 | } |
388 | mlog(0, "recovery thread found node %u in the recovery map!\n", | 434 | mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", |
435 | dlm->name, dlm->dlm_reco_thread_task->pid, | ||
389 | dlm->reco.dead_node); | 436 | dlm->reco.dead_node); |
390 | spin_unlock(&dlm->spinlock); | 437 | spin_unlock(&dlm->spinlock); |
391 | 438 | ||
@@ -408,8 +455,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
408 | } | 455 | } |
409 | mlog(0, "another node will master this recovery session.\n"); | 456 | mlog(0, "another node will master this recovery session.\n"); |
410 | } | 457 | } |
411 | mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", | 458 | mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", |
412 | dlm->name, dlm->reco.new_master, | 459 | dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master, |
413 | dlm->node_num, dlm->reco.dead_node); | 460 | dlm->node_num, dlm->reco.dead_node); |
414 | 461 | ||
415 | /* it is safe to start everything back up here | 462 | /* it is safe to start everything back up here |
@@ -421,7 +468,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
421 | return 0; | 468 | return 0; |
422 | 469 | ||
423 | master_here: | 470 | master_here: |
424 | mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", | 471 | mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n", |
472 | dlm->dlm_reco_thread_task->pid, | ||
425 | dlm->name, dlm->reco.dead_node, dlm->node_num); | 473 | dlm->name, dlm->reco.dead_node, dlm->node_num); |
426 | 474 | ||
427 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); | 475 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); |
@@ -563,11 +611,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
563 | goto leave; | 611 | goto leave; |
564 | case DLM_RECO_NODE_DATA_RECEIVING: | 612 | case DLM_RECO_NODE_DATA_RECEIVING: |
565 | case DLM_RECO_NODE_DATA_REQUESTED: | 613 | case DLM_RECO_NODE_DATA_REQUESTED: |
614 | mlog(0, "%s: node %u still in state %s\n", | ||
615 | dlm->name, ndata->node_num, | ||
616 | ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? | ||
617 | "receiving" : "requested"); | ||
566 | all_nodes_done = 0; | 618 | all_nodes_done = 0; |
567 | break; | 619 | break; |
568 | case DLM_RECO_NODE_DATA_DONE: | 620 | case DLM_RECO_NODE_DATA_DONE: |
621 | mlog(0, "%s: node %u state is done\n", | ||
622 | dlm->name, ndata->node_num); | ||
569 | break; | 623 | break; |
570 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 624 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: |
625 | mlog(0, "%s: node %u state is finalize\n", | ||
626 | dlm->name, ndata->node_num); | ||
571 | break; | 627 | break; |
572 | } | 628 | } |
573 | } | 629 | } |
@@ -714,6 +770,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) | |||
714 | mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " | 770 | mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " |
715 | "dead_node is %u\n", dlm->name, lr->node_idx, | 771 | "dead_node is %u\n", dlm->name, lr->node_idx, |
716 | lr->dead_node, dlm->reco.dead_node); | 772 | lr->dead_node, dlm->reco.dead_node); |
773 | dlm_print_reco_node_status(dlm); | ||
717 | /* this is a hack */ | 774 | /* this is a hack */ |
718 | dlm_put(dlm); | 775 | dlm_put(dlm); |
719 | return -ENOMEM; | 776 | return -ENOMEM; |
@@ -764,6 +821,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
764 | reco_master = item->u.ral.reco_master; | 821 | reco_master = item->u.ral.reco_master; |
765 | mres = (struct dlm_migratable_lockres *)data; | 822 | mres = (struct dlm_migratable_lockres *)data; |
766 | 823 | ||
824 | mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", | ||
825 | dlm->name, dead_node, reco_master); | ||
826 | |||
767 | if (dead_node != dlm->reco.dead_node || | 827 | if (dead_node != dlm->reco.dead_node || |
768 | reco_master != dlm->reco.new_master) { | 828 | reco_master != dlm->reco.new_master) { |
769 | /* show extra debug info if the recovery state is messed */ | 829 | /* show extra debug info if the recovery state is messed */ |
@@ -802,7 +862,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
802 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, | 862 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, |
803 | DLM_MRES_RECOVERY); | 863 | DLM_MRES_RECOVERY); |
804 | if (ret < 0) { | 864 | if (ret < 0) { |
805 | mlog_errno(ret); | 865 | mlog(ML_ERROR, "%s: node %u went down while sending " |
866 | "recovery state for dead node %u, ret=%d\n", dlm->name, | ||
867 | reco_master, dead_node, ret); | ||
806 | skip_all_done = 1; | 868 | skip_all_done = 1; |
807 | break; | 869 | break; |
808 | } | 870 | } |
@@ -816,7 +878,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
816 | if (!skip_all_done) { | 878 | if (!skip_all_done) { |
817 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); | 879 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); |
818 | if (ret < 0) { | 880 | if (ret < 0) { |
819 | mlog_errno(ret); | 881 | mlog(ML_ERROR, "%s: node %u went down while sending " |
882 | "recovery all-done for dead node %u, ret=%d\n", | ||
883 | dlm->name, reco_master, dead_node, ret); | ||
820 | } | 884 | } |
821 | } | 885 | } |
822 | 886 | ||
@@ -865,7 +929,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) | |||
865 | mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " | 929 | mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " |
866 | "node_idx=%u, this node=%u\n", done->dead_node, | 930 | "node_idx=%u, this node=%u\n", done->dead_node, |
867 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | 931 | dlm->reco.dead_node, done->node_idx, dlm->node_num); |
868 | BUG_ON(done->dead_node != dlm->reco.dead_node); | 932 | |
933 | mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), | ||
934 | "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " | ||
935 | "node_idx=%u, this node=%u\n", done->dead_node, | ||
936 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | ||
869 | 937 | ||
870 | spin_lock(&dlm_reco_state_lock); | 938 | spin_lock(&dlm_reco_state_lock); |
871 | list_for_each(iter, &dlm->reco.node_data) { | 939 | list_for_each(iter, &dlm->reco.node_data) { |
@@ -2228,7 +2296,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) | |||
2228 | 2296 | ||
2229 | mlog_entry("%u\n", dead_node); | 2297 | mlog_entry("%u\n", dead_node); |
2230 | 2298 | ||
2231 | mlog(0, "dead node is %u\n", dead_node); | 2299 | mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); |
2232 | 2300 | ||
2233 | spin_lock(&dlm->spinlock); | 2301 | spin_lock(&dlm->spinlock); |
2234 | dlm_node_iter_init(dlm->domain_map, &iter); | 2302 | dlm_node_iter_init(dlm->domain_map, &iter); |
@@ -2301,8 +2369,9 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2301 | if (!dlm_grab(dlm)) | 2369 | if (!dlm_grab(dlm)) |
2302 | return 0; | 2370 | return 0; |
2303 | 2371 | ||
2304 | mlog(0, "node %u wants to recover node %u\n", | 2372 | mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", |
2305 | br->node_idx, br->dead_node); | 2373 | dlm->name, br->node_idx, br->dead_node, |
2374 | dlm->reco.dead_node, dlm->reco.new_master); | ||
2306 | 2375 | ||
2307 | dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); | 2376 | dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); |
2308 | 2377 | ||
@@ -2344,6 +2413,11 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2344 | spin_unlock(&dlm->spinlock); | 2413 | spin_unlock(&dlm->spinlock); |
2345 | 2414 | ||
2346 | dlm_kick_recovery_thread(dlm); | 2415 | dlm_kick_recovery_thread(dlm); |
2416 | |||
2417 | mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", | ||
2418 | dlm->name, br->node_idx, br->dead_node, | ||
2419 | dlm->reco.dead_node, dlm->reco.new_master); | ||
2420 | |||
2347 | dlm_put(dlm); | 2421 | dlm_put(dlm); |
2348 | return 0; | 2422 | return 0; |
2349 | } | 2423 | } |
@@ -2401,8 +2475,9 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2401 | if (!dlm_grab(dlm)) | 2475 | if (!dlm_grab(dlm)) |
2402 | return 0; | 2476 | return 0; |
2403 | 2477 | ||
2404 | mlog(0, "node %u finalizing recovery of node %u\n", | 2478 | mlog(0, "%s: node %u finalizing recovery of node %u (%u:%u)\n", |
2405 | fr->node_idx, fr->dead_node); | 2479 | dlm->name, fr->node_idx, fr->dead_node, |
2480 | dlm->reco.dead_node, dlm->reco.new_master); | ||
2406 | 2481 | ||
2407 | spin_lock(&dlm->spinlock); | 2482 | spin_lock(&dlm->spinlock); |
2408 | 2483 | ||
@@ -2426,6 +2501,9 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2426 | dlm_reset_recovery(dlm); | 2501 | dlm_reset_recovery(dlm); |
2427 | 2502 | ||
2428 | dlm_kick_recovery_thread(dlm); | 2503 | dlm_kick_recovery_thread(dlm); |
2504 | mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", | ||
2505 | dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); | ||
2506 | |||
2429 | dlm_put(dlm); | 2507 | dlm_put(dlm); |
2430 | return 0; | 2508 | return 0; |
2431 | } | 2509 | } |