aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/dlm/dlmrecovery.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/dlm/dlmrecovery.c')
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c102
1 files changed, 90 insertions, 12 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 59c8976915a9..81bd2400e221 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -239,6 +239,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
239 * 239 *
240 */ 240 */
241 241
242static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
243{
244 struct dlm_reco_node_data *ndata;
245 struct dlm_lock_resource *res;
246
247 mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
248 dlm->name, dlm->dlm_reco_thread_task->pid,
249 dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
250 dlm->reco.dead_node, dlm->reco.new_master);
251
252 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
253 char *st = "unknown";
254 switch (ndata->state) {
255 case DLM_RECO_NODE_DATA_INIT:
256 st = "init";
257 break;
258 case DLM_RECO_NODE_DATA_REQUESTING:
259 st = "requesting";
260 break;
261 case DLM_RECO_NODE_DATA_DEAD:
262 st = "dead";
263 break;
264 case DLM_RECO_NODE_DATA_RECEIVING:
265 st = "receiving";
266 break;
267 case DLM_RECO_NODE_DATA_REQUESTED:
268 st = "requested";
269 break;
270 case DLM_RECO_NODE_DATA_DONE:
271 st = "done";
272 break;
273 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
274 st = "finalize-sent";
275 break;
276 default:
277 st = "bad";
278 break;
279 }
280 mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
281 dlm->name, ndata->node_num, st);
282 }
283 list_for_each_entry(res, &dlm->reco.resources, recovering) {
284 mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
285 dlm->name, res->lockname.len, res->lockname.name);
286 }
287}
242 288
243#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) 289#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
244 290
@@ -385,7 +431,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
385 /* return to main thread loop and sleep. */ 431 /* return to main thread loop and sleep. */
386 return 0; 432 return 0;
387 } 433 }
388 mlog(0, "recovery thread found node %u in the recovery map!\n", 434 mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
435 dlm->name, dlm->dlm_reco_thread_task->pid,
389 dlm->reco.dead_node); 436 dlm->reco.dead_node);
390 spin_unlock(&dlm->spinlock); 437 spin_unlock(&dlm->spinlock);
391 438
@@ -408,8 +455,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
408 } 455 }
409 mlog(0, "another node will master this recovery session.\n"); 456 mlog(0, "another node will master this recovery session.\n");
410 } 457 }
411 mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", 458 mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
412 dlm->name, dlm->reco.new_master, 459 dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
413 dlm->node_num, dlm->reco.dead_node); 460 dlm->node_num, dlm->reco.dead_node);
414 461
415 /* it is safe to start everything back up here 462 /* it is safe to start everything back up here
@@ -421,7 +468,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
421 return 0; 468 return 0;
422 469
423master_here: 470master_here:
424 mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", 471 mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
472 dlm->dlm_reco_thread_task->pid,
425 dlm->name, dlm->reco.dead_node, dlm->node_num); 473 dlm->name, dlm->reco.dead_node, dlm->node_num);
426 474
427 status = dlm_remaster_locks(dlm, dlm->reco.dead_node); 475 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
@@ -563,11 +611,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
563 goto leave; 611 goto leave;
564 case DLM_RECO_NODE_DATA_RECEIVING: 612 case DLM_RECO_NODE_DATA_RECEIVING:
565 case DLM_RECO_NODE_DATA_REQUESTED: 613 case DLM_RECO_NODE_DATA_REQUESTED:
614 mlog(0, "%s: node %u still in state %s\n",
615 dlm->name, ndata->node_num,
616 ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
617 "receiving" : "requested");
566 all_nodes_done = 0; 618 all_nodes_done = 0;
567 break; 619 break;
568 case DLM_RECO_NODE_DATA_DONE: 620 case DLM_RECO_NODE_DATA_DONE:
621 mlog(0, "%s: node %u state is done\n",
622 dlm->name, ndata->node_num);
569 break; 623 break;
570 case DLM_RECO_NODE_DATA_FINALIZE_SENT: 624 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
625 mlog(0, "%s: node %u state is finalize\n",
626 dlm->name, ndata->node_num);
571 break; 627 break;
572 } 628 }
573 } 629 }
@@ -714,6 +770,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
714 mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " 770 mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
715 "dead_node is %u\n", dlm->name, lr->node_idx, 771 "dead_node is %u\n", dlm->name, lr->node_idx,
716 lr->dead_node, dlm->reco.dead_node); 772 lr->dead_node, dlm->reco.dead_node);
773 dlm_print_reco_node_status(dlm);
717 /* this is a hack */ 774 /* this is a hack */
718 dlm_put(dlm); 775 dlm_put(dlm);
719 return -ENOMEM; 776 return -ENOMEM;
@@ -764,6 +821,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
764 reco_master = item->u.ral.reco_master; 821 reco_master = item->u.ral.reco_master;
765 mres = (struct dlm_migratable_lockres *)data; 822 mres = (struct dlm_migratable_lockres *)data;
766 823
824 mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
825 dlm->name, dead_node, reco_master);
826
767 if (dead_node != dlm->reco.dead_node || 827 if (dead_node != dlm->reco.dead_node ||
768 reco_master != dlm->reco.new_master) { 828 reco_master != dlm->reco.new_master) {
769 /* show extra debug info if the recovery state is messed */ 829 /* show extra debug info if the recovery state is messed */
@@ -802,7 +862,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
802 ret = dlm_send_one_lockres(dlm, res, mres, reco_master, 862 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
803 DLM_MRES_RECOVERY); 863 DLM_MRES_RECOVERY);
804 if (ret < 0) { 864 if (ret < 0) {
805 mlog_errno(ret); 865 mlog(ML_ERROR, "%s: node %u went down while sending "
866 "recovery state for dead node %u, ret=%d\n", dlm->name,
867 reco_master, dead_node, ret);
806 skip_all_done = 1; 868 skip_all_done = 1;
807 break; 869 break;
808 } 870 }
@@ -816,7 +878,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
816 if (!skip_all_done) { 878 if (!skip_all_done) {
817 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); 879 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
818 if (ret < 0) { 880 if (ret < 0) {
819 mlog_errno(ret); 881 mlog(ML_ERROR, "%s: node %u went down while sending "
882 "recovery all-done for dead node %u, ret=%d\n",
883 dlm->name, reco_master, dead_node, ret);
820 } 884 }
821 } 885 }
822 886
@@ -865,7 +929,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
865 mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " 929 mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
866 "node_idx=%u, this node=%u\n", done->dead_node, 930 "node_idx=%u, this node=%u\n", done->dead_node,
867 dlm->reco.dead_node, done->node_idx, dlm->node_num); 931 dlm->reco.dead_node, done->node_idx, dlm->node_num);
868 BUG_ON(done->dead_node != dlm->reco.dead_node); 932
933 mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
934 "Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
935 "node_idx=%u, this node=%u\n", done->dead_node,
936 dlm->reco.dead_node, done->node_idx, dlm->node_num);
869 937
870 spin_lock(&dlm_reco_state_lock); 938 spin_lock(&dlm_reco_state_lock);
871 list_for_each(iter, &dlm->reco.node_data) { 939 list_for_each(iter, &dlm->reco.node_data) {
@@ -2228,7 +2296,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
2228 2296
2229 mlog_entry("%u\n", dead_node); 2297 mlog_entry("%u\n", dead_node);
2230 2298
2231 mlog(0, "dead node is %u\n", dead_node); 2299 mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
2232 2300
2233 spin_lock(&dlm->spinlock); 2301 spin_lock(&dlm->spinlock);
2234 dlm_node_iter_init(dlm->domain_map, &iter); 2302 dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2301,8 +2369,9 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2301 if (!dlm_grab(dlm)) 2369 if (!dlm_grab(dlm))
2302 return 0; 2370 return 0;
2303 2371
2304 mlog(0, "node %u wants to recover node %u\n", 2372 mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
2305 br->node_idx, br->dead_node); 2373 dlm->name, br->node_idx, br->dead_node,
2374 dlm->reco.dead_node, dlm->reco.new_master);
2306 2375
2307 dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); 2376 dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
2308 2377
@@ -2344,6 +2413,11 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2344 spin_unlock(&dlm->spinlock); 2413 spin_unlock(&dlm->spinlock);
2345 2414
2346 dlm_kick_recovery_thread(dlm); 2415 dlm_kick_recovery_thread(dlm);
2416
2417 mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
2418 dlm->name, br->node_idx, br->dead_node,
2419 dlm->reco.dead_node, dlm->reco.new_master);
2420
2347 dlm_put(dlm); 2421 dlm_put(dlm);
2348 return 0; 2422 return 0;
2349} 2423}
@@ -2401,8 +2475,9 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2401 if (!dlm_grab(dlm)) 2475 if (!dlm_grab(dlm))
2402 return 0; 2476 return 0;
2403 2477
2404 mlog(0, "node %u finalizing recovery of node %u\n", 2478 mlog(0, "%s: node %u finalizing recovery of node %u (%u:%u)\n",
2405 fr->node_idx, fr->dead_node); 2479 dlm->name, fr->node_idx, fr->dead_node,
2480 dlm->reco.dead_node, dlm->reco.new_master);
2406 2481
2407 spin_lock(&dlm->spinlock); 2482 spin_lock(&dlm->spinlock);
2408 2483
@@ -2426,6 +2501,9 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2426 dlm_reset_recovery(dlm); 2501 dlm_reset_recovery(dlm);
2427 2502
2428 dlm_kick_recovery_thread(dlm); 2503 dlm_kick_recovery_thread(dlm);
2504 mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
2505 dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
2506
2429 dlm_put(dlm); 2507 dlm_put(dlm);
2430 return 0; 2508 return 0;
2431} 2509}