summaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/dlm
diff options
context:
space:
mode:
authorpiaojun <piaojun@huawei.com>2018-04-05 19:19:11 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-06 00:36:22 -0400
commit60c7ec9ee4a3410c2cb08850102d363c7e207f48 (patch)
tree77b1cff7c9c386e7e338c07a1ad66ea249ab0d63 /fs/ocfs2/dlm
parenta43d24cb3b0b395de8bb176355a907a9c9a1c42e (diff)
ocfs2/dlm: wait for dlm recovery done when migrating all lock resources
Wait for dlm recovery done when migrating all lock resources in case that new lock resource left after leaving dlm domain. And the left lock resource will cause other nodes BUG. NodeA NodeB NodeC umount: dlm_unregister_domain() dlm_migrate_all_locks() NodeB down do recovery for NodeB and collect a new lockres form other live nodes: dlm_do_recovery dlm_remaster_locks dlm_request_all_locks: dlm_mig_lockres_handler dlm_new_lockres __dlm_insert_lockres at last NodeA become the master of the new lockres and leave domain: dlm_leave_domain() mount: dlm_join_domain() touch file and request for the owner of the new lockres, but all the other nodes said 'NO', so NodeC decide to be the owner, and send do assert msg to other nodes: dlmlock() dlm_get_lock_resource() dlm_do_assert_master() other nodes receive the msg and found two masters exist. at last cause BUG in dlm_assert_master_handler() -->BUG(); Link: http://lkml.kernel.org/r/5AAA6E25.7090303@huawei.com Fixes: bc9838c4d44a ("dlm: allow dlm do recovery during shutdown") Signed-off-by: Jun Piao <piaojun@huawei.com> Reviewed-by: Alex Chen <alex.chen@huawei.com> Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com> Acked-by: Joseph Qi <jiangqi903@gmail.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <ge.changwei@h3c.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/ocfs2/dlm')
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c15
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c13
3 files changed, 26 insertions, 3 deletions
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 953c200e1c30..d06e27ec4be4 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
140 u8 node_num; 140 u8 node_num;
141 u32 key; 141 u32 key;
142 u8 joining_node; 142 u8 joining_node;
143 u8 migrate_done; /* set to 1 means node has migrated all lock resources */
143 wait_queue_head_t dlm_join_events; 144 wait_queue_head_t dlm_join_events;
144 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 145 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
145 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 146 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 25b76f0d082b..425081be6161 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -461,6 +461,19 @@ redo_bucket:
461 cond_resched_lock(&dlm->spinlock); 461 cond_resched_lock(&dlm->spinlock);
462 num += n; 462 num += n;
463 } 463 }
464
465 if (!num) {
466 if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
467 mlog(0, "%s: perhaps there are more lock resources "
468 "need to be migrated after dlm recovery\n", dlm->name);
469 ret = -EAGAIN;
470 } else {
471 mlog(0, "%s: we won't do dlm recovery after migrating "
472 "all lock resources\n", dlm->name);
473 dlm->migrate_done = 1;
474 }
475 }
476
464 spin_unlock(&dlm->spinlock); 477 spin_unlock(&dlm->spinlock);
465 wake_up(&dlm->dlm_thread_wq); 478 wake_up(&dlm->dlm_thread_wq);
466 479
@@ -2038,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2038 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; 2051 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
2039 init_waitqueue_head(&dlm->dlm_join_events); 2052 init_waitqueue_head(&dlm->dlm_join_events);
2040 2053
2054 dlm->migrate_done = 0;
2055
2041 dlm->reco.new_master = O2NM_INVALID_NODE_NUM; 2056 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
2042 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 2057 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
2043 2058
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 86204b81ef34..b454eb371b77 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
423 423
424static void dlm_begin_recovery(struct dlm_ctxt *dlm) 424static void dlm_begin_recovery(struct dlm_ctxt *dlm)
425{ 425{
426 spin_lock(&dlm->spinlock); 426 assert_spin_locked(&dlm->spinlock);
427 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); 427 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
428 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", 428 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
429 dlm->name, dlm->reco.dead_node); 429 dlm->name, dlm->reco.dead_node);
430 dlm->reco.state |= DLM_RECO_STATE_ACTIVE; 430 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
431 spin_unlock(&dlm->spinlock);
432} 431}
433 432
434static void dlm_end_recovery(struct dlm_ctxt *dlm) 433static void dlm_end_recovery(struct dlm_ctxt *dlm)
@@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
456 455
457 spin_lock(&dlm->spinlock); 456 spin_lock(&dlm->spinlock);
458 457
458 if (dlm->migrate_done) {
459 mlog(0, "%s: no need do recovery after migrating all "
460 "lock resources\n", dlm->name);
461 spin_unlock(&dlm->spinlock);
462 return 0;
463 }
464
459 /* check to see if the new master has died */ 465 /* check to see if the new master has died */
460 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && 466 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
461 test_bit(dlm->reco.new_master, dlm->recovery_map)) { 467 test_bit(dlm->reco.new_master, dlm->recovery_map)) {
@@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
490 mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", 496 mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
491 dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), 497 dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
492 dlm->reco.dead_node); 498 dlm->reco.dead_node);
493 spin_unlock(&dlm->spinlock);
494 499
495 /* take write barrier */ 500 /* take write barrier */
496 /* (stops the list reshuffling thread, proxy ast handling) */ 501 /* (stops the list reshuffling thread, proxy ast handling) */
497 dlm_begin_recovery(dlm); 502 dlm_begin_recovery(dlm);
498 503
504 spin_unlock(&dlm->spinlock);
505
499 if (dlm->reco.new_master == dlm->node_num) 506 if (dlm->reco.new_master == dlm->node_num)
500 goto master_here; 507 goto master_here;
501 508