aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorWengang Wang <wen.gang.wang@oracle.com>2010-07-30 04:14:44 -0400
committerJoel Becker <joel.becker@oracle.com>2010-08-07 13:49:41 -0400
commita524812b7eaa7783d7811198921100f079034e61 (patch)
treecbab978f01806cf71b20249519157a4044223c82 /fs
parent845b6cf34150100deb5f58c8a37a372b111f2918 (diff)
ocfs2/dlm: avoid incorrect bit set in refmap on recovery master
In the following situation, there remains an incorrect bit in refmap on the recovery master. Finally the recovery master will fail at purging the lockres due to the incorrect bit in refmap. 1) node A has no interest on lockres A any longer, so it is purging it. 2) the owner of lockres A is node B, so node A is sending de-ref message to node B. 3) at this time, node B crashed. node C becomes the recovery master. it recovers lockres A(because the master is the dead node B). 4) node A migrated lockres A to node C with a refbit there. 5) node A failed to send de-ref message to node B because it crashed. The failure is ignored. no other action is done for lockres A any more. For mormal, re-send the deref message to it to recovery master can fix it. Well, ignoring the failure of deref to the original master and not recovering the lockres to recovery master has the same effect. And the later is simpler. Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> Acked-by: Srinivas Eeda <srinivas.eeda@oracle.com> Cc: stable@kernel.org Signed-off-by: Joel Becker <joel.becker@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c34
2 files changed, 31 insertions, 25 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9dfaac73b36d..aaaffbcbe916 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1997 struct list_head *queue; 1997 struct list_head *queue;
1998 struct dlm_lock *lock, *next; 1998 struct dlm_lock *lock, *next;
1999 1999
2000 assert_spin_locked(&dlm->spinlock);
2001 assert_spin_locked(&res->spinlock);
2000 res->state |= DLM_LOCK_RES_RECOVERING; 2002 res->state |= DLM_LOCK_RES_RECOVERING;
2001 if (!list_empty(&res->recovering)) { 2003 if (!list_empty(&res->recovering)) {
2002 mlog(0, 2004 mlog(0,
@@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2326 /* zero the lvb if necessary */ 2328 /* zero the lvb if necessary */
2327 dlm_revalidate_lvb(dlm, res, dead_node); 2329 dlm_revalidate_lvb(dlm, res, dead_node);
2328 if (res->owner == dead_node) { 2330 if (res->owner == dead_node) {
2329 if (res->state & DLM_LOCK_RES_DROPPING_REF) 2331 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2330 mlog(0, "%s:%.*s: owned by " 2332 mlog(ML_NOTICE, "Ignore %.*s for "
2331 "dead node %u, this node was " 2333 "recovery as it is being freed\n",
2332 "dropping its ref when it died. " 2334 res->lockname.len,
2333 "continue, dropping the flag.\n", 2335 res->lockname.name);
2334 dlm->name, res->lockname.len, 2336 } else
2335 res->lockname.name, dead_node); 2337 dlm_move_lockres_to_recovery_list(dlm,
2336 2338 res);
2337 /* the wake_up for this will happen when the
2338 * RECOVERING flag is dropped later */
2339 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2340 2339
2341 dlm_move_lockres_to_recovery_list(dlm, res);
2342 } else if (res->owner == dlm->node_num) { 2340 } else if (res->owner == dlm->node_num) {
2343 dlm_free_dead_locks(dlm, res, dead_node); 2341 dlm_free_dead_locks(dlm, res, dead_node);
2344 __dlm_lockres_calc_usage(dlm, res); 2342 __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index dd78ca3f360f..2211acf33d9b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
92 * truly ready to be freed. */ 92 * truly ready to be freed. */
93int __dlm_lockres_unused(struct dlm_lock_resource *res) 93int __dlm_lockres_unused(struct dlm_lock_resource *res)
94{ 94{
95 if (!__dlm_lockres_has_locks(res) && 95 int bit;
96 (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { 96
97 /* try not to scan the bitmap unless the first two 97 if (__dlm_lockres_has_locks(res))
98 * conditions are already true */ 98 return 0;
99 int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 99
100 if (bit >= O2NM_MAX_NODES) { 100 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
101 /* since the bit for dlm->node_num is not 101 return 0;
102 * set, inflight_locks better be zero */ 102
103 BUG_ON(res->inflight_locks != 0); 103 if (res->state & DLM_LOCK_RES_RECOVERING)
104 return 1; 104 return 0;
105 } 105
106 } 106 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
107 return 0; 107 if (bit < O2NM_MAX_NODES)
108 return 0;
109
110 /*
111 * since the bit for dlm->node_num is not set, inflight_locks better
112 * be zero
113 */
114 BUG_ON(res->inflight_locks != 0);
115 return 1;
108} 116}
109 117
110 118