aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Teigland <teigland@redhat.com>2012-04-23 13:18:18 -0400
committerDavid Teigland <teigland@redhat.com>2012-04-26 16:36:04 -0400
commit13ef11110fa2173b9d03e6616574914e12e2a90f (patch)
treeb62224982191e5dd572aae6003f71c68cae0a7ea /fs
parent513ef596d43cc35a72ae21170075136855641493 (diff)
dlm: fix waiter recovery
An outstanding remote operation (an lkb on the "waiter" list) could sometimes miss being resent during recovery. The decision was based on the lkb_nodeid field, which could have changed during an earlier aborted recovery, so it no longer represents the actual remote destination. The lkb_wait_nodeid is always the actual remote node, so it is the best value to use. Signed-off-by: David Teigland <teigland@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/dlm/lock.c43
1 files changed, 31 insertions, 12 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 4c58d4a3adc4..3d35c593f4c1 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -4187,15 +4187,19 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
4187/* A waiting lkb needs recovery if the master node has failed, or 4187/* A waiting lkb needs recovery if the master node has failed, or
4188 the master node is changing (only when no directory is used) */ 4188 the master node is changing (only when no directory is used) */
4189 4189
4190static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) 4190static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb,
4191 int dir_nodeid)
4191{ 4192{
4192 if (dlm_is_removed(ls, lkb->lkb_nodeid)) 4193 if (dlm_is_removed(ls, lkb->lkb_wait_nodeid))
4193 return 1; 4194 return 1;
4194 4195
4195 if (!dlm_no_directory(ls)) 4196 if (!dlm_no_directory(ls))
4196 return 0; 4197 return 0;
4197 4198
4198 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid) 4199 if (dir_nodeid == dlm_our_nodeid())
4200 return 1;
4201
4202 if (dir_nodeid != lkb->lkb_wait_nodeid)
4199 return 1; 4203 return 1;
4200 4204
4201 return 0; 4205 return 0;
@@ -4212,6 +4216,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
4212 struct dlm_lkb *lkb, *safe; 4216 struct dlm_lkb *lkb, *safe;
4213 struct dlm_message *ms_stub; 4217 struct dlm_message *ms_stub;
4214 int wait_type, stub_unlock_result, stub_cancel_result; 4218 int wait_type, stub_unlock_result, stub_cancel_result;
4219 int dir_nodeid;
4215 4220
4216 ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); 4221 ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL);
4217 if (!ms_stub) { 4222 if (!ms_stub) {
@@ -4223,13 +4228,21 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
4223 4228
4224 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { 4229 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
4225 4230
4231 dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource);
4232
4226 /* exclude debug messages about unlocks because there can be so 4233 /* exclude debug messages about unlocks because there can be so
4227 many and they aren't very interesting */ 4234 many and they aren't very interesting */
4228 4235
4229 if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { 4236 if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
4230 log_debug(ls, "recover_waiter %x nodeid %d " 4237 log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
4231 "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid, 4238 "lkb_nodeid %d wait_nodeid %d dir_nodeid %d",
4232 lkb->lkb_wait_type, lkb->lkb_wait_nodeid); 4239 lkb->lkb_id,
4240 lkb->lkb_remid,
4241 lkb->lkb_wait_type,
4242 lkb->lkb_resource->res_nodeid,
4243 lkb->lkb_nodeid,
4244 lkb->lkb_wait_nodeid,
4245 dir_nodeid);
4233 } 4246 }
4234 4247
4235 /* all outstanding lookups, regardless of destination will be 4248 /* all outstanding lookups, regardless of destination will be
@@ -4240,7 +4253,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
4240 continue; 4253 continue;
4241 } 4254 }
4242 4255
4243 if (!waiter_needs_recovery(ls, lkb)) 4256 if (!waiter_needs_recovery(ls, lkb, dir_nodeid))
4244 continue; 4257 continue;
4245 4258
4246 wait_type = lkb->lkb_wait_type; 4259 wait_type = lkb->lkb_wait_type;
@@ -4373,8 +4386,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
4373 ou = is_overlap_unlock(lkb); 4386 ou = is_overlap_unlock(lkb);
4374 err = 0; 4387 err = 0;
4375 4388
4376 log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d", 4389 log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
4377 lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid); 4390 "lkb_nodeid %d wait_nodeid %d dir_nodeid %d "
4391 "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype,
4392 r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
4393 dlm_dir_nodeid(r), oc, ou);
4378 4394
4379 /* At this point we assume that we won't get a reply to any 4395 /* At this point we assume that we won't get a reply to any
4380 previous op or overlap op on this lock. First, do a big 4396 previous op or overlap op on this lock. First, do a big
@@ -4426,9 +4442,12 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
4426 } 4442 }
4427 } 4443 }
4428 4444
4429 if (err) 4445 if (err) {
4430 log_error(ls, "recover_waiters_post %x %d %x %d %d", 4446 log_error(ls, "waiter %x msg %d r_nodeid %d "
4431 lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou); 4447 "dir_nodeid %d overlap %d %d",
4448 lkb->lkb_id, mstype, r->res_nodeid,
4449 dlm_dir_nodeid(r), oc, ou);
4450 }
4432 unlock_rsb(r); 4451 unlock_rsb(r);
4433 put_rsb(r); 4452 put_rsb(r);
4434 dlm_put_lkb(lkb); 4453 dlm_put_lkb(lkb);