diff options
author | David Teigland <teigland@redhat.com> | 2012-04-23 13:18:18 -0400 |
---|---|---|
committer | David Teigland <teigland@redhat.com> | 2012-04-26 16:36:04 -0400 |
commit | 13ef11110fa2173b9d03e6616574914e12e2a90f (patch) | |
tree | b62224982191e5dd572aae6003f71c68cae0a7ea /fs | |
parent | 513ef596d43cc35a72ae21170075136855641493 (diff) |
dlm: fix waiter recovery
An outstanding remote operation (an lkb on the "waiter"
list) could sometimes miss being resent during recovery.
The decision was based on the lkb_nodeid field, which
could have changed during an earlier aborted recovery,
so it no longer represents the actual remote destination.
The lkb_wait_nodeid is always the actual remote node,
so it is the best value to use.
Signed-off-by: David Teigland <teigland@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/dlm/lock.c | 43 |
1 files changed, 31 insertions, 12 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 4c58d4a3adc4..3d35c593f4c1 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c | |||
@@ -4187,15 +4187,19 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, | |||
4187 | /* A waiting lkb needs recovery if the master node has failed, or | 4187 | /* A waiting lkb needs recovery if the master node has failed, or |
4188 | the master node is changing (only when no directory is used) */ | 4188 | the master node is changing (only when no directory is used) */ |
4189 | 4189 | ||
4190 | static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) | 4190 | static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb, |
4191 | int dir_nodeid) | ||
4191 | { | 4192 | { |
4192 | if (dlm_is_removed(ls, lkb->lkb_nodeid)) | 4193 | if (dlm_is_removed(ls, lkb->lkb_wait_nodeid)) |
4193 | return 1; | 4194 | return 1; |
4194 | 4195 | ||
4195 | if (!dlm_no_directory(ls)) | 4196 | if (!dlm_no_directory(ls)) |
4196 | return 0; | 4197 | return 0; |
4197 | 4198 | ||
4198 | if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid) | 4199 | if (dir_nodeid == dlm_our_nodeid()) |
4200 | return 1; | ||
4201 | |||
4202 | if (dir_nodeid != lkb->lkb_wait_nodeid) | ||
4199 | return 1; | 4203 | return 1; |
4200 | 4204 | ||
4201 | return 0; | 4205 | return 0; |
@@ -4212,6 +4216,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) | |||
4212 | struct dlm_lkb *lkb, *safe; | 4216 | struct dlm_lkb *lkb, *safe; |
4213 | struct dlm_message *ms_stub; | 4217 | struct dlm_message *ms_stub; |
4214 | int wait_type, stub_unlock_result, stub_cancel_result; | 4218 | int wait_type, stub_unlock_result, stub_cancel_result; |
4219 | int dir_nodeid; | ||
4215 | 4220 | ||
4216 | ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); | 4221 | ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); |
4217 | if (!ms_stub) { | 4222 | if (!ms_stub) { |
@@ -4223,13 +4228,21 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) | |||
4223 | 4228 | ||
4224 | list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { | 4229 | list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { |
4225 | 4230 | ||
4231 | dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource); | ||
4232 | |||
4226 | /* exclude debug messages about unlocks because there can be so | 4233 | /* exclude debug messages about unlocks because there can be so |
4227 | many and they aren't very interesting */ | 4234 | many and they aren't very interesting */ |
4228 | 4235 | ||
4229 | if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { | 4236 | if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { |
4230 | log_debug(ls, "recover_waiter %x nodeid %d " | 4237 | log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " |
4231 | "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid, | 4238 | "lkb_nodeid %d wait_nodeid %d dir_nodeid %d", |
4232 | lkb->lkb_wait_type, lkb->lkb_wait_nodeid); | 4239 | lkb->lkb_id, |
4240 | lkb->lkb_remid, | ||
4241 | lkb->lkb_wait_type, | ||
4242 | lkb->lkb_resource->res_nodeid, | ||
4243 | lkb->lkb_nodeid, | ||
4244 | lkb->lkb_wait_nodeid, | ||
4245 | dir_nodeid); | ||
4233 | } | 4246 | } |
4234 | 4247 | ||
4235 | /* all outstanding lookups, regardless of destination will be | 4248 | /* all outstanding lookups, regardless of destination will be |
@@ -4240,7 +4253,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) | |||
4240 | continue; | 4253 | continue; |
4241 | } | 4254 | } |
4242 | 4255 | ||
4243 | if (!waiter_needs_recovery(ls, lkb)) | 4256 | if (!waiter_needs_recovery(ls, lkb, dir_nodeid)) |
4244 | continue; | 4257 | continue; |
4245 | 4258 | ||
4246 | wait_type = lkb->lkb_wait_type; | 4259 | wait_type = lkb->lkb_wait_type; |
@@ -4373,8 +4386,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) | |||
4373 | ou = is_overlap_unlock(lkb); | 4386 | ou = is_overlap_unlock(lkb); |
4374 | err = 0; | 4387 | err = 0; |
4375 | 4388 | ||
4376 | log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d", | 4389 | log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " |
4377 | lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid); | 4390 | "lkb_nodeid %d wait_nodeid %d dir_nodeid %d " |
4391 | "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype, | ||
4392 | r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid, | ||
4393 | dlm_dir_nodeid(r), oc, ou); | ||
4378 | 4394 | ||
4379 | /* At this point we assume that we won't get a reply to any | 4395 | /* At this point we assume that we won't get a reply to any |
4380 | previous op or overlap op on this lock. First, do a big | 4396 | previous op or overlap op on this lock. First, do a big |
@@ -4426,9 +4442,12 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) | |||
4426 | } | 4442 | } |
4427 | } | 4443 | } |
4428 | 4444 | ||
4429 | if (err) | 4445 | if (err) { |
4430 | log_error(ls, "recover_waiters_post %x %d %x %d %d", | 4446 | log_error(ls, "waiter %x msg %d r_nodeid %d " |
4431 | lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou); | 4447 | "dir_nodeid %d overlap %d %d", |
4448 | lkb->lkb_id, mstype, r->res_nodeid, | ||
4449 | dlm_dir_nodeid(r), oc, ou); | ||
4450 | } | ||
4432 | unlock_rsb(r); | 4451 | unlock_rsb(r); |
4433 | put_rsb(r); | 4452 | put_rsb(r); |
4434 | dlm_put_lkb(lkb); | 4453 | dlm_put_lkb(lkb); |