diff options
author | David Teigland <teigland@redhat.com> | 2007-09-27 16:53:38 -0400 |
---|---|---|
committer | Steven Whitehouse <swhiteho@redhat.com> | 2007-10-10 03:56:38 -0400 |
commit | c36258b5925e6cf6bf72904635100593573bfcff (patch) | |
tree | 565f1ce29a7f8a2cd1c25f2d36c932727adbdbc2 /fs/dlm/member.c | |
parent | b434eda6fda5bcdcc2dd918e5ffbf7184f2d4e17 (diff) |
[DLM] block dlm_recv in recovery transition
Introduce a per-lockspace rwsem that's held in read mode by dlm_recv
threads while working in the dlm. This allows dlm_recv activity to be
suspended when the lockspace transitions to, from and between recovery
cycles.
The specific bug prompting this change is one where an in-progress
recovery cycle is aborted by a new recovery cycle. While dlm_recv was
processing a recovery message, the recovery cycle was aborted and
dlm_recoverd began cleaning up. dlm_recv decremented recover_locks_count
on an rsb after dlm_recoverd had reset it to zero. This is fixed by
suspending dlm_recv (taking write lock on the rwsem) before aborting the
current recovery.
The transitions to/from normal and recovery modes are simplified by using
this new ability to block dlm_recv. The switch from normal to recovery
mode means dlm_recv goes from processing locking messages, to saving them
for later, and vice versa. Races are avoided by blocking dlm_recv when
setting the flag that switches between modes.
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Diffstat (limited to 'fs/dlm/member.c')
-rw-r--r-- | fs/dlm/member.c | 41 |
1 files changed, 27 insertions, 14 deletions
diff --git a/fs/dlm/member.c b/fs/dlm/member.c index d09977528f69..e9cdcab306e2 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c | |||
@@ -18,10 +18,6 @@ | |||
18 | #include "rcom.h" | 18 | #include "rcom.h" |
19 | #include "config.h" | 19 | #include "config.h" |
20 | 20 | ||
21 | /* | ||
22 | * Following called by dlm_recoverd thread | ||
23 | */ | ||
24 | |||
25 | static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) | 21 | static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) |
26 | { | 22 | { |
27 | struct dlm_member *memb = NULL; | 23 | struct dlm_member *memb = NULL; |
@@ -250,18 +246,30 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) | |||
250 | return error; | 246 | return error; |
251 | } | 247 | } |
252 | 248 | ||
253 | /* | 249 | /* Userspace guarantees that dlm_ls_stop() has completed on all nodes before |
254 | * Following called from lockspace.c | 250 | dlm_ls_start() is called on any of them to start the new recovery. */ |
255 | */ | ||
256 | 251 | ||
257 | int dlm_ls_stop(struct dlm_ls *ls) | 252 | int dlm_ls_stop(struct dlm_ls *ls) |
258 | { | 253 | { |
259 | int new; | 254 | int new; |
260 | 255 | ||
261 | /* | 256 | /* |
262 | * A stop cancels any recovery that's in progress (see RECOVERY_STOP, | 257 | * Prevent dlm_recv from being in the middle of something when we do |
263 | * dlm_recovery_stopped()) and prevents any new locks from being | 258 | * the stop. This includes ensuring dlm_recv isn't processing a |
264 | * processed (see RUNNING, dlm_locking_stopped()). | 259 | * recovery message (rcom), while dlm_recoverd is aborting and |
260 | * resetting things from an in-progress recovery. i.e. we want | ||
261 | * dlm_recoverd to abort its recovery without worrying about dlm_recv | ||
262 | * processing an rcom at the same time. Stopping dlm_recv also makes | ||
263 | * it easy for dlm_receive_message() to check locking stopped and add a | ||
264 | * message to the requestqueue without races. | ||
265 | */ | ||
266 | |||
267 | down_write(&ls->ls_recv_active); | ||
268 | |||
269 | /* | ||
270 | * Abort any recovery that's in progress (see RECOVERY_STOP, | ||
271 | * dlm_recovery_stopped()) and tell any other threads running in the | ||
272 | * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). | ||
265 | */ | 273 | */ |
266 | 274 | ||
267 | spin_lock(&ls->ls_recover_lock); | 275 | spin_lock(&ls->ls_recover_lock); |
@@ -271,8 +279,14 @@ int dlm_ls_stop(struct dlm_ls *ls) | |||
271 | spin_unlock(&ls->ls_recover_lock); | 279 | spin_unlock(&ls->ls_recover_lock); |
272 | 280 | ||
273 | /* | 281 | /* |
282 | * Let dlm_recv run again, now any normal messages will be saved on the | ||
283 | * requestqueue for later. | ||
284 | */ | ||
285 | |||
286 | up_write(&ls->ls_recv_active); | ||
287 | |||
288 | /* | ||
274 | * This in_recovery lock does two things: | 289 | * This in_recovery lock does two things: |
275 | * | ||
276 | * 1) Keeps this function from returning until all threads are out | 290 | * 1) Keeps this function from returning until all threads are out |
277 | * of locking routines and locking is truely stopped. | 291 | * of locking routines and locking is truely stopped. |
278 | * 2) Keeps any new requests from being processed until it's unlocked | 292 | * 2) Keeps any new requests from being processed until it's unlocked |
@@ -284,9 +298,8 @@ int dlm_ls_stop(struct dlm_ls *ls) | |||
284 | 298 | ||
285 | /* | 299 | /* |
286 | * The recoverd suspend/resume makes sure that dlm_recoverd (if | 300 | * The recoverd suspend/resume makes sure that dlm_recoverd (if |
287 | * running) has noticed the clearing of RUNNING above and quit | 301 | * running) has noticed RECOVERY_STOP above and quit processing the |
288 | * processing the previous recovery. This will be true for all nodes | 302 | * previous recovery. |
289 | * before any nodes start the new recovery. | ||
290 | */ | 303 | */ |
291 | 304 | ||
292 | dlm_recoverd_suspend(ls); | 305 | dlm_recoverd_suspend(ls); |