diff options
Diffstat (limited to 'fs/ocfs2/dlm/dlmrecovery.c')
-rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 2132 |
1 files changed, 2132 insertions, 0 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c new file mode 100644 index 000000000000..0c8eb1093f00 --- /dev/null +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -0,0 +1,2132 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmrecovery.c | ||
5 | * | ||
6 | * recovery stuff | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/utsname.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/sysctl.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/blkdev.h> | ||
38 | #include <linux/socket.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/timer.h> | ||
41 | #include <linux/kthread.h> | ||
42 | |||
43 | |||
44 | #include "cluster/heartbeat.h" | ||
45 | #include "cluster/nodemanager.h" | ||
46 | #include "cluster/tcp.h" | ||
47 | |||
48 | #include "dlmapi.h" | ||
49 | #include "dlmcommon.h" | ||
50 | #include "dlmdomain.h" | ||
51 | |||
52 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) | ||
53 | #include "cluster/masklog.h" | ||
54 | |||
55 | static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); | ||
56 | |||
57 | static int dlm_recovery_thread(void *data); | ||
58 | void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); | ||
59 | int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); | ||
60 | static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); | ||
61 | static int dlm_do_recovery(struct dlm_ctxt *dlm); | ||
62 | |||
63 | static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); | ||
64 | static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); | ||
65 | static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); | ||
66 | static int dlm_request_all_locks(struct dlm_ctxt *dlm, | ||
67 | u8 request_from, u8 dead_node); | ||
68 | static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); | ||
69 | |||
70 | static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); | ||
71 | static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, | ||
72 | const char *lockname, int namelen, | ||
73 | int total_locks, u64 cookie, | ||
74 | u8 flags, u8 master); | ||
75 | static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, | ||
76 | struct dlm_migratable_lockres *mres, | ||
77 | u8 send_to, | ||
78 | struct dlm_lock_resource *res, | ||
79 | int total_locks); | ||
80 | static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, | ||
81 | struct dlm_lock_resource *res, | ||
82 | u8 *real_master); | ||
83 | static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | ||
84 | struct dlm_lock_resource *res, | ||
85 | struct dlm_migratable_lockres *mres); | ||
86 | static int dlm_do_master_requery(struct dlm_ctxt *dlm, | ||
87 | struct dlm_lock_resource *res, | ||
88 | u8 nodenum, u8 *real_master); | ||
89 | static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm); | ||
90 | static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, | ||
91 | u8 dead_node, u8 send_to); | ||
92 | static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node); | ||
93 | static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, | ||
94 | struct list_head *list, u8 dead_node); | ||
95 | static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | ||
96 | u8 dead_node, u8 new_master); | ||
97 | static void dlm_reco_ast(void *astdata); | ||
98 | static void dlm_reco_bast(void *astdata, int blocked_type); | ||
99 | static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st); | ||
100 | static void dlm_request_all_locks_worker(struct dlm_work_item *item, | ||
101 | void *data); | ||
102 | static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); | ||
103 | |||
104 | static u64 dlm_get_next_mig_cookie(void); | ||
105 | |||
106 | static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED; | ||
107 | static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED; | ||
108 | static u64 dlm_mig_cookie = 1; | ||
109 | |||
110 | static u64 dlm_get_next_mig_cookie(void) | ||
111 | { | ||
112 | u64 c; | ||
113 | spin_lock(&dlm_mig_cookie_lock); | ||
114 | c = dlm_mig_cookie; | ||
115 | if (dlm_mig_cookie == (~0ULL)) | ||
116 | dlm_mig_cookie = 1; | ||
117 | else | ||
118 | dlm_mig_cookie++; | ||
119 | spin_unlock(&dlm_mig_cookie_lock); | ||
120 | return c; | ||
121 | } | ||
122 | |||
123 | static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) | ||
124 | { | ||
125 | spin_lock(&dlm->spinlock); | ||
126 | clear_bit(dlm->reco.dead_node, dlm->recovery_map); | ||
127 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
128 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | ||
129 | spin_unlock(&dlm->spinlock); | ||
130 | } | ||
131 | |||
132 | /* Worker function used during recovery. */ | ||
133 | void dlm_dispatch_work(void *data) | ||
134 | { | ||
135 | struct dlm_ctxt *dlm = (struct dlm_ctxt *)data; | ||
136 | LIST_HEAD(tmp_list); | ||
137 | struct list_head *iter, *iter2; | ||
138 | struct dlm_work_item *item; | ||
139 | dlm_workfunc_t *workfunc; | ||
140 | |||
141 | spin_lock(&dlm->work_lock); | ||
142 | list_splice_init(&dlm->work_list, &tmp_list); | ||
143 | spin_unlock(&dlm->work_lock); | ||
144 | |||
145 | list_for_each_safe(iter, iter2, &tmp_list) { | ||
146 | item = list_entry(iter, struct dlm_work_item, list); | ||
147 | workfunc = item->func; | ||
148 | list_del_init(&item->list); | ||
149 | |||
150 | /* already have ref on dlm to avoid having | ||
151 | * it disappear. just double-check. */ | ||
152 | BUG_ON(item->dlm != dlm); | ||
153 | |||
154 | /* this is allowed to sleep and | ||
155 | * call network stuff */ | ||
156 | workfunc(item, item->data); | ||
157 | |||
158 | dlm_put(dlm); | ||
159 | kfree(item); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * RECOVERY THREAD | ||
165 | */ | ||
166 | |||
167 | static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm) | ||
168 | { | ||
169 | /* wake the recovery thread | ||
170 | * this will wake the reco thread in one of three places | ||
171 | * 1) sleeping with no recovery happening | ||
172 | * 2) sleeping with recovery mastered elsewhere | ||
173 | * 3) recovery mastered here, waiting on reco data */ | ||
174 | |||
175 | wake_up(&dlm->dlm_reco_thread_wq); | ||
176 | } | ||
177 | |||
178 | /* Launch the recovery thread */ | ||
179 | int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) | ||
180 | { | ||
181 | mlog(0, "starting dlm recovery thread...\n"); | ||
182 | |||
183 | dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, | ||
184 | "dlm_reco_thread"); | ||
185 | if (IS_ERR(dlm->dlm_reco_thread_task)) { | ||
186 | mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); | ||
187 | dlm->dlm_reco_thread_task = NULL; | ||
188 | return -EINVAL; | ||
189 | } | ||
190 | |||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) | ||
195 | { | ||
196 | if (dlm->dlm_reco_thread_task) { | ||
197 | mlog(0, "waiting for dlm recovery thread to exit\n"); | ||
198 | kthread_stop(dlm->dlm_reco_thread_task); | ||
199 | dlm->dlm_reco_thread_task = NULL; | ||
200 | } | ||
201 | } | ||
202 | |||
203 | |||
204 | |||
205 | /* | ||
206 | * this is lame, but here's how recovery works... | ||
207 | * 1) all recovery threads cluster wide will work on recovering | ||
208 | * ONE node at a time | ||
209 | * 2) negotiate who will take over all the locks for the dead node. | ||
210 | * thats right... ALL the locks. | ||
211 | * 3) once a new master is chosen, everyone scans all locks | ||
212 | * and moves aside those mastered by the dead guy | ||
213 | * 4) each of these locks should be locked until recovery is done | ||
214 | * 5) the new master collects up all of secondary lock queue info | ||
215 | * one lock at a time, forcing each node to communicate back | ||
216 | * before continuing | ||
217 | * 6) each secondary lock queue responds with the full known lock info | ||
218 | * 7) once the new master has run all its locks, it sends a ALLDONE! | ||
219 | * message to everyone | ||
220 | * 8) upon receiving this message, the secondary queue node unlocks | ||
221 | * and responds to the ALLDONE | ||
222 | * 9) once the new master gets responses from everyone, he unlocks | ||
223 | * everything and recovery for this dead node is done | ||
224 | *10) go back to 2) while there are still dead nodes | ||
225 | * | ||
226 | */ | ||
227 | |||
228 | |||
229 | #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) | ||
230 | |||
231 | static int dlm_recovery_thread(void *data) | ||
232 | { | ||
233 | int status; | ||
234 | struct dlm_ctxt *dlm = data; | ||
235 | unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS); | ||
236 | |||
237 | mlog(0, "dlm thread running for %s...\n", dlm->name); | ||
238 | |||
239 | while (!kthread_should_stop()) { | ||
240 | if (dlm_joined(dlm)) { | ||
241 | status = dlm_do_recovery(dlm); | ||
242 | if (status == -EAGAIN) { | ||
243 | /* do not sleep, recheck immediately. */ | ||
244 | continue; | ||
245 | } | ||
246 | if (status < 0) | ||
247 | mlog_errno(status); | ||
248 | } | ||
249 | |||
250 | wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, | ||
251 | kthread_should_stop(), | ||
252 | timeout); | ||
253 | } | ||
254 | |||
255 | mlog(0, "quitting DLM recovery thread\n"); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | /* callers of the top-level api calls (dlmlock/dlmunlock) should | ||
260 | * block on the dlm->reco.event when recovery is in progress. | ||
261 | * the dlm recovery thread will set this state when it begins | ||
262 | * recovering a dead node (as the new master or not) and clear | ||
263 | * the state and wake as soon as all affected lock resources have | ||
264 | * been marked with the RECOVERY flag */ | ||
265 | static int dlm_in_recovery(struct dlm_ctxt *dlm) | ||
266 | { | ||
267 | int in_recovery; | ||
268 | spin_lock(&dlm->spinlock); | ||
269 | in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE); | ||
270 | spin_unlock(&dlm->spinlock); | ||
271 | return in_recovery; | ||
272 | } | ||
273 | |||
274 | |||
275 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm) | ||
276 | { | ||
277 | wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); | ||
278 | } | ||
279 | |||
280 | static void dlm_begin_recovery(struct dlm_ctxt *dlm) | ||
281 | { | ||
282 | spin_lock(&dlm->spinlock); | ||
283 | BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); | ||
284 | dlm->reco.state |= DLM_RECO_STATE_ACTIVE; | ||
285 | spin_unlock(&dlm->spinlock); | ||
286 | } | ||
287 | |||
288 | static void dlm_end_recovery(struct dlm_ctxt *dlm) | ||
289 | { | ||
290 | spin_lock(&dlm->spinlock); | ||
291 | BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); | ||
292 | dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; | ||
293 | spin_unlock(&dlm->spinlock); | ||
294 | wake_up(&dlm->reco.event); | ||
295 | } | ||
296 | |||
297 | static int dlm_do_recovery(struct dlm_ctxt *dlm) | ||
298 | { | ||
299 | int status = 0; | ||
300 | |||
301 | spin_lock(&dlm->spinlock); | ||
302 | |||
303 | /* check to see if the new master has died */ | ||
304 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && | ||
305 | test_bit(dlm->reco.new_master, dlm->recovery_map)) { | ||
306 | mlog(0, "new master %u died while recovering %u!\n", | ||
307 | dlm->reco.new_master, dlm->reco.dead_node); | ||
308 | /* unset the new_master, leave dead_node */ | ||
309 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | ||
310 | } | ||
311 | |||
312 | /* select a target to recover */ | ||
313 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { | ||
314 | int bit; | ||
315 | |||
316 | bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); | ||
317 | if (bit >= O2NM_MAX_NODES || bit < 0) | ||
318 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
319 | else | ||
320 | dlm->reco.dead_node = bit; | ||
321 | } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { | ||
322 | /* BUG? */ | ||
323 | mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", | ||
324 | dlm->reco.dead_node); | ||
325 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
326 | } | ||
327 | |||
328 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { | ||
329 | // mlog(0, "nothing to recover! sleeping now!\n"); | ||
330 | spin_unlock(&dlm->spinlock); | ||
331 | /* return to main thread loop and sleep. */ | ||
332 | return 0; | ||
333 | } | ||
334 | mlog(0, "recovery thread found node %u in the recovery map!\n", | ||
335 | dlm->reco.dead_node); | ||
336 | spin_unlock(&dlm->spinlock); | ||
337 | |||
338 | /* take write barrier */ | ||
339 | /* (stops the list reshuffling thread, proxy ast handling) */ | ||
340 | dlm_begin_recovery(dlm); | ||
341 | |||
342 | if (dlm->reco.new_master == dlm->node_num) | ||
343 | goto master_here; | ||
344 | |||
345 | if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { | ||
346 | /* choose a new master */ | ||
347 | if (!dlm_pick_recovery_master(dlm)) { | ||
348 | /* already notified everyone. go. */ | ||
349 | dlm->reco.new_master = dlm->node_num; | ||
350 | goto master_here; | ||
351 | } | ||
352 | mlog(0, "another node will master this recovery session.\n"); | ||
353 | } | ||
354 | mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", | ||
355 | dlm->name, dlm->reco.new_master, | ||
356 | dlm->node_num, dlm->reco.dead_node); | ||
357 | |||
358 | /* it is safe to start everything back up here | ||
359 | * because all of the dead node's lock resources | ||
360 | * have been marked as in-recovery */ | ||
361 | dlm_end_recovery(dlm); | ||
362 | |||
363 | /* sleep out in main dlm_recovery_thread loop. */ | ||
364 | return 0; | ||
365 | |||
366 | master_here: | ||
367 | mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", | ||
368 | dlm->name, dlm->reco.dead_node, dlm->node_num); | ||
369 | |||
370 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); | ||
371 | if (status < 0) { | ||
372 | mlog(ML_ERROR, "error %d remastering locks for node %u, " | ||
373 | "retrying.\n", status, dlm->reco.dead_node); | ||
374 | } else { | ||
375 | /* success! see if any other nodes need recovery */ | ||
376 | dlm_reset_recovery(dlm); | ||
377 | } | ||
378 | dlm_end_recovery(dlm); | ||
379 | |||
380 | /* continue and look for another dead node */ | ||
381 | return -EAGAIN; | ||
382 | } | ||
383 | |||
384 | static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | ||
385 | { | ||
386 | int status = 0; | ||
387 | struct dlm_reco_node_data *ndata; | ||
388 | struct list_head *iter; | ||
389 | int all_nodes_done; | ||
390 | int destroy = 0; | ||
391 | int pass = 0; | ||
392 | |||
393 | status = dlm_init_recovery_area(dlm, dead_node); | ||
394 | if (status < 0) | ||
395 | goto leave; | ||
396 | |||
397 | /* safe to access the node data list without a lock, since this | ||
398 | * process is the only one to change the list */ | ||
399 | list_for_each(iter, &dlm->reco.node_data) { | ||
400 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
401 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); | ||
402 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; | ||
403 | |||
404 | mlog(0, "requesting lock info from node %u\n", | ||
405 | ndata->node_num); | ||
406 | |||
407 | if (ndata->node_num == dlm->node_num) { | ||
408 | ndata->state = DLM_RECO_NODE_DATA_DONE; | ||
409 | continue; | ||
410 | } | ||
411 | |||
412 | status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); | ||
413 | if (status < 0) { | ||
414 | mlog_errno(status); | ||
415 | if (dlm_is_host_down(status)) | ||
416 | ndata->state = DLM_RECO_NODE_DATA_DEAD; | ||
417 | else { | ||
418 | destroy = 1; | ||
419 | goto leave; | ||
420 | } | ||
421 | } | ||
422 | |||
423 | switch (ndata->state) { | ||
424 | case DLM_RECO_NODE_DATA_INIT: | ||
425 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
426 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
427 | BUG(); | ||
428 | break; | ||
429 | case DLM_RECO_NODE_DATA_DEAD: | ||
430 | mlog(0, "node %u died after requesting " | ||
431 | "recovery info for node %u\n", | ||
432 | ndata->node_num, dead_node); | ||
433 | // start all over | ||
434 | destroy = 1; | ||
435 | status = -EAGAIN; | ||
436 | goto leave; | ||
437 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
438 | ndata->state = DLM_RECO_NODE_DATA_REQUESTED; | ||
439 | mlog(0, "now receiving recovery data from " | ||
440 | "node %u for dead node %u\n", | ||
441 | ndata->node_num, dead_node); | ||
442 | break; | ||
443 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
444 | mlog(0, "already receiving recovery data from " | ||
445 | "node %u for dead node %u\n", | ||
446 | ndata->node_num, dead_node); | ||
447 | break; | ||
448 | case DLM_RECO_NODE_DATA_DONE: | ||
449 | mlog(0, "already DONE receiving recovery data " | ||
450 | "from node %u for dead node %u\n", | ||
451 | ndata->node_num, dead_node); | ||
452 | break; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | mlog(0, "done requesting all lock info\n"); | ||
457 | |||
458 | /* nodes should be sending reco data now | ||
459 | * just need to wait */ | ||
460 | |||
461 | while (1) { | ||
462 | /* check all the nodes now to see if we are | ||
463 | * done, or if anyone died */ | ||
464 | all_nodes_done = 1; | ||
465 | spin_lock(&dlm_reco_state_lock); | ||
466 | list_for_each(iter, &dlm->reco.node_data) { | ||
467 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
468 | |||
469 | mlog(0, "checking recovery state of node %u\n", | ||
470 | ndata->node_num); | ||
471 | switch (ndata->state) { | ||
472 | case DLM_RECO_NODE_DATA_INIT: | ||
473 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
474 | mlog(ML_ERROR, "bad ndata state for " | ||
475 | "node %u: state=%d\n", | ||
476 | ndata->node_num, ndata->state); | ||
477 | BUG(); | ||
478 | break; | ||
479 | case DLM_RECO_NODE_DATA_DEAD: | ||
480 | mlog(0, "node %u died after " | ||
481 | "requesting recovery info for " | ||
482 | "node %u\n", ndata->node_num, | ||
483 | dead_node); | ||
484 | spin_unlock(&dlm_reco_state_lock); | ||
485 | // start all over | ||
486 | destroy = 1; | ||
487 | status = -EAGAIN; | ||
488 | goto leave; | ||
489 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
490 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
491 | all_nodes_done = 0; | ||
492 | break; | ||
493 | case DLM_RECO_NODE_DATA_DONE: | ||
494 | break; | ||
495 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
496 | break; | ||
497 | } | ||
498 | } | ||
499 | spin_unlock(&dlm_reco_state_lock); | ||
500 | |||
501 | mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass, | ||
502 | all_nodes_done?"yes":"no"); | ||
503 | if (all_nodes_done) { | ||
504 | int ret; | ||
505 | |||
506 | /* all nodes are now in DLM_RECO_NODE_DATA_DONE state | ||
507 | * just send a finalize message to everyone and | ||
508 | * clean up */ | ||
509 | mlog(0, "all nodes are done! send finalize\n"); | ||
510 | ret = dlm_send_finalize_reco_message(dlm); | ||
511 | if (ret < 0) | ||
512 | mlog_errno(ret); | ||
513 | |||
514 | spin_lock(&dlm->spinlock); | ||
515 | dlm_finish_local_lockres_recovery(dlm, dead_node, | ||
516 | dlm->node_num); | ||
517 | spin_unlock(&dlm->spinlock); | ||
518 | mlog(0, "should be done with recovery!\n"); | ||
519 | |||
520 | mlog(0, "finishing recovery of %s at %lu, " | ||
521 | "dead=%u, this=%u, new=%u\n", dlm->name, | ||
522 | jiffies, dlm->reco.dead_node, | ||
523 | dlm->node_num, dlm->reco.new_master); | ||
524 | destroy = 1; | ||
525 | status = ret; | ||
526 | /* rescan everything marked dirty along the way */ | ||
527 | dlm_kick_thread(dlm, NULL); | ||
528 | break; | ||
529 | } | ||
530 | /* wait to be signalled, with periodic timeout | ||
531 | * to check for node death */ | ||
532 | wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, | ||
533 | kthread_should_stop(), | ||
534 | msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS)); | ||
535 | |||
536 | } | ||
537 | |||
538 | leave: | ||
539 | if (destroy) | ||
540 | dlm_destroy_recovery_area(dlm, dead_node); | ||
541 | |||
542 | mlog_exit(status); | ||
543 | return status; | ||
544 | } | ||
545 | |||
546 | static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | ||
547 | { | ||
548 | int num=0; | ||
549 | struct dlm_reco_node_data *ndata; | ||
550 | |||
551 | spin_lock(&dlm->spinlock); | ||
552 | memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); | ||
553 | /* nodes can only be removed (by dying) after dropping | ||
554 | * this lock, and death will be trapped later, so this should do */ | ||
555 | spin_unlock(&dlm->spinlock); | ||
556 | |||
557 | while (1) { | ||
558 | num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num); | ||
559 | if (num >= O2NM_MAX_NODES) { | ||
560 | break; | ||
561 | } | ||
562 | BUG_ON(num == dead_node); | ||
563 | |||
564 | ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); | ||
565 | if (!ndata) { | ||
566 | dlm_destroy_recovery_area(dlm, dead_node); | ||
567 | return -ENOMEM; | ||
568 | } | ||
569 | ndata->node_num = num; | ||
570 | ndata->state = DLM_RECO_NODE_DATA_INIT; | ||
571 | spin_lock(&dlm_reco_state_lock); | ||
572 | list_add_tail(&ndata->list, &dlm->reco.node_data); | ||
573 | spin_unlock(&dlm_reco_state_lock); | ||
574 | num++; | ||
575 | } | ||
576 | |||
577 | return 0; | ||
578 | } | ||
579 | |||
580 | static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | ||
581 | { | ||
582 | struct list_head *iter, *iter2; | ||
583 | struct dlm_reco_node_data *ndata; | ||
584 | LIST_HEAD(tmplist); | ||
585 | |||
586 | spin_lock(&dlm_reco_state_lock); | ||
587 | list_splice_init(&dlm->reco.node_data, &tmplist); | ||
588 | spin_unlock(&dlm_reco_state_lock); | ||
589 | |||
590 | list_for_each_safe(iter, iter2, &tmplist) { | ||
591 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
592 | list_del_init(&ndata->list); | ||
593 | kfree(ndata); | ||
594 | } | ||
595 | } | ||
596 | |||
597 | static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, | ||
598 | u8 dead_node) | ||
599 | { | ||
600 | struct dlm_lock_request lr; | ||
601 | enum dlm_status ret; | ||
602 | |||
603 | mlog(0, "\n"); | ||
604 | |||
605 | |||
606 | mlog(0, "dlm_request_all_locks: dead node is %u, sending request " | ||
607 | "to %u\n", dead_node, request_from); | ||
608 | |||
609 | memset(&lr, 0, sizeof(lr)); | ||
610 | lr.node_idx = dlm->node_num; | ||
611 | lr.dead_node = dead_node; | ||
612 | |||
613 | // send message | ||
614 | ret = DLM_NOLOCKMGR; | ||
615 | ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, | ||
616 | &lr, sizeof(lr), request_from, NULL); | ||
617 | |||
618 | /* negative status is handled by caller */ | ||
619 | if (ret < 0) | ||
620 | mlog_errno(ret); | ||
621 | |||
622 | // return from here, then | ||
623 | // sleep until all received or error | ||
624 | return ret; | ||
625 | |||
626 | } | ||
627 | |||
628 | int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) | ||
629 | { | ||
630 | struct dlm_ctxt *dlm = data; | ||
631 | struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; | ||
632 | char *buf = NULL; | ||
633 | struct dlm_work_item *item = NULL; | ||
634 | |||
635 | if (!dlm_grab(dlm)) | ||
636 | return -EINVAL; | ||
637 | |||
638 | BUG_ON(lr->dead_node != dlm->reco.dead_node); | ||
639 | |||
640 | item = kcalloc(1, sizeof(*item), GFP_KERNEL); | ||
641 | if (!item) { | ||
642 | dlm_put(dlm); | ||
643 | return -ENOMEM; | ||
644 | } | ||
645 | |||
646 | /* this will get freed by dlm_request_all_locks_worker */ | ||
647 | buf = (char *) __get_free_page(GFP_KERNEL); | ||
648 | if (!buf) { | ||
649 | kfree(item); | ||
650 | dlm_put(dlm); | ||
651 | return -ENOMEM; | ||
652 | } | ||
653 | |||
654 | /* queue up work for dlm_request_all_locks_worker */ | ||
655 | dlm_grab(dlm); /* get an extra ref for the work item */ | ||
656 | dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf); | ||
657 | item->u.ral.reco_master = lr->node_idx; | ||
658 | item->u.ral.dead_node = lr->dead_node; | ||
659 | spin_lock(&dlm->work_lock); | ||
660 | list_add_tail(&item->list, &dlm->work_list); | ||
661 | spin_unlock(&dlm->work_lock); | ||
662 | schedule_work(&dlm->dispatched_work); | ||
663 | |||
664 | dlm_put(dlm); | ||
665 | return 0; | ||
666 | } | ||
667 | |||
668 | static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | ||
669 | { | ||
670 | struct dlm_migratable_lockres *mres; | ||
671 | struct dlm_lock_resource *res; | ||
672 | struct dlm_ctxt *dlm; | ||
673 | LIST_HEAD(resources); | ||
674 | struct list_head *iter; | ||
675 | int ret; | ||
676 | u8 dead_node, reco_master; | ||
677 | |||
678 | dlm = item->dlm; | ||
679 | dead_node = item->u.ral.dead_node; | ||
680 | reco_master = item->u.ral.reco_master; | ||
681 | BUG_ON(dead_node != dlm->reco.dead_node); | ||
682 | BUG_ON(reco_master != dlm->reco.new_master); | ||
683 | |||
684 | mres = (struct dlm_migratable_lockres *)data; | ||
685 | |||
686 | /* lock resources should have already been moved to the | ||
687 | * dlm->reco.resources list. now move items from that list | ||
688 | * to a temp list if the dead owner matches. note that the | ||
689 | * whole cluster recovers only one node at a time, so we | ||
690 | * can safely move UNKNOWN lock resources for each recovery | ||
691 | * session. */ | ||
692 | dlm_move_reco_locks_to_list(dlm, &resources, dead_node); | ||
693 | |||
694 | /* now we can begin blasting lockreses without the dlm lock */ | ||
695 | list_for_each(iter, &resources) { | ||
696 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
697 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, | ||
698 | DLM_MRES_RECOVERY); | ||
699 | if (ret < 0) | ||
700 | mlog_errno(ret); | ||
701 | } | ||
702 | |||
703 | /* move the resources back to the list */ | ||
704 | spin_lock(&dlm->spinlock); | ||
705 | list_splice_init(&resources, &dlm->reco.resources); | ||
706 | spin_unlock(&dlm->spinlock); | ||
707 | |||
708 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); | ||
709 | if (ret < 0) | ||
710 | mlog_errno(ret); | ||
711 | |||
712 | free_page((unsigned long)data); | ||
713 | } | ||
714 | |||
715 | |||
716 | static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) | ||
717 | { | ||
718 | int ret, tmpret; | ||
719 | struct dlm_reco_data_done done_msg; | ||
720 | |||
721 | memset(&done_msg, 0, sizeof(done_msg)); | ||
722 | done_msg.node_idx = dlm->node_num; | ||
723 | done_msg.dead_node = dead_node; | ||
724 | mlog(0, "sending DATA DONE message to %u, " | ||
725 | "my node=%u, dead node=%u\n", send_to, done_msg.node_idx, | ||
726 | done_msg.dead_node); | ||
727 | |||
728 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, | ||
729 | sizeof(done_msg), send_to, &tmpret); | ||
730 | /* negative status is ignored by the caller */ | ||
731 | if (ret >= 0) | ||
732 | ret = tmpret; | ||
733 | return ret; | ||
734 | } | ||
735 | |||
736 | |||
737 | int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) | ||
738 | { | ||
739 | struct dlm_ctxt *dlm = data; | ||
740 | struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; | ||
741 | struct list_head *iter; | ||
742 | struct dlm_reco_node_data *ndata = NULL; | ||
743 | int ret = -EINVAL; | ||
744 | |||
745 | if (!dlm_grab(dlm)) | ||
746 | return -EINVAL; | ||
747 | |||
748 | mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " | ||
749 | "node_idx=%u, this node=%u\n", done->dead_node, | ||
750 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | ||
751 | BUG_ON(done->dead_node != dlm->reco.dead_node); | ||
752 | |||
753 | spin_lock(&dlm_reco_state_lock); | ||
754 | list_for_each(iter, &dlm->reco.node_data) { | ||
755 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
756 | if (ndata->node_num != done->node_idx) | ||
757 | continue; | ||
758 | |||
759 | switch (ndata->state) { | ||
760 | case DLM_RECO_NODE_DATA_INIT: | ||
761 | case DLM_RECO_NODE_DATA_DEAD: | ||
762 | case DLM_RECO_NODE_DATA_DONE: | ||
763 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
764 | mlog(ML_ERROR, "bad ndata state for node %u:" | ||
765 | " state=%d\n", ndata->node_num, | ||
766 | ndata->state); | ||
767 | BUG(); | ||
768 | break; | ||
769 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
770 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
771 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
772 | mlog(0, "node %u is DONE sending " | ||
773 | "recovery data!\n", | ||
774 | ndata->node_num); | ||
775 | |||
776 | ndata->state = DLM_RECO_NODE_DATA_DONE; | ||
777 | ret = 0; | ||
778 | break; | ||
779 | } | ||
780 | } | ||
781 | spin_unlock(&dlm_reco_state_lock); | ||
782 | |||
783 | /* wake the recovery thread, some node is done */ | ||
784 | if (!ret) | ||
785 | dlm_kick_recovery_thread(dlm); | ||
786 | |||
787 | if (ret < 0) | ||
788 | mlog(ML_ERROR, "failed to find recovery node data for node " | ||
789 | "%u\n", done->node_idx); | ||
790 | dlm_put(dlm); | ||
791 | |||
792 | mlog(0, "leaving reco data done handler, ret=%d\n", ret); | ||
793 | return ret; | ||
794 | } | ||
795 | |||
796 | static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, | ||
797 | struct list_head *list, | ||
798 | u8 dead_node) | ||
799 | { | ||
800 | struct dlm_lock_resource *res; | ||
801 | struct list_head *iter, *iter2; | ||
802 | |||
803 | spin_lock(&dlm->spinlock); | ||
804 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | ||
805 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
806 | if (dlm_is_recovery_lock(res->lockname.name, | ||
807 | res->lockname.len)) | ||
808 | continue; | ||
809 | if (res->owner == dead_node) { | ||
810 | mlog(0, "found lockres owned by dead node while " | ||
811 | "doing recovery for node %u. sending it.\n", | ||
812 | dead_node); | ||
813 | list_del_init(&res->recovering); | ||
814 | list_add_tail(&res->recovering, list); | ||
815 | } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
816 | mlog(0, "found UNKNOWN owner while doing recovery " | ||
817 | "for node %u. sending it.\n", dead_node); | ||
818 | list_del_init(&res->recovering); | ||
819 | list_add_tail(&res->recovering, list); | ||
820 | } | ||
821 | } | ||
822 | spin_unlock(&dlm->spinlock); | ||
823 | } | ||
824 | |||
825 | static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res) | ||
826 | { | ||
827 | int total_locks = 0; | ||
828 | struct list_head *iter, *queue = &res->granted; | ||
829 | int i; | ||
830 | |||
831 | for (i=0; i<3; i++) { | ||
832 | list_for_each(iter, queue) | ||
833 | total_locks++; | ||
834 | queue++; | ||
835 | } | ||
836 | return total_locks; | ||
837 | } | ||
838 | |||
839 | |||
840 | static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, | ||
841 | struct dlm_migratable_lockres *mres, | ||
842 | u8 send_to, | ||
843 | struct dlm_lock_resource *res, | ||
844 | int total_locks) | ||
845 | { | ||
846 | u64 mig_cookie = be64_to_cpu(mres->mig_cookie); | ||
847 | int mres_total_locks = be32_to_cpu(mres->total_locks); | ||
848 | int sz, ret = 0, status = 0; | ||
849 | u8 orig_flags = mres->flags, | ||
850 | orig_master = mres->master; | ||
851 | |||
852 | BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS); | ||
853 | if (!mres->num_locks) | ||
854 | return 0; | ||
855 | |||
856 | sz = sizeof(struct dlm_migratable_lockres) + | ||
857 | (mres->num_locks * sizeof(struct dlm_migratable_lock)); | ||
858 | |||
859 | /* add an all-done flag if we reached the last lock */ | ||
860 | orig_flags = mres->flags; | ||
861 | BUG_ON(total_locks > mres_total_locks); | ||
862 | if (total_locks == mres_total_locks) | ||
863 | mres->flags |= DLM_MRES_ALL_DONE; | ||
864 | |||
865 | /* send it */ | ||
866 | ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, | ||
867 | sz, send_to, &status); | ||
868 | if (ret < 0) { | ||
869 | /* XXX: negative status is not handled. | ||
870 | * this will end up killing this node. */ | ||
871 | mlog_errno(ret); | ||
872 | } else { | ||
873 | /* might get an -ENOMEM back here */ | ||
874 | ret = status; | ||
875 | if (ret < 0) { | ||
876 | mlog_errno(ret); | ||
877 | |||
878 | if (ret == -EFAULT) { | ||
879 | mlog(ML_ERROR, "node %u told me to kill " | ||
880 | "myself!\n", send_to); | ||
881 | BUG(); | ||
882 | } | ||
883 | } | ||
884 | } | ||
885 | |||
886 | /* zero and reinit the message buffer */ | ||
887 | dlm_init_migratable_lockres(mres, res->lockname.name, | ||
888 | res->lockname.len, mres_total_locks, | ||
889 | mig_cookie, orig_flags, orig_master); | ||
890 | return ret; | ||
891 | } | ||
892 | |||
893 | static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, | ||
894 | const char *lockname, int namelen, | ||
895 | int total_locks, u64 cookie, | ||
896 | u8 flags, u8 master) | ||
897 | { | ||
898 | /* mres here is one full page */ | ||
899 | memset(mres, 0, PAGE_SIZE); | ||
900 | mres->lockname_len = namelen; | ||
901 | memcpy(mres->lockname, lockname, namelen); | ||
902 | mres->num_locks = 0; | ||
903 | mres->total_locks = cpu_to_be32(total_locks); | ||
904 | mres->mig_cookie = cpu_to_be64(cookie); | ||
905 | mres->flags = flags; | ||
906 | mres->master = master; | ||
907 | } | ||
908 | |||
909 | |||
910 | /* returns 1 if this lock fills the network structure, | ||
911 | * 0 otherwise */ | ||
912 | static int dlm_add_lock_to_array(struct dlm_lock *lock, | ||
913 | struct dlm_migratable_lockres *mres, int queue) | ||
914 | { | ||
915 | struct dlm_migratable_lock *ml; | ||
916 | int lock_num = mres->num_locks; | ||
917 | |||
918 | ml = &(mres->ml[lock_num]); | ||
919 | ml->cookie = lock->ml.cookie; | ||
920 | ml->type = lock->ml.type; | ||
921 | ml->convert_type = lock->ml.convert_type; | ||
922 | ml->highest_blocked = lock->ml.highest_blocked; | ||
923 | ml->list = queue; | ||
924 | if (lock->lksb) { | ||
925 | ml->flags = lock->lksb->flags; | ||
926 | /* send our current lvb */ | ||
927 | if (ml->type == LKM_EXMODE || | ||
928 | ml->type == LKM_PRMODE) { | ||
929 | /* if it is already set, this had better be a PR | ||
930 | * and it has to match */ | ||
931 | if (mres->lvb[0] && (ml->type == LKM_EXMODE || | ||
932 | memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { | ||
933 | mlog(ML_ERROR, "mismatched lvbs!\n"); | ||
934 | __dlm_print_one_lock_resource(lock->lockres); | ||
935 | BUG(); | ||
936 | } | ||
937 | memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN); | ||
938 | } | ||
939 | } | ||
940 | ml->node = lock->ml.node; | ||
941 | mres->num_locks++; | ||
942 | /* we reached the max, send this network message */ | ||
943 | if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS) | ||
944 | return 1; | ||
945 | return 0; | ||
946 | } | ||
947 | |||
948 | |||
949 | int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
950 | struct dlm_migratable_lockres *mres, | ||
951 | u8 send_to, u8 flags) | ||
952 | { | ||
953 | struct list_head *queue, *iter; | ||
954 | int total_locks, i; | ||
955 | u64 mig_cookie = 0; | ||
956 | struct dlm_lock *lock; | ||
957 | int ret = 0; | ||
958 | |||
959 | BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); | ||
960 | |||
961 | mlog(0, "sending to %u\n", send_to); | ||
962 | |||
963 | total_locks = dlm_num_locks_in_lockres(res); | ||
964 | if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) { | ||
965 | /* rare, but possible */ | ||
966 | mlog(0, "argh. lockres has %d locks. this will " | ||
967 | "require more than one network packet to " | ||
968 | "migrate\n", total_locks); | ||
969 | mig_cookie = dlm_get_next_mig_cookie(); | ||
970 | } | ||
971 | |||
972 | dlm_init_migratable_lockres(mres, res->lockname.name, | ||
973 | res->lockname.len, total_locks, | ||
974 | mig_cookie, flags, res->owner); | ||
975 | |||
976 | total_locks = 0; | ||
977 | for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { | ||
978 | queue = dlm_list_idx_to_ptr(res, i); | ||
979 | list_for_each(iter, queue) { | ||
980 | lock = list_entry (iter, struct dlm_lock, list); | ||
981 | |||
982 | /* add another lock. */ | ||
983 | total_locks++; | ||
984 | if (!dlm_add_lock_to_array(lock, mres, i)) | ||
985 | continue; | ||
986 | |||
987 | /* this filled the lock message, | ||
988 | * we must send it immediately. */ | ||
989 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, | ||
990 | res, total_locks); | ||
991 | if (ret < 0) { | ||
992 | // TODO | ||
993 | mlog(ML_ERROR, "dlm_send_mig_lockres_msg " | ||
994 | "returned %d, TODO\n", ret); | ||
995 | BUG(); | ||
996 | } | ||
997 | } | ||
998 | } | ||
999 | /* flush any remaining locks */ | ||
1000 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); | ||
1001 | if (ret < 0) { | ||
1002 | // TODO | ||
1003 | mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " | ||
1004 | "TODO\n", ret); | ||
1005 | BUG(); | ||
1006 | } | ||
1007 | return ret; | ||
1008 | } | ||
1009 | |||
1010 | |||
1011 | |||
1012 | /* | ||
1013 | * this message will contain no more than one page worth of | ||
1014 | * recovery data, and it will work on only one lockres. | ||
1015 | * there may be many locks in this page, and we may need to wait | ||
1016 | * for additional packets to complete all the locks (rare, but | ||
1017 | * possible). | ||
1018 | */ | ||
1019 | /* | ||
1020 | * NOTE: the allocation error cases here are scary | ||
1021 | * we really cannot afford to fail an alloc in recovery | ||
1022 | * do we spin? returning an error only delays the problem really | ||
1023 | */ | ||
1024 | |||
1025 | int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) | ||
1026 | { | ||
1027 | struct dlm_ctxt *dlm = data; | ||
1028 | struct dlm_migratable_lockres *mres = | ||
1029 | (struct dlm_migratable_lockres *)msg->buf; | ||
1030 | int ret = 0; | ||
1031 | u8 real_master; | ||
1032 | char *buf = NULL; | ||
1033 | struct dlm_work_item *item = NULL; | ||
1034 | struct dlm_lock_resource *res = NULL; | ||
1035 | |||
1036 | if (!dlm_grab(dlm)) | ||
1037 | return -EINVAL; | ||
1038 | |||
1039 | BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); | ||
1040 | |||
1041 | real_master = mres->master; | ||
1042 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1043 | /* cannot migrate a lockres with no master */ | ||
1044 | BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); | ||
1045 | } | ||
1046 | |||
1047 | mlog(0, "%s message received from node %u\n", | ||
1048 | (mres->flags & DLM_MRES_RECOVERY) ? | ||
1049 | "recovery" : "migration", mres->master); | ||
1050 | if (mres->flags & DLM_MRES_ALL_DONE) | ||
1051 | mlog(0, "all done flag. all lockres data received!\n"); | ||
1052 | |||
1053 | ret = -ENOMEM; | ||
1054 | buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); | ||
1055 | item = kcalloc(1, sizeof(*item), GFP_KERNEL); | ||
1056 | if (!buf || !item) | ||
1057 | goto leave; | ||
1058 | |||
1059 | /* lookup the lock to see if we have a secondary queue for this | ||
1060 | * already... just add the locks in and this will have its owner | ||
1061 | * and RECOVERY flag changed when it completes. */ | ||
1062 | res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); | ||
1063 | if (res) { | ||
1064 | /* this will get a ref on res */ | ||
1065 | /* mark it as recovering/migrating and hash it */ | ||
1066 | spin_lock(&res->spinlock); | ||
1067 | if (mres->flags & DLM_MRES_RECOVERY) { | ||
1068 | res->state |= DLM_LOCK_RES_RECOVERING; | ||
1069 | } else { | ||
1070 | if (res->state & DLM_LOCK_RES_MIGRATING) { | ||
1071 | /* this is at least the second | ||
1072 | * lockres message */ | ||
1073 | mlog(0, "lock %.*s is already migrating\n", | ||
1074 | mres->lockname_len, | ||
1075 | mres->lockname); | ||
1076 | } else if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
1077 | /* caller should BUG */ | ||
1078 | mlog(ML_ERROR, "node is attempting to migrate " | ||
1079 | "lock %.*s, but marked as recovering!\n", | ||
1080 | mres->lockname_len, mres->lockname); | ||
1081 | ret = -EFAULT; | ||
1082 | spin_unlock(&res->spinlock); | ||
1083 | goto leave; | ||
1084 | } | ||
1085 | res->state |= DLM_LOCK_RES_MIGRATING; | ||
1086 | } | ||
1087 | spin_unlock(&res->spinlock); | ||
1088 | } else { | ||
1089 | /* need to allocate, just like if it was | ||
1090 | * mastered here normally */ | ||
1091 | res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); | ||
1092 | if (!res) | ||
1093 | goto leave; | ||
1094 | |||
1095 | /* to match the ref that we would have gotten if | ||
1096 | * dlm_lookup_lockres had succeeded */ | ||
1097 | dlm_lockres_get(res); | ||
1098 | |||
1099 | /* mark it as recovering/migrating and hash it */ | ||
1100 | if (mres->flags & DLM_MRES_RECOVERY) | ||
1101 | res->state |= DLM_LOCK_RES_RECOVERING; | ||
1102 | else | ||
1103 | res->state |= DLM_LOCK_RES_MIGRATING; | ||
1104 | |||
1105 | spin_lock(&dlm->spinlock); | ||
1106 | __dlm_insert_lockres(dlm, res); | ||
1107 | spin_unlock(&dlm->spinlock); | ||
1108 | |||
1109 | /* now that the new lockres is inserted, | ||
1110 | * make it usable by other processes */ | ||
1111 | spin_lock(&res->spinlock); | ||
1112 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | ||
1113 | spin_unlock(&res->spinlock); | ||
1114 | |||
1115 | /* add an extra ref for just-allocated lockres | ||
1116 | * otherwise the lockres will be purged immediately */ | ||
1117 | dlm_lockres_get(res); | ||
1118 | |||
1119 | } | ||
1120 | |||
1121 | /* at this point we have allocated everything we need, | ||
1122 | * and we have a hashed lockres with an extra ref and | ||
1123 | * the proper res->state flags. */ | ||
1124 | ret = 0; | ||
1125 | if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1126 | /* migration cannot have an unknown master */ | ||
1127 | BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); | ||
1128 | mlog(0, "recovery has passed me a lockres with an " | ||
1129 | "unknown owner.. will need to requery: " | ||
1130 | "%.*s\n", mres->lockname_len, mres->lockname); | ||
1131 | } else { | ||
1132 | spin_lock(&res->spinlock); | ||
1133 | dlm_change_lockres_owner(dlm, res, dlm->node_num); | ||
1134 | spin_unlock(&res->spinlock); | ||
1135 | } | ||
1136 | |||
1137 | /* queue up work for dlm_mig_lockres_worker */ | ||
1138 | dlm_grab(dlm); /* get an extra ref for the work item */ | ||
1139 | memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */ | ||
1140 | dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf); | ||
1141 | item->u.ml.lockres = res; /* already have a ref */ | ||
1142 | item->u.ml.real_master = real_master; | ||
1143 | spin_lock(&dlm->work_lock); | ||
1144 | list_add_tail(&item->list, &dlm->work_list); | ||
1145 | spin_unlock(&dlm->work_lock); | ||
1146 | schedule_work(&dlm->dispatched_work); | ||
1147 | |||
1148 | leave: | ||
1149 | dlm_put(dlm); | ||
1150 | if (ret < 0) { | ||
1151 | if (buf) | ||
1152 | kfree(buf); | ||
1153 | if (item) | ||
1154 | kfree(item); | ||
1155 | } | ||
1156 | |||
1157 | mlog_exit(ret); | ||
1158 | return ret; | ||
1159 | } | ||
1160 | |||
1161 | |||
1162 | static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) | ||
1163 | { | ||
1164 | struct dlm_ctxt *dlm = data; | ||
1165 | struct dlm_migratable_lockres *mres; | ||
1166 | int ret = 0; | ||
1167 | struct dlm_lock_resource *res; | ||
1168 | u8 real_master; | ||
1169 | |||
1170 | dlm = item->dlm; | ||
1171 | mres = (struct dlm_migratable_lockres *)data; | ||
1172 | |||
1173 | res = item->u.ml.lockres; | ||
1174 | real_master = item->u.ml.real_master; | ||
1175 | |||
1176 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1177 | /* this case is super-rare. only occurs if | ||
1178 | * node death happens during migration. */ | ||
1179 | again: | ||
1180 | ret = dlm_lockres_master_requery(dlm, res, &real_master); | ||
1181 | if (ret < 0) { | ||
1182 | mlog(0, "dlm_lockres_master_requery failure: %d\n", | ||
1183 | ret); | ||
1184 | goto again; | ||
1185 | } | ||
1186 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1187 | mlog(0, "lockres %.*s not claimed. " | ||
1188 | "this node will take it.\n", | ||
1189 | res->lockname.len, res->lockname.name); | ||
1190 | } else { | ||
1191 | mlog(0, "master needs to respond to sender " | ||
1192 | "that node %u still owns %.*s\n", | ||
1193 | real_master, res->lockname.len, | ||
1194 | res->lockname.name); | ||
1195 | /* cannot touch this lockres */ | ||
1196 | goto leave; | ||
1197 | } | ||
1198 | } | ||
1199 | |||
1200 | ret = dlm_process_recovery_data(dlm, res, mres); | ||
1201 | if (ret < 0) | ||
1202 | mlog(0, "dlm_process_recovery_data returned %d\n", ret); | ||
1203 | else | ||
1204 | mlog(0, "dlm_process_recovery_data succeeded\n"); | ||
1205 | |||
1206 | if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) == | ||
1207 | (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) { | ||
1208 | ret = dlm_finish_migration(dlm, res, mres->master); | ||
1209 | if (ret < 0) | ||
1210 | mlog_errno(ret); | ||
1211 | } | ||
1212 | |||
1213 | leave: | ||
1214 | kfree(data); | ||
1215 | mlog_exit(ret); | ||
1216 | } | ||
1217 | |||
1218 | |||
1219 | |||
1220 | static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, | ||
1221 | struct dlm_lock_resource *res, | ||
1222 | u8 *real_master) | ||
1223 | { | ||
1224 | struct dlm_node_iter iter; | ||
1225 | int nodenum; | ||
1226 | int ret = 0; | ||
1227 | |||
1228 | *real_master = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1229 | |||
1230 | /* we only reach here if one of the two nodes in a | ||
1231 | * migration died while the migration was in progress. | ||
1232 | * at this point we need to requery the master. we | ||
1233 | * know that the new_master got as far as creating | ||
1234 | * an mle on at least one node, but we do not know | ||
1235 | * if any nodes had actually cleared the mle and set | ||
1236 | * the master to the new_master. the old master | ||
1237 | * is supposed to set the owner to UNKNOWN in the | ||
1238 | * event of a new_master death, so the only possible | ||
1239 | * responses that we can get from nodes here are | ||
1240 | * that the master is new_master, or that the master | ||
1241 | * is UNKNOWN. | ||
1242 | * if all nodes come back with UNKNOWN then we know | ||
1243 | * the lock needs remastering here. | ||
1244 | * if any node comes back with a valid master, check | ||
1245 | * to see if that master is the one that we are | ||
1246 | * recovering. if so, then the new_master died and | ||
1247 | * we need to remaster this lock. if not, then the | ||
1248 | * new_master survived and that node will respond to | ||
1249 | * other nodes about the owner. | ||
1250 | * if there is an owner, this node needs to dump this | ||
1251 | * lockres and alert the sender that this lockres | ||
1252 | * was rejected. */ | ||
1253 | spin_lock(&dlm->spinlock); | ||
1254 | dlm_node_iter_init(dlm->domain_map, &iter); | ||
1255 | spin_unlock(&dlm->spinlock); | ||
1256 | |||
1257 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | ||
1258 | /* do not send to self */ | ||
1259 | if (nodenum == dlm->node_num) | ||
1260 | continue; | ||
1261 | ret = dlm_do_master_requery(dlm, res, nodenum, real_master); | ||
1262 | if (ret < 0) { | ||
1263 | mlog_errno(ret); | ||
1264 | BUG(); | ||
1265 | /* TODO: need to figure a way to restart this */ | ||
1266 | } | ||
1267 | if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1268 | mlog(0, "lock master is %u\n", *real_master); | ||
1269 | break; | ||
1270 | } | ||
1271 | } | ||
1272 | return ret; | ||
1273 | } | ||
1274 | |||
1275 | |||
1276 | static int dlm_do_master_requery(struct dlm_ctxt *dlm, | ||
1277 | struct dlm_lock_resource *res, | ||
1278 | u8 nodenum, u8 *real_master) | ||
1279 | { | ||
1280 | int ret = -EINVAL; | ||
1281 | struct dlm_master_requery req; | ||
1282 | int status = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1283 | |||
1284 | memset(&req, 0, sizeof(req)); | ||
1285 | req.node_idx = dlm->node_num; | ||
1286 | req.namelen = res->lockname.len; | ||
1287 | memcpy(req.name, res->lockname.name, res->lockname.len); | ||
1288 | |||
1289 | ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, | ||
1290 | &req, sizeof(req), nodenum, &status); | ||
1291 | /* XXX: negative status not handled properly here. */ | ||
1292 | if (ret < 0) | ||
1293 | mlog_errno(ret); | ||
1294 | else { | ||
1295 | BUG_ON(status < 0); | ||
1296 | BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); | ||
1297 | *real_master = (u8) (status & 0xff); | ||
1298 | mlog(0, "node %u responded to master requery with %u\n", | ||
1299 | nodenum, *real_master); | ||
1300 | ret = 0; | ||
1301 | } | ||
1302 | return ret; | ||
1303 | } | ||
1304 | |||
1305 | |||
1306 | /* this function cannot error, so unless the sending | ||
1307 | * or receiving of the message failed, the owner can | ||
1308 | * be trusted */ | ||
1309 | int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) | ||
1310 | { | ||
1311 | struct dlm_ctxt *dlm = data; | ||
1312 | struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; | ||
1313 | struct dlm_lock_resource *res = NULL; | ||
1314 | int master = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1315 | u32 flags = DLM_ASSERT_MASTER_REQUERY; | ||
1316 | |||
1317 | if (!dlm_grab(dlm)) { | ||
1318 | /* since the domain has gone away on this | ||
1319 | * node, the proper response is UNKNOWN */ | ||
1320 | return master; | ||
1321 | } | ||
1322 | |||
1323 | spin_lock(&dlm->spinlock); | ||
1324 | res = __dlm_lookup_lockres(dlm, req->name, req->namelen); | ||
1325 | if (res) { | ||
1326 | spin_lock(&res->spinlock); | ||
1327 | master = res->owner; | ||
1328 | if (master == dlm->node_num) { | ||
1329 | int ret = dlm_dispatch_assert_master(dlm, res, | ||
1330 | 0, 0, flags); | ||
1331 | if (ret < 0) { | ||
1332 | mlog_errno(-ENOMEM); | ||
1333 | /* retry!? */ | ||
1334 | BUG(); | ||
1335 | } | ||
1336 | } | ||
1337 | spin_unlock(&res->spinlock); | ||
1338 | } | ||
1339 | spin_unlock(&dlm->spinlock); | ||
1340 | |||
1341 | dlm_put(dlm); | ||
1342 | return master; | ||
1343 | } | ||
1344 | |||
1345 | static inline struct list_head * | ||
1346 | dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num) | ||
1347 | { | ||
1348 | struct list_head *ret; | ||
1349 | BUG_ON(list_num < 0); | ||
1350 | BUG_ON(list_num > 2); | ||
1351 | ret = &(res->granted); | ||
1352 | ret += list_num; | ||
1353 | return ret; | ||
1354 | } | ||
1355 | /* TODO: do ast flush business | ||
1356 | * TODO: do MIGRATING and RECOVERING spinning | ||
1357 | */ | ||
1358 | |||
1359 | /* | ||
1360 | * NOTE about in-flight requests during migration: | ||
1361 | * | ||
1362 | * Before attempting the migrate, the master has marked the lockres as | ||
1363 | * MIGRATING and then flushed all of its pending ASTS. So any in-flight | ||
1364 | * requests either got queued before the MIGRATING flag got set, in which | ||
1365 | * case the lock data will reflect the change and a return message is on | ||
1366 | * the way, or the request failed to get in before MIGRATING got set. In | ||
1367 | * this case, the caller will be told to spin and wait for the MIGRATING | ||
1368 | * flag to be dropped, then recheck the master. | ||
1369 | * This holds true for the convert, cancel and unlock cases, and since lvb | ||
1370 | * updates are tied to these same messages, it applies to lvb updates as | ||
1371 | * well. For the lock case, there is no way a lock can be on the master | ||
1372 | * queue and not be on the secondary queue since the lock is always added | ||
1373 | * locally first. This means that the new target node will never be sent | ||
1374 | * a lock that he doesn't already have on the list. | ||
1375 | * In total, this means that the local lock is correct and should not be | ||
1376 | * updated to match the one sent by the master. Any messages sent back | ||
1377 | * from the master before the MIGRATING flag will bring the lock properly | ||
1378 | * up-to-date, and the change will be ordered properly for the waiter. | ||
1379 | * We will *not* attempt to modify the lock underneath the waiter. | ||
1380 | */ | ||
1381 | |||
1382 | static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | ||
1383 | struct dlm_lock_resource *res, | ||
1384 | struct dlm_migratable_lockres *mres) | ||
1385 | { | ||
1386 | struct dlm_migratable_lock *ml; | ||
1387 | struct list_head *queue; | ||
1388 | struct dlm_lock *newlock = NULL; | ||
1389 | struct dlm_lockstatus *lksb = NULL; | ||
1390 | int ret = 0; | ||
1391 | int i; | ||
1392 | struct list_head *iter; | ||
1393 | struct dlm_lock *lock = NULL; | ||
1394 | |||
1395 | mlog(0, "running %d locks for this lockres\n", mres->num_locks); | ||
1396 | for (i=0; i<mres->num_locks; i++) { | ||
1397 | ml = &(mres->ml[i]); | ||
1398 | BUG_ON(ml->highest_blocked != LKM_IVMODE); | ||
1399 | newlock = NULL; | ||
1400 | lksb = NULL; | ||
1401 | |||
1402 | queue = dlm_list_num_to_pointer(res, ml->list); | ||
1403 | |||
1404 | /* if the lock is for the local node it needs to | ||
1405 | * be moved to the proper location within the queue. | ||
1406 | * do not allocate a new lock structure. */ | ||
1407 | if (ml->node == dlm->node_num) { | ||
1408 | /* MIGRATION ONLY! */ | ||
1409 | BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); | ||
1410 | |||
1411 | spin_lock(&res->spinlock); | ||
1412 | list_for_each(iter, queue) { | ||
1413 | lock = list_entry (iter, struct dlm_lock, list); | ||
1414 | if (lock->ml.cookie != ml->cookie) | ||
1415 | lock = NULL; | ||
1416 | else | ||
1417 | break; | ||
1418 | } | ||
1419 | |||
1420 | /* lock is always created locally first, and | ||
1421 | * destroyed locally last. it must be on the list */ | ||
1422 | if (!lock) { | ||
1423 | mlog(ML_ERROR, "could not find local lock " | ||
1424 | "with cookie %"MLFu64"!\n", | ||
1425 | ml->cookie); | ||
1426 | BUG(); | ||
1427 | } | ||
1428 | BUG_ON(lock->ml.node != ml->node); | ||
1429 | |||
1430 | /* see NOTE above about why we do not update | ||
1431 | * to match the master here */ | ||
1432 | |||
1433 | /* move the lock to its proper place */ | ||
1434 | /* do not alter lock refcount. switching lists. */ | ||
1435 | list_del_init(&lock->list); | ||
1436 | list_add_tail(&lock->list, queue); | ||
1437 | spin_unlock(&res->spinlock); | ||
1438 | |||
1439 | mlog(0, "just reordered a local lock!\n"); | ||
1440 | continue; | ||
1441 | } | ||
1442 | |||
1443 | /* lock is for another node. */ | ||
1444 | newlock = dlm_new_lock(ml->type, ml->node, | ||
1445 | be64_to_cpu(ml->cookie), NULL); | ||
1446 | if (!newlock) { | ||
1447 | ret = -ENOMEM; | ||
1448 | goto leave; | ||
1449 | } | ||
1450 | lksb = newlock->lksb; | ||
1451 | dlm_lock_attach_lockres(newlock, res); | ||
1452 | |||
1453 | if (ml->convert_type != LKM_IVMODE) { | ||
1454 | BUG_ON(queue != &res->converting); | ||
1455 | newlock->ml.convert_type = ml->convert_type; | ||
1456 | } | ||
1457 | lksb->flags |= (ml->flags & | ||
1458 | (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); | ||
1459 | |||
1460 | if (mres->lvb[0]) { | ||
1461 | if (lksb->flags & DLM_LKSB_PUT_LVB) { | ||
1462 | /* other node was trying to update | ||
1463 | * lvb when node died. recreate the | ||
1464 | * lksb with the updated lvb. */ | ||
1465 | memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); | ||
1466 | } else { | ||
1467 | /* otherwise, the node is sending its | ||
1468 | * most recent valid lvb info */ | ||
1469 | BUG_ON(ml->type != LKM_EXMODE && | ||
1470 | ml->type != LKM_PRMODE); | ||
1471 | if (res->lvb[0] && (ml->type == LKM_EXMODE || | ||
1472 | memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { | ||
1473 | mlog(ML_ERROR, "received bad lvb!\n"); | ||
1474 | __dlm_print_one_lock_resource(res); | ||
1475 | BUG(); | ||
1476 | } | ||
1477 | memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); | ||
1478 | } | ||
1479 | } | ||
1480 | |||
1481 | |||
1482 | /* NOTE: | ||
1483 | * wrt lock queue ordering and recovery: | ||
1484 | * 1. order of locks on granted queue is | ||
1485 | * meaningless. | ||
1486 | * 2. order of locks on converting queue is | ||
1487 | * LOST with the node death. sorry charlie. | ||
1488 | * 3. order of locks on the blocked queue is | ||
1489 | * also LOST. | ||
1490 | * order of locks does not affect integrity, it | ||
1491 | * just means that a lock request may get pushed | ||
1492 | * back in line as a result of the node death. | ||
1493 | * also note that for a given node the lock order | ||
1494 | * for its secondary queue locks is preserved | ||
1495 | * relative to each other, but clearly *not* | ||
1496 | * preserved relative to locks from other nodes. | ||
1497 | */ | ||
1498 | spin_lock(&res->spinlock); | ||
1499 | dlm_lock_get(newlock); | ||
1500 | list_add_tail(&newlock->list, queue); | ||
1501 | spin_unlock(&res->spinlock); | ||
1502 | } | ||
1503 | mlog(0, "done running all the locks\n"); | ||
1504 | |||
1505 | leave: | ||
1506 | if (ret < 0) { | ||
1507 | mlog_errno(ret); | ||
1508 | if (newlock) | ||
1509 | dlm_lock_put(newlock); | ||
1510 | } | ||
1511 | |||
1512 | mlog_exit(ret); | ||
1513 | return ret; | ||
1514 | } | ||
1515 | |||
1516 | void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | ||
1517 | struct dlm_lock_resource *res) | ||
1518 | { | ||
1519 | int i; | ||
1520 | struct list_head *queue, *iter, *iter2; | ||
1521 | struct dlm_lock *lock; | ||
1522 | |||
1523 | res->state |= DLM_LOCK_RES_RECOVERING; | ||
1524 | if (!list_empty(&res->recovering)) | ||
1525 | list_del_init(&res->recovering); | ||
1526 | list_add_tail(&res->recovering, &dlm->reco.resources); | ||
1527 | |||
1528 | /* find any pending locks and put them back on proper list */ | ||
1529 | for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { | ||
1530 | queue = dlm_list_idx_to_ptr(res, i); | ||
1531 | list_for_each_safe(iter, iter2, queue) { | ||
1532 | lock = list_entry (iter, struct dlm_lock, list); | ||
1533 | dlm_lock_get(lock); | ||
1534 | if (lock->convert_pending) { | ||
1535 | /* move converting lock back to granted */ | ||
1536 | BUG_ON(i != DLM_CONVERTING_LIST); | ||
1537 | mlog(0, "node died with convert pending " | ||
1538 | "on %.*s. move back to granted list.\n", | ||
1539 | res->lockname.len, res->lockname.name); | ||
1540 | dlm_revert_pending_convert(res, lock); | ||
1541 | lock->convert_pending = 0; | ||
1542 | } else if (lock->lock_pending) { | ||
1543 | /* remove pending lock requests completely */ | ||
1544 | BUG_ON(i != DLM_BLOCKED_LIST); | ||
1545 | mlog(0, "node died with lock pending " | ||
1546 | "on %.*s. remove from blocked list and skip.\n", | ||
1547 | res->lockname.len, res->lockname.name); | ||
1548 | /* lock will be floating until ref in | ||
1549 | * dlmlock_remote is freed after the network | ||
1550 | * call returns. ok for it to not be on any | ||
1551 | * list since no ast can be called | ||
1552 | * (the master is dead). */ | ||
1553 | dlm_revert_pending_lock(res, lock); | ||
1554 | lock->lock_pending = 0; | ||
1555 | } else if (lock->unlock_pending) { | ||
1556 | /* if an unlock was in progress, treat as | ||
1557 | * if this had completed successfully | ||
1558 | * before sending this lock state to the | ||
1559 | * new master. note that the dlm_unlock | ||
1560 | * call is still responsible for calling | ||
1561 | * the unlockast. that will happen after | ||
1562 | * the network call times out. for now, | ||
1563 | * just move lists to prepare the new | ||
1564 | * recovery master. */ | ||
1565 | BUG_ON(i != DLM_GRANTED_LIST); | ||
1566 | mlog(0, "node died with unlock pending " | ||
1567 | "on %.*s. remove from blocked list and skip.\n", | ||
1568 | res->lockname.len, res->lockname.name); | ||
1569 | dlm_commit_pending_unlock(res, lock); | ||
1570 | lock->unlock_pending = 0; | ||
1571 | } else if (lock->cancel_pending) { | ||
1572 | /* if a cancel was in progress, treat as | ||
1573 | * if this had completed successfully | ||
1574 | * before sending this lock state to the | ||
1575 | * new master */ | ||
1576 | BUG_ON(i != DLM_CONVERTING_LIST); | ||
1577 | mlog(0, "node died with cancel pending " | ||
1578 | "on %.*s. move back to granted list.\n", | ||
1579 | res->lockname.len, res->lockname.name); | ||
1580 | dlm_commit_pending_cancel(res, lock); | ||
1581 | lock->cancel_pending = 0; | ||
1582 | } | ||
1583 | dlm_lock_put(lock); | ||
1584 | } | ||
1585 | } | ||
1586 | } | ||
1587 | |||
1588 | |||
1589 | |||
1590 | /* removes all recovered locks from the recovery list. | ||
1591 | * sets the res->owner to the new master. | ||
1592 | * unsets the RECOVERY flag and wakes waiters. */ | ||
1593 | static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | ||
1594 | u8 dead_node, u8 new_master) | ||
1595 | { | ||
1596 | int i; | ||
1597 | struct list_head *iter, *iter2, *bucket; | ||
1598 | struct dlm_lock_resource *res; | ||
1599 | |||
1600 | mlog_entry_void(); | ||
1601 | |||
1602 | assert_spin_locked(&dlm->spinlock); | ||
1603 | |||
1604 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | ||
1605 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
1606 | if (res->owner == dead_node) { | ||
1607 | list_del_init(&res->recovering); | ||
1608 | spin_lock(&res->spinlock); | ||
1609 | dlm_change_lockres_owner(dlm, res, new_master); | ||
1610 | res->state &= ~DLM_LOCK_RES_RECOVERING; | ||
1611 | __dlm_dirty_lockres(dlm, res); | ||
1612 | spin_unlock(&res->spinlock); | ||
1613 | wake_up(&res->wq); | ||
1614 | } | ||
1615 | } | ||
1616 | |||
1617 | /* this will become unnecessary eventually, but | ||
1618 | * for now we need to run the whole hash, clear | ||
1619 | * the RECOVERING state and set the owner | ||
1620 | * if necessary */ | ||
1621 | for (i=0; i<DLM_HASH_SIZE; i++) { | ||
1622 | bucket = &(dlm->resources[i]); | ||
1623 | list_for_each(iter, bucket) { | ||
1624 | res = list_entry (iter, struct dlm_lock_resource, list); | ||
1625 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
1626 | if (res->owner == dead_node) { | ||
1627 | mlog(0, "(this=%u) res %.*s owner=%u " | ||
1628 | "was not on recovering list, but " | ||
1629 | "clearing state anyway\n", | ||
1630 | dlm->node_num, res->lockname.len, | ||
1631 | res->lockname.name, new_master); | ||
1632 | } else if (res->owner == dlm->node_num) { | ||
1633 | mlog(0, "(this=%u) res %.*s owner=%u " | ||
1634 | "was not on recovering list, " | ||
1635 | "owner is THIS node, clearing\n", | ||
1636 | dlm->node_num, res->lockname.len, | ||
1637 | res->lockname.name, new_master); | ||
1638 | } else | ||
1639 | continue; | ||
1640 | |||
1641 | spin_lock(&res->spinlock); | ||
1642 | dlm_change_lockres_owner(dlm, res, new_master); | ||
1643 | res->state &= ~DLM_LOCK_RES_RECOVERING; | ||
1644 | __dlm_dirty_lockres(dlm, res); | ||
1645 | spin_unlock(&res->spinlock); | ||
1646 | wake_up(&res->wq); | ||
1647 | } | ||
1648 | } | ||
1649 | } | ||
1650 | } | ||
1651 | |||
1652 | static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local) | ||
1653 | { | ||
1654 | if (local) { | ||
1655 | if (lock->ml.type != LKM_EXMODE && | ||
1656 | lock->ml.type != LKM_PRMODE) | ||
1657 | return 1; | ||
1658 | } else if (lock->ml.type == LKM_EXMODE) | ||
1659 | return 1; | ||
1660 | return 0; | ||
1661 | } | ||
1662 | |||
1663 | static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, | ||
1664 | struct dlm_lock_resource *res, u8 dead_node) | ||
1665 | { | ||
1666 | struct list_head *iter, *queue; | ||
1667 | struct dlm_lock *lock; | ||
1668 | int blank_lvb = 0, local = 0; | ||
1669 | int i; | ||
1670 | u8 search_node; | ||
1671 | |||
1672 | assert_spin_locked(&dlm->spinlock); | ||
1673 | assert_spin_locked(&res->spinlock); | ||
1674 | |||
1675 | if (res->owner == dlm->node_num) | ||
1676 | /* if this node owned the lockres, and if the dead node | ||
1677 | * had an EX when he died, blank out the lvb */ | ||
1678 | search_node = dead_node; | ||
1679 | else { | ||
1680 | /* if this is a secondary lockres, and we had no EX or PR | ||
1681 | * locks granted, we can no longer trust the lvb */ | ||
1682 | search_node = dlm->node_num; | ||
1683 | local = 1; /* check local state for valid lvb */ | ||
1684 | } | ||
1685 | |||
1686 | for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { | ||
1687 | queue = dlm_list_idx_to_ptr(res, i); | ||
1688 | list_for_each(iter, queue) { | ||
1689 | lock = list_entry (iter, struct dlm_lock, list); | ||
1690 | if (lock->ml.node == search_node) { | ||
1691 | if (dlm_lvb_needs_invalidation(lock, local)) { | ||
1692 | /* zero the lksb lvb and lockres lvb */ | ||
1693 | blank_lvb = 1; | ||
1694 | memset(lock->lksb->lvb, 0, DLM_LVB_LEN); | ||
1695 | } | ||
1696 | } | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1700 | if (blank_lvb) { | ||
1701 | mlog(0, "clearing %.*s lvb, dead node %u had EX\n", | ||
1702 | res->lockname.len, res->lockname.name, dead_node); | ||
1703 | memset(res->lvb, 0, DLM_LVB_LEN); | ||
1704 | } | ||
1705 | } | ||
1706 | |||
1707 | static void dlm_free_dead_locks(struct dlm_ctxt *dlm, | ||
1708 | struct dlm_lock_resource *res, u8 dead_node) | ||
1709 | { | ||
1710 | struct list_head *iter, *tmpiter; | ||
1711 | struct dlm_lock *lock; | ||
1712 | |||
1713 | /* this node is the lockres master: | ||
1714 | * 1) remove any stale locks for the dead node | ||
1715 | * 2) if the dead node had an EX when he died, blank out the lvb | ||
1716 | */ | ||
1717 | assert_spin_locked(&dlm->spinlock); | ||
1718 | assert_spin_locked(&res->spinlock); | ||
1719 | |||
1720 | /* TODO: check pending_asts, pending_basts here */ | ||
1721 | list_for_each_safe(iter, tmpiter, &res->granted) { | ||
1722 | lock = list_entry (iter, struct dlm_lock, list); | ||
1723 | if (lock->ml.node == dead_node) { | ||
1724 | list_del_init(&lock->list); | ||
1725 | dlm_lock_put(lock); | ||
1726 | } | ||
1727 | } | ||
1728 | list_for_each_safe(iter, tmpiter, &res->converting) { | ||
1729 | lock = list_entry (iter, struct dlm_lock, list); | ||
1730 | if (lock->ml.node == dead_node) { | ||
1731 | list_del_init(&lock->list); | ||
1732 | dlm_lock_put(lock); | ||
1733 | } | ||
1734 | } | ||
1735 | list_for_each_safe(iter, tmpiter, &res->blocked) { | ||
1736 | lock = list_entry (iter, struct dlm_lock, list); | ||
1737 | if (lock->ml.node == dead_node) { | ||
1738 | list_del_init(&lock->list); | ||
1739 | dlm_lock_put(lock); | ||
1740 | } | ||
1741 | } | ||
1742 | |||
1743 | /* do not kick thread yet */ | ||
1744 | __dlm_dirty_lockres(dlm, res); | ||
1745 | } | ||
1746 | |||
1747 | /* if this node is the recovery master, and there are no | ||
1748 | * locks for a given lockres owned by this node that are in | ||
1749 | * either PR or EX mode, zero out the lvb before requesting. | ||
1750 | * | ||
1751 | */ | ||
1752 | |||
1753 | |||
1754 | static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) | ||
1755 | { | ||
1756 | struct list_head *iter; | ||
1757 | struct dlm_lock_resource *res; | ||
1758 | int i; | ||
1759 | struct list_head *bucket; | ||
1760 | |||
1761 | |||
1762 | /* purge any stale mles */ | ||
1763 | dlm_clean_master_list(dlm, dead_node); | ||
1764 | |||
1765 | /* | ||
1766 | * now clean up all lock resources. there are two rules: | ||
1767 | * | ||
1768 | * 1) if the dead node was the master, move the lockres | ||
1769 | * to the recovering list. set the RECOVERING flag. | ||
1770 | * this lockres needs to be cleaned up before it can | ||
1771 | * be used further. | ||
1772 | * | ||
1773 | * 2) if this node was the master, remove all locks from | ||
1774 | * each of the lockres queues that were owned by the | ||
1775 | * dead node. once recovery finishes, the dlm thread | ||
1776 | * can be kicked again to see if any ASTs or BASTs | ||
1777 | * need to be fired as a result. | ||
1778 | */ | ||
1779 | for (i=0; i<DLM_HASH_SIZE; i++) { | ||
1780 | bucket = &(dlm->resources[i]); | ||
1781 | list_for_each(iter, bucket) { | ||
1782 | res = list_entry (iter, struct dlm_lock_resource, list); | ||
1783 | if (dlm_is_recovery_lock(res->lockname.name, | ||
1784 | res->lockname.len)) | ||
1785 | continue; | ||
1786 | |||
1787 | spin_lock(&res->spinlock); | ||
1788 | /* zero the lvb if necessary */ | ||
1789 | dlm_revalidate_lvb(dlm, res, dead_node); | ||
1790 | if (res->owner == dead_node) | ||
1791 | dlm_move_lockres_to_recovery_list(dlm, res); | ||
1792 | else if (res->owner == dlm->node_num) { | ||
1793 | dlm_free_dead_locks(dlm, res, dead_node); | ||
1794 | __dlm_lockres_calc_usage(dlm, res); | ||
1795 | } | ||
1796 | spin_unlock(&res->spinlock); | ||
1797 | } | ||
1798 | } | ||
1799 | |||
1800 | } | ||
1801 | |||
1802 | static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) | ||
1803 | { | ||
1804 | assert_spin_locked(&dlm->spinlock); | ||
1805 | |||
1806 | /* check to see if the node is already considered dead */ | ||
1807 | if (!test_bit(idx, dlm->live_nodes_map)) { | ||
1808 | mlog(0, "for domain %s, node %d is already dead. " | ||
1809 | "another node likely did recovery already.\n", | ||
1810 | dlm->name, idx); | ||
1811 | return; | ||
1812 | } | ||
1813 | |||
1814 | /* check to see if we do not care about this node */ | ||
1815 | if (!test_bit(idx, dlm->domain_map)) { | ||
1816 | /* This also catches the case that we get a node down | ||
1817 | * but haven't joined the domain yet. */ | ||
1818 | mlog(0, "node %u already removed from domain!\n", idx); | ||
1819 | return; | ||
1820 | } | ||
1821 | |||
1822 | clear_bit(idx, dlm->live_nodes_map); | ||
1823 | |||
1824 | /* Clean up join state on node death. */ | ||
1825 | if (dlm->joining_node == idx) { | ||
1826 | mlog(0, "Clearing join state for node %u\n", idx); | ||
1827 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
1828 | } | ||
1829 | |||
1830 | /* make sure local cleanup occurs before the heartbeat events */ | ||
1831 | if (!test_bit(idx, dlm->recovery_map)) | ||
1832 | dlm_do_local_recovery_cleanup(dlm, idx); | ||
1833 | |||
1834 | /* notify anything attached to the heartbeat events */ | ||
1835 | dlm_hb_event_notify_attached(dlm, idx, 0); | ||
1836 | |||
1837 | mlog(0, "node %u being removed from domain map!\n", idx); | ||
1838 | clear_bit(idx, dlm->domain_map); | ||
1839 | /* wake up migration waiters if a node goes down. | ||
1840 | * perhaps later we can genericize this for other waiters. */ | ||
1841 | wake_up(&dlm->migration_wq); | ||
1842 | |||
1843 | if (test_bit(idx, dlm->recovery_map)) | ||
1844 | mlog(0, "domain %s, node %u already added " | ||
1845 | "to recovery map!\n", dlm->name, idx); | ||
1846 | else | ||
1847 | set_bit(idx, dlm->recovery_map); | ||
1848 | } | ||
1849 | |||
1850 | void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) | ||
1851 | { | ||
1852 | struct dlm_ctxt *dlm = data; | ||
1853 | |||
1854 | if (!dlm_grab(dlm)) | ||
1855 | return; | ||
1856 | |||
1857 | spin_lock(&dlm->spinlock); | ||
1858 | __dlm_hb_node_down(dlm, idx); | ||
1859 | spin_unlock(&dlm->spinlock); | ||
1860 | |||
1861 | dlm_put(dlm); | ||
1862 | } | ||
1863 | |||
1864 | void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) | ||
1865 | { | ||
1866 | struct dlm_ctxt *dlm = data; | ||
1867 | |||
1868 | if (!dlm_grab(dlm)) | ||
1869 | return; | ||
1870 | |||
1871 | spin_lock(&dlm->spinlock); | ||
1872 | |||
1873 | set_bit(idx, dlm->live_nodes_map); | ||
1874 | |||
1875 | /* notify any mles attached to the heartbeat events */ | ||
1876 | dlm_hb_event_notify_attached(dlm, idx, 1); | ||
1877 | |||
1878 | spin_unlock(&dlm->spinlock); | ||
1879 | |||
1880 | dlm_put(dlm); | ||
1881 | } | ||
1882 | |||
1883 | static void dlm_reco_ast(void *astdata) | ||
1884 | { | ||
1885 | struct dlm_ctxt *dlm = astdata; | ||
1886 | mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n", | ||
1887 | dlm->node_num, dlm->name); | ||
1888 | } | ||
1889 | static void dlm_reco_bast(void *astdata, int blocked_type) | ||
1890 | { | ||
1891 | struct dlm_ctxt *dlm = astdata; | ||
1892 | mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n", | ||
1893 | dlm->node_num, dlm->name); | ||
1894 | } | ||
1895 | static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) | ||
1896 | { | ||
1897 | mlog(0, "unlockast for recovery lock fired!\n"); | ||
1898 | } | ||
1899 | |||
1900 | |||
1901 | static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) | ||
1902 | { | ||
1903 | enum dlm_status ret; | ||
1904 | struct dlm_lockstatus lksb; | ||
1905 | int status = -EINVAL; | ||
1906 | |||
1907 | mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", | ||
1908 | dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); | ||
1909 | retry: | ||
1910 | memset(&lksb, 0, sizeof(lksb)); | ||
1911 | |||
1912 | ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, | ||
1913 | DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); | ||
1914 | |||
1915 | if (ret == DLM_NORMAL) { | ||
1916 | mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", | ||
1917 | dlm->name, dlm->node_num); | ||
1918 | /* I am master, send message to all nodes saying | ||
1919 | * that I am beginning a recovery session */ | ||
1920 | status = dlm_send_begin_reco_message(dlm, | ||
1921 | dlm->reco.dead_node); | ||
1922 | |||
1923 | /* recovery lock is a special case. ast will not get fired, | ||
1924 | * so just go ahead and unlock it. */ | ||
1925 | ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); | ||
1926 | if (ret != DLM_NORMAL) { | ||
1927 | /* this would really suck. this could only happen | ||
1928 | * if there was a network error during the unlock | ||
1929 | * because of node death. this means the unlock | ||
1930 | * is actually "done" and the lock structure is | ||
1931 | * even freed. we can continue, but only | ||
1932 | * because this specific lock name is special. */ | ||
1933 | mlog(0, "dlmunlock returned %d\n", ret); | ||
1934 | } | ||
1935 | |||
1936 | if (status < 0) { | ||
1937 | mlog(0, "failed to send recovery message. " | ||
1938 | "must retry with new node map.\n"); | ||
1939 | goto retry; | ||
1940 | } | ||
1941 | } else if (ret == DLM_NOTQUEUED) { | ||
1942 | mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", | ||
1943 | dlm->name, dlm->node_num); | ||
1944 | /* another node is master. wait on | ||
1945 | * reco.new_master != O2NM_INVALID_NODE_NUM */ | ||
1946 | status = -EEXIST; | ||
1947 | } | ||
1948 | |||
1949 | return status; | ||
1950 | } | ||
1951 | |||
1952 | static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) | ||
1953 | { | ||
1954 | struct dlm_begin_reco br; | ||
1955 | int ret = 0; | ||
1956 | struct dlm_node_iter iter; | ||
1957 | int nodenum; | ||
1958 | int status; | ||
1959 | |||
1960 | mlog_entry("%u\n", dead_node); | ||
1961 | |||
1962 | mlog(0, "dead node is %u\n", dead_node); | ||
1963 | |||
1964 | spin_lock(&dlm->spinlock); | ||
1965 | dlm_node_iter_init(dlm->domain_map, &iter); | ||
1966 | spin_unlock(&dlm->spinlock); | ||
1967 | |||
1968 | clear_bit(dead_node, iter.node_map); | ||
1969 | |||
1970 | memset(&br, 0, sizeof(br)); | ||
1971 | br.node_idx = dlm->node_num; | ||
1972 | br.dead_node = dead_node; | ||
1973 | |||
1974 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | ||
1975 | ret = 0; | ||
1976 | if (nodenum == dead_node) { | ||
1977 | mlog(0, "not sending begin reco to dead node " | ||
1978 | "%u\n", dead_node); | ||
1979 | continue; | ||
1980 | } | ||
1981 | if (nodenum == dlm->node_num) { | ||
1982 | mlog(0, "not sending begin reco to self\n"); | ||
1983 | continue; | ||
1984 | } | ||
1985 | |||
1986 | ret = -EINVAL; | ||
1987 | mlog(0, "attempting to send begin reco msg to %d\n", | ||
1988 | nodenum); | ||
1989 | ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, | ||
1990 | &br, sizeof(br), nodenum, &status); | ||
1991 | /* negative status is handled ok by caller here */ | ||
1992 | if (ret >= 0) | ||
1993 | ret = status; | ||
1994 | if (ret < 0) { | ||
1995 | struct dlm_lock_resource *res; | ||
1996 | mlog_errno(ret); | ||
1997 | mlog(ML_ERROR, "begin reco of dlm %s to node %u " | ||
1998 | " returned %d\n", dlm->name, nodenum, ret); | ||
1999 | res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, | ||
2000 | DLM_RECOVERY_LOCK_NAME_LEN); | ||
2001 | if (res) { | ||
2002 | dlm_print_one_lock_resource(res); | ||
2003 | dlm_lockres_put(res); | ||
2004 | } else { | ||
2005 | mlog(ML_ERROR, "recovery lock not found\n"); | ||
2006 | } | ||
2007 | break; | ||
2008 | } | ||
2009 | } | ||
2010 | |||
2011 | return ret; | ||
2012 | } | ||
2013 | |||
2014 | int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) | ||
2015 | { | ||
2016 | struct dlm_ctxt *dlm = data; | ||
2017 | struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; | ||
2018 | |||
2019 | /* ok to return 0, domain has gone away */ | ||
2020 | if (!dlm_grab(dlm)) | ||
2021 | return 0; | ||
2022 | |||
2023 | mlog(0, "node %u wants to recover node %u\n", | ||
2024 | br->node_idx, br->dead_node); | ||
2025 | |||
2026 | dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); | ||
2027 | |||
2028 | spin_lock(&dlm->spinlock); | ||
2029 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { | ||
2030 | mlog(0, "new_master already set to %u!\n", | ||
2031 | dlm->reco.new_master); | ||
2032 | } | ||
2033 | if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { | ||
2034 | mlog(0, "dead_node already set to %u!\n", | ||
2035 | dlm->reco.dead_node); | ||
2036 | } | ||
2037 | dlm->reco.new_master = br->node_idx; | ||
2038 | dlm->reco.dead_node = br->dead_node; | ||
2039 | if (!test_bit(br->dead_node, dlm->recovery_map)) { | ||
2040 | mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " | ||
2041 | "node has not yet. marking %u as dead\n", | ||
2042 | br->node_idx, br->dead_node, br->dead_node); | ||
2043 | __dlm_hb_node_down(dlm, br->dead_node); | ||
2044 | } | ||
2045 | spin_unlock(&dlm->spinlock); | ||
2046 | |||
2047 | dlm_kick_recovery_thread(dlm); | ||
2048 | dlm_put(dlm); | ||
2049 | return 0; | ||
2050 | } | ||
2051 | |||
2052 | static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) | ||
2053 | { | ||
2054 | int ret = 0; | ||
2055 | struct dlm_finalize_reco fr; | ||
2056 | struct dlm_node_iter iter; | ||
2057 | int nodenum; | ||
2058 | int status; | ||
2059 | |||
2060 | mlog(0, "finishing recovery for node %s:%u\n", | ||
2061 | dlm->name, dlm->reco.dead_node); | ||
2062 | |||
2063 | spin_lock(&dlm->spinlock); | ||
2064 | dlm_node_iter_init(dlm->domain_map, &iter); | ||
2065 | spin_unlock(&dlm->spinlock); | ||
2066 | |||
2067 | memset(&fr, 0, sizeof(fr)); | ||
2068 | fr.node_idx = dlm->node_num; | ||
2069 | fr.dead_node = dlm->reco.dead_node; | ||
2070 | |||
2071 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | ||
2072 | if (nodenum == dlm->node_num) | ||
2073 | continue; | ||
2074 | ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, | ||
2075 | &fr, sizeof(fr), nodenum, &status); | ||
2076 | if (ret >= 0) { | ||
2077 | ret = status; | ||
2078 | if (dlm_is_host_down(ret)) { | ||
2079 | /* this has no effect on this recovery | ||
2080 | * session, so set the status to zero to | ||
2081 | * finish out the last recovery */ | ||
2082 | mlog(ML_ERROR, "node %u went down after this " | ||
2083 | "node finished recovery.\n", nodenum); | ||
2084 | ret = 0; | ||
2085 | } | ||
2086 | } | ||
2087 | if (ret < 0) { | ||
2088 | mlog_errno(ret); | ||
2089 | break; | ||
2090 | } | ||
2091 | } | ||
2092 | |||
2093 | return ret; | ||
2094 | } | ||
2095 | |||
2096 | int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) | ||
2097 | { | ||
2098 | struct dlm_ctxt *dlm = data; | ||
2099 | struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; | ||
2100 | |||
2101 | /* ok to return 0, domain has gone away */ | ||
2102 | if (!dlm_grab(dlm)) | ||
2103 | return 0; | ||
2104 | |||
2105 | mlog(0, "node %u finalizing recovery of node %u\n", | ||
2106 | fr->node_idx, fr->dead_node); | ||
2107 | |||
2108 | spin_lock(&dlm->spinlock); | ||
2109 | |||
2110 | if (dlm->reco.new_master != fr->node_idx) { | ||
2111 | mlog(ML_ERROR, "node %u sent recovery finalize msg, but node " | ||
2112 | "%u is supposed to be the new master, dead=%u\n", | ||
2113 | fr->node_idx, dlm->reco.new_master, fr->dead_node); | ||
2114 | BUG(); | ||
2115 | } | ||
2116 | if (dlm->reco.dead_node != fr->dead_node) { | ||
2117 | mlog(ML_ERROR, "node %u sent recovery finalize msg for dead " | ||
2118 | "node %u, but node %u is supposed to be dead\n", | ||
2119 | fr->node_idx, fr->dead_node, dlm->reco.dead_node); | ||
2120 | BUG(); | ||
2121 | } | ||
2122 | |||
2123 | dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); | ||
2124 | |||
2125 | spin_unlock(&dlm->spinlock); | ||
2126 | |||
2127 | dlm_reset_recovery(dlm); | ||
2128 | |||
2129 | dlm_kick_recovery_thread(dlm); | ||
2130 | dlm_put(dlm); | ||
2131 | return 0; | ||
2132 | } | ||