diff options
author | Kurt Hackel <kurt.hackel@oracle.com> | 2005-12-15 17:31:23 -0500 |
---|---|---|
committer | Joel Becker <joel.becker@oracle.com> | 2006-01-03 14:45:47 -0500 |
commit | 6714d8e86bf443f6f7af50f9d432025649f091f5 (patch) | |
tree | 2c484bd1894a90cad7020869c7054f192d3bf34d /fs/ocfs2/dlm/dlmdomain.c | |
parent | 98211489d4147e41b11703e4245846d60b3acce4 (diff) |
[PATCH] OCFS2: The Second Oracle Cluster Filesystem
A distributed lock manager built with the cluster file system use case
in mind. The OCFS2 dlm exposes a VMS style API, though things have
been simplified internally. The only lock levels implemented currently
are NLMODE, PRMODE and EXMODE.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Diffstat (limited to 'fs/ocfs2/dlm/dlmdomain.c')
-rw-r--r-- | fs/ocfs2/dlm/dlmdomain.c | 1469 |
1 files changed, 1469 insertions, 0 deletions
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c new file mode 100644 index 000000000000..da3c22045f89 --- /dev/null +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -0,0 +1,1469 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmdomain.c | ||
5 | * | ||
6 | * defines domain join / leave apis | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #include <linux/module.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/highmem.h> | ||
31 | #include <linux/utsname.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/spinlock.h> | ||
34 | #include <linux/delay.h> | ||
35 | #include <linux/err.h> | ||
36 | |||
37 | #include "cluster/heartbeat.h" | ||
38 | #include "cluster/nodemanager.h" | ||
39 | #include "cluster/tcp.h" | ||
40 | |||
41 | #include "dlmapi.h" | ||
42 | #include "dlmcommon.h" | ||
43 | |||
44 | #include "dlmdebug.h" | ||
45 | #include "dlmdomain.h" | ||
46 | |||
47 | #include "dlmver.h" | ||
48 | |||
49 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) | ||
50 | #include "cluster/masklog.h" | ||
51 | |||
52 | /* | ||
53 | * | ||
54 | * spinlock lock ordering: if multiple locks are needed, obey this ordering: | ||
55 | * dlm_domain_lock | ||
56 | * struct dlm_ctxt->spinlock | ||
57 | * struct dlm_lock_resource->spinlock | ||
58 | * struct dlm_ctxt->master_lock | ||
59 | * struct dlm_ctxt->ast_lock | ||
60 | * dlm_master_list_entry->spinlock | ||
61 | * dlm_lock->spinlock | ||
62 | * | ||
63 | */ | ||
64 | |||
65 | spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED; | ||
66 | LIST_HEAD(dlm_domains); | ||
67 | static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); | ||
68 | |||
69 | #define DLM_DOMAIN_BACKOFF_MS 200 | ||
70 | |||
71 | static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); | ||
72 | static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); | ||
73 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); | ||
74 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); | ||
75 | |||
76 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); | ||
77 | |||
78 | void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) | ||
79 | { | ||
80 | list_del_init(&lockres->list); | ||
81 | dlm_lockres_put(lockres); | ||
82 | } | ||
83 | |||
84 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | ||
85 | struct dlm_lock_resource *res) | ||
86 | { | ||
87 | struct list_head *bucket; | ||
88 | struct qstr *q; | ||
89 | |||
90 | assert_spin_locked(&dlm->spinlock); | ||
91 | |||
92 | q = &res->lockname; | ||
93 | q->hash = full_name_hash(q->name, q->len); | ||
94 | bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]); | ||
95 | |||
96 | /* get a reference for our hashtable */ | ||
97 | dlm_lockres_get(res); | ||
98 | |||
99 | list_add_tail(&res->list, bucket); | ||
100 | } | ||
101 | |||
102 | struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, | ||
103 | const char *name, | ||
104 | unsigned int len) | ||
105 | { | ||
106 | unsigned int hash; | ||
107 | struct list_head *iter; | ||
108 | struct dlm_lock_resource *tmpres=NULL; | ||
109 | struct list_head *bucket; | ||
110 | |||
111 | mlog_entry("%.*s\n", len, name); | ||
112 | |||
113 | assert_spin_locked(&dlm->spinlock); | ||
114 | |||
115 | hash = full_name_hash(name, len); | ||
116 | |||
117 | bucket = &(dlm->resources[hash & DLM_HASH_MASK]); | ||
118 | |||
119 | /* check for pre-existing lock */ | ||
120 | list_for_each(iter, bucket) { | ||
121 | tmpres = list_entry(iter, struct dlm_lock_resource, list); | ||
122 | if (tmpres->lockname.len == len && | ||
123 | memcmp(tmpres->lockname.name, name, len) == 0) { | ||
124 | dlm_lockres_get(tmpres); | ||
125 | break; | ||
126 | } | ||
127 | |||
128 | tmpres = NULL; | ||
129 | } | ||
130 | return tmpres; | ||
131 | } | ||
132 | |||
133 | struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, | ||
134 | const char *name, | ||
135 | unsigned int len) | ||
136 | { | ||
137 | struct dlm_lock_resource *res; | ||
138 | |||
139 | spin_lock(&dlm->spinlock); | ||
140 | res = __dlm_lookup_lockres(dlm, name, len); | ||
141 | spin_unlock(&dlm->spinlock); | ||
142 | return res; | ||
143 | } | ||
144 | |||
145 | static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) | ||
146 | { | ||
147 | struct dlm_ctxt *tmp = NULL; | ||
148 | struct list_head *iter; | ||
149 | |||
150 | assert_spin_locked(&dlm_domain_lock); | ||
151 | |||
152 | /* tmp->name here is always NULL terminated, | ||
153 | * but domain may not be! */ | ||
154 | list_for_each(iter, &dlm_domains) { | ||
155 | tmp = list_entry (iter, struct dlm_ctxt, list); | ||
156 | if (strlen(tmp->name) == len && | ||
157 | memcmp(tmp->name, domain, len)==0) | ||
158 | break; | ||
159 | tmp = NULL; | ||
160 | } | ||
161 | |||
162 | return tmp; | ||
163 | } | ||
164 | |||
165 | /* For null terminated domain strings ONLY */ | ||
166 | static struct dlm_ctxt * __dlm_lookup_domain(const char *domain) | ||
167 | { | ||
168 | assert_spin_locked(&dlm_domain_lock); | ||
169 | |||
170 | return __dlm_lookup_domain_full(domain, strlen(domain)); | ||
171 | } | ||
172 | |||
173 | |||
174 | /* returns true on one of two conditions: | ||
175 | * 1) the domain does not exist | ||
176 | * 2) the domain exists and it's state is "joined" */ | ||
177 | static int dlm_wait_on_domain_helper(const char *domain) | ||
178 | { | ||
179 | int ret = 0; | ||
180 | struct dlm_ctxt *tmp = NULL; | ||
181 | |||
182 | spin_lock(&dlm_domain_lock); | ||
183 | |||
184 | tmp = __dlm_lookup_domain(domain); | ||
185 | if (!tmp) | ||
186 | ret = 1; | ||
187 | else if (tmp->dlm_state == DLM_CTXT_JOINED) | ||
188 | ret = 1; | ||
189 | |||
190 | spin_unlock(&dlm_domain_lock); | ||
191 | return ret; | ||
192 | } | ||
193 | |||
194 | static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) | ||
195 | { | ||
196 | if (dlm->resources) | ||
197 | free_page((unsigned long) dlm->resources); | ||
198 | |||
199 | if (dlm->name) | ||
200 | kfree(dlm->name); | ||
201 | |||
202 | kfree(dlm); | ||
203 | } | ||
204 | |||
205 | /* A little strange - this function will be called while holding | ||
206 | * dlm_domain_lock and is expected to be holding it on the way out. We | ||
207 | * will however drop and reacquire it multiple times */ | ||
208 | static void dlm_ctxt_release(struct kref *kref) | ||
209 | { | ||
210 | struct dlm_ctxt *dlm; | ||
211 | |||
212 | dlm = container_of(kref, struct dlm_ctxt, dlm_refs); | ||
213 | |||
214 | BUG_ON(dlm->num_joins); | ||
215 | BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED); | ||
216 | |||
217 | /* we may still be in the list if we hit an error during join. */ | ||
218 | list_del_init(&dlm->list); | ||
219 | |||
220 | spin_unlock(&dlm_domain_lock); | ||
221 | |||
222 | mlog(0, "freeing memory from domain %s\n", dlm->name); | ||
223 | |||
224 | wake_up(&dlm_domain_events); | ||
225 | |||
226 | dlm_free_ctxt_mem(dlm); | ||
227 | |||
228 | spin_lock(&dlm_domain_lock); | ||
229 | } | ||
230 | |||
231 | void dlm_put(struct dlm_ctxt *dlm) | ||
232 | { | ||
233 | spin_lock(&dlm_domain_lock); | ||
234 | kref_put(&dlm->dlm_refs, dlm_ctxt_release); | ||
235 | spin_unlock(&dlm_domain_lock); | ||
236 | } | ||
237 | |||
238 | static void __dlm_get(struct dlm_ctxt *dlm) | ||
239 | { | ||
240 | kref_get(&dlm->dlm_refs); | ||
241 | } | ||
242 | |||
243 | /* given a questionable reference to a dlm object, gets a reference if | ||
244 | * it can find it in the list, otherwise returns NULL in which case | ||
245 | * you shouldn't trust your pointer. */ | ||
246 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) | ||
247 | { | ||
248 | struct list_head *iter; | ||
249 | struct dlm_ctxt *target = NULL; | ||
250 | |||
251 | spin_lock(&dlm_domain_lock); | ||
252 | |||
253 | list_for_each(iter, &dlm_domains) { | ||
254 | target = list_entry (iter, struct dlm_ctxt, list); | ||
255 | |||
256 | if (target == dlm) { | ||
257 | __dlm_get(target); | ||
258 | break; | ||
259 | } | ||
260 | |||
261 | target = NULL; | ||
262 | } | ||
263 | |||
264 | spin_unlock(&dlm_domain_lock); | ||
265 | |||
266 | return target; | ||
267 | } | ||
268 | |||
269 | int dlm_domain_fully_joined(struct dlm_ctxt *dlm) | ||
270 | { | ||
271 | int ret; | ||
272 | |||
273 | spin_lock(&dlm_domain_lock); | ||
274 | ret = (dlm->dlm_state == DLM_CTXT_JOINED) || | ||
275 | (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN); | ||
276 | spin_unlock(&dlm_domain_lock); | ||
277 | |||
278 | return ret; | ||
279 | } | ||
280 | |||
281 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) | ||
282 | { | ||
283 | dlm_unregister_domain_handlers(dlm); | ||
284 | dlm_complete_thread(dlm); | ||
285 | dlm_complete_recovery_thread(dlm); | ||
286 | |||
287 | /* We've left the domain. Now we can take ourselves out of the | ||
288 | * list and allow the kref stuff to help us free the | ||
289 | * memory. */ | ||
290 | spin_lock(&dlm_domain_lock); | ||
291 | list_del_init(&dlm->list); | ||
292 | spin_unlock(&dlm_domain_lock); | ||
293 | |||
294 | /* Wake up anyone waiting for us to remove this domain */ | ||
295 | wake_up(&dlm_domain_events); | ||
296 | } | ||
297 | |||
298 | static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) | ||
299 | { | ||
300 | int i; | ||
301 | struct dlm_lock_resource *res; | ||
302 | |||
303 | mlog(0, "Migrating locks from domain %s\n", dlm->name); | ||
304 | restart: | ||
305 | spin_lock(&dlm->spinlock); | ||
306 | for (i=0; i<DLM_HASH_SIZE; i++) { | ||
307 | while (!list_empty(&dlm->resources[i])) { | ||
308 | res = list_entry(dlm->resources[i].next, | ||
309 | struct dlm_lock_resource, list); | ||
310 | /* need reference when manually grabbing lockres */ | ||
311 | dlm_lockres_get(res); | ||
312 | /* this should unhash the lockres | ||
313 | * and exit with dlm->spinlock */ | ||
314 | mlog(0, "purging res=%p\n", res); | ||
315 | if (dlm_lockres_is_dirty(dlm, res)) { | ||
316 | /* HACK! this should absolutely go. | ||
317 | * need to figure out why some empty | ||
318 | * lockreses are still marked dirty */ | ||
319 | mlog(ML_ERROR, "lockres %.*s dirty!\n", | ||
320 | res->lockname.len, res->lockname.name); | ||
321 | |||
322 | spin_unlock(&dlm->spinlock); | ||
323 | dlm_kick_thread(dlm, res); | ||
324 | wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); | ||
325 | dlm_lockres_put(res); | ||
326 | goto restart; | ||
327 | } | ||
328 | dlm_purge_lockres(dlm, res); | ||
329 | dlm_lockres_put(res); | ||
330 | } | ||
331 | } | ||
332 | spin_unlock(&dlm->spinlock); | ||
333 | |||
334 | mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); | ||
335 | } | ||
336 | |||
337 | static int dlm_no_joining_node(struct dlm_ctxt *dlm) | ||
338 | { | ||
339 | int ret; | ||
340 | |||
341 | spin_lock(&dlm->spinlock); | ||
342 | ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN; | ||
343 | spin_unlock(&dlm->spinlock); | ||
344 | |||
345 | return ret; | ||
346 | } | ||
347 | |||
348 | static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) | ||
349 | { | ||
350 | /* Yikes, a double spinlock! I need domain_lock for the dlm | ||
351 | * state and the dlm spinlock for join state... Sorry! */ | ||
352 | again: | ||
353 | spin_lock(&dlm_domain_lock); | ||
354 | spin_lock(&dlm->spinlock); | ||
355 | |||
356 | if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
357 | mlog(0, "Node %d is joining, we wait on it.\n", | ||
358 | dlm->joining_node); | ||
359 | spin_unlock(&dlm->spinlock); | ||
360 | spin_unlock(&dlm_domain_lock); | ||
361 | |||
362 | wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm)); | ||
363 | goto again; | ||
364 | } | ||
365 | |||
366 | dlm->dlm_state = DLM_CTXT_LEAVING; | ||
367 | spin_unlock(&dlm->spinlock); | ||
368 | spin_unlock(&dlm_domain_lock); | ||
369 | } | ||
370 | |||
371 | static void __dlm_print_nodes(struct dlm_ctxt *dlm) | ||
372 | { | ||
373 | int node = -1; | ||
374 | |||
375 | assert_spin_locked(&dlm->spinlock); | ||
376 | |||
377 | mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name); | ||
378 | |||
379 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, | ||
380 | node + 1)) < O2NM_MAX_NODES) { | ||
381 | mlog(ML_NOTICE, " node %d\n", node); | ||
382 | } | ||
383 | } | ||
384 | |||
385 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) | ||
386 | { | ||
387 | struct dlm_ctxt *dlm = data; | ||
388 | unsigned int node; | ||
389 | struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; | ||
390 | |||
391 | mlog_entry("%p %u %p", msg, len, data); | ||
392 | |||
393 | if (!dlm_grab(dlm)) | ||
394 | return 0; | ||
395 | |||
396 | node = exit_msg->node_idx; | ||
397 | |||
398 | mlog(0, "Node %u leaves domain %s\n", node, dlm->name); | ||
399 | |||
400 | spin_lock(&dlm->spinlock); | ||
401 | clear_bit(node, dlm->domain_map); | ||
402 | __dlm_print_nodes(dlm); | ||
403 | |||
404 | /* notify anything attached to the heartbeat events */ | ||
405 | dlm_hb_event_notify_attached(dlm, node, 0); | ||
406 | |||
407 | spin_unlock(&dlm->spinlock); | ||
408 | |||
409 | dlm_put(dlm); | ||
410 | |||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, | ||
415 | unsigned int node) | ||
416 | { | ||
417 | int status; | ||
418 | struct dlm_exit_domain leave_msg; | ||
419 | |||
420 | mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", | ||
421 | node, dlm->name, dlm->node_num); | ||
422 | |||
423 | memset(&leave_msg, 0, sizeof(leave_msg)); | ||
424 | leave_msg.node_idx = dlm->node_num; | ||
425 | |||
426 | status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, | ||
427 | &leave_msg, sizeof(leave_msg), node, | ||
428 | NULL); | ||
429 | |||
430 | mlog(0, "status return %d from o2net_send_message\n", status); | ||
431 | |||
432 | return status; | ||
433 | } | ||
434 | |||
435 | |||
436 | static void dlm_leave_domain(struct dlm_ctxt *dlm) | ||
437 | { | ||
438 | int node, clear_node, status; | ||
439 | |||
440 | /* At this point we've migrated away all our locks and won't | ||
441 | * accept mastership of new ones. The dlm is responsible for | ||
442 | * almost nothing now. We make sure not to confuse any joining | ||
443 | * nodes and then commence shutdown procedure. */ | ||
444 | |||
445 | spin_lock(&dlm->spinlock); | ||
446 | /* Clear ourselves from the domain map */ | ||
447 | clear_bit(dlm->node_num, dlm->domain_map); | ||
448 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, | ||
449 | 0)) < O2NM_MAX_NODES) { | ||
450 | /* Drop the dlm spinlock. This is safe wrt the domain_map. | ||
451 | * -nodes cannot be added now as the | ||
452 | * query_join_handlers knows to respond with OK_NO_MAP | ||
453 | * -we catch the right network errors if a node is | ||
454 | * removed from the map while we're sending him the | ||
455 | * exit message. */ | ||
456 | spin_unlock(&dlm->spinlock); | ||
457 | |||
458 | clear_node = 1; | ||
459 | |||
460 | status = dlm_send_one_domain_exit(dlm, node); | ||
461 | if (status < 0 && | ||
462 | status != -ENOPROTOOPT && | ||
463 | status != -ENOTCONN) { | ||
464 | mlog(ML_NOTICE, "Error %d sending domain exit message " | ||
465 | "to node %d\n", status, node); | ||
466 | |||
467 | /* Not sure what to do here but lets sleep for | ||
468 | * a bit in case this was a transient | ||
469 | * error... */ | ||
470 | msleep(DLM_DOMAIN_BACKOFF_MS); | ||
471 | clear_node = 0; | ||
472 | } | ||
473 | |||
474 | spin_lock(&dlm->spinlock); | ||
475 | /* If we're not clearing the node bit then we intend | ||
476 | * to loop back around to try again. */ | ||
477 | if (clear_node) | ||
478 | clear_bit(node, dlm->domain_map); | ||
479 | } | ||
480 | spin_unlock(&dlm->spinlock); | ||
481 | } | ||
482 | |||
483 | int dlm_joined(struct dlm_ctxt *dlm) | ||
484 | { | ||
485 | int ret = 0; | ||
486 | |||
487 | spin_lock(&dlm_domain_lock); | ||
488 | |||
489 | if (dlm->dlm_state == DLM_CTXT_JOINED) | ||
490 | ret = 1; | ||
491 | |||
492 | spin_unlock(&dlm_domain_lock); | ||
493 | |||
494 | return ret; | ||
495 | } | ||
496 | |||
497 | int dlm_shutting_down(struct dlm_ctxt *dlm) | ||
498 | { | ||
499 | int ret = 0; | ||
500 | |||
501 | spin_lock(&dlm_domain_lock); | ||
502 | |||
503 | if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) | ||
504 | ret = 1; | ||
505 | |||
506 | spin_unlock(&dlm_domain_lock); | ||
507 | |||
508 | return ret; | ||
509 | } | ||
510 | |||
511 | void dlm_unregister_domain(struct dlm_ctxt *dlm) | ||
512 | { | ||
513 | int leave = 0; | ||
514 | |||
515 | spin_lock(&dlm_domain_lock); | ||
516 | BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); | ||
517 | BUG_ON(!dlm->num_joins); | ||
518 | |||
519 | dlm->num_joins--; | ||
520 | if (!dlm->num_joins) { | ||
521 | /* We mark it "in shutdown" now so new register | ||
522 | * requests wait until we've completely left the | ||
523 | * domain. Don't use DLM_CTXT_LEAVING yet as we still | ||
524 | * want new domain joins to communicate with us at | ||
525 | * least until we've completed migration of our | ||
526 | * resources. */ | ||
527 | dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN; | ||
528 | leave = 1; | ||
529 | } | ||
530 | spin_unlock(&dlm_domain_lock); | ||
531 | |||
532 | if (leave) { | ||
533 | mlog(0, "shutting down domain %s\n", dlm->name); | ||
534 | |||
535 | /* We changed dlm state, notify the thread */ | ||
536 | dlm_kick_thread(dlm, NULL); | ||
537 | |||
538 | dlm_migrate_all_locks(dlm); | ||
539 | dlm_mark_domain_leaving(dlm); | ||
540 | dlm_leave_domain(dlm); | ||
541 | dlm_complete_dlm_shutdown(dlm); | ||
542 | } | ||
543 | dlm_put(dlm); | ||
544 | } | ||
545 | EXPORT_SYMBOL_GPL(dlm_unregister_domain); | ||
546 | |||
547 | static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) | ||
548 | { | ||
549 | struct dlm_query_join_request *query; | ||
550 | enum dlm_query_join_response response; | ||
551 | struct dlm_ctxt *dlm = NULL; | ||
552 | |||
553 | query = (struct dlm_query_join_request *) msg->buf; | ||
554 | |||
555 | mlog(0, "node %u wants to join domain %s\n", query->node_idx, | ||
556 | query->domain); | ||
557 | |||
558 | /* | ||
559 | * If heartbeat doesn't consider the node live, tell it | ||
560 | * to back off and try again. This gives heartbeat a chance | ||
561 | * to catch up. | ||
562 | */ | ||
563 | if (!o2hb_check_node_heartbeating(query->node_idx)) { | ||
564 | mlog(0, "node %u is not in our live map yet\n", | ||
565 | query->node_idx); | ||
566 | |||
567 | response = JOIN_DISALLOW; | ||
568 | goto respond; | ||
569 | } | ||
570 | |||
571 | response = JOIN_OK_NO_MAP; | ||
572 | |||
573 | spin_lock(&dlm_domain_lock); | ||
574 | dlm = __dlm_lookup_domain_full(query->domain, query->name_len); | ||
575 | /* Once the dlm ctxt is marked as leaving then we don't want | ||
576 | * to be put in someone's domain map. */ | ||
577 | if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { | ||
578 | spin_lock(&dlm->spinlock); | ||
579 | |||
580 | if (dlm->dlm_state == DLM_CTXT_NEW && | ||
581 | dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
582 | /*If this is a brand new context and we | ||
583 | * haven't started our join process yet, then | ||
584 | * the other node won the race. */ | ||
585 | response = JOIN_OK_NO_MAP; | ||
586 | } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
587 | /* Disallow parallel joins. */ | ||
588 | response = JOIN_DISALLOW; | ||
589 | } else { | ||
590 | /* Alright we're fully a part of this domain | ||
591 | * so we keep some state as to who's joining | ||
592 | * and indicate to him that needs to be fixed | ||
593 | * up. */ | ||
594 | response = JOIN_OK; | ||
595 | __dlm_set_joining_node(dlm, query->node_idx); | ||
596 | } | ||
597 | |||
598 | spin_unlock(&dlm->spinlock); | ||
599 | } | ||
600 | spin_unlock(&dlm_domain_lock); | ||
601 | |||
602 | respond: | ||
603 | mlog(0, "We respond with %u\n", response); | ||
604 | |||
605 | return response; | ||
606 | } | ||
607 | |||
608 | static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) | ||
609 | { | ||
610 | struct dlm_assert_joined *assert; | ||
611 | struct dlm_ctxt *dlm = NULL; | ||
612 | |||
613 | assert = (struct dlm_assert_joined *) msg->buf; | ||
614 | |||
615 | mlog(0, "node %u asserts join on domain %s\n", assert->node_idx, | ||
616 | assert->domain); | ||
617 | |||
618 | spin_lock(&dlm_domain_lock); | ||
619 | dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len); | ||
620 | /* XXX should we consider no dlm ctxt an error? */ | ||
621 | if (dlm) { | ||
622 | spin_lock(&dlm->spinlock); | ||
623 | |||
624 | /* Alright, this node has officially joined our | ||
625 | * domain. Set him in the map and clean up our | ||
626 | * leftover join state. */ | ||
627 | BUG_ON(dlm->joining_node != assert->node_idx); | ||
628 | set_bit(assert->node_idx, dlm->domain_map); | ||
629 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
630 | |||
631 | __dlm_print_nodes(dlm); | ||
632 | |||
633 | /* notify anything attached to the heartbeat events */ | ||
634 | dlm_hb_event_notify_attached(dlm, assert->node_idx, 1); | ||
635 | |||
636 | spin_unlock(&dlm->spinlock); | ||
637 | } | ||
638 | spin_unlock(&dlm_domain_lock); | ||
639 | |||
640 | return 0; | ||
641 | } | ||
642 | |||
643 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) | ||
644 | { | ||
645 | struct dlm_cancel_join *cancel; | ||
646 | struct dlm_ctxt *dlm = NULL; | ||
647 | |||
648 | cancel = (struct dlm_cancel_join *) msg->buf; | ||
649 | |||
650 | mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx, | ||
651 | cancel->domain); | ||
652 | |||
653 | spin_lock(&dlm_domain_lock); | ||
654 | dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len); | ||
655 | |||
656 | if (dlm) { | ||
657 | spin_lock(&dlm->spinlock); | ||
658 | |||
659 | /* Yikes, this guy wants to cancel his join. No | ||
660 | * problem, we simply cleanup our join state. */ | ||
661 | BUG_ON(dlm->joining_node != cancel->node_idx); | ||
662 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
663 | |||
664 | spin_unlock(&dlm->spinlock); | ||
665 | } | ||
666 | spin_unlock(&dlm_domain_lock); | ||
667 | |||
668 | return 0; | ||
669 | } | ||
670 | |||
671 | static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm, | ||
672 | unsigned int node) | ||
673 | { | ||
674 | int status; | ||
675 | struct dlm_cancel_join cancel_msg; | ||
676 | |||
677 | memset(&cancel_msg, 0, sizeof(cancel_msg)); | ||
678 | cancel_msg.node_idx = dlm->node_num; | ||
679 | cancel_msg.name_len = strlen(dlm->name); | ||
680 | memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len); | ||
681 | |||
682 | status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, | ||
683 | &cancel_msg, sizeof(cancel_msg), node, | ||
684 | NULL); | ||
685 | if (status < 0) { | ||
686 | mlog_errno(status); | ||
687 | goto bail; | ||
688 | } | ||
689 | |||
690 | bail: | ||
691 | return status; | ||
692 | } | ||
693 | |||
694 | /* map_size should be in bytes. */ | ||
695 | static int dlm_send_join_cancels(struct dlm_ctxt *dlm, | ||
696 | unsigned long *node_map, | ||
697 | unsigned int map_size) | ||
698 | { | ||
699 | int status, tmpstat; | ||
700 | unsigned int node; | ||
701 | |||
702 | if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * | ||
703 | sizeof(unsigned long))) { | ||
704 | mlog(ML_ERROR, | ||
705 | "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n", | ||
706 | map_size, BITS_TO_LONGS(O2NM_MAX_NODES)); | ||
707 | return -EINVAL; | ||
708 | } | ||
709 | |||
710 | status = 0; | ||
711 | node = -1; | ||
712 | while ((node = find_next_bit(node_map, O2NM_MAX_NODES, | ||
713 | node + 1)) < O2NM_MAX_NODES) { | ||
714 | if (node == dlm->node_num) | ||
715 | continue; | ||
716 | |||
717 | tmpstat = dlm_send_one_join_cancel(dlm, node); | ||
718 | if (tmpstat) { | ||
719 | mlog(ML_ERROR, "Error return %d cancelling join on " | ||
720 | "node %d\n", tmpstat, node); | ||
721 | if (!status) | ||
722 | status = tmpstat; | ||
723 | } | ||
724 | } | ||
725 | |||
726 | if (status) | ||
727 | mlog_errno(status); | ||
728 | return status; | ||
729 | } | ||
730 | |||
731 | static int dlm_request_join(struct dlm_ctxt *dlm, | ||
732 | int node, | ||
733 | enum dlm_query_join_response *response) | ||
734 | { | ||
735 | int status, retval; | ||
736 | struct dlm_query_join_request join_msg; | ||
737 | |||
738 | mlog(0, "querying node %d\n", node); | ||
739 | |||
740 | memset(&join_msg, 0, sizeof(join_msg)); | ||
741 | join_msg.node_idx = dlm->node_num; | ||
742 | join_msg.name_len = strlen(dlm->name); | ||
743 | memcpy(join_msg.domain, dlm->name, join_msg.name_len); | ||
744 | |||
745 | status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, | ||
746 | sizeof(join_msg), node, &retval); | ||
747 | if (status < 0 && status != -ENOPROTOOPT) { | ||
748 | mlog_errno(status); | ||
749 | goto bail; | ||
750 | } | ||
751 | |||
752 | /* -ENOPROTOOPT from the net code means the other side isn't | ||
753 | listening for our message type -- that's fine, it means | ||
754 | his dlm isn't up, so we can consider him a 'yes' but not | ||
755 | joined into the domain. */ | ||
756 | if (status == -ENOPROTOOPT) { | ||
757 | status = 0; | ||
758 | *response = JOIN_OK_NO_MAP; | ||
759 | } else if (retval == JOIN_DISALLOW || | ||
760 | retval == JOIN_OK || | ||
761 | retval == JOIN_OK_NO_MAP) { | ||
762 | *response = retval; | ||
763 | } else { | ||
764 | status = -EINVAL; | ||
765 | mlog(ML_ERROR, "invalid response %d from node %u\n", retval, | ||
766 | node); | ||
767 | } | ||
768 | |||
769 | mlog(0, "status %d, node %d response is %d\n", status, node, | ||
770 | *response); | ||
771 | |||
772 | bail: | ||
773 | return status; | ||
774 | } | ||
775 | |||
776 | static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, | ||
777 | unsigned int node) | ||
778 | { | ||
779 | int status; | ||
780 | struct dlm_assert_joined assert_msg; | ||
781 | |||
782 | mlog(0, "Sending join assert to node %u\n", node); | ||
783 | |||
784 | memset(&assert_msg, 0, sizeof(assert_msg)); | ||
785 | assert_msg.node_idx = dlm->node_num; | ||
786 | assert_msg.name_len = strlen(dlm->name); | ||
787 | memcpy(assert_msg.domain, dlm->name, assert_msg.name_len); | ||
788 | |||
789 | status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, | ||
790 | &assert_msg, sizeof(assert_msg), node, | ||
791 | NULL); | ||
792 | if (status < 0) | ||
793 | mlog_errno(status); | ||
794 | |||
795 | return status; | ||
796 | } | ||
797 | |||
798 | static void dlm_send_join_asserts(struct dlm_ctxt *dlm, | ||
799 | unsigned long *node_map) | ||
800 | { | ||
801 | int status, node, live; | ||
802 | |||
803 | status = 0; | ||
804 | node = -1; | ||
805 | while ((node = find_next_bit(node_map, O2NM_MAX_NODES, | ||
806 | node + 1)) < O2NM_MAX_NODES) { | ||
807 | if (node == dlm->node_num) | ||
808 | continue; | ||
809 | |||
810 | do { | ||
811 | /* It is very important that this message be | ||
812 | * received so we spin until either the node | ||
813 | * has died or it gets the message. */ | ||
814 | status = dlm_send_one_join_assert(dlm, node); | ||
815 | |||
816 | spin_lock(&dlm->spinlock); | ||
817 | live = test_bit(node, dlm->live_nodes_map); | ||
818 | spin_unlock(&dlm->spinlock); | ||
819 | |||
820 | if (status) { | ||
821 | mlog(ML_ERROR, "Error return %d asserting " | ||
822 | "join on node %d\n", status, node); | ||
823 | |||
824 | /* give us some time between errors... */ | ||
825 | if (live) | ||
826 | msleep(DLM_DOMAIN_BACKOFF_MS); | ||
827 | } | ||
828 | } while (status && live); | ||
829 | } | ||
830 | } | ||
831 | |||
832 | struct domain_join_ctxt { | ||
833 | unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
834 | unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
835 | }; | ||
836 | |||
837 | static int dlm_should_restart_join(struct dlm_ctxt *dlm, | ||
838 | struct domain_join_ctxt *ctxt, | ||
839 | enum dlm_query_join_response response) | ||
840 | { | ||
841 | int ret; | ||
842 | |||
843 | if (response == JOIN_DISALLOW) { | ||
844 | mlog(0, "Latest response of disallow -- should restart\n"); | ||
845 | return 1; | ||
846 | } | ||
847 | |||
848 | spin_lock(&dlm->spinlock); | ||
849 | /* For now, we restart the process if the node maps have | ||
850 | * changed at all */ | ||
851 | ret = memcmp(ctxt->live_map, dlm->live_nodes_map, | ||
852 | sizeof(dlm->live_nodes_map)); | ||
853 | spin_unlock(&dlm->spinlock); | ||
854 | |||
855 | if (ret) | ||
856 | mlog(0, "Node maps changed -- should restart\n"); | ||
857 | |||
858 | return ret; | ||
859 | } | ||
860 | |||
861 | static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) | ||
862 | { | ||
863 | int status = 0, tmpstat, node; | ||
864 | struct domain_join_ctxt *ctxt; | ||
865 | enum dlm_query_join_response response; | ||
866 | |||
867 | mlog_entry("%p", dlm); | ||
868 | |||
869 | ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL); | ||
870 | if (!ctxt) { | ||
871 | status = -ENOMEM; | ||
872 | mlog_errno(status); | ||
873 | goto bail; | ||
874 | } | ||
875 | |||
876 | /* group sem locking should work for us here -- we're already | ||
877 | * registered for heartbeat events so filling this should be | ||
878 | * atomic wrt getting those handlers called. */ | ||
879 | o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); | ||
880 | |||
881 | spin_lock(&dlm->spinlock); | ||
882 | memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); | ||
883 | |||
884 | __dlm_set_joining_node(dlm, dlm->node_num); | ||
885 | |||
886 | spin_unlock(&dlm->spinlock); | ||
887 | |||
888 | node = -1; | ||
889 | while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES, | ||
890 | node + 1)) < O2NM_MAX_NODES) { | ||
891 | if (node == dlm->node_num) | ||
892 | continue; | ||
893 | |||
894 | status = dlm_request_join(dlm, node, &response); | ||
895 | if (status < 0) { | ||
896 | mlog_errno(status); | ||
897 | goto bail; | ||
898 | } | ||
899 | |||
900 | /* Ok, either we got a response or the node doesn't have a | ||
901 | * dlm up. */ | ||
902 | if (response == JOIN_OK) | ||
903 | set_bit(node, ctxt->yes_resp_map); | ||
904 | |||
905 | if (dlm_should_restart_join(dlm, ctxt, response)) { | ||
906 | status = -EAGAIN; | ||
907 | goto bail; | ||
908 | } | ||
909 | } | ||
910 | |||
911 | mlog(0, "Yay, done querying nodes!\n"); | ||
912 | |||
913 | /* Yay, everyone agree's we can join the domain. My domain is | ||
914 | * comprised of all nodes who were put in the | ||
915 | * yes_resp_map. Copy that into our domain map and send a join | ||
916 | * assert message to clean up everyone elses state. */ | ||
917 | spin_lock(&dlm->spinlock); | ||
918 | memcpy(dlm->domain_map, ctxt->yes_resp_map, | ||
919 | sizeof(ctxt->yes_resp_map)); | ||
920 | set_bit(dlm->node_num, dlm->domain_map); | ||
921 | spin_unlock(&dlm->spinlock); | ||
922 | |||
923 | dlm_send_join_asserts(dlm, ctxt->yes_resp_map); | ||
924 | |||
925 | /* Joined state *must* be set before the joining node | ||
926 | * information, otherwise the query_join handler may read no | ||
927 | * current joiner but a state of NEW and tell joining nodes | ||
928 | * we're not in the domain. */ | ||
929 | spin_lock(&dlm_domain_lock); | ||
930 | dlm->dlm_state = DLM_CTXT_JOINED; | ||
931 | dlm->num_joins++; | ||
932 | spin_unlock(&dlm_domain_lock); | ||
933 | |||
934 | bail: | ||
935 | spin_lock(&dlm->spinlock); | ||
936 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
937 | if (!status) | ||
938 | __dlm_print_nodes(dlm); | ||
939 | spin_unlock(&dlm->spinlock); | ||
940 | |||
941 | if (ctxt) { | ||
942 | /* Do we need to send a cancel message to any nodes? */ | ||
943 | if (status < 0) { | ||
944 | tmpstat = dlm_send_join_cancels(dlm, | ||
945 | ctxt->yes_resp_map, | ||
946 | sizeof(ctxt->yes_resp_map)); | ||
947 | if (tmpstat < 0) | ||
948 | mlog_errno(tmpstat); | ||
949 | } | ||
950 | kfree(ctxt); | ||
951 | } | ||
952 | |||
953 | mlog(0, "returning %d\n", status); | ||
954 | return status; | ||
955 | } | ||
956 | |||
957 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) | ||
958 | { | ||
959 | o2hb_unregister_callback(&dlm->dlm_hb_up); | ||
960 | o2hb_unregister_callback(&dlm->dlm_hb_down); | ||
961 | o2net_unregister_handler_list(&dlm->dlm_domain_handlers); | ||
962 | } | ||
963 | |||
964 | static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) | ||
965 | { | ||
966 | int status; | ||
967 | |||
968 | mlog(0, "registering handlers.\n"); | ||
969 | |||
970 | o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, | ||
971 | dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); | ||
972 | status = o2hb_register_callback(&dlm->dlm_hb_down); | ||
973 | if (status) | ||
974 | goto bail; | ||
975 | |||
976 | o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, | ||
977 | dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); | ||
978 | status = o2hb_register_callback(&dlm->dlm_hb_up); | ||
979 | if (status) | ||
980 | goto bail; | ||
981 | |||
982 | status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, | ||
983 | sizeof(struct dlm_master_request), | ||
984 | dlm_master_request_handler, | ||
985 | dlm, &dlm->dlm_domain_handlers); | ||
986 | if (status) | ||
987 | goto bail; | ||
988 | |||
989 | status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, | ||
990 | sizeof(struct dlm_assert_master), | ||
991 | dlm_assert_master_handler, | ||
992 | dlm, &dlm->dlm_domain_handlers); | ||
993 | if (status) | ||
994 | goto bail; | ||
995 | |||
996 | status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, | ||
997 | sizeof(struct dlm_create_lock), | ||
998 | dlm_create_lock_handler, | ||
999 | dlm, &dlm->dlm_domain_handlers); | ||
1000 | if (status) | ||
1001 | goto bail; | ||
1002 | |||
1003 | status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, | ||
1004 | DLM_CONVERT_LOCK_MAX_LEN, | ||
1005 | dlm_convert_lock_handler, | ||
1006 | dlm, &dlm->dlm_domain_handlers); | ||
1007 | if (status) | ||
1008 | goto bail; | ||
1009 | |||
1010 | status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, | ||
1011 | DLM_UNLOCK_LOCK_MAX_LEN, | ||
1012 | dlm_unlock_lock_handler, | ||
1013 | dlm, &dlm->dlm_domain_handlers); | ||
1014 | if (status) | ||
1015 | goto bail; | ||
1016 | |||
1017 | status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, | ||
1018 | DLM_PROXY_AST_MAX_LEN, | ||
1019 | dlm_proxy_ast_handler, | ||
1020 | dlm, &dlm->dlm_domain_handlers); | ||
1021 | if (status) | ||
1022 | goto bail; | ||
1023 | |||
1024 | status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, | ||
1025 | sizeof(struct dlm_exit_domain), | ||
1026 | dlm_exit_domain_handler, | ||
1027 | dlm, &dlm->dlm_domain_handlers); | ||
1028 | if (status) | ||
1029 | goto bail; | ||
1030 | |||
1031 | status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, | ||
1032 | sizeof(struct dlm_migrate_request), | ||
1033 | dlm_migrate_request_handler, | ||
1034 | dlm, &dlm->dlm_domain_handlers); | ||
1035 | if (status) | ||
1036 | goto bail; | ||
1037 | |||
1038 | status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, | ||
1039 | DLM_MIG_LOCKRES_MAX_LEN, | ||
1040 | dlm_mig_lockres_handler, | ||
1041 | dlm, &dlm->dlm_domain_handlers); | ||
1042 | if (status) | ||
1043 | goto bail; | ||
1044 | |||
1045 | status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, | ||
1046 | sizeof(struct dlm_master_requery), | ||
1047 | dlm_master_requery_handler, | ||
1048 | dlm, &dlm->dlm_domain_handlers); | ||
1049 | if (status) | ||
1050 | goto bail; | ||
1051 | |||
1052 | status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, | ||
1053 | sizeof(struct dlm_lock_request), | ||
1054 | dlm_request_all_locks_handler, | ||
1055 | dlm, &dlm->dlm_domain_handlers); | ||
1056 | if (status) | ||
1057 | goto bail; | ||
1058 | |||
1059 | status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, | ||
1060 | sizeof(struct dlm_reco_data_done), | ||
1061 | dlm_reco_data_done_handler, | ||
1062 | dlm, &dlm->dlm_domain_handlers); | ||
1063 | if (status) | ||
1064 | goto bail; | ||
1065 | |||
1066 | status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, | ||
1067 | sizeof(struct dlm_begin_reco), | ||
1068 | dlm_begin_reco_handler, | ||
1069 | dlm, &dlm->dlm_domain_handlers); | ||
1070 | if (status) | ||
1071 | goto bail; | ||
1072 | |||
1073 | status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, | ||
1074 | sizeof(struct dlm_finalize_reco), | ||
1075 | dlm_finalize_reco_handler, | ||
1076 | dlm, &dlm->dlm_domain_handlers); | ||
1077 | if (status) | ||
1078 | goto bail; | ||
1079 | |||
1080 | bail: | ||
1081 | if (status) | ||
1082 | dlm_unregister_domain_handlers(dlm); | ||
1083 | |||
1084 | return status; | ||
1085 | } | ||
1086 | |||
1087 | static int dlm_join_domain(struct dlm_ctxt *dlm) | ||
1088 | { | ||
1089 | int status; | ||
1090 | |||
1091 | BUG_ON(!dlm); | ||
1092 | |||
1093 | mlog(0, "Join domain %s\n", dlm->name); | ||
1094 | |||
1095 | status = dlm_register_domain_handlers(dlm); | ||
1096 | if (status) { | ||
1097 | mlog_errno(status); | ||
1098 | goto bail; | ||
1099 | } | ||
1100 | |||
1101 | status = dlm_launch_thread(dlm); | ||
1102 | if (status < 0) { | ||
1103 | mlog_errno(status); | ||
1104 | goto bail; | ||
1105 | } | ||
1106 | |||
1107 | status = dlm_launch_recovery_thread(dlm); | ||
1108 | if (status < 0) { | ||
1109 | mlog_errno(status); | ||
1110 | goto bail; | ||
1111 | } | ||
1112 | |||
1113 | do { | ||
1114 | unsigned int backoff; | ||
1115 | status = dlm_try_to_join_domain(dlm); | ||
1116 | |||
1117 | /* If we're racing another node to the join, then we | ||
1118 | * need to back off temporarily and let them | ||
1119 | * complete. */ | ||
1120 | if (status == -EAGAIN) { | ||
1121 | if (signal_pending(current)) { | ||
1122 | status = -ERESTARTSYS; | ||
1123 | goto bail; | ||
1124 | } | ||
1125 | |||
1126 | /* | ||
1127 | * <chip> After you! | ||
1128 | * <dale> No, after you! | ||
1129 | * <chip> I insist! | ||
1130 | * <dale> But you first! | ||
1131 | * ... | ||
1132 | */ | ||
1133 | backoff = (unsigned int)(jiffies & 0x3); | ||
1134 | backoff *= DLM_DOMAIN_BACKOFF_MS; | ||
1135 | mlog(0, "backoff %d\n", backoff); | ||
1136 | msleep(backoff); | ||
1137 | } | ||
1138 | } while (status == -EAGAIN); | ||
1139 | |||
1140 | if (status < 0) { | ||
1141 | mlog_errno(status); | ||
1142 | goto bail; | ||
1143 | } | ||
1144 | |||
1145 | status = 0; | ||
1146 | bail: | ||
1147 | wake_up(&dlm_domain_events); | ||
1148 | |||
1149 | if (status) { | ||
1150 | dlm_unregister_domain_handlers(dlm); | ||
1151 | dlm_complete_thread(dlm); | ||
1152 | dlm_complete_recovery_thread(dlm); | ||
1153 | } | ||
1154 | |||
1155 | return status; | ||
1156 | } | ||
1157 | |||
1158 | static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | ||
1159 | u32 key) | ||
1160 | { | ||
1161 | int i; | ||
1162 | struct dlm_ctxt *dlm = NULL; | ||
1163 | |||
1164 | dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL); | ||
1165 | if (!dlm) { | ||
1166 | mlog_errno(-ENOMEM); | ||
1167 | goto leave; | ||
1168 | } | ||
1169 | |||
1170 | dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); | ||
1171 | if (dlm->name == NULL) { | ||
1172 | mlog_errno(-ENOMEM); | ||
1173 | kfree(dlm); | ||
1174 | dlm = NULL; | ||
1175 | goto leave; | ||
1176 | } | ||
1177 | |||
1178 | dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL); | ||
1179 | if (!dlm->resources) { | ||
1180 | mlog_errno(-ENOMEM); | ||
1181 | kfree(dlm->name); | ||
1182 | kfree(dlm); | ||
1183 | dlm = NULL; | ||
1184 | goto leave; | ||
1185 | } | ||
1186 | memset(dlm->resources, 0, PAGE_SIZE); | ||
1187 | |||
1188 | for (i=0; i<DLM_HASH_SIZE; i++) | ||
1189 | INIT_LIST_HEAD(&dlm->resources[i]); | ||
1190 | |||
1191 | strcpy(dlm->name, domain); | ||
1192 | dlm->key = key; | ||
1193 | dlm->node_num = o2nm_this_node(); | ||
1194 | |||
1195 | spin_lock_init(&dlm->spinlock); | ||
1196 | spin_lock_init(&dlm->master_lock); | ||
1197 | spin_lock_init(&dlm->ast_lock); | ||
1198 | INIT_LIST_HEAD(&dlm->list); | ||
1199 | INIT_LIST_HEAD(&dlm->dirty_list); | ||
1200 | INIT_LIST_HEAD(&dlm->reco.resources); | ||
1201 | INIT_LIST_HEAD(&dlm->reco.received); | ||
1202 | INIT_LIST_HEAD(&dlm->reco.node_data); | ||
1203 | INIT_LIST_HEAD(&dlm->purge_list); | ||
1204 | INIT_LIST_HEAD(&dlm->dlm_domain_handlers); | ||
1205 | dlm->reco.state = 0; | ||
1206 | |||
1207 | INIT_LIST_HEAD(&dlm->pending_asts); | ||
1208 | INIT_LIST_HEAD(&dlm->pending_basts); | ||
1209 | |||
1210 | mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", | ||
1211 | dlm->recovery_map, &(dlm->recovery_map[0])); | ||
1212 | |||
1213 | memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); | ||
1214 | memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); | ||
1215 | memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); | ||
1216 | |||
1217 | dlm->dlm_thread_task = NULL; | ||
1218 | dlm->dlm_reco_thread_task = NULL; | ||
1219 | init_waitqueue_head(&dlm->dlm_thread_wq); | ||
1220 | init_waitqueue_head(&dlm->dlm_reco_thread_wq); | ||
1221 | init_waitqueue_head(&dlm->reco.event); | ||
1222 | init_waitqueue_head(&dlm->ast_wq); | ||
1223 | init_waitqueue_head(&dlm->migration_wq); | ||
1224 | INIT_LIST_HEAD(&dlm->master_list); | ||
1225 | INIT_LIST_HEAD(&dlm->mle_hb_events); | ||
1226 | |||
1227 | dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1228 | init_waitqueue_head(&dlm->dlm_join_events); | ||
1229 | |||
1230 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | ||
1231 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
1232 | atomic_set(&dlm->local_resources, 0); | ||
1233 | atomic_set(&dlm->remote_resources, 0); | ||
1234 | atomic_set(&dlm->unknown_resources, 0); | ||
1235 | |||
1236 | spin_lock_init(&dlm->work_lock); | ||
1237 | INIT_LIST_HEAD(&dlm->work_list); | ||
1238 | INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm); | ||
1239 | |||
1240 | kref_init(&dlm->dlm_refs); | ||
1241 | dlm->dlm_state = DLM_CTXT_NEW; | ||
1242 | |||
1243 | INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); | ||
1244 | |||
1245 | mlog(0, "context init: refcount %u\n", | ||
1246 | atomic_read(&dlm->dlm_refs.refcount)); | ||
1247 | |||
1248 | leave: | ||
1249 | return dlm; | ||
1250 | } | ||
1251 | |||
1252 | /* | ||
1253 | * dlm_register_domain: one-time setup per "domain" | ||
1254 | */ | ||
1255 | struct dlm_ctxt * dlm_register_domain(const char *domain, | ||
1256 | u32 key) | ||
1257 | { | ||
1258 | int ret; | ||
1259 | struct dlm_ctxt *dlm = NULL; | ||
1260 | struct dlm_ctxt *new_ctxt = NULL; | ||
1261 | |||
1262 | if (strlen(domain) > O2NM_MAX_NAME_LEN) { | ||
1263 | ret = -ENAMETOOLONG; | ||
1264 | mlog(ML_ERROR, "domain name length too long\n"); | ||
1265 | goto leave; | ||
1266 | } | ||
1267 | |||
1268 | if (!o2hb_check_local_node_heartbeating()) { | ||
1269 | mlog(ML_ERROR, "the local node has not been configured, or is " | ||
1270 | "not heartbeating\n"); | ||
1271 | ret = -EPROTO; | ||
1272 | goto leave; | ||
1273 | } | ||
1274 | |||
1275 | mlog(0, "register called for domain \"%s\"\n", domain); | ||
1276 | |||
1277 | retry: | ||
1278 | dlm = NULL; | ||
1279 | if (signal_pending(current)) { | ||
1280 | ret = -ERESTARTSYS; | ||
1281 | mlog_errno(ret); | ||
1282 | goto leave; | ||
1283 | } | ||
1284 | |||
1285 | spin_lock(&dlm_domain_lock); | ||
1286 | |||
1287 | dlm = __dlm_lookup_domain(domain); | ||
1288 | if (dlm) { | ||
1289 | if (dlm->dlm_state != DLM_CTXT_JOINED) { | ||
1290 | spin_unlock(&dlm_domain_lock); | ||
1291 | |||
1292 | mlog(0, "This ctxt is not joined yet!\n"); | ||
1293 | wait_event_interruptible(dlm_domain_events, | ||
1294 | dlm_wait_on_domain_helper( | ||
1295 | domain)); | ||
1296 | goto retry; | ||
1297 | } | ||
1298 | |||
1299 | __dlm_get(dlm); | ||
1300 | dlm->num_joins++; | ||
1301 | |||
1302 | spin_unlock(&dlm_domain_lock); | ||
1303 | |||
1304 | ret = 0; | ||
1305 | goto leave; | ||
1306 | } | ||
1307 | |||
1308 | /* doesn't exist */ | ||
1309 | if (!new_ctxt) { | ||
1310 | spin_unlock(&dlm_domain_lock); | ||
1311 | |||
1312 | new_ctxt = dlm_alloc_ctxt(domain, key); | ||
1313 | if (new_ctxt) | ||
1314 | goto retry; | ||
1315 | |||
1316 | ret = -ENOMEM; | ||
1317 | mlog_errno(ret); | ||
1318 | goto leave; | ||
1319 | } | ||
1320 | |||
1321 | /* a little variable switch-a-roo here... */ | ||
1322 | dlm = new_ctxt; | ||
1323 | new_ctxt = NULL; | ||
1324 | |||
1325 | /* add the new domain */ | ||
1326 | list_add_tail(&dlm->list, &dlm_domains); | ||
1327 | spin_unlock(&dlm_domain_lock); | ||
1328 | |||
1329 | ret = dlm_join_domain(dlm); | ||
1330 | if (ret) { | ||
1331 | mlog_errno(ret); | ||
1332 | dlm_put(dlm); | ||
1333 | goto leave; | ||
1334 | } | ||
1335 | |||
1336 | ret = 0; | ||
1337 | leave: | ||
1338 | if (new_ctxt) | ||
1339 | dlm_free_ctxt_mem(new_ctxt); | ||
1340 | |||
1341 | if (ret < 0) | ||
1342 | dlm = ERR_PTR(ret); | ||
1343 | |||
1344 | return dlm; | ||
1345 | } | ||
1346 | EXPORT_SYMBOL_GPL(dlm_register_domain); | ||
1347 | |||
1348 | static LIST_HEAD(dlm_join_handlers); | ||
1349 | |||
1350 | static void dlm_unregister_net_handlers(void) | ||
1351 | { | ||
1352 | o2net_unregister_handler_list(&dlm_join_handlers); | ||
1353 | } | ||
1354 | |||
1355 | static int dlm_register_net_handlers(void) | ||
1356 | { | ||
1357 | int status = 0; | ||
1358 | |||
1359 | status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, | ||
1360 | sizeof(struct dlm_query_join_request), | ||
1361 | dlm_query_join_handler, | ||
1362 | NULL, &dlm_join_handlers); | ||
1363 | if (status) | ||
1364 | goto bail; | ||
1365 | |||
1366 | status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, | ||
1367 | sizeof(struct dlm_assert_joined), | ||
1368 | dlm_assert_joined_handler, | ||
1369 | NULL, &dlm_join_handlers); | ||
1370 | if (status) | ||
1371 | goto bail; | ||
1372 | |||
1373 | status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, | ||
1374 | sizeof(struct dlm_cancel_join), | ||
1375 | dlm_cancel_join_handler, | ||
1376 | NULL, &dlm_join_handlers); | ||
1377 | |||
1378 | bail: | ||
1379 | if (status < 0) | ||
1380 | dlm_unregister_net_handlers(); | ||
1381 | |||
1382 | return status; | ||
1383 | } | ||
1384 | |||
1385 | /* Domain eviction callback handling. | ||
1386 | * | ||
1387 | * The file system requires notification of node death *before* the | ||
1388 | * dlm completes it's recovery work, otherwise it may be able to | ||
1389 | * acquire locks on resources requiring recovery. Since the dlm can | ||
1390 | * evict a node from it's domain *before* heartbeat fires, a similar | ||
1391 | * mechanism is required. */ | ||
1392 | |||
1393 | /* Eviction is not expected to happen often, so a per-domain lock is | ||
1394 | * not necessary. Eviction callbacks are allowed to sleep for short | ||
1395 | * periods of time. */ | ||
1396 | static DECLARE_RWSEM(dlm_callback_sem); | ||
1397 | |||
1398 | void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, | ||
1399 | int node_num) | ||
1400 | { | ||
1401 | struct list_head *iter; | ||
1402 | struct dlm_eviction_cb *cb; | ||
1403 | |||
1404 | down_read(&dlm_callback_sem); | ||
1405 | list_for_each(iter, &dlm->dlm_eviction_callbacks) { | ||
1406 | cb = list_entry(iter, struct dlm_eviction_cb, ec_item); | ||
1407 | |||
1408 | cb->ec_func(node_num, cb->ec_data); | ||
1409 | } | ||
1410 | up_read(&dlm_callback_sem); | ||
1411 | } | ||
1412 | |||
1413 | void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, | ||
1414 | dlm_eviction_func *f, | ||
1415 | void *data) | ||
1416 | { | ||
1417 | INIT_LIST_HEAD(&cb->ec_item); | ||
1418 | cb->ec_func = f; | ||
1419 | cb->ec_data = data; | ||
1420 | } | ||
1421 | EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb); | ||
1422 | |||
1423 | void dlm_register_eviction_cb(struct dlm_ctxt *dlm, | ||
1424 | struct dlm_eviction_cb *cb) | ||
1425 | { | ||
1426 | down_write(&dlm_callback_sem); | ||
1427 | list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks); | ||
1428 | up_write(&dlm_callback_sem); | ||
1429 | } | ||
1430 | EXPORT_SYMBOL_GPL(dlm_register_eviction_cb); | ||
1431 | |||
1432 | void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb) | ||
1433 | { | ||
1434 | down_write(&dlm_callback_sem); | ||
1435 | list_del_init(&cb->ec_item); | ||
1436 | up_write(&dlm_callback_sem); | ||
1437 | } | ||
1438 | EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb); | ||
1439 | |||
1440 | static int __init dlm_init(void) | ||
1441 | { | ||
1442 | int status; | ||
1443 | |||
1444 | dlm_print_version(); | ||
1445 | |||
1446 | status = dlm_init_mle_cache(); | ||
1447 | if (status) | ||
1448 | return -1; | ||
1449 | |||
1450 | status = dlm_register_net_handlers(); | ||
1451 | if (status) { | ||
1452 | dlm_destroy_mle_cache(); | ||
1453 | return -1; | ||
1454 | } | ||
1455 | |||
1456 | return 0; | ||
1457 | } | ||
1458 | |||
1459 | static void __exit dlm_exit (void) | ||
1460 | { | ||
1461 | dlm_unregister_net_handlers(); | ||
1462 | dlm_destroy_mle_cache(); | ||
1463 | } | ||
1464 | |||
1465 | MODULE_AUTHOR("Oracle"); | ||
1466 | MODULE_LICENSE("GPL"); | ||
1467 | |||
1468 | module_init(dlm_init); | ||
1469 | module_exit(dlm_exit); | ||