diff options
Diffstat (limited to 'fs/ocfs2/dlm/dlmdomain.c')
| -rw-r--r-- | fs/ocfs2/dlm/dlmdomain.c | 1469 |
1 files changed, 1469 insertions, 0 deletions
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c new file mode 100644 index 000000000000..da3c22045f89 --- /dev/null +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
| @@ -0,0 +1,1469 @@ | |||
| 1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
| 2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
| 3 | * | ||
| 4 | * dlmdomain.c | ||
| 5 | * | ||
| 6 | * defines domain join / leave apis | ||
| 7 | * | ||
| 8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
| 9 | * | ||
| 10 | * This program is free software; you can redistribute it and/or | ||
| 11 | * modify it under the terms of the GNU General Public | ||
| 12 | * License as published by the Free Software Foundation; either | ||
| 13 | * version 2 of the License, or (at your option) any later version. | ||
| 14 | * | ||
| 15 | * This program is distributed in the hope that it will be useful, | ||
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 18 | * General Public License for more details. | ||
| 19 | * | ||
| 20 | * You should have received a copy of the GNU General Public | ||
| 21 | * License along with this program; if not, write to the | ||
| 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
| 23 | * Boston, MA 021110-1307, USA. | ||
| 24 | * | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include <linux/module.h> | ||
| 28 | #include <linux/types.h> | ||
| 29 | #include <linux/slab.h> | ||
| 30 | #include <linux/highmem.h> | ||
| 31 | #include <linux/utsname.h> | ||
| 32 | #include <linux/init.h> | ||
| 33 | #include <linux/spinlock.h> | ||
| 34 | #include <linux/delay.h> | ||
| 35 | #include <linux/err.h> | ||
| 36 | |||
| 37 | #include "cluster/heartbeat.h" | ||
| 38 | #include "cluster/nodemanager.h" | ||
| 39 | #include "cluster/tcp.h" | ||
| 40 | |||
| 41 | #include "dlmapi.h" | ||
| 42 | #include "dlmcommon.h" | ||
| 43 | |||
| 44 | #include "dlmdebug.h" | ||
| 45 | #include "dlmdomain.h" | ||
| 46 | |||
| 47 | #include "dlmver.h" | ||
| 48 | |||
| 49 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) | ||
| 50 | #include "cluster/masklog.h" | ||
| 51 | |||
| 52 | /* | ||
| 53 | * | ||
| 54 | * spinlock lock ordering: if multiple locks are needed, obey this ordering: | ||
| 55 | * dlm_domain_lock | ||
| 56 | * struct dlm_ctxt->spinlock | ||
| 57 | * struct dlm_lock_resource->spinlock | ||
| 58 | * struct dlm_ctxt->master_lock | ||
| 59 | * struct dlm_ctxt->ast_lock | ||
| 60 | * dlm_master_list_entry->spinlock | ||
| 61 | * dlm_lock->spinlock | ||
| 62 | * | ||
| 63 | */ | ||
| 64 | |||
| 65 | spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED; | ||
| 66 | LIST_HEAD(dlm_domains); | ||
| 67 | static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); | ||
| 68 | |||
| 69 | #define DLM_DOMAIN_BACKOFF_MS 200 | ||
| 70 | |||
| 71 | static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); | ||
| 72 | static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); | ||
| 73 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); | ||
| 74 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); | ||
| 75 | |||
| 76 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); | ||
| 77 | |||
| 78 | void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) | ||
| 79 | { | ||
| 80 | list_del_init(&lockres->list); | ||
| 81 | dlm_lockres_put(lockres); | ||
| 82 | } | ||
| 83 | |||
| 84 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | ||
| 85 | struct dlm_lock_resource *res) | ||
| 86 | { | ||
| 87 | struct list_head *bucket; | ||
| 88 | struct qstr *q; | ||
| 89 | |||
| 90 | assert_spin_locked(&dlm->spinlock); | ||
| 91 | |||
| 92 | q = &res->lockname; | ||
| 93 | q->hash = full_name_hash(q->name, q->len); | ||
| 94 | bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]); | ||
| 95 | |||
| 96 | /* get a reference for our hashtable */ | ||
| 97 | dlm_lockres_get(res); | ||
| 98 | |||
| 99 | list_add_tail(&res->list, bucket); | ||
| 100 | } | ||
| 101 | |||
| 102 | struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, | ||
| 103 | const char *name, | ||
| 104 | unsigned int len) | ||
| 105 | { | ||
| 106 | unsigned int hash; | ||
| 107 | struct list_head *iter; | ||
| 108 | struct dlm_lock_resource *tmpres=NULL; | ||
| 109 | struct list_head *bucket; | ||
| 110 | |||
| 111 | mlog_entry("%.*s\n", len, name); | ||
| 112 | |||
| 113 | assert_spin_locked(&dlm->spinlock); | ||
| 114 | |||
| 115 | hash = full_name_hash(name, len); | ||
| 116 | |||
| 117 | bucket = &(dlm->resources[hash & DLM_HASH_MASK]); | ||
| 118 | |||
| 119 | /* check for pre-existing lock */ | ||
| 120 | list_for_each(iter, bucket) { | ||
| 121 | tmpres = list_entry(iter, struct dlm_lock_resource, list); | ||
| 122 | if (tmpres->lockname.len == len && | ||
| 123 | memcmp(tmpres->lockname.name, name, len) == 0) { | ||
| 124 | dlm_lockres_get(tmpres); | ||
| 125 | break; | ||
| 126 | } | ||
| 127 | |||
| 128 | tmpres = NULL; | ||
| 129 | } | ||
| 130 | return tmpres; | ||
| 131 | } | ||
| 132 | |||
| 133 | struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, | ||
| 134 | const char *name, | ||
| 135 | unsigned int len) | ||
| 136 | { | ||
| 137 | struct dlm_lock_resource *res; | ||
| 138 | |||
| 139 | spin_lock(&dlm->spinlock); | ||
| 140 | res = __dlm_lookup_lockres(dlm, name, len); | ||
| 141 | spin_unlock(&dlm->spinlock); | ||
| 142 | return res; | ||
| 143 | } | ||
| 144 | |||
| 145 | static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) | ||
| 146 | { | ||
| 147 | struct dlm_ctxt *tmp = NULL; | ||
| 148 | struct list_head *iter; | ||
| 149 | |||
| 150 | assert_spin_locked(&dlm_domain_lock); | ||
| 151 | |||
| 152 | /* tmp->name here is always NULL terminated, | ||
| 153 | * but domain may not be! */ | ||
| 154 | list_for_each(iter, &dlm_domains) { | ||
| 155 | tmp = list_entry (iter, struct dlm_ctxt, list); | ||
| 156 | if (strlen(tmp->name) == len && | ||
| 157 | memcmp(tmp->name, domain, len)==0) | ||
| 158 | break; | ||
| 159 | tmp = NULL; | ||
| 160 | } | ||
| 161 | |||
| 162 | return tmp; | ||
| 163 | } | ||
| 164 | |||
| 165 | /* For null terminated domain strings ONLY */ | ||
| 166 | static struct dlm_ctxt * __dlm_lookup_domain(const char *domain) | ||
| 167 | { | ||
| 168 | assert_spin_locked(&dlm_domain_lock); | ||
| 169 | |||
| 170 | return __dlm_lookup_domain_full(domain, strlen(domain)); | ||
| 171 | } | ||
| 172 | |||
| 173 | |||
| 174 | /* returns true on one of two conditions: | ||
| 175 | * 1) the domain does not exist | ||
| 176 | * 2) the domain exists and it's state is "joined" */ | ||
| 177 | static int dlm_wait_on_domain_helper(const char *domain) | ||
| 178 | { | ||
| 179 | int ret = 0; | ||
| 180 | struct dlm_ctxt *tmp = NULL; | ||
| 181 | |||
| 182 | spin_lock(&dlm_domain_lock); | ||
| 183 | |||
| 184 | tmp = __dlm_lookup_domain(domain); | ||
| 185 | if (!tmp) | ||
| 186 | ret = 1; | ||
| 187 | else if (tmp->dlm_state == DLM_CTXT_JOINED) | ||
| 188 | ret = 1; | ||
| 189 | |||
| 190 | spin_unlock(&dlm_domain_lock); | ||
| 191 | return ret; | ||
| 192 | } | ||
| 193 | |||
| 194 | static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) | ||
| 195 | { | ||
| 196 | if (dlm->resources) | ||
| 197 | free_page((unsigned long) dlm->resources); | ||
| 198 | |||
| 199 | if (dlm->name) | ||
| 200 | kfree(dlm->name); | ||
| 201 | |||
| 202 | kfree(dlm); | ||
| 203 | } | ||
| 204 | |||
| 205 | /* A little strange - this function will be called while holding | ||
| 206 | * dlm_domain_lock and is expected to be holding it on the way out. We | ||
| 207 | * will however drop and reacquire it multiple times */ | ||
| 208 | static void dlm_ctxt_release(struct kref *kref) | ||
| 209 | { | ||
| 210 | struct dlm_ctxt *dlm; | ||
| 211 | |||
| 212 | dlm = container_of(kref, struct dlm_ctxt, dlm_refs); | ||
| 213 | |||
| 214 | BUG_ON(dlm->num_joins); | ||
| 215 | BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED); | ||
| 216 | |||
| 217 | /* we may still be in the list if we hit an error during join. */ | ||
| 218 | list_del_init(&dlm->list); | ||
| 219 | |||
| 220 | spin_unlock(&dlm_domain_lock); | ||
| 221 | |||
| 222 | mlog(0, "freeing memory from domain %s\n", dlm->name); | ||
| 223 | |||
| 224 | wake_up(&dlm_domain_events); | ||
| 225 | |||
| 226 | dlm_free_ctxt_mem(dlm); | ||
| 227 | |||
| 228 | spin_lock(&dlm_domain_lock); | ||
| 229 | } | ||
| 230 | |||
| 231 | void dlm_put(struct dlm_ctxt *dlm) | ||
| 232 | { | ||
| 233 | spin_lock(&dlm_domain_lock); | ||
| 234 | kref_put(&dlm->dlm_refs, dlm_ctxt_release); | ||
| 235 | spin_unlock(&dlm_domain_lock); | ||
| 236 | } | ||
| 237 | |||
| 238 | static void __dlm_get(struct dlm_ctxt *dlm) | ||
| 239 | { | ||
| 240 | kref_get(&dlm->dlm_refs); | ||
| 241 | } | ||
| 242 | |||
| 243 | /* given a questionable reference to a dlm object, gets a reference if | ||
| 244 | * it can find it in the list, otherwise returns NULL in which case | ||
| 245 | * you shouldn't trust your pointer. */ | ||
| 246 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) | ||
| 247 | { | ||
| 248 | struct list_head *iter; | ||
| 249 | struct dlm_ctxt *target = NULL; | ||
| 250 | |||
| 251 | spin_lock(&dlm_domain_lock); | ||
| 252 | |||
| 253 | list_for_each(iter, &dlm_domains) { | ||
| 254 | target = list_entry (iter, struct dlm_ctxt, list); | ||
| 255 | |||
| 256 | if (target == dlm) { | ||
| 257 | __dlm_get(target); | ||
| 258 | break; | ||
| 259 | } | ||
| 260 | |||
| 261 | target = NULL; | ||
| 262 | } | ||
| 263 | |||
| 264 | spin_unlock(&dlm_domain_lock); | ||
| 265 | |||
| 266 | return target; | ||
| 267 | } | ||
| 268 | |||
| 269 | int dlm_domain_fully_joined(struct dlm_ctxt *dlm) | ||
| 270 | { | ||
| 271 | int ret; | ||
| 272 | |||
| 273 | spin_lock(&dlm_domain_lock); | ||
| 274 | ret = (dlm->dlm_state == DLM_CTXT_JOINED) || | ||
| 275 | (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN); | ||
| 276 | spin_unlock(&dlm_domain_lock); | ||
| 277 | |||
| 278 | return ret; | ||
| 279 | } | ||
| 280 | |||
| 281 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) | ||
| 282 | { | ||
| 283 | dlm_unregister_domain_handlers(dlm); | ||
| 284 | dlm_complete_thread(dlm); | ||
| 285 | dlm_complete_recovery_thread(dlm); | ||
| 286 | |||
| 287 | /* We've left the domain. Now we can take ourselves out of the | ||
| 288 | * list and allow the kref stuff to help us free the | ||
| 289 | * memory. */ | ||
| 290 | spin_lock(&dlm_domain_lock); | ||
| 291 | list_del_init(&dlm->list); | ||
| 292 | spin_unlock(&dlm_domain_lock); | ||
| 293 | |||
| 294 | /* Wake up anyone waiting for us to remove this domain */ | ||
| 295 | wake_up(&dlm_domain_events); | ||
| 296 | } | ||
| 297 | |||
| 298 | static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) | ||
| 299 | { | ||
| 300 | int i; | ||
| 301 | struct dlm_lock_resource *res; | ||
| 302 | |||
| 303 | mlog(0, "Migrating locks from domain %s\n", dlm->name); | ||
| 304 | restart: | ||
| 305 | spin_lock(&dlm->spinlock); | ||
| 306 | for (i=0; i<DLM_HASH_SIZE; i++) { | ||
| 307 | while (!list_empty(&dlm->resources[i])) { | ||
| 308 | res = list_entry(dlm->resources[i].next, | ||
| 309 | struct dlm_lock_resource, list); | ||
| 310 | /* need reference when manually grabbing lockres */ | ||
| 311 | dlm_lockres_get(res); | ||
| 312 | /* this should unhash the lockres | ||
| 313 | * and exit with dlm->spinlock */ | ||
| 314 | mlog(0, "purging res=%p\n", res); | ||
| 315 | if (dlm_lockres_is_dirty(dlm, res)) { | ||
| 316 | /* HACK! this should absolutely go. | ||
| 317 | * need to figure out why some empty | ||
| 318 | * lockreses are still marked dirty */ | ||
| 319 | mlog(ML_ERROR, "lockres %.*s dirty!\n", | ||
| 320 | res->lockname.len, res->lockname.name); | ||
| 321 | |||
| 322 | spin_unlock(&dlm->spinlock); | ||
| 323 | dlm_kick_thread(dlm, res); | ||
| 324 | wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); | ||
| 325 | dlm_lockres_put(res); | ||
| 326 | goto restart; | ||
| 327 | } | ||
| 328 | dlm_purge_lockres(dlm, res); | ||
| 329 | dlm_lockres_put(res); | ||
| 330 | } | ||
| 331 | } | ||
| 332 | spin_unlock(&dlm->spinlock); | ||
| 333 | |||
| 334 | mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); | ||
| 335 | } | ||
| 336 | |||
| 337 | static int dlm_no_joining_node(struct dlm_ctxt *dlm) | ||
| 338 | { | ||
| 339 | int ret; | ||
| 340 | |||
| 341 | spin_lock(&dlm->spinlock); | ||
| 342 | ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN; | ||
| 343 | spin_unlock(&dlm->spinlock); | ||
| 344 | |||
| 345 | return ret; | ||
| 346 | } | ||
| 347 | |||
| 348 | static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) | ||
| 349 | { | ||
| 350 | /* Yikes, a double spinlock! I need domain_lock for the dlm | ||
| 351 | * state and the dlm spinlock for join state... Sorry! */ | ||
| 352 | again: | ||
| 353 | spin_lock(&dlm_domain_lock); | ||
| 354 | spin_lock(&dlm->spinlock); | ||
| 355 | |||
| 356 | if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
| 357 | mlog(0, "Node %d is joining, we wait on it.\n", | ||
| 358 | dlm->joining_node); | ||
| 359 | spin_unlock(&dlm->spinlock); | ||
| 360 | spin_unlock(&dlm_domain_lock); | ||
| 361 | |||
| 362 | wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm)); | ||
| 363 | goto again; | ||
| 364 | } | ||
| 365 | |||
| 366 | dlm->dlm_state = DLM_CTXT_LEAVING; | ||
| 367 | spin_unlock(&dlm->spinlock); | ||
| 368 | spin_unlock(&dlm_domain_lock); | ||
| 369 | } | ||
| 370 | |||
| 371 | static void __dlm_print_nodes(struct dlm_ctxt *dlm) | ||
| 372 | { | ||
| 373 | int node = -1; | ||
| 374 | |||
| 375 | assert_spin_locked(&dlm->spinlock); | ||
| 376 | |||
| 377 | mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name); | ||
| 378 | |||
| 379 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, | ||
| 380 | node + 1)) < O2NM_MAX_NODES) { | ||
| 381 | mlog(ML_NOTICE, " node %d\n", node); | ||
| 382 | } | ||
| 383 | } | ||
| 384 | |||
| 385 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) | ||
| 386 | { | ||
| 387 | struct dlm_ctxt *dlm = data; | ||
| 388 | unsigned int node; | ||
| 389 | struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; | ||
| 390 | |||
| 391 | mlog_entry("%p %u %p", msg, len, data); | ||
| 392 | |||
| 393 | if (!dlm_grab(dlm)) | ||
| 394 | return 0; | ||
| 395 | |||
| 396 | node = exit_msg->node_idx; | ||
| 397 | |||
| 398 | mlog(0, "Node %u leaves domain %s\n", node, dlm->name); | ||
| 399 | |||
| 400 | spin_lock(&dlm->spinlock); | ||
| 401 | clear_bit(node, dlm->domain_map); | ||
| 402 | __dlm_print_nodes(dlm); | ||
| 403 | |||
| 404 | /* notify anything attached to the heartbeat events */ | ||
| 405 | dlm_hb_event_notify_attached(dlm, node, 0); | ||
| 406 | |||
| 407 | spin_unlock(&dlm->spinlock); | ||
| 408 | |||
| 409 | dlm_put(dlm); | ||
| 410 | |||
| 411 | return 0; | ||
| 412 | } | ||
| 413 | |||
| 414 | static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, | ||
| 415 | unsigned int node) | ||
| 416 | { | ||
| 417 | int status; | ||
| 418 | struct dlm_exit_domain leave_msg; | ||
| 419 | |||
| 420 | mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", | ||
| 421 | node, dlm->name, dlm->node_num); | ||
| 422 | |||
| 423 | memset(&leave_msg, 0, sizeof(leave_msg)); | ||
| 424 | leave_msg.node_idx = dlm->node_num; | ||
| 425 | |||
| 426 | status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, | ||
| 427 | &leave_msg, sizeof(leave_msg), node, | ||
| 428 | NULL); | ||
| 429 | |||
| 430 | mlog(0, "status return %d from o2net_send_message\n", status); | ||
| 431 | |||
| 432 | return status; | ||
| 433 | } | ||
| 434 | |||
| 435 | |||
| 436 | static void dlm_leave_domain(struct dlm_ctxt *dlm) | ||
| 437 | { | ||
| 438 | int node, clear_node, status; | ||
| 439 | |||
| 440 | /* At this point we've migrated away all our locks and won't | ||
| 441 | * accept mastership of new ones. The dlm is responsible for | ||
| 442 | * almost nothing now. We make sure not to confuse any joining | ||
| 443 | * nodes and then commence shutdown procedure. */ | ||
| 444 | |||
| 445 | spin_lock(&dlm->spinlock); | ||
| 446 | /* Clear ourselves from the domain map */ | ||
| 447 | clear_bit(dlm->node_num, dlm->domain_map); | ||
| 448 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, | ||
| 449 | 0)) < O2NM_MAX_NODES) { | ||
| 450 | /* Drop the dlm spinlock. This is safe wrt the domain_map. | ||
| 451 | * -nodes cannot be added now as the | ||
| 452 | * query_join_handlers knows to respond with OK_NO_MAP | ||
| 453 | * -we catch the right network errors if a node is | ||
| 454 | * removed from the map while we're sending him the | ||
| 455 | * exit message. */ | ||
| 456 | spin_unlock(&dlm->spinlock); | ||
| 457 | |||
| 458 | clear_node = 1; | ||
| 459 | |||
| 460 | status = dlm_send_one_domain_exit(dlm, node); | ||
| 461 | if (status < 0 && | ||
| 462 | status != -ENOPROTOOPT && | ||
| 463 | status != -ENOTCONN) { | ||
| 464 | mlog(ML_NOTICE, "Error %d sending domain exit message " | ||
| 465 | "to node %d\n", status, node); | ||
| 466 | |||
| 467 | /* Not sure what to do here but lets sleep for | ||
| 468 | * a bit in case this was a transient | ||
| 469 | * error... */ | ||
| 470 | msleep(DLM_DOMAIN_BACKOFF_MS); | ||
| 471 | clear_node = 0; | ||
| 472 | } | ||
| 473 | |||
| 474 | spin_lock(&dlm->spinlock); | ||
| 475 | /* If we're not clearing the node bit then we intend | ||
| 476 | * to loop back around to try again. */ | ||
| 477 | if (clear_node) | ||
| 478 | clear_bit(node, dlm->domain_map); | ||
| 479 | } | ||
| 480 | spin_unlock(&dlm->spinlock); | ||
| 481 | } | ||
| 482 | |||
| 483 | int dlm_joined(struct dlm_ctxt *dlm) | ||
| 484 | { | ||
| 485 | int ret = 0; | ||
| 486 | |||
| 487 | spin_lock(&dlm_domain_lock); | ||
| 488 | |||
| 489 | if (dlm->dlm_state == DLM_CTXT_JOINED) | ||
| 490 | ret = 1; | ||
| 491 | |||
| 492 | spin_unlock(&dlm_domain_lock); | ||
| 493 | |||
| 494 | return ret; | ||
| 495 | } | ||
| 496 | |||
| 497 | int dlm_shutting_down(struct dlm_ctxt *dlm) | ||
| 498 | { | ||
| 499 | int ret = 0; | ||
| 500 | |||
| 501 | spin_lock(&dlm_domain_lock); | ||
| 502 | |||
| 503 | if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) | ||
| 504 | ret = 1; | ||
| 505 | |||
| 506 | spin_unlock(&dlm_domain_lock); | ||
| 507 | |||
| 508 | return ret; | ||
| 509 | } | ||
| 510 | |||
| 511 | void dlm_unregister_domain(struct dlm_ctxt *dlm) | ||
| 512 | { | ||
| 513 | int leave = 0; | ||
| 514 | |||
| 515 | spin_lock(&dlm_domain_lock); | ||
| 516 | BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); | ||
| 517 | BUG_ON(!dlm->num_joins); | ||
| 518 | |||
| 519 | dlm->num_joins--; | ||
| 520 | if (!dlm->num_joins) { | ||
| 521 | /* We mark it "in shutdown" now so new register | ||
| 522 | * requests wait until we've completely left the | ||
| 523 | * domain. Don't use DLM_CTXT_LEAVING yet as we still | ||
| 524 | * want new domain joins to communicate with us at | ||
| 525 | * least until we've completed migration of our | ||
| 526 | * resources. */ | ||
| 527 | dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN; | ||
| 528 | leave = 1; | ||
| 529 | } | ||
| 530 | spin_unlock(&dlm_domain_lock); | ||
| 531 | |||
| 532 | if (leave) { | ||
| 533 | mlog(0, "shutting down domain %s\n", dlm->name); | ||
| 534 | |||
| 535 | /* We changed dlm state, notify the thread */ | ||
| 536 | dlm_kick_thread(dlm, NULL); | ||
| 537 | |||
| 538 | dlm_migrate_all_locks(dlm); | ||
| 539 | dlm_mark_domain_leaving(dlm); | ||
| 540 | dlm_leave_domain(dlm); | ||
| 541 | dlm_complete_dlm_shutdown(dlm); | ||
| 542 | } | ||
| 543 | dlm_put(dlm); | ||
| 544 | } | ||
| 545 | EXPORT_SYMBOL_GPL(dlm_unregister_domain); | ||
| 546 | |||
| 547 | static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) | ||
| 548 | { | ||
| 549 | struct dlm_query_join_request *query; | ||
| 550 | enum dlm_query_join_response response; | ||
| 551 | struct dlm_ctxt *dlm = NULL; | ||
| 552 | |||
| 553 | query = (struct dlm_query_join_request *) msg->buf; | ||
| 554 | |||
| 555 | mlog(0, "node %u wants to join domain %s\n", query->node_idx, | ||
| 556 | query->domain); | ||
| 557 | |||
| 558 | /* | ||
| 559 | * If heartbeat doesn't consider the node live, tell it | ||
| 560 | * to back off and try again. This gives heartbeat a chance | ||
| 561 | * to catch up. | ||
| 562 | */ | ||
| 563 | if (!o2hb_check_node_heartbeating(query->node_idx)) { | ||
| 564 | mlog(0, "node %u is not in our live map yet\n", | ||
| 565 | query->node_idx); | ||
| 566 | |||
| 567 | response = JOIN_DISALLOW; | ||
| 568 | goto respond; | ||
| 569 | } | ||
| 570 | |||
| 571 | response = JOIN_OK_NO_MAP; | ||
| 572 | |||
| 573 | spin_lock(&dlm_domain_lock); | ||
| 574 | dlm = __dlm_lookup_domain_full(query->domain, query->name_len); | ||
| 575 | /* Once the dlm ctxt is marked as leaving then we don't want | ||
| 576 | * to be put in someone's domain map. */ | ||
| 577 | if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { | ||
| 578 | spin_lock(&dlm->spinlock); | ||
| 579 | |||
| 580 | if (dlm->dlm_state == DLM_CTXT_NEW && | ||
| 581 | dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
| 582 | /*If this is a brand new context and we | ||
| 583 | * haven't started our join process yet, then | ||
| 584 | * the other node won the race. */ | ||
| 585 | response = JOIN_OK_NO_MAP; | ||
| 586 | } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
| 587 | /* Disallow parallel joins. */ | ||
| 588 | response = JOIN_DISALLOW; | ||
| 589 | } else { | ||
| 590 | /* Alright we're fully a part of this domain | ||
| 591 | * so we keep some state as to who's joining | ||
| 592 | * and indicate to him that needs to be fixed | ||
| 593 | * up. */ | ||
| 594 | response = JOIN_OK; | ||
| 595 | __dlm_set_joining_node(dlm, query->node_idx); | ||
| 596 | } | ||
| 597 | |||
| 598 | spin_unlock(&dlm->spinlock); | ||
| 599 | } | ||
| 600 | spin_unlock(&dlm_domain_lock); | ||
| 601 | |||
| 602 | respond: | ||
| 603 | mlog(0, "We respond with %u\n", response); | ||
| 604 | |||
| 605 | return response; | ||
| 606 | } | ||
| 607 | |||
| 608 | static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) | ||
| 609 | { | ||
| 610 | struct dlm_assert_joined *assert; | ||
| 611 | struct dlm_ctxt *dlm = NULL; | ||
| 612 | |||
| 613 | assert = (struct dlm_assert_joined *) msg->buf; | ||
| 614 | |||
| 615 | mlog(0, "node %u asserts join on domain %s\n", assert->node_idx, | ||
| 616 | assert->domain); | ||
| 617 | |||
| 618 | spin_lock(&dlm_domain_lock); | ||
| 619 | dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len); | ||
| 620 | /* XXX should we consider no dlm ctxt an error? */ | ||
| 621 | if (dlm) { | ||
| 622 | spin_lock(&dlm->spinlock); | ||
| 623 | |||
| 624 | /* Alright, this node has officially joined our | ||
| 625 | * domain. Set him in the map and clean up our | ||
| 626 | * leftover join state. */ | ||
| 627 | BUG_ON(dlm->joining_node != assert->node_idx); | ||
| 628 | set_bit(assert->node_idx, dlm->domain_map); | ||
| 629 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
| 630 | |||
| 631 | __dlm_print_nodes(dlm); | ||
| 632 | |||
| 633 | /* notify anything attached to the heartbeat events */ | ||
| 634 | dlm_hb_event_notify_attached(dlm, assert->node_idx, 1); | ||
| 635 | |||
| 636 | spin_unlock(&dlm->spinlock); | ||
| 637 | } | ||
| 638 | spin_unlock(&dlm_domain_lock); | ||
| 639 | |||
| 640 | return 0; | ||
| 641 | } | ||
| 642 | |||
| 643 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) | ||
| 644 | { | ||
| 645 | struct dlm_cancel_join *cancel; | ||
| 646 | struct dlm_ctxt *dlm = NULL; | ||
| 647 | |||
| 648 | cancel = (struct dlm_cancel_join *) msg->buf; | ||
| 649 | |||
| 650 | mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx, | ||
| 651 | cancel->domain); | ||
| 652 | |||
| 653 | spin_lock(&dlm_domain_lock); | ||
| 654 | dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len); | ||
| 655 | |||
| 656 | if (dlm) { | ||
| 657 | spin_lock(&dlm->spinlock); | ||
| 658 | |||
| 659 | /* Yikes, this guy wants to cancel his join. No | ||
| 660 | * problem, we simply cleanup our join state. */ | ||
| 661 | BUG_ON(dlm->joining_node != cancel->node_idx); | ||
| 662 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
| 663 | |||
| 664 | spin_unlock(&dlm->spinlock); | ||
| 665 | } | ||
| 666 | spin_unlock(&dlm_domain_lock); | ||
| 667 | |||
| 668 | return 0; | ||
| 669 | } | ||
| 670 | |||
| 671 | static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm, | ||
| 672 | unsigned int node) | ||
| 673 | { | ||
| 674 | int status; | ||
| 675 | struct dlm_cancel_join cancel_msg; | ||
| 676 | |||
| 677 | memset(&cancel_msg, 0, sizeof(cancel_msg)); | ||
| 678 | cancel_msg.node_idx = dlm->node_num; | ||
| 679 | cancel_msg.name_len = strlen(dlm->name); | ||
| 680 | memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len); | ||
| 681 | |||
| 682 | status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, | ||
| 683 | &cancel_msg, sizeof(cancel_msg), node, | ||
| 684 | NULL); | ||
| 685 | if (status < 0) { | ||
| 686 | mlog_errno(status); | ||
| 687 | goto bail; | ||
| 688 | } | ||
| 689 | |||
| 690 | bail: | ||
| 691 | return status; | ||
| 692 | } | ||
| 693 | |||
| 694 | /* map_size should be in bytes. */ | ||
| 695 | static int dlm_send_join_cancels(struct dlm_ctxt *dlm, | ||
| 696 | unsigned long *node_map, | ||
| 697 | unsigned int map_size) | ||
| 698 | { | ||
| 699 | int status, tmpstat; | ||
| 700 | unsigned int node; | ||
| 701 | |||
| 702 | if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * | ||
| 703 | sizeof(unsigned long))) { | ||
| 704 | mlog(ML_ERROR, | ||
| 705 | "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n", | ||
| 706 | map_size, BITS_TO_LONGS(O2NM_MAX_NODES)); | ||
| 707 | return -EINVAL; | ||
| 708 | } | ||
| 709 | |||
| 710 | status = 0; | ||
| 711 | node = -1; | ||
| 712 | while ((node = find_next_bit(node_map, O2NM_MAX_NODES, | ||
| 713 | node + 1)) < O2NM_MAX_NODES) { | ||
| 714 | if (node == dlm->node_num) | ||
| 715 | continue; | ||
| 716 | |||
| 717 | tmpstat = dlm_send_one_join_cancel(dlm, node); | ||
| 718 | if (tmpstat) { | ||
| 719 | mlog(ML_ERROR, "Error return %d cancelling join on " | ||
| 720 | "node %d\n", tmpstat, node); | ||
| 721 | if (!status) | ||
| 722 | status = tmpstat; | ||
| 723 | } | ||
| 724 | } | ||
| 725 | |||
| 726 | if (status) | ||
| 727 | mlog_errno(status); | ||
| 728 | return status; | ||
| 729 | } | ||
| 730 | |||
| 731 | static int dlm_request_join(struct dlm_ctxt *dlm, | ||
| 732 | int node, | ||
| 733 | enum dlm_query_join_response *response) | ||
| 734 | { | ||
| 735 | int status, retval; | ||
| 736 | struct dlm_query_join_request join_msg; | ||
| 737 | |||
| 738 | mlog(0, "querying node %d\n", node); | ||
| 739 | |||
| 740 | memset(&join_msg, 0, sizeof(join_msg)); | ||
| 741 | join_msg.node_idx = dlm->node_num; | ||
| 742 | join_msg.name_len = strlen(dlm->name); | ||
| 743 | memcpy(join_msg.domain, dlm->name, join_msg.name_len); | ||
| 744 | |||
| 745 | status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, | ||
| 746 | sizeof(join_msg), node, &retval); | ||
| 747 | if (status < 0 && status != -ENOPROTOOPT) { | ||
| 748 | mlog_errno(status); | ||
| 749 | goto bail; | ||
| 750 | } | ||
| 751 | |||
| 752 | /* -ENOPROTOOPT from the net code means the other side isn't | ||
| 753 | listening for our message type -- that's fine, it means | ||
| 754 | his dlm isn't up, so we can consider him a 'yes' but not | ||
| 755 | joined into the domain. */ | ||
| 756 | if (status == -ENOPROTOOPT) { | ||
| 757 | status = 0; | ||
| 758 | *response = JOIN_OK_NO_MAP; | ||
| 759 | } else if (retval == JOIN_DISALLOW || | ||
| 760 | retval == JOIN_OK || | ||
| 761 | retval == JOIN_OK_NO_MAP) { | ||
| 762 | *response = retval; | ||
| 763 | } else { | ||
| 764 | status = -EINVAL; | ||
| 765 | mlog(ML_ERROR, "invalid response %d from node %u\n", retval, | ||
| 766 | node); | ||
| 767 | } | ||
| 768 | |||
| 769 | mlog(0, "status %d, node %d response is %d\n", status, node, | ||
| 770 | *response); | ||
| 771 | |||
| 772 | bail: | ||
| 773 | return status; | ||
| 774 | } | ||
| 775 | |||
| 776 | static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, | ||
| 777 | unsigned int node) | ||
| 778 | { | ||
| 779 | int status; | ||
| 780 | struct dlm_assert_joined assert_msg; | ||
| 781 | |||
| 782 | mlog(0, "Sending join assert to node %u\n", node); | ||
| 783 | |||
| 784 | memset(&assert_msg, 0, sizeof(assert_msg)); | ||
| 785 | assert_msg.node_idx = dlm->node_num; | ||
| 786 | assert_msg.name_len = strlen(dlm->name); | ||
| 787 | memcpy(assert_msg.domain, dlm->name, assert_msg.name_len); | ||
| 788 | |||
| 789 | status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, | ||
| 790 | &assert_msg, sizeof(assert_msg), node, | ||
| 791 | NULL); | ||
| 792 | if (status < 0) | ||
| 793 | mlog_errno(status); | ||
| 794 | |||
| 795 | return status; | ||
| 796 | } | ||
| 797 | |||
| 798 | static void dlm_send_join_asserts(struct dlm_ctxt *dlm, | ||
| 799 | unsigned long *node_map) | ||
| 800 | { | ||
| 801 | int status, node, live; | ||
| 802 | |||
| 803 | status = 0; | ||
| 804 | node = -1; | ||
| 805 | while ((node = find_next_bit(node_map, O2NM_MAX_NODES, | ||
| 806 | node + 1)) < O2NM_MAX_NODES) { | ||
| 807 | if (node == dlm->node_num) | ||
| 808 | continue; | ||
| 809 | |||
| 810 | do { | ||
| 811 | /* It is very important that this message be | ||
| 812 | * received so we spin until either the node | ||
| 813 | * has died or it gets the message. */ | ||
| 814 | status = dlm_send_one_join_assert(dlm, node); | ||
| 815 | |||
| 816 | spin_lock(&dlm->spinlock); | ||
| 817 | live = test_bit(node, dlm->live_nodes_map); | ||
| 818 | spin_unlock(&dlm->spinlock); | ||
| 819 | |||
| 820 | if (status) { | ||
| 821 | mlog(ML_ERROR, "Error return %d asserting " | ||
| 822 | "join on node %d\n", status, node); | ||
| 823 | |||
| 824 | /* give us some time between errors... */ | ||
| 825 | if (live) | ||
| 826 | msleep(DLM_DOMAIN_BACKOFF_MS); | ||
| 827 | } | ||
| 828 | } while (status && live); | ||
| 829 | } | ||
| 830 | } | ||
| 831 | |||
| 832 | struct domain_join_ctxt { | ||
| 833 | unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
| 834 | unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
| 835 | }; | ||
| 836 | |||
| 837 | static int dlm_should_restart_join(struct dlm_ctxt *dlm, | ||
| 838 | struct domain_join_ctxt *ctxt, | ||
| 839 | enum dlm_query_join_response response) | ||
| 840 | { | ||
| 841 | int ret; | ||
| 842 | |||
| 843 | if (response == JOIN_DISALLOW) { | ||
| 844 | mlog(0, "Latest response of disallow -- should restart\n"); | ||
| 845 | return 1; | ||
| 846 | } | ||
| 847 | |||
| 848 | spin_lock(&dlm->spinlock); | ||
| 849 | /* For now, we restart the process if the node maps have | ||
| 850 | * changed at all */ | ||
| 851 | ret = memcmp(ctxt->live_map, dlm->live_nodes_map, | ||
| 852 | sizeof(dlm->live_nodes_map)); | ||
| 853 | spin_unlock(&dlm->spinlock); | ||
| 854 | |||
| 855 | if (ret) | ||
| 856 | mlog(0, "Node maps changed -- should restart\n"); | ||
| 857 | |||
| 858 | return ret; | ||
| 859 | } | ||
| 860 | |||
| 861 | static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) | ||
| 862 | { | ||
| 863 | int status = 0, tmpstat, node; | ||
| 864 | struct domain_join_ctxt *ctxt; | ||
| 865 | enum dlm_query_join_response response; | ||
| 866 | |||
| 867 | mlog_entry("%p", dlm); | ||
| 868 | |||
| 869 | ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL); | ||
| 870 | if (!ctxt) { | ||
| 871 | status = -ENOMEM; | ||
| 872 | mlog_errno(status); | ||
| 873 | goto bail; | ||
| 874 | } | ||
| 875 | |||
| 876 | /* group sem locking should work for us here -- we're already | ||
| 877 | * registered for heartbeat events so filling this should be | ||
| 878 | * atomic wrt getting those handlers called. */ | ||
| 879 | o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); | ||
| 880 | |||
| 881 | spin_lock(&dlm->spinlock); | ||
| 882 | memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); | ||
| 883 | |||
| 884 | __dlm_set_joining_node(dlm, dlm->node_num); | ||
| 885 | |||
| 886 | spin_unlock(&dlm->spinlock); | ||
| 887 | |||
| 888 | node = -1; | ||
| 889 | while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES, | ||
| 890 | node + 1)) < O2NM_MAX_NODES) { | ||
| 891 | if (node == dlm->node_num) | ||
| 892 | continue; | ||
| 893 | |||
| 894 | status = dlm_request_join(dlm, node, &response); | ||
| 895 | if (status < 0) { | ||
| 896 | mlog_errno(status); | ||
| 897 | goto bail; | ||
| 898 | } | ||
| 899 | |||
| 900 | /* Ok, either we got a response or the node doesn't have a | ||
| 901 | * dlm up. */ | ||
| 902 | if (response == JOIN_OK) | ||
| 903 | set_bit(node, ctxt->yes_resp_map); | ||
| 904 | |||
| 905 | if (dlm_should_restart_join(dlm, ctxt, response)) { | ||
| 906 | status = -EAGAIN; | ||
| 907 | goto bail; | ||
| 908 | } | ||
| 909 | } | ||
| 910 | |||
| 911 | mlog(0, "Yay, done querying nodes!\n"); | ||
| 912 | |||
| 913 | /* Yay, everyone agree's we can join the domain. My domain is | ||
| 914 | * comprised of all nodes who were put in the | ||
| 915 | * yes_resp_map. Copy that into our domain map and send a join | ||
| 916 | * assert message to clean up everyone elses state. */ | ||
| 917 | spin_lock(&dlm->spinlock); | ||
| 918 | memcpy(dlm->domain_map, ctxt->yes_resp_map, | ||
| 919 | sizeof(ctxt->yes_resp_map)); | ||
| 920 | set_bit(dlm->node_num, dlm->domain_map); | ||
| 921 | spin_unlock(&dlm->spinlock); | ||
| 922 | |||
| 923 | dlm_send_join_asserts(dlm, ctxt->yes_resp_map); | ||
| 924 | |||
| 925 | /* Joined state *must* be set before the joining node | ||
| 926 | * information, otherwise the query_join handler may read no | ||
| 927 | * current joiner but a state of NEW and tell joining nodes | ||
| 928 | * we're not in the domain. */ | ||
| 929 | spin_lock(&dlm_domain_lock); | ||
| 930 | dlm->dlm_state = DLM_CTXT_JOINED; | ||
| 931 | dlm->num_joins++; | ||
| 932 | spin_unlock(&dlm_domain_lock); | ||
| 933 | |||
| 934 | bail: | ||
| 935 | spin_lock(&dlm->spinlock); | ||
| 936 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
| 937 | if (!status) | ||
| 938 | __dlm_print_nodes(dlm); | ||
| 939 | spin_unlock(&dlm->spinlock); | ||
| 940 | |||
| 941 | if (ctxt) { | ||
| 942 | /* Do we need to send a cancel message to any nodes? */ | ||
| 943 | if (status < 0) { | ||
| 944 | tmpstat = dlm_send_join_cancels(dlm, | ||
| 945 | ctxt->yes_resp_map, | ||
| 946 | sizeof(ctxt->yes_resp_map)); | ||
| 947 | if (tmpstat < 0) | ||
| 948 | mlog_errno(tmpstat); | ||
| 949 | } | ||
| 950 | kfree(ctxt); | ||
| 951 | } | ||
| 952 | |||
| 953 | mlog(0, "returning %d\n", status); | ||
| 954 | return status; | ||
| 955 | } | ||
| 956 | |||
| 957 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) | ||
| 958 | { | ||
| 959 | o2hb_unregister_callback(&dlm->dlm_hb_up); | ||
| 960 | o2hb_unregister_callback(&dlm->dlm_hb_down); | ||
| 961 | o2net_unregister_handler_list(&dlm->dlm_domain_handlers); | ||
| 962 | } | ||
| 963 | |||
| 964 | static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) | ||
| 965 | { | ||
| 966 | int status; | ||
| 967 | |||
| 968 | mlog(0, "registering handlers.\n"); | ||
| 969 | |||
| 970 | o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, | ||
| 971 | dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); | ||
| 972 | status = o2hb_register_callback(&dlm->dlm_hb_down); | ||
| 973 | if (status) | ||
| 974 | goto bail; | ||
| 975 | |||
| 976 | o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, | ||
| 977 | dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); | ||
| 978 | status = o2hb_register_callback(&dlm->dlm_hb_up); | ||
| 979 | if (status) | ||
| 980 | goto bail; | ||
| 981 | |||
| 982 | status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, | ||
| 983 | sizeof(struct dlm_master_request), | ||
| 984 | dlm_master_request_handler, | ||
| 985 | dlm, &dlm->dlm_domain_handlers); | ||
| 986 | if (status) | ||
| 987 | goto bail; | ||
| 988 | |||
| 989 | status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, | ||
| 990 | sizeof(struct dlm_assert_master), | ||
| 991 | dlm_assert_master_handler, | ||
| 992 | dlm, &dlm->dlm_domain_handlers); | ||
| 993 | if (status) | ||
| 994 | goto bail; | ||
| 995 | |||
| 996 | status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, | ||
| 997 | sizeof(struct dlm_create_lock), | ||
| 998 | dlm_create_lock_handler, | ||
| 999 | dlm, &dlm->dlm_domain_handlers); | ||
| 1000 | if (status) | ||
| 1001 | goto bail; | ||
| 1002 | |||
| 1003 | status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, | ||
| 1004 | DLM_CONVERT_LOCK_MAX_LEN, | ||
| 1005 | dlm_convert_lock_handler, | ||
| 1006 | dlm, &dlm->dlm_domain_handlers); | ||
| 1007 | if (status) | ||
| 1008 | goto bail; | ||
| 1009 | |||
| 1010 | status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, | ||
| 1011 | DLM_UNLOCK_LOCK_MAX_LEN, | ||
| 1012 | dlm_unlock_lock_handler, | ||
| 1013 | dlm, &dlm->dlm_domain_handlers); | ||
| 1014 | if (status) | ||
| 1015 | goto bail; | ||
| 1016 | |||
| 1017 | status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, | ||
| 1018 | DLM_PROXY_AST_MAX_LEN, | ||
| 1019 | dlm_proxy_ast_handler, | ||
| 1020 | dlm, &dlm->dlm_domain_handlers); | ||
| 1021 | if (status) | ||
| 1022 | goto bail; | ||
| 1023 | |||
| 1024 | status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, | ||
| 1025 | sizeof(struct dlm_exit_domain), | ||
| 1026 | dlm_exit_domain_handler, | ||
| 1027 | dlm, &dlm->dlm_domain_handlers); | ||
| 1028 | if (status) | ||
| 1029 | goto bail; | ||
| 1030 | |||
| 1031 | status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, | ||
| 1032 | sizeof(struct dlm_migrate_request), | ||
| 1033 | dlm_migrate_request_handler, | ||
| 1034 | dlm, &dlm->dlm_domain_handlers); | ||
| 1035 | if (status) | ||
| 1036 | goto bail; | ||
| 1037 | |||
| 1038 | status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, | ||
| 1039 | DLM_MIG_LOCKRES_MAX_LEN, | ||
| 1040 | dlm_mig_lockres_handler, | ||
| 1041 | dlm, &dlm->dlm_domain_handlers); | ||
| 1042 | if (status) | ||
| 1043 | goto bail; | ||
| 1044 | |||
| 1045 | status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, | ||
| 1046 | sizeof(struct dlm_master_requery), | ||
| 1047 | dlm_master_requery_handler, | ||
| 1048 | dlm, &dlm->dlm_domain_handlers); | ||
| 1049 | if (status) | ||
| 1050 | goto bail; | ||
| 1051 | |||
| 1052 | status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, | ||
| 1053 | sizeof(struct dlm_lock_request), | ||
| 1054 | dlm_request_all_locks_handler, | ||
| 1055 | dlm, &dlm->dlm_domain_handlers); | ||
| 1056 | if (status) | ||
| 1057 | goto bail; | ||
| 1058 | |||
| 1059 | status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, | ||
| 1060 | sizeof(struct dlm_reco_data_done), | ||
| 1061 | dlm_reco_data_done_handler, | ||
| 1062 | dlm, &dlm->dlm_domain_handlers); | ||
| 1063 | if (status) | ||
| 1064 | goto bail; | ||
| 1065 | |||
| 1066 | status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, | ||
| 1067 | sizeof(struct dlm_begin_reco), | ||
| 1068 | dlm_begin_reco_handler, | ||
| 1069 | dlm, &dlm->dlm_domain_handlers); | ||
| 1070 | if (status) | ||
| 1071 | goto bail; | ||
| 1072 | |||
| 1073 | status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, | ||
| 1074 | sizeof(struct dlm_finalize_reco), | ||
| 1075 | dlm_finalize_reco_handler, | ||
| 1076 | dlm, &dlm->dlm_domain_handlers); | ||
| 1077 | if (status) | ||
| 1078 | goto bail; | ||
| 1079 | |||
| 1080 | bail: | ||
| 1081 | if (status) | ||
| 1082 | dlm_unregister_domain_handlers(dlm); | ||
| 1083 | |||
| 1084 | return status; | ||
| 1085 | } | ||
| 1086 | |||
| 1087 | static int dlm_join_domain(struct dlm_ctxt *dlm) | ||
| 1088 | { | ||
| 1089 | int status; | ||
| 1090 | |||
| 1091 | BUG_ON(!dlm); | ||
| 1092 | |||
| 1093 | mlog(0, "Join domain %s\n", dlm->name); | ||
| 1094 | |||
| 1095 | status = dlm_register_domain_handlers(dlm); | ||
| 1096 | if (status) { | ||
| 1097 | mlog_errno(status); | ||
| 1098 | goto bail; | ||
| 1099 | } | ||
| 1100 | |||
| 1101 | status = dlm_launch_thread(dlm); | ||
| 1102 | if (status < 0) { | ||
| 1103 | mlog_errno(status); | ||
| 1104 | goto bail; | ||
| 1105 | } | ||
| 1106 | |||
| 1107 | status = dlm_launch_recovery_thread(dlm); | ||
| 1108 | if (status < 0) { | ||
| 1109 | mlog_errno(status); | ||
| 1110 | goto bail; | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | do { | ||
| 1114 | unsigned int backoff; | ||
| 1115 | status = dlm_try_to_join_domain(dlm); | ||
| 1116 | |||
| 1117 | /* If we're racing another node to the join, then we | ||
| 1118 | * need to back off temporarily and let them | ||
| 1119 | * complete. */ | ||
| 1120 | if (status == -EAGAIN) { | ||
| 1121 | if (signal_pending(current)) { | ||
| 1122 | status = -ERESTARTSYS; | ||
| 1123 | goto bail; | ||
| 1124 | } | ||
| 1125 | |||
| 1126 | /* | ||
| 1127 | * <chip> After you! | ||
| 1128 | * <dale> No, after you! | ||
| 1129 | * <chip> I insist! | ||
| 1130 | * <dale> But you first! | ||
| 1131 | * ... | ||
| 1132 | */ | ||
| 1133 | backoff = (unsigned int)(jiffies & 0x3); | ||
| 1134 | backoff *= DLM_DOMAIN_BACKOFF_MS; | ||
| 1135 | mlog(0, "backoff %d\n", backoff); | ||
| 1136 | msleep(backoff); | ||
| 1137 | } | ||
| 1138 | } while (status == -EAGAIN); | ||
| 1139 | |||
| 1140 | if (status < 0) { | ||
| 1141 | mlog_errno(status); | ||
| 1142 | goto bail; | ||
| 1143 | } | ||
| 1144 | |||
| 1145 | status = 0; | ||
| 1146 | bail: | ||
| 1147 | wake_up(&dlm_domain_events); | ||
| 1148 | |||
| 1149 | if (status) { | ||
| 1150 | dlm_unregister_domain_handlers(dlm); | ||
| 1151 | dlm_complete_thread(dlm); | ||
| 1152 | dlm_complete_recovery_thread(dlm); | ||
| 1153 | } | ||
| 1154 | |||
| 1155 | return status; | ||
| 1156 | } | ||
| 1157 | |||
| 1158 | static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | ||
| 1159 | u32 key) | ||
| 1160 | { | ||
| 1161 | int i; | ||
| 1162 | struct dlm_ctxt *dlm = NULL; | ||
| 1163 | |||
| 1164 | dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL); | ||
| 1165 | if (!dlm) { | ||
| 1166 | mlog_errno(-ENOMEM); | ||
| 1167 | goto leave; | ||
| 1168 | } | ||
| 1169 | |||
| 1170 | dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); | ||
| 1171 | if (dlm->name == NULL) { | ||
| 1172 | mlog_errno(-ENOMEM); | ||
| 1173 | kfree(dlm); | ||
| 1174 | dlm = NULL; | ||
| 1175 | goto leave; | ||
| 1176 | } | ||
| 1177 | |||
| 1178 | dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL); | ||
| 1179 | if (!dlm->resources) { | ||
| 1180 | mlog_errno(-ENOMEM); | ||
| 1181 | kfree(dlm->name); | ||
| 1182 | kfree(dlm); | ||
| 1183 | dlm = NULL; | ||
| 1184 | goto leave; | ||
| 1185 | } | ||
| 1186 | memset(dlm->resources, 0, PAGE_SIZE); | ||
| 1187 | |||
| 1188 | for (i=0; i<DLM_HASH_SIZE; i++) | ||
| 1189 | INIT_LIST_HEAD(&dlm->resources[i]); | ||
| 1190 | |||
| 1191 | strcpy(dlm->name, domain); | ||
| 1192 | dlm->key = key; | ||
| 1193 | dlm->node_num = o2nm_this_node(); | ||
| 1194 | |||
| 1195 | spin_lock_init(&dlm->spinlock); | ||
| 1196 | spin_lock_init(&dlm->master_lock); | ||
| 1197 | spin_lock_init(&dlm->ast_lock); | ||
| 1198 | INIT_LIST_HEAD(&dlm->list); | ||
| 1199 | INIT_LIST_HEAD(&dlm->dirty_list); | ||
| 1200 | INIT_LIST_HEAD(&dlm->reco.resources); | ||
| 1201 | INIT_LIST_HEAD(&dlm->reco.received); | ||
| 1202 | INIT_LIST_HEAD(&dlm->reco.node_data); | ||
| 1203 | INIT_LIST_HEAD(&dlm->purge_list); | ||
| 1204 | INIT_LIST_HEAD(&dlm->dlm_domain_handlers); | ||
| 1205 | dlm->reco.state = 0; | ||
| 1206 | |||
| 1207 | INIT_LIST_HEAD(&dlm->pending_asts); | ||
| 1208 | INIT_LIST_HEAD(&dlm->pending_basts); | ||
| 1209 | |||
| 1210 | mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", | ||
| 1211 | dlm->recovery_map, &(dlm->recovery_map[0])); | ||
| 1212 | |||
| 1213 | memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); | ||
| 1214 | memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); | ||
| 1215 | memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); | ||
| 1216 | |||
| 1217 | dlm->dlm_thread_task = NULL; | ||
| 1218 | dlm->dlm_reco_thread_task = NULL; | ||
| 1219 | init_waitqueue_head(&dlm->dlm_thread_wq); | ||
| 1220 | init_waitqueue_head(&dlm->dlm_reco_thread_wq); | ||
| 1221 | init_waitqueue_head(&dlm->reco.event); | ||
| 1222 | init_waitqueue_head(&dlm->ast_wq); | ||
| 1223 | init_waitqueue_head(&dlm->migration_wq); | ||
| 1224 | INIT_LIST_HEAD(&dlm->master_list); | ||
| 1225 | INIT_LIST_HEAD(&dlm->mle_hb_events); | ||
| 1226 | |||
| 1227 | dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
| 1228 | init_waitqueue_head(&dlm->dlm_join_events); | ||
| 1229 | |||
| 1230 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | ||
| 1231 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
| 1232 | atomic_set(&dlm->local_resources, 0); | ||
| 1233 | atomic_set(&dlm->remote_resources, 0); | ||
| 1234 | atomic_set(&dlm->unknown_resources, 0); | ||
| 1235 | |||
| 1236 | spin_lock_init(&dlm->work_lock); | ||
| 1237 | INIT_LIST_HEAD(&dlm->work_list); | ||
| 1238 | INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm); | ||
| 1239 | |||
| 1240 | kref_init(&dlm->dlm_refs); | ||
| 1241 | dlm->dlm_state = DLM_CTXT_NEW; | ||
| 1242 | |||
| 1243 | INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); | ||
| 1244 | |||
| 1245 | mlog(0, "context init: refcount %u\n", | ||
| 1246 | atomic_read(&dlm->dlm_refs.refcount)); | ||
| 1247 | |||
| 1248 | leave: | ||
| 1249 | return dlm; | ||
| 1250 | } | ||
| 1251 | |||
| 1252 | /* | ||
| 1253 | * dlm_register_domain: one-time setup per "domain" | ||
| 1254 | */ | ||
| 1255 | struct dlm_ctxt * dlm_register_domain(const char *domain, | ||
| 1256 | u32 key) | ||
| 1257 | { | ||
| 1258 | int ret; | ||
| 1259 | struct dlm_ctxt *dlm = NULL; | ||
| 1260 | struct dlm_ctxt *new_ctxt = NULL; | ||
| 1261 | |||
| 1262 | if (strlen(domain) > O2NM_MAX_NAME_LEN) { | ||
| 1263 | ret = -ENAMETOOLONG; | ||
| 1264 | mlog(ML_ERROR, "domain name length too long\n"); | ||
| 1265 | goto leave; | ||
| 1266 | } | ||
| 1267 | |||
| 1268 | if (!o2hb_check_local_node_heartbeating()) { | ||
| 1269 | mlog(ML_ERROR, "the local node has not been configured, or is " | ||
| 1270 | "not heartbeating\n"); | ||
| 1271 | ret = -EPROTO; | ||
| 1272 | goto leave; | ||
| 1273 | } | ||
| 1274 | |||
| 1275 | mlog(0, "register called for domain \"%s\"\n", domain); | ||
| 1276 | |||
| 1277 | retry: | ||
| 1278 | dlm = NULL; | ||
| 1279 | if (signal_pending(current)) { | ||
| 1280 | ret = -ERESTARTSYS; | ||
| 1281 | mlog_errno(ret); | ||
| 1282 | goto leave; | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | spin_lock(&dlm_domain_lock); | ||
| 1286 | |||
| 1287 | dlm = __dlm_lookup_domain(domain); | ||
| 1288 | if (dlm) { | ||
| 1289 | if (dlm->dlm_state != DLM_CTXT_JOINED) { | ||
| 1290 | spin_unlock(&dlm_domain_lock); | ||
| 1291 | |||
| 1292 | mlog(0, "This ctxt is not joined yet!\n"); | ||
| 1293 | wait_event_interruptible(dlm_domain_events, | ||
| 1294 | dlm_wait_on_domain_helper( | ||
| 1295 | domain)); | ||
| 1296 | goto retry; | ||
| 1297 | } | ||
| 1298 | |||
| 1299 | __dlm_get(dlm); | ||
| 1300 | dlm->num_joins++; | ||
| 1301 | |||
| 1302 | spin_unlock(&dlm_domain_lock); | ||
| 1303 | |||
| 1304 | ret = 0; | ||
| 1305 | goto leave; | ||
| 1306 | } | ||
| 1307 | |||
| 1308 | /* doesn't exist */ | ||
| 1309 | if (!new_ctxt) { | ||
| 1310 | spin_unlock(&dlm_domain_lock); | ||
| 1311 | |||
| 1312 | new_ctxt = dlm_alloc_ctxt(domain, key); | ||
| 1313 | if (new_ctxt) | ||
| 1314 | goto retry; | ||
| 1315 | |||
| 1316 | ret = -ENOMEM; | ||
| 1317 | mlog_errno(ret); | ||
| 1318 | goto leave; | ||
| 1319 | } | ||
| 1320 | |||
| 1321 | /* a little variable switch-a-roo here... */ | ||
| 1322 | dlm = new_ctxt; | ||
| 1323 | new_ctxt = NULL; | ||
| 1324 | |||
| 1325 | /* add the new domain */ | ||
| 1326 | list_add_tail(&dlm->list, &dlm_domains); | ||
| 1327 | spin_unlock(&dlm_domain_lock); | ||
| 1328 | |||
| 1329 | ret = dlm_join_domain(dlm); | ||
| 1330 | if (ret) { | ||
| 1331 | mlog_errno(ret); | ||
| 1332 | dlm_put(dlm); | ||
| 1333 | goto leave; | ||
| 1334 | } | ||
| 1335 | |||
| 1336 | ret = 0; | ||
| 1337 | leave: | ||
| 1338 | if (new_ctxt) | ||
| 1339 | dlm_free_ctxt_mem(new_ctxt); | ||
| 1340 | |||
| 1341 | if (ret < 0) | ||
| 1342 | dlm = ERR_PTR(ret); | ||
| 1343 | |||
| 1344 | return dlm; | ||
| 1345 | } | ||
| 1346 | EXPORT_SYMBOL_GPL(dlm_register_domain); | ||
| 1347 | |||
| 1348 | static LIST_HEAD(dlm_join_handlers); | ||
| 1349 | |||
| 1350 | static void dlm_unregister_net_handlers(void) | ||
| 1351 | { | ||
| 1352 | o2net_unregister_handler_list(&dlm_join_handlers); | ||
| 1353 | } | ||
| 1354 | |||
| 1355 | static int dlm_register_net_handlers(void) | ||
| 1356 | { | ||
| 1357 | int status = 0; | ||
| 1358 | |||
| 1359 | status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, | ||
| 1360 | sizeof(struct dlm_query_join_request), | ||
| 1361 | dlm_query_join_handler, | ||
| 1362 | NULL, &dlm_join_handlers); | ||
| 1363 | if (status) | ||
| 1364 | goto bail; | ||
| 1365 | |||
| 1366 | status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, | ||
| 1367 | sizeof(struct dlm_assert_joined), | ||
| 1368 | dlm_assert_joined_handler, | ||
| 1369 | NULL, &dlm_join_handlers); | ||
| 1370 | if (status) | ||
| 1371 | goto bail; | ||
| 1372 | |||
| 1373 | status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, | ||
| 1374 | sizeof(struct dlm_cancel_join), | ||
| 1375 | dlm_cancel_join_handler, | ||
| 1376 | NULL, &dlm_join_handlers); | ||
| 1377 | |||
| 1378 | bail: | ||
| 1379 | if (status < 0) | ||
| 1380 | dlm_unregister_net_handlers(); | ||
| 1381 | |||
| 1382 | return status; | ||
| 1383 | } | ||
| 1384 | |||
| 1385 | /* Domain eviction callback handling. | ||
| 1386 | * | ||
| 1387 | * The file system requires notification of node death *before* the | ||
| 1388 | * dlm completes it's recovery work, otherwise it may be able to | ||
| 1389 | * acquire locks on resources requiring recovery. Since the dlm can | ||
| 1390 | * evict a node from it's domain *before* heartbeat fires, a similar | ||
| 1391 | * mechanism is required. */ | ||
| 1392 | |||
| 1393 | /* Eviction is not expected to happen often, so a per-domain lock is | ||
| 1394 | * not necessary. Eviction callbacks are allowed to sleep for short | ||
| 1395 | * periods of time. */ | ||
| 1396 | static DECLARE_RWSEM(dlm_callback_sem); | ||
| 1397 | |||
| 1398 | void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, | ||
| 1399 | int node_num) | ||
| 1400 | { | ||
| 1401 | struct list_head *iter; | ||
| 1402 | struct dlm_eviction_cb *cb; | ||
| 1403 | |||
| 1404 | down_read(&dlm_callback_sem); | ||
| 1405 | list_for_each(iter, &dlm->dlm_eviction_callbacks) { | ||
| 1406 | cb = list_entry(iter, struct dlm_eviction_cb, ec_item); | ||
| 1407 | |||
| 1408 | cb->ec_func(node_num, cb->ec_data); | ||
| 1409 | } | ||
| 1410 | up_read(&dlm_callback_sem); | ||
| 1411 | } | ||
| 1412 | |||
| 1413 | void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, | ||
| 1414 | dlm_eviction_func *f, | ||
| 1415 | void *data) | ||
| 1416 | { | ||
| 1417 | INIT_LIST_HEAD(&cb->ec_item); | ||
| 1418 | cb->ec_func = f; | ||
| 1419 | cb->ec_data = data; | ||
| 1420 | } | ||
| 1421 | EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb); | ||
| 1422 | |||
| 1423 | void dlm_register_eviction_cb(struct dlm_ctxt *dlm, | ||
| 1424 | struct dlm_eviction_cb *cb) | ||
| 1425 | { | ||
| 1426 | down_write(&dlm_callback_sem); | ||
| 1427 | list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks); | ||
| 1428 | up_write(&dlm_callback_sem); | ||
| 1429 | } | ||
| 1430 | EXPORT_SYMBOL_GPL(dlm_register_eviction_cb); | ||
| 1431 | |||
| 1432 | void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb) | ||
| 1433 | { | ||
| 1434 | down_write(&dlm_callback_sem); | ||
| 1435 | list_del_init(&cb->ec_item); | ||
| 1436 | up_write(&dlm_callback_sem); | ||
| 1437 | } | ||
| 1438 | EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb); | ||
| 1439 | |||
| 1440 | static int __init dlm_init(void) | ||
| 1441 | { | ||
| 1442 | int status; | ||
| 1443 | |||
| 1444 | dlm_print_version(); | ||
| 1445 | |||
| 1446 | status = dlm_init_mle_cache(); | ||
| 1447 | if (status) | ||
| 1448 | return -1; | ||
| 1449 | |||
| 1450 | status = dlm_register_net_handlers(); | ||
| 1451 | if (status) { | ||
| 1452 | dlm_destroy_mle_cache(); | ||
| 1453 | return -1; | ||
| 1454 | } | ||
| 1455 | |||
| 1456 | return 0; | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | static void __exit dlm_exit (void) | ||
| 1460 | { | ||
| 1461 | dlm_unregister_net_handlers(); | ||
| 1462 | dlm_destroy_mle_cache(); | ||
| 1463 | } | ||
| 1464 | |||
| 1465 | MODULE_AUTHOR("Oracle"); | ||
| 1466 | MODULE_LICENSE("GPL"); | ||
| 1467 | |||
| 1468 | module_init(dlm_init); | ||
| 1469 | module_exit(dlm_exit); | ||
