diff options
-rw-r--r-- | fs/ocfs2/dlm/dlmast.c | 12 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmcommon.h | 63 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmconvert.c | 24 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmdebug.c | 6 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmdebug.h | 30 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmdomain.c | 101 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmfs.c | 6 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmlock.c | 68 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmmaster.c | 448 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 580 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmthread.c | 68 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmunlock.c | 10 | ||||
-rw-r--r-- | fs/ocfs2/dlm/userdlm.c | 2 |
13 files changed, 1045 insertions, 373 deletions
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 87ee29cad50b..42775e2bbe2c 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c | |||
@@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
197 | lock->ml.node == dlm->node_num ? "master" : | 197 | lock->ml.node == dlm->node_num ? "master" : |
198 | "remote"); | 198 | "remote"); |
199 | memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); | 199 | memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); |
200 | } else if (lksb->flags & DLM_LKSB_PUT_LVB) { | ||
201 | mlog(0, "setting lvb from lockres for %s node\n", | ||
202 | lock->ml.node == dlm->node_num ? "master" : | ||
203 | "remote"); | ||
204 | memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); | ||
205 | } | 200 | } |
201 | /* Do nothing for lvb put requests - they should be done in | ||
202 | * place when the lock is downconverted - otherwise we risk | ||
203 | * racing gets and puts which could result in old lvb data | ||
204 | * being propagated. We leave the put flag set and clear it | ||
205 | * here. In the future we might want to clear it at the time | ||
206 | * the put is actually done. | ||
207 | */ | ||
206 | spin_unlock(&res->spinlock); | 208 | spin_unlock(&res->spinlock); |
207 | } | 209 | } |
208 | 210 | ||
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 88cc43df18f1..9bdc9cf65991 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -37,7 +37,17 @@ | |||
37 | #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes | 37 | #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes |
38 | #define DLM_THREAD_MS 200 // flush at least every 200 ms | 38 | #define DLM_THREAD_MS 200 // flush at least every 200 ms |
39 | 39 | ||
40 | #define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) | 40 | #define DLM_HASH_SIZE_DEFAULT (1 << 14) |
41 | #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE | ||
42 | # define DLM_HASH_PAGES 1 | ||
43 | #else | ||
44 | # define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE) | ||
45 | #endif | ||
46 | #define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head)) | ||
47 | #define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) | ||
48 | |||
49 | /* Intended to make it easier for us to switch out hash functions */ | ||
50 | #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) | ||
41 | 51 | ||
42 | enum dlm_ast_type { | 52 | enum dlm_ast_type { |
43 | DLM_AST = 0, | 53 | DLM_AST = 0, |
@@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) | |||
61 | return 0; | 71 | return 0; |
62 | } | 72 | } |
63 | 73 | ||
64 | #define DLM_RECO_STATE_ACTIVE 0x0001 | 74 | #define DLM_RECO_STATE_ACTIVE 0x0001 |
75 | #define DLM_RECO_STATE_FINALIZE 0x0002 | ||
65 | 76 | ||
66 | struct dlm_recovery_ctxt | 77 | struct dlm_recovery_ctxt |
67 | { | 78 | { |
@@ -85,7 +96,7 @@ enum dlm_ctxt_state { | |||
85 | struct dlm_ctxt | 96 | struct dlm_ctxt |
86 | { | 97 | { |
87 | struct list_head list; | 98 | struct list_head list; |
88 | struct hlist_head *lockres_hash; | 99 | struct hlist_head **lockres_hash; |
89 | struct list_head dirty_list; | 100 | struct list_head dirty_list; |
90 | struct list_head purge_list; | 101 | struct list_head purge_list; |
91 | struct list_head pending_asts; | 102 | struct list_head pending_asts; |
@@ -120,6 +131,7 @@ struct dlm_ctxt | |||
120 | struct o2hb_callback_func dlm_hb_down; | 131 | struct o2hb_callback_func dlm_hb_down; |
121 | struct task_struct *dlm_thread_task; | 132 | struct task_struct *dlm_thread_task; |
122 | struct task_struct *dlm_reco_thread_task; | 133 | struct task_struct *dlm_reco_thread_task; |
134 | struct workqueue_struct *dlm_worker; | ||
123 | wait_queue_head_t dlm_thread_wq; | 135 | wait_queue_head_t dlm_thread_wq; |
124 | wait_queue_head_t dlm_reco_thread_wq; | 136 | wait_queue_head_t dlm_reco_thread_wq; |
125 | wait_queue_head_t ast_wq; | 137 | wait_queue_head_t ast_wq; |
@@ -132,6 +144,11 @@ struct dlm_ctxt | |||
132 | struct list_head dlm_eviction_callbacks; | 144 | struct list_head dlm_eviction_callbacks; |
133 | }; | 145 | }; |
134 | 146 | ||
147 | static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i) | ||
148 | { | ||
149 | return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); | ||
150 | } | ||
151 | |||
135 | /* these keventd work queue items are for less-frequently | 152 | /* these keventd work queue items are for less-frequently |
136 | * called functions that cannot be directly called from the | 153 | * called functions that cannot be directly called from the |
137 | * net message handlers for some reason, usually because | 154 | * net message handlers for some reason, usually because |
@@ -216,20 +233,29 @@ struct dlm_lock_resource | |||
216 | /* WARNING: Please see the comment in dlm_init_lockres before | 233 | /* WARNING: Please see the comment in dlm_init_lockres before |
217 | * adding fields here. */ | 234 | * adding fields here. */ |
218 | struct hlist_node hash_node; | 235 | struct hlist_node hash_node; |
236 | struct qstr lockname; | ||
219 | struct kref refs; | 237 | struct kref refs; |
220 | 238 | ||
221 | /* please keep these next 3 in this order | 239 | /* |
222 | * some funcs want to iterate over all lists */ | 240 | * Please keep granted, converting, and blocked in this order, |
241 | * as some funcs want to iterate over all lists. | ||
242 | * | ||
243 | * All four lists are protected by the hash's reference. | ||
244 | */ | ||
223 | struct list_head granted; | 245 | struct list_head granted; |
224 | struct list_head converting; | 246 | struct list_head converting; |
225 | struct list_head blocked; | 247 | struct list_head blocked; |
248 | struct list_head purge; | ||
226 | 249 | ||
250 | /* | ||
251 | * These two lists require you to hold an additional reference | ||
252 | * while they are on the list. | ||
253 | */ | ||
227 | struct list_head dirty; | 254 | struct list_head dirty; |
228 | struct list_head recovering; // dlm_recovery_ctxt.resources list | 255 | struct list_head recovering; // dlm_recovery_ctxt.resources list |
229 | 256 | ||
230 | /* unused lock resources have their last_used stamped and are | 257 | /* unused lock resources have their last_used stamped and are |
231 | * put on a list for the dlm thread to run. */ | 258 | * put on a list for the dlm thread to run. */ |
232 | struct list_head purge; | ||
233 | unsigned long last_used; | 259 | unsigned long last_used; |
234 | 260 | ||
235 | unsigned migration_pending:1; | 261 | unsigned migration_pending:1; |
@@ -238,7 +264,6 @@ struct dlm_lock_resource | |||
238 | wait_queue_head_t wq; | 264 | wait_queue_head_t wq; |
239 | u8 owner; //node which owns the lock resource, or unknown | 265 | u8 owner; //node which owns the lock resource, or unknown |
240 | u16 state; | 266 | u16 state; |
241 | struct qstr lockname; | ||
242 | char lvb[DLM_LVB_LEN]; | 267 | char lvb[DLM_LVB_LEN]; |
243 | }; | 268 | }; |
244 | 269 | ||
@@ -300,6 +325,15 @@ enum dlm_lockres_list { | |||
300 | DLM_BLOCKED_LIST | 325 | DLM_BLOCKED_LIST |
301 | }; | 326 | }; |
302 | 327 | ||
328 | static inline int dlm_lvb_is_empty(char *lvb) | ||
329 | { | ||
330 | int i; | ||
331 | for (i=0; i<DLM_LVB_LEN; i++) | ||
332 | if (lvb[i]) | ||
333 | return 0; | ||
334 | return 1; | ||
335 | } | ||
336 | |||
303 | static inline struct list_head * | 337 | static inline struct list_head * |
304 | dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) | 338 | dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) |
305 | { | 339 | { |
@@ -609,7 +643,8 @@ struct dlm_finalize_reco | |||
609 | { | 643 | { |
610 | u8 node_idx; | 644 | u8 node_idx; |
611 | u8 dead_node; | 645 | u8 dead_node; |
612 | __be16 pad1; | 646 | u8 flags; |
647 | u8 pad1; | ||
613 | __be32 pad2; | 648 | __be32 pad2; |
614 | }; | 649 | }; |
615 | 650 | ||
@@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm); | |||
676 | void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); | 711 | void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); |
677 | int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); | 712 | int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); |
678 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); | 713 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); |
714 | int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); | ||
679 | 715 | ||
680 | void dlm_put(struct dlm_ctxt *dlm); | 716 | void dlm_put(struct dlm_ctxt *dlm); |
681 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); | 717 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); |
@@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, | |||
687 | struct dlm_lock_resource *res); | 723 | struct dlm_lock_resource *res); |
688 | void dlm_purge_lockres(struct dlm_ctxt *dlm, | 724 | void dlm_purge_lockres(struct dlm_ctxt *dlm, |
689 | struct dlm_lock_resource *lockres); | 725 | struct dlm_lock_resource *lockres); |
690 | void dlm_lockres_get(struct dlm_lock_resource *res); | 726 | static inline void dlm_lockres_get(struct dlm_lock_resource *res) |
727 | { | ||
728 | /* This is called on every lookup, so it might be worth | ||
729 | * inlining. */ | ||
730 | kref_get(&res->refs); | ||
731 | } | ||
691 | void dlm_lockres_put(struct dlm_lock_resource *res); | 732 | void dlm_lockres_put(struct dlm_lock_resource *res); |
692 | void __dlm_unhash_lockres(struct dlm_lock_resource *res); | 733 | void __dlm_unhash_lockres(struct dlm_lock_resource *res); |
693 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | 734 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, |
694 | struct dlm_lock_resource *res); | 735 | struct dlm_lock_resource *res); |
695 | struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, | 736 | struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, |
696 | const char *name, | 737 | const char *name, |
697 | unsigned int len); | 738 | unsigned int len, |
739 | unsigned int hash); | ||
698 | struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, | 740 | struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, |
699 | const char *name, | 741 | const char *name, |
700 | unsigned int len); | 742 | unsigned int len); |
@@ -819,6 +861,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, | |||
819 | u8 dead_node); | 861 | u8 dead_node); |
820 | int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); | 862 | int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); |
821 | 863 | ||
864 | int __dlm_lockres_unused(struct dlm_lock_resource *res); | ||
822 | 865 | ||
823 | static inline const char * dlm_lock_mode_name(int mode) | 866 | static inline const char * dlm_lock_mode_name(int mode) |
824 | { | 867 | { |
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 70888b31e751..c764dc8e40a2 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c | |||
@@ -214,6 +214,9 @@ grant: | |||
214 | if (lock->ml.node == dlm->node_num) | 214 | if (lock->ml.node == dlm->node_num) |
215 | mlog(0, "doing in-place convert for nonlocal lock\n"); | 215 | mlog(0, "doing in-place convert for nonlocal lock\n"); |
216 | lock->ml.type = type; | 216 | lock->ml.type = type; |
217 | if (lock->lksb->flags & DLM_LKSB_PUT_LVB) | ||
218 | memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); | ||
219 | |||
217 | status = DLM_NORMAL; | 220 | status = DLM_NORMAL; |
218 | *call_ast = 1; | 221 | *call_ast = 1; |
219 | goto unlock_exit; | 222 | goto unlock_exit; |
@@ -461,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) | |||
461 | } | 464 | } |
462 | 465 | ||
463 | spin_lock(&res->spinlock); | 466 | spin_lock(&res->spinlock); |
467 | status = __dlm_lockres_state_to_status(res); | ||
468 | if (status != DLM_NORMAL) { | ||
469 | spin_unlock(&res->spinlock); | ||
470 | dlm_error(status); | ||
471 | goto leave; | ||
472 | } | ||
464 | list_for_each(iter, &res->granted) { | 473 | list_for_each(iter, &res->granted) { |
465 | lock = list_entry(iter, struct dlm_lock, list); | 474 | lock = list_entry(iter, struct dlm_lock, list); |
466 | if (lock->ml.cookie == cnv->cookie && | 475 | if (lock->ml.cookie == cnv->cookie && |
@@ -470,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) | |||
470 | } | 479 | } |
471 | lock = NULL; | 480 | lock = NULL; |
472 | } | 481 | } |
482 | if (!lock) { | ||
483 | __dlm_print_one_lock_resource(res); | ||
484 | list_for_each(iter, &res->granted) { | ||
485 | lock = list_entry(iter, struct dlm_lock, list); | ||
486 | if (lock->ml.node == cnv->node_idx) { | ||
487 | mlog(ML_ERROR, "There is something here " | ||
488 | "for node %u, lock->ml.cookie=%llu, " | ||
489 | "cnv->cookie=%llu\n", cnv->node_idx, | ||
490 | (unsigned long long)lock->ml.cookie, | ||
491 | (unsigned long long)cnv->cookie); | ||
492 | break; | ||
493 | } | ||
494 | } | ||
495 | lock = NULL; | ||
496 | } | ||
473 | spin_unlock(&res->spinlock); | 497 | spin_unlock(&res->spinlock); |
474 | if (!lock) { | 498 | if (!lock) { |
475 | status = DLM_IVLOCKID; | 499 | status = DLM_IVLOCKID; |
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c7eae5d3324e..3f6c8d88f7af 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -37,10 +37,8 @@ | |||
37 | 37 | ||
38 | #include "dlmapi.h" | 38 | #include "dlmapi.h" |
39 | #include "dlmcommon.h" | 39 | #include "dlmcommon.h" |
40 | #include "dlmdebug.h" | ||
41 | 40 | ||
42 | #include "dlmdomain.h" | 41 | #include "dlmdomain.h" |
43 | #include "dlmdebug.h" | ||
44 | 42 | ||
45 | #define MLOG_MASK_PREFIX ML_DLM | 43 | #define MLOG_MASK_PREFIX ML_DLM |
46 | #include "cluster/masklog.h" | 44 | #include "cluster/masklog.h" |
@@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid) | |||
120 | } | 118 | } |
121 | EXPORT_SYMBOL_GPL(dlm_print_one_lock); | 119 | EXPORT_SYMBOL_GPL(dlm_print_one_lock); |
122 | 120 | ||
121 | #if 0 | ||
123 | void dlm_dump_lock_resources(struct dlm_ctxt *dlm) | 122 | void dlm_dump_lock_resources(struct dlm_ctxt *dlm) |
124 | { | 123 | { |
125 | struct dlm_lock_resource *res; | 124 | struct dlm_lock_resource *res; |
@@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm) | |||
136 | 135 | ||
137 | spin_lock(&dlm->spinlock); | 136 | spin_lock(&dlm->spinlock); |
138 | for (i=0; i<DLM_HASH_BUCKETS; i++) { | 137 | for (i=0; i<DLM_HASH_BUCKETS; i++) { |
139 | bucket = &(dlm->lockres_hash[i]); | 138 | bucket = dlm_lockres_hash(dlm, i); |
140 | hlist_for_each_entry(res, iter, bucket, hash_node) | 139 | hlist_for_each_entry(res, iter, bucket, hash_node) |
141 | dlm_print_one_lock_resource(res); | 140 | dlm_print_one_lock_resource(res); |
142 | } | 141 | } |
143 | spin_unlock(&dlm->spinlock); | 142 | spin_unlock(&dlm->spinlock); |
144 | } | 143 | } |
144 | #endif /* 0 */ | ||
145 | 145 | ||
146 | static const char *dlm_errnames[] = { | 146 | static const char *dlm_errnames[] = { |
147 | [DLM_NORMAL] = "DLM_NORMAL", | 147 | [DLM_NORMAL] = "DLM_NORMAL", |
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h deleted file mode 100644 index 6858510c3ccd..000000000000 --- a/fs/ocfs2/dlm/dlmdebug.h +++ /dev/null | |||
@@ -1,30 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmdebug.h | ||
5 | * | ||
6 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public | ||
19 | * License along with this program; if not, write to the | ||
20 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
21 | * Boston, MA 021110-1307, USA. | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #ifndef DLMDEBUG_H | ||
26 | #define DLMDEBUG_H | ||
27 | |||
28 | void dlm_dump_lock_resources(struct dlm_ctxt *dlm); | ||
29 | |||
30 | #endif | ||
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8f3a9e3106fd..ba27c5c5e959 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -41,7 +41,6 @@ | |||
41 | #include "dlmapi.h" | 41 | #include "dlmapi.h" |
42 | #include "dlmcommon.h" | 42 | #include "dlmcommon.h" |
43 | 43 | ||
44 | #include "dlmdebug.h" | ||
45 | #include "dlmdomain.h" | 44 | #include "dlmdomain.h" |
46 | 45 | ||
47 | #include "dlmver.h" | 46 | #include "dlmver.h" |
@@ -49,6 +48,33 @@ | |||
49 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) | 48 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) |
50 | #include "cluster/masklog.h" | 49 | #include "cluster/masklog.h" |
51 | 50 | ||
51 | static void dlm_free_pagevec(void **vec, int pages) | ||
52 | { | ||
53 | while (pages--) | ||
54 | free_page((unsigned long)vec[pages]); | ||
55 | kfree(vec); | ||
56 | } | ||
57 | |||
58 | static void **dlm_alloc_pagevec(int pages) | ||
59 | { | ||
60 | void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); | ||
61 | int i; | ||
62 | |||
63 | if (!vec) | ||
64 | return NULL; | ||
65 | |||
66 | for (i = 0; i < pages; i++) | ||
67 | if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL))) | ||
68 | goto out_free; | ||
69 | |||
70 | mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n", | ||
71 | pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE); | ||
72 | return vec; | ||
73 | out_free: | ||
74 | dlm_free_pagevec(vec, i); | ||
75 | return NULL; | ||
76 | } | ||
77 | |||
52 | /* | 78 | /* |
53 | * | 79 | * |
54 | * spinlock lock ordering: if multiple locks are needed, obey this ordering: | 80 | * spinlock lock ordering: if multiple locks are needed, obey this ordering: |
@@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, | |||
90 | assert_spin_locked(&dlm->spinlock); | 116 | assert_spin_locked(&dlm->spinlock); |
91 | 117 | ||
92 | q = &res->lockname; | 118 | q = &res->lockname; |
93 | q->hash = full_name_hash(q->name, q->len); | 119 | bucket = dlm_lockres_hash(dlm, q->hash); |
94 | bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]); | ||
95 | 120 | ||
96 | /* get a reference for our hashtable */ | 121 | /* get a reference for our hashtable */ |
97 | dlm_lockres_get(res); | 122 | dlm_lockres_get(res); |
@@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, | |||
100 | } | 125 | } |
101 | 126 | ||
102 | struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, | 127 | struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, |
103 | const char *name, | 128 | const char *name, |
104 | unsigned int len) | 129 | unsigned int len, |
130 | unsigned int hash) | ||
105 | { | 131 | { |
106 | unsigned int hash; | ||
107 | struct hlist_node *iter; | ||
108 | struct dlm_lock_resource *tmpres=NULL; | ||
109 | struct hlist_head *bucket; | 132 | struct hlist_head *bucket; |
133 | struct hlist_node *list; | ||
110 | 134 | ||
111 | mlog_entry("%.*s\n", len, name); | 135 | mlog_entry("%.*s\n", len, name); |
112 | 136 | ||
113 | assert_spin_locked(&dlm->spinlock); | 137 | assert_spin_locked(&dlm->spinlock); |
114 | 138 | ||
115 | hash = full_name_hash(name, len); | 139 | bucket = dlm_lockres_hash(dlm, hash); |
116 | |||
117 | bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]); | ||
118 | |||
119 | /* check for pre-existing lock */ | ||
120 | hlist_for_each(iter, bucket) { | ||
121 | tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node); | ||
122 | if (tmpres->lockname.len == len && | ||
123 | memcmp(tmpres->lockname.name, name, len) == 0) { | ||
124 | dlm_lockres_get(tmpres); | ||
125 | break; | ||
126 | } | ||
127 | 140 | ||
128 | tmpres = NULL; | 141 | hlist_for_each(list, bucket) { |
142 | struct dlm_lock_resource *res = hlist_entry(list, | ||
143 | struct dlm_lock_resource, hash_node); | ||
144 | if (res->lockname.name[0] != name[0]) | ||
145 | continue; | ||
146 | if (unlikely(res->lockname.len != len)) | ||
147 | continue; | ||
148 | if (memcmp(res->lockname.name + 1, name + 1, len - 1)) | ||
149 | continue; | ||
150 | dlm_lockres_get(res); | ||
151 | return res; | ||
129 | } | 152 | } |
130 | return tmpres; | 153 | return NULL; |
131 | } | 154 | } |
132 | 155 | ||
133 | struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, | 156 | struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, |
@@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, | |||
135 | unsigned int len) | 158 | unsigned int len) |
136 | { | 159 | { |
137 | struct dlm_lock_resource *res; | 160 | struct dlm_lock_resource *res; |
161 | unsigned int hash = dlm_lockid_hash(name, len); | ||
138 | 162 | ||
139 | spin_lock(&dlm->spinlock); | 163 | spin_lock(&dlm->spinlock); |
140 | res = __dlm_lookup_lockres(dlm, name, len); | 164 | res = __dlm_lookup_lockres(dlm, name, len, hash); |
141 | spin_unlock(&dlm->spinlock); | 165 | spin_unlock(&dlm->spinlock); |
142 | return res; | 166 | return res; |
143 | } | 167 | } |
@@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain) | |||
194 | static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) | 218 | static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) |
195 | { | 219 | { |
196 | if (dlm->lockres_hash) | 220 | if (dlm->lockres_hash) |
197 | free_page((unsigned long) dlm->lockres_hash); | 221 | dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); |
198 | 222 | ||
199 | if (dlm->name) | 223 | if (dlm->name) |
200 | kfree(dlm->name); | 224 | kfree(dlm->name); |
@@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm) | |||
278 | return ret; | 302 | return ret; |
279 | } | 303 | } |
280 | 304 | ||
305 | static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) | ||
306 | { | ||
307 | if (dlm->dlm_worker) { | ||
308 | flush_workqueue(dlm->dlm_worker); | ||
309 | destroy_workqueue(dlm->dlm_worker); | ||
310 | dlm->dlm_worker = NULL; | ||
311 | } | ||
312 | } | ||
313 | |||
281 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) | 314 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) |
282 | { | 315 | { |
283 | dlm_unregister_domain_handlers(dlm); | 316 | dlm_unregister_domain_handlers(dlm); |
284 | dlm_complete_thread(dlm); | 317 | dlm_complete_thread(dlm); |
285 | dlm_complete_recovery_thread(dlm); | 318 | dlm_complete_recovery_thread(dlm); |
319 | dlm_destroy_dlm_worker(dlm); | ||
286 | 320 | ||
287 | /* We've left the domain. Now we can take ourselves out of the | 321 | /* We've left the domain. Now we can take ourselves out of the |
288 | * list and allow the kref stuff to help us free the | 322 | * list and allow the kref stuff to help us free the |
@@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) | |||
304 | restart: | 338 | restart: |
305 | spin_lock(&dlm->spinlock); | 339 | spin_lock(&dlm->spinlock); |
306 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 340 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { |
307 | while (!hlist_empty(&dlm->lockres_hash[i])) { | 341 | while (!hlist_empty(dlm_lockres_hash(dlm, i))) { |
308 | res = hlist_entry(dlm->lockres_hash[i].first, | 342 | res = hlist_entry(dlm_lockres_hash(dlm, i)->first, |
309 | struct dlm_lock_resource, hash_node); | 343 | struct dlm_lock_resource, hash_node); |
310 | /* need reference when manually grabbing lockres */ | 344 | /* need reference when manually grabbing lockres */ |
311 | dlm_lockres_get(res); | 345 | dlm_lockres_get(res); |
@@ -1126,6 +1160,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) | |||
1126 | goto bail; | 1160 | goto bail; |
1127 | } | 1161 | } |
1128 | 1162 | ||
1163 | dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); | ||
1164 | if (!dlm->dlm_worker) { | ||
1165 | status = -ENOMEM; | ||
1166 | mlog_errno(status); | ||
1167 | goto bail; | ||
1168 | } | ||
1169 | |||
1129 | do { | 1170 | do { |
1130 | unsigned int backoff; | 1171 | unsigned int backoff; |
1131 | status = dlm_try_to_join_domain(dlm); | 1172 | status = dlm_try_to_join_domain(dlm); |
@@ -1166,6 +1207,7 @@ bail: | |||
1166 | dlm_unregister_domain_handlers(dlm); | 1207 | dlm_unregister_domain_handlers(dlm); |
1167 | dlm_complete_thread(dlm); | 1208 | dlm_complete_thread(dlm); |
1168 | dlm_complete_recovery_thread(dlm); | 1209 | dlm_complete_recovery_thread(dlm); |
1210 | dlm_destroy_dlm_worker(dlm); | ||
1169 | } | 1211 | } |
1170 | 1212 | ||
1171 | return status; | 1213 | return status; |
@@ -1191,7 +1233,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
1191 | goto leave; | 1233 | goto leave; |
1192 | } | 1234 | } |
1193 | 1235 | ||
1194 | dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); | 1236 | dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); |
1195 | if (!dlm->lockres_hash) { | 1237 | if (!dlm->lockres_hash) { |
1196 | mlog_errno(-ENOMEM); | 1238 | mlog_errno(-ENOMEM); |
1197 | kfree(dlm->name); | 1239 | kfree(dlm->name); |
@@ -1200,8 +1242,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
1200 | goto leave; | 1242 | goto leave; |
1201 | } | 1243 | } |
1202 | 1244 | ||
1203 | for (i=0; i<DLM_HASH_BUCKETS; i++) | 1245 | for (i = 0; i < DLM_HASH_BUCKETS; i++) |
1204 | INIT_HLIST_HEAD(&dlm->lockres_hash[i]); | 1246 | INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); |
1205 | 1247 | ||
1206 | strcpy(dlm->name, domain); | 1248 | strcpy(dlm->name, domain); |
1207 | dlm->key = key; | 1249 | dlm->key = key; |
@@ -1231,6 +1273,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
1231 | 1273 | ||
1232 | dlm->dlm_thread_task = NULL; | 1274 | dlm->dlm_thread_task = NULL; |
1233 | dlm->dlm_reco_thread_task = NULL; | 1275 | dlm->dlm_reco_thread_task = NULL; |
1276 | dlm->dlm_worker = NULL; | ||
1234 | init_waitqueue_head(&dlm->dlm_thread_wq); | 1277 | init_waitqueue_head(&dlm->dlm_thread_wq); |
1235 | init_waitqueue_head(&dlm->dlm_reco_thread_wq); | 1278 | init_waitqueue_head(&dlm->dlm_reco_thread_wq); |
1236 | init_waitqueue_head(&dlm->reco.event); | 1279 | init_waitqueue_head(&dlm->reco.event); |
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 7273d9fa6bab..033ad1701232 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c | |||
@@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode, | |||
116 | * doesn't make sense for LVB writes. */ | 116 | * doesn't make sense for LVB writes. */ |
117 | file->f_flags &= ~O_APPEND; | 117 | file->f_flags &= ~O_APPEND; |
118 | 118 | ||
119 | fp = kmalloc(sizeof(*fp), GFP_KERNEL); | 119 | fp = kmalloc(sizeof(*fp), GFP_NOFS); |
120 | if (!fp) { | 120 | if (!fp) { |
121 | status = -ENOMEM; | 121 | status = -ENOMEM; |
122 | goto bail; | 122 | goto bail; |
@@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp, | |||
196 | else | 196 | else |
197 | readlen = count - *ppos; | 197 | readlen = count - *ppos; |
198 | 198 | ||
199 | lvb_buf = kmalloc(readlen, GFP_KERNEL); | 199 | lvb_buf = kmalloc(readlen, GFP_NOFS); |
200 | if (!lvb_buf) | 200 | if (!lvb_buf) |
201 | return -ENOMEM; | 201 | return -ENOMEM; |
202 | 202 | ||
@@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp, | |||
240 | else | 240 | else |
241 | writelen = count - *ppos; | 241 | writelen = count - *ppos; |
242 | 242 | ||
243 | lvb_buf = kmalloc(writelen, GFP_KERNEL); | 243 | lvb_buf = kmalloc(writelen, GFP_NOFS); |
244 | if (!lvb_buf) | 244 | if (!lvb_buf) |
245 | return -ENOMEM; | 245 | return -ENOMEM; |
246 | 246 | ||
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 55cda25ae11b..d6f89577e25f 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c | |||
@@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, | |||
201 | struct dlm_lock *lock, int flags) | 201 | struct dlm_lock *lock, int flags) |
202 | { | 202 | { |
203 | enum dlm_status status = DLM_DENIED; | 203 | enum dlm_status status = DLM_DENIED; |
204 | int lockres_changed = 1; | ||
204 | 205 | ||
205 | mlog_entry("type=%d\n", lock->ml.type); | 206 | mlog_entry("type=%d\n", lock->ml.type); |
206 | mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, | 207 | mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, |
@@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, | |||
226 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | 227 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; |
227 | lock->lock_pending = 0; | 228 | lock->lock_pending = 0; |
228 | if (status != DLM_NORMAL) { | 229 | if (status != DLM_NORMAL) { |
229 | if (status != DLM_NOTQUEUED) | 230 | if (status == DLM_RECOVERING && |
231 | dlm_is_recovery_lock(res->lockname.name, | ||
232 | res->lockname.len)) { | ||
233 | /* recovery lock was mastered by dead node. | ||
234 | * we need to have calc_usage shoot down this | ||
235 | * lockres and completely remaster it. */ | ||
236 | mlog(0, "%s: recovery lock was owned by " | ||
237 | "dead node %u, remaster it now.\n", | ||
238 | dlm->name, res->owner); | ||
239 | } else if (status != DLM_NOTQUEUED) { | ||
240 | /* | ||
241 | * DO NOT call calc_usage, as this would unhash | ||
242 | * the remote lockres before we ever get to use | ||
243 | * it. treat as if we never made any change to | ||
244 | * the lockres. | ||
245 | */ | ||
246 | lockres_changed = 0; | ||
230 | dlm_error(status); | 247 | dlm_error(status); |
248 | } | ||
231 | dlm_revert_pending_lock(res, lock); | 249 | dlm_revert_pending_lock(res, lock); |
232 | dlm_lock_put(lock); | 250 | dlm_lock_put(lock); |
233 | } else if (dlm_is_recovery_lock(res->lockname.name, | 251 | } else if (dlm_is_recovery_lock(res->lockname.name, |
@@ -243,7 +261,8 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, | |||
243 | } | 261 | } |
244 | spin_unlock(&res->spinlock); | 262 | spin_unlock(&res->spinlock); |
245 | 263 | ||
246 | dlm_lockres_calc_usage(dlm, res); | 264 | if (lockres_changed) |
265 | dlm_lockres_calc_usage(dlm, res); | ||
247 | 266 | ||
248 | wake_up(&res->wq); | 267 | wake_up(&res->wq); |
249 | return status; | 268 | return status; |
@@ -280,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, | |||
280 | if (tmpret >= 0) { | 299 | if (tmpret >= 0) { |
281 | // successfully sent and received | 300 | // successfully sent and received |
282 | ret = status; // this is already a dlm_status | 301 | ret = status; // this is already a dlm_status |
302 | if (ret == DLM_REJECTED) { | ||
303 | mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " | ||
304 | "no longer owned by %u. that node is coming back " | ||
305 | "up currently.\n", dlm->name, create.namelen, | ||
306 | create.name, res->owner); | ||
307 | dlm_print_one_lock_resource(res); | ||
308 | BUG(); | ||
309 | } | ||
283 | } else { | 310 | } else { |
284 | mlog_errno(tmpret); | 311 | mlog_errno(tmpret); |
285 | if (dlm_is_host_down(tmpret)) { | 312 | if (dlm_is_host_down(tmpret)) { |
@@ -381,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, | |||
381 | struct dlm_lock *lock; | 408 | struct dlm_lock *lock; |
382 | int kernel_allocated = 0; | 409 | int kernel_allocated = 0; |
383 | 410 | ||
384 | lock = kcalloc(1, sizeof(*lock), GFP_KERNEL); | 411 | lock = kcalloc(1, sizeof(*lock), GFP_NOFS); |
385 | if (!lock) | 412 | if (!lock) |
386 | return NULL; | 413 | return NULL; |
387 | 414 | ||
388 | if (!lksb) { | 415 | if (!lksb) { |
389 | /* zero memory only if kernel-allocated */ | 416 | /* zero memory only if kernel-allocated */ |
390 | lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL); | 417 | lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS); |
391 | if (!lksb) { | 418 | if (!lksb) { |
392 | kfree(lock); | 419 | kfree(lock); |
393 | return NULL; | 420 | return NULL; |
@@ -428,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) | |||
428 | if (!dlm_grab(dlm)) | 455 | if (!dlm_grab(dlm)) |
429 | return DLM_REJECTED; | 456 | return DLM_REJECTED; |
430 | 457 | ||
431 | mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), | ||
432 | "Domain %s not fully joined!\n", dlm->name); | ||
433 | |||
434 | name = create->name; | 458 | name = create->name; |
435 | namelen = create->namelen; | 459 | namelen = create->namelen; |
460 | status = DLM_REJECTED; | ||
461 | if (!dlm_domain_fully_joined(dlm)) { | ||
462 | mlog(ML_ERROR, "Domain %s not fully joined, but node %u is " | ||
463 | "sending a create_lock message for lock %.*s!\n", | ||
464 | dlm->name, create->node_idx, namelen, name); | ||
465 | dlm_error(status); | ||
466 | goto leave; | ||
467 | } | ||
436 | 468 | ||
437 | status = DLM_IVBUFLEN; | 469 | status = DLM_IVBUFLEN; |
438 | if (namelen > DLM_LOCKID_NAME_MAX) { | 470 | if (namelen > DLM_LOCKID_NAME_MAX) { |
@@ -668,18 +700,22 @@ retry_lock: | |||
668 | msleep(100); | 700 | msleep(100); |
669 | /* no waiting for dlm_reco_thread */ | 701 | /* no waiting for dlm_reco_thread */ |
670 | if (recovery) { | 702 | if (recovery) { |
671 | if (status == DLM_RECOVERING) { | 703 | if (status != DLM_RECOVERING) |
672 | mlog(0, "%s: got RECOVERING " | 704 | goto retry_lock; |
673 | "for $REOCVERY lock, master " | 705 | |
674 | "was %u\n", dlm->name, | 706 | mlog(0, "%s: got RECOVERING " |
675 | res->owner); | 707 | "for $RECOVERY lock, master " |
676 | dlm_wait_for_node_death(dlm, res->owner, | 708 | "was %u\n", dlm->name, |
677 | DLM_NODE_DEATH_WAIT_MAX); | 709 | res->owner); |
678 | } | 710 | /* wait to see the node go down, then |
711 | * drop down and allow the lockres to | ||
712 | * get cleaned up. need to remaster. */ | ||
713 | dlm_wait_for_node_death(dlm, res->owner, | ||
714 | DLM_NODE_DEATH_WAIT_MAX); | ||
679 | } else { | 715 | } else { |
680 | dlm_wait_for_recovery(dlm); | 716 | dlm_wait_for_recovery(dlm); |
717 | goto retry_lock; | ||
681 | } | 718 | } |
682 | goto retry_lock; | ||
683 | } | 719 | } |
684 | 720 | ||
685 | if (status != DLM_NORMAL) { | 721 | if (status != DLM_NORMAL) { |
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 940be4c13b1f..1b8346dd0572 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -47,7 +47,6 @@ | |||
47 | 47 | ||
48 | #include "dlmapi.h" | 48 | #include "dlmapi.h" |
49 | #include "dlmcommon.h" | 49 | #include "dlmcommon.h" |
50 | #include "dlmdebug.h" | ||
51 | #include "dlmdomain.h" | 50 | #include "dlmdomain.h" |
52 | 51 | ||
53 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) | 52 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) |
@@ -74,6 +73,7 @@ struct dlm_master_list_entry | |||
74 | wait_queue_head_t wq; | 73 | wait_queue_head_t wq; |
75 | atomic_t woken; | 74 | atomic_t woken; |
76 | struct kref mle_refs; | 75 | struct kref mle_refs; |
76 | int inuse; | ||
77 | unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 77 | unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
78 | unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 78 | unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
79 | unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 79 | unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
@@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, | |||
127 | return 1; | 127 | return 1; |
128 | } | 128 | } |
129 | 129 | ||
130 | #if 0 | 130 | #define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) |
131 | /* Code here is included but defined out as it aids debugging */ | 131 | static void _dlm_print_nodemap(unsigned long *map, const char *mapname) |
132 | { | ||
133 | int i; | ||
134 | printk("%s=[ ", mapname); | ||
135 | for (i=0; i<O2NM_MAX_NODES; i++) | ||
136 | if (test_bit(i, map)) | ||
137 | printk("%d ", i); | ||
138 | printk("]"); | ||
139 | } | ||
132 | 140 | ||
133 | void dlm_print_one_mle(struct dlm_master_list_entry *mle) | 141 | static void dlm_print_one_mle(struct dlm_master_list_entry *mle) |
134 | { | 142 | { |
135 | int i = 0, refs; | 143 | int refs; |
136 | char *type; | 144 | char *type; |
137 | char attached; | 145 | char attached; |
138 | u8 master; | 146 | u8 master; |
139 | unsigned int namelen; | 147 | unsigned int namelen; |
140 | const char *name; | 148 | const char *name; |
141 | struct kref *k; | 149 | struct kref *k; |
150 | unsigned long *maybe = mle->maybe_map, | ||
151 | *vote = mle->vote_map, | ||
152 | *resp = mle->response_map, | ||
153 | *node = mle->node_map; | ||
142 | 154 | ||
143 | k = &mle->mle_refs; | 155 | k = &mle->mle_refs; |
144 | if (mle->type == DLM_MLE_BLOCK) | 156 | if (mle->type == DLM_MLE_BLOCK) |
@@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) | |||
159 | name = mle->u.res->lockname.name; | 171 | name = mle->u.res->lockname.name; |
160 | } | 172 | } |
161 | 173 | ||
162 | mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n", | 174 | mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ", |
163 | i, type, refs, master, mle->new_master, attached, | 175 | namelen, name, type, refs, master, mle->new_master, attached, |
164 | namelen, namelen, name); | 176 | mle->inuse); |
177 | dlm_print_nodemap(maybe); | ||
178 | printk(", "); | ||
179 | dlm_print_nodemap(vote); | ||
180 | printk(", "); | ||
181 | dlm_print_nodemap(resp); | ||
182 | printk(", "); | ||
183 | dlm_print_nodemap(node); | ||
184 | printk(", "); | ||
185 | printk("\n"); | ||
165 | } | 186 | } |
166 | 187 | ||
188 | #if 0 | ||
189 | /* Code here is included but defined out as it aids debugging */ | ||
190 | |||
167 | static void dlm_dump_mles(struct dlm_ctxt *dlm) | 191 | static void dlm_dump_mles(struct dlm_ctxt *dlm) |
168 | { | 192 | { |
169 | struct dlm_master_list_entry *mle; | 193 | struct dlm_master_list_entry *mle; |
170 | struct list_head *iter; | 194 | struct list_head *iter; |
171 | 195 | ||
172 | mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); | 196 | mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); |
173 | mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n"); | ||
174 | spin_lock(&dlm->master_lock); | 197 | spin_lock(&dlm->master_lock); |
175 | list_for_each(iter, &dlm->master_list) { | 198 | list_for_each(iter, &dlm->master_list) { |
176 | mle = list_entry(iter, struct dlm_master_list_entry, list); | 199 | mle = list_entry(iter, struct dlm_master_list_entry, list); |
@@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, | |||
314 | spin_unlock(&dlm->spinlock); | 337 | spin_unlock(&dlm->spinlock); |
315 | } | 338 | } |
316 | 339 | ||
340 | static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) | ||
341 | { | ||
342 | struct dlm_ctxt *dlm; | ||
343 | dlm = mle->dlm; | ||
344 | |||
345 | assert_spin_locked(&dlm->spinlock); | ||
346 | assert_spin_locked(&dlm->master_lock); | ||
347 | mle->inuse++; | ||
348 | kref_get(&mle->mle_refs); | ||
349 | } | ||
350 | |||
351 | static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) | ||
352 | { | ||
353 | struct dlm_ctxt *dlm; | ||
354 | dlm = mle->dlm; | ||
355 | |||
356 | spin_lock(&dlm->spinlock); | ||
357 | spin_lock(&dlm->master_lock); | ||
358 | mle->inuse--; | ||
359 | __dlm_put_mle(mle); | ||
360 | spin_unlock(&dlm->master_lock); | ||
361 | spin_unlock(&dlm->spinlock); | ||
362 | |||
363 | } | ||
364 | |||
317 | /* remove from list and free */ | 365 | /* remove from list and free */ |
318 | static void __dlm_put_mle(struct dlm_master_list_entry *mle) | 366 | static void __dlm_put_mle(struct dlm_master_list_entry *mle) |
319 | { | 367 | { |
@@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle) | |||
322 | 370 | ||
323 | assert_spin_locked(&dlm->spinlock); | 371 | assert_spin_locked(&dlm->spinlock); |
324 | assert_spin_locked(&dlm->master_lock); | 372 | assert_spin_locked(&dlm->master_lock); |
325 | BUG_ON(!atomic_read(&mle->mle_refs.refcount)); | 373 | if (!atomic_read(&mle->mle_refs.refcount)) { |
326 | 374 | /* this may or may not crash, but who cares. | |
327 | kref_put(&mle->mle_refs, dlm_mle_release); | 375 | * it's a BUG. */ |
376 | mlog(ML_ERROR, "bad mle: %p\n", mle); | ||
377 | dlm_print_one_mle(mle); | ||
378 | BUG(); | ||
379 | } else | ||
380 | kref_put(&mle->mle_refs, dlm_mle_release); | ||
328 | } | 381 | } |
329 | 382 | ||
330 | 383 | ||
@@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, | |||
367 | memset(mle->response_map, 0, sizeof(mle->response_map)); | 420 | memset(mle->response_map, 0, sizeof(mle->response_map)); |
368 | mle->master = O2NM_MAX_NODES; | 421 | mle->master = O2NM_MAX_NODES; |
369 | mle->new_master = O2NM_MAX_NODES; | 422 | mle->new_master = O2NM_MAX_NODES; |
423 | mle->inuse = 0; | ||
370 | 424 | ||
371 | if (mle->type == DLM_MLE_MASTER) { | 425 | if (mle->type == DLM_MLE_MASTER) { |
372 | BUG_ON(!res); | 426 | BUG_ON(!res); |
@@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref) | |||
564 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, | 618 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, |
565 | res->lockname.name); | 619 | res->lockname.name); |
566 | 620 | ||
621 | if (!hlist_unhashed(&res->hash_node) || | ||
622 | !list_empty(&res->granted) || | ||
623 | !list_empty(&res->converting) || | ||
624 | !list_empty(&res->blocked) || | ||
625 | !list_empty(&res->dirty) || | ||
626 | !list_empty(&res->recovering) || | ||
627 | !list_empty(&res->purge)) { | ||
628 | mlog(ML_ERROR, | ||
629 | "Going to BUG for resource %.*s." | ||
630 | " We're on a list! [%c%c%c%c%c%c%c]\n", | ||
631 | res->lockname.len, res->lockname.name, | ||
632 | !hlist_unhashed(&res->hash_node) ? 'H' : ' ', | ||
633 | !list_empty(&res->granted) ? 'G' : ' ', | ||
634 | !list_empty(&res->converting) ? 'C' : ' ', | ||
635 | !list_empty(&res->blocked) ? 'B' : ' ', | ||
636 | !list_empty(&res->dirty) ? 'D' : ' ', | ||
637 | !list_empty(&res->recovering) ? 'R' : ' ', | ||
638 | !list_empty(&res->purge) ? 'P' : ' '); | ||
639 | |||
640 | dlm_print_one_lock_resource(res); | ||
641 | } | ||
642 | |||
567 | /* By the time we're ready to blow this guy away, we shouldn't | 643 | /* By the time we're ready to blow this guy away, we shouldn't |
568 | * be on any lists. */ | 644 | * be on any lists. */ |
569 | BUG_ON(!hlist_unhashed(&res->hash_node)); | 645 | BUG_ON(!hlist_unhashed(&res->hash_node)); |
@@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref) | |||
579 | kfree(res); | 655 | kfree(res); |
580 | } | 656 | } |
581 | 657 | ||
582 | void dlm_lockres_get(struct dlm_lock_resource *res) | ||
583 | { | ||
584 | kref_get(&res->refs); | ||
585 | } | ||
586 | |||
587 | void dlm_lockres_put(struct dlm_lock_resource *res) | 658 | void dlm_lockres_put(struct dlm_lock_resource *res) |
588 | { | 659 | { |
589 | kref_put(&res->refs, dlm_lockres_release); | 660 | kref_put(&res->refs, dlm_lockres_release); |
@@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, | |||
603 | memcpy(qname, name, namelen); | 674 | memcpy(qname, name, namelen); |
604 | 675 | ||
605 | res->lockname.len = namelen; | 676 | res->lockname.len = namelen; |
606 | res->lockname.hash = full_name_hash(name, namelen); | 677 | res->lockname.hash = dlm_lockid_hash(name, namelen); |
607 | 678 | ||
608 | init_waitqueue_head(&res->wq); | 679 | init_waitqueue_head(&res->wq); |
609 | spin_lock_init(&res->spinlock); | 680 | spin_lock_init(&res->spinlock); |
@@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | |||
637 | { | 708 | { |
638 | struct dlm_lock_resource *res; | 709 | struct dlm_lock_resource *res; |
639 | 710 | ||
640 | res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); | 711 | res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); |
641 | if (!res) | 712 | if (!res) |
642 | return NULL; | 713 | return NULL; |
643 | 714 | ||
644 | res->lockname.name = kmalloc(namelen, GFP_KERNEL); | 715 | res->lockname.name = kmalloc(namelen, GFP_NOFS); |
645 | if (!res->lockname.name) { | 716 | if (!res->lockname.name) { |
646 | kfree(res); | 717 | kfree(res); |
647 | return NULL; | 718 | return NULL; |
@@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | |||
677 | int blocked = 0; | 748 | int blocked = 0; |
678 | int ret, nodenum; | 749 | int ret, nodenum; |
679 | struct dlm_node_iter iter; | 750 | struct dlm_node_iter iter; |
680 | unsigned int namelen; | 751 | unsigned int namelen, hash; |
681 | int tries = 0; | 752 | int tries = 0; |
682 | int bit, wait_on_recovery = 0; | 753 | int bit, wait_on_recovery = 0; |
683 | 754 | ||
684 | BUG_ON(!lockid); | 755 | BUG_ON(!lockid); |
685 | 756 | ||
686 | namelen = strlen(lockid); | 757 | namelen = strlen(lockid); |
758 | hash = dlm_lockid_hash(lockid, namelen); | ||
687 | 759 | ||
688 | mlog(0, "get lockres %s (len %d)\n", lockid, namelen); | 760 | mlog(0, "get lockres %s (len %d)\n", lockid, namelen); |
689 | 761 | ||
690 | lookup: | 762 | lookup: |
691 | spin_lock(&dlm->spinlock); | 763 | spin_lock(&dlm->spinlock); |
692 | tmpres = __dlm_lookup_lockres(dlm, lockid, namelen); | 764 | tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); |
693 | if (tmpres) { | 765 | if (tmpres) { |
694 | spin_unlock(&dlm->spinlock); | 766 | spin_unlock(&dlm->spinlock); |
695 | mlog(0, "found in hash!\n"); | 767 | mlog(0, "found in hash!\n"); |
@@ -704,7 +776,7 @@ lookup: | |||
704 | mlog(0, "allocating a new resource\n"); | 776 | mlog(0, "allocating a new resource\n"); |
705 | /* nothing found and we need to allocate one. */ | 777 | /* nothing found and we need to allocate one. */ |
706 | alloc_mle = (struct dlm_master_list_entry *) | 778 | alloc_mle = (struct dlm_master_list_entry *) |
707 | kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); | 779 | kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); |
708 | if (!alloc_mle) | 780 | if (!alloc_mle) |
709 | goto leave; | 781 | goto leave; |
710 | res = dlm_new_lockres(dlm, lockid, namelen); | 782 | res = dlm_new_lockres(dlm, lockid, namelen); |
@@ -790,10 +862,11 @@ lookup: | |||
790 | * if so, the creator of the BLOCK may try to put the last | 862 | * if so, the creator of the BLOCK may try to put the last |
791 | * ref at this time in the assert master handler, so we | 863 | * ref at this time in the assert master handler, so we |
792 | * need an extra one to keep from a bad ptr deref. */ | 864 | * need an extra one to keep from a bad ptr deref. */ |
793 | dlm_get_mle(mle); | 865 | dlm_get_mle_inuse(mle); |
794 | spin_unlock(&dlm->master_lock); | 866 | spin_unlock(&dlm->master_lock); |
795 | spin_unlock(&dlm->spinlock); | 867 | spin_unlock(&dlm->spinlock); |
796 | 868 | ||
869 | redo_request: | ||
797 | while (wait_on_recovery) { | 870 | while (wait_on_recovery) { |
798 | /* any cluster changes that occurred after dropping the | 871 | /* any cluster changes that occurred after dropping the |
799 | * dlm spinlock would be detectable be a change on the mle, | 872 | * dlm spinlock would be detectable be a change on the mle, |
@@ -812,7 +885,7 @@ lookup: | |||
812 | } | 885 | } |
813 | 886 | ||
814 | dlm_kick_recovery_thread(dlm); | 887 | dlm_kick_recovery_thread(dlm); |
815 | msleep(100); | 888 | msleep(1000); |
816 | dlm_wait_for_recovery(dlm); | 889 | dlm_wait_for_recovery(dlm); |
817 | 890 | ||
818 | spin_lock(&dlm->spinlock); | 891 | spin_lock(&dlm->spinlock); |
@@ -825,13 +898,15 @@ lookup: | |||
825 | } else | 898 | } else |
826 | wait_on_recovery = 0; | 899 | wait_on_recovery = 0; |
827 | spin_unlock(&dlm->spinlock); | 900 | spin_unlock(&dlm->spinlock); |
901 | |||
902 | if (wait_on_recovery) | ||
903 | dlm_wait_for_node_recovery(dlm, bit, 10000); | ||
828 | } | 904 | } |
829 | 905 | ||
830 | /* must wait for lock to be mastered elsewhere */ | 906 | /* must wait for lock to be mastered elsewhere */ |
831 | if (blocked) | 907 | if (blocked) |
832 | goto wait; | 908 | goto wait; |
833 | 909 | ||
834 | redo_request: | ||
835 | ret = -EINVAL; | 910 | ret = -EINVAL; |
836 | dlm_node_iter_init(mle->vote_map, &iter); | 911 | dlm_node_iter_init(mle->vote_map, &iter); |
837 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | 912 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { |
@@ -856,6 +931,7 @@ wait: | |||
856 | /* keep going until the response map includes all nodes */ | 931 | /* keep going until the response map includes all nodes */ |
857 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); | 932 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); |
858 | if (ret < 0) { | 933 | if (ret < 0) { |
934 | wait_on_recovery = 1; | ||
859 | mlog(0, "%s:%.*s: node map changed, redo the " | 935 | mlog(0, "%s:%.*s: node map changed, redo the " |
860 | "master request now, blocked=%d\n", | 936 | "master request now, blocked=%d\n", |
861 | dlm->name, res->lockname.len, | 937 | dlm->name, res->lockname.len, |
@@ -866,7 +942,7 @@ wait: | |||
866 | dlm->name, res->lockname.len, | 942 | dlm->name, res->lockname.len, |
867 | res->lockname.name, blocked); | 943 | res->lockname.name, blocked); |
868 | dlm_print_one_lock_resource(res); | 944 | dlm_print_one_lock_resource(res); |
869 | /* dlm_print_one_mle(mle); */ | 945 | dlm_print_one_mle(mle); |
870 | tries = 0; | 946 | tries = 0; |
871 | } | 947 | } |
872 | goto redo_request; | 948 | goto redo_request; |
@@ -880,7 +956,7 @@ wait: | |||
880 | dlm_mle_detach_hb_events(dlm, mle); | 956 | dlm_mle_detach_hb_events(dlm, mle); |
881 | dlm_put_mle(mle); | 957 | dlm_put_mle(mle); |
882 | /* put the extra ref */ | 958 | /* put the extra ref */ |
883 | dlm_put_mle(mle); | 959 | dlm_put_mle_inuse(mle); |
884 | 960 | ||
885 | wake_waiters: | 961 | wake_waiters: |
886 | spin_lock(&res->spinlock); | 962 | spin_lock(&res->spinlock); |
@@ -921,12 +997,14 @@ recheck: | |||
921 | spin_unlock(&res->spinlock); | 997 | spin_unlock(&res->spinlock); |
922 | /* this will cause the master to re-assert across | 998 | /* this will cause the master to re-assert across |
923 | * the whole cluster, freeing up mles */ | 999 | * the whole cluster, freeing up mles */ |
924 | ret = dlm_do_master_request(mle, res->owner); | 1000 | if (res->owner != dlm->node_num) { |
925 | if (ret < 0) { | 1001 | ret = dlm_do_master_request(mle, res->owner); |
926 | /* give recovery a chance to run */ | 1002 | if (ret < 0) { |
927 | mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); | 1003 | /* give recovery a chance to run */ |
928 | msleep(500); | 1004 | mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); |
929 | goto recheck; | 1005 | msleep(500); |
1006 | goto recheck; | ||
1007 | } | ||
930 | } | 1008 | } |
931 | ret = 0; | 1009 | ret = 0; |
932 | goto leave; | 1010 | goto leave; |
@@ -962,6 +1040,12 @@ recheck: | |||
962 | "rechecking now\n", dlm->name, res->lockname.len, | 1040 | "rechecking now\n", dlm->name, res->lockname.len, |
963 | res->lockname.name); | 1041 | res->lockname.name); |
964 | goto recheck; | 1042 | goto recheck; |
1043 | } else { | ||
1044 | if (!voting_done) { | ||
1045 | mlog(0, "map not changed and voting not done " | ||
1046 | "for %s:%.*s\n", dlm->name, res->lockname.len, | ||
1047 | res->lockname.name); | ||
1048 | } | ||
965 | } | 1049 | } |
966 | 1050 | ||
967 | if (m != O2NM_MAX_NODES) { | 1051 | if (m != O2NM_MAX_NODES) { |
@@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, | |||
1129 | set_bit(node, mle->vote_map); | 1213 | set_bit(node, mle->vote_map); |
1130 | } else { | 1214 | } else { |
1131 | mlog(ML_ERROR, "node down! %d\n", node); | 1215 | mlog(ML_ERROR, "node down! %d\n", node); |
1132 | |||
1133 | /* if the node wasn't involved in mastery skip it, | ||
1134 | * but clear it out from the maps so that it will | ||
1135 | * not affect mastery of this lockres */ | ||
1136 | clear_bit(node, mle->response_map); | ||
1137 | clear_bit(node, mle->vote_map); | ||
1138 | if (!test_bit(node, mle->maybe_map)) | ||
1139 | goto next; | ||
1140 | |||
1141 | /* if we're already blocked on lock mastery, and the | ||
1142 | * dead node wasn't the expected master, or there is | ||
1143 | * another node in the maybe_map, keep waiting */ | ||
1144 | if (blocked) { | 1216 | if (blocked) { |
1145 | int lowest = find_next_bit(mle->maybe_map, | 1217 | int lowest = find_next_bit(mle->maybe_map, |
1146 | O2NM_MAX_NODES, 0); | 1218 | O2NM_MAX_NODES, 0); |
@@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, | |||
1148 | /* act like it was never there */ | 1220 | /* act like it was never there */ |
1149 | clear_bit(node, mle->maybe_map); | 1221 | clear_bit(node, mle->maybe_map); |
1150 | 1222 | ||
1151 | if (node != lowest) | 1223 | if (node == lowest) { |
1152 | goto next; | 1224 | mlog(0, "expected master %u died" |
1153 | 1225 | " while this node was blocked " | |
1154 | mlog(ML_ERROR, "expected master %u died while " | 1226 | "waiting on it!\n", node); |
1155 | "this node was blocked waiting on it!\n", | 1227 | lowest = find_next_bit(mle->maybe_map, |
1156 | node); | 1228 | O2NM_MAX_NODES, |
1157 | lowest = find_next_bit(mle->maybe_map, | 1229 | lowest+1); |
1158 | O2NM_MAX_NODES, | 1230 | if (lowest < O2NM_MAX_NODES) { |
1159 | lowest+1); | 1231 | mlog(0, "%s:%.*s:still " |
1160 | if (lowest < O2NM_MAX_NODES) { | 1232 | "blocked. waiting on %u " |
1161 | mlog(0, "still blocked. waiting " | 1233 | "now\n", dlm->name, |
1162 | "on %u now\n", lowest); | 1234 | res->lockname.len, |
1163 | goto next; | 1235 | res->lockname.name, |
1236 | lowest); | ||
1237 | } else { | ||
1238 | /* mle is an MLE_BLOCK, but | ||
1239 | * there is now nothing left to | ||
1240 | * block on. we need to return | ||
1241 | * all the way back out and try | ||
1242 | * again with an MLE_MASTER. | ||
1243 | * dlm_do_local_recovery_cleanup | ||
1244 | * has already run, so the mle | ||
1245 | * refcount is ok */ | ||
1246 | mlog(0, "%s:%.*s: no " | ||
1247 | "longer blocking. try to " | ||
1248 | "master this here\n", | ||
1249 | dlm->name, | ||
1250 | res->lockname.len, | ||
1251 | res->lockname.name); | ||
1252 | mle->type = DLM_MLE_MASTER; | ||
1253 | mle->u.res = res; | ||
1254 | } | ||
1164 | } | 1255 | } |
1165 | |||
1166 | /* mle is an MLE_BLOCK, but there is now | ||
1167 | * nothing left to block on. we need to return | ||
1168 | * all the way back out and try again with | ||
1169 | * an MLE_MASTER. dlm_do_local_recovery_cleanup | ||
1170 | * has already run, so the mle refcount is ok */ | ||
1171 | mlog(0, "no longer blocking. we can " | ||
1172 | "try to master this here\n"); | ||
1173 | mle->type = DLM_MLE_MASTER; | ||
1174 | memset(mle->maybe_map, 0, | ||
1175 | sizeof(mle->maybe_map)); | ||
1176 | memset(mle->response_map, 0, | ||
1177 | sizeof(mle->maybe_map)); | ||
1178 | memcpy(mle->vote_map, mle->node_map, | ||
1179 | sizeof(mle->node_map)); | ||
1180 | mle->u.res = res; | ||
1181 | set_bit(dlm->node_num, mle->maybe_map); | ||
1182 | |||
1183 | ret = -EAGAIN; | ||
1184 | goto next; | ||
1185 | } | 1256 | } |
1186 | 1257 | ||
1187 | clear_bit(node, mle->maybe_map); | 1258 | /* now blank out everything, as if we had never |
1188 | if (node > dlm->node_num) | 1259 | * contacted anyone */ |
1189 | goto next; | 1260 | memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); |
1190 | 1261 | memset(mle->response_map, 0, sizeof(mle->response_map)); | |
1191 | mlog(0, "dead node in map!\n"); | 1262 | /* reset the vote_map to the current node_map */ |
1192 | /* yuck. go back and re-contact all nodes | 1263 | memcpy(mle->vote_map, mle->node_map, |
1193 | * in the vote_map, removing this node. */ | 1264 | sizeof(mle->node_map)); |
1194 | memset(mle->response_map, 0, | 1265 | /* put myself into the maybe map */ |
1195 | sizeof(mle->response_map)); | 1266 | if (mle->type != DLM_MLE_BLOCK) |
1267 | set_bit(dlm->node_num, mle->maybe_map); | ||
1196 | } | 1268 | } |
1197 | ret = -EAGAIN; | 1269 | ret = -EAGAIN; |
1198 | next: | ||
1199 | node = dlm_bitmap_diff_iter_next(&bdi, &sc); | 1270 | node = dlm_bitmap_diff_iter_next(&bdi, &sc); |
1200 | } | 1271 | } |
1201 | return ret; | 1272 | return ret; |
@@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1316 | struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; | 1387 | struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; |
1317 | struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; | 1388 | struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; |
1318 | char *name; | 1389 | char *name; |
1319 | unsigned int namelen; | 1390 | unsigned int namelen, hash; |
1320 | int found, ret; | 1391 | int found, ret; |
1321 | int set_maybe; | 1392 | int set_maybe; |
1322 | int dispatch_assert = 0; | 1393 | int dispatch_assert = 0; |
@@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1331 | 1402 | ||
1332 | name = request->name; | 1403 | name = request->name; |
1333 | namelen = request->namelen; | 1404 | namelen = request->namelen; |
1405 | hash = dlm_lockid_hash(name, namelen); | ||
1334 | 1406 | ||
1335 | if (namelen > DLM_LOCKID_NAME_MAX) { | 1407 | if (namelen > DLM_LOCKID_NAME_MAX) { |
1336 | response = DLM_IVBUFLEN; | 1408 | response = DLM_IVBUFLEN; |
@@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1339 | 1411 | ||
1340 | way_up_top: | 1412 | way_up_top: |
1341 | spin_lock(&dlm->spinlock); | 1413 | spin_lock(&dlm->spinlock); |
1342 | res = __dlm_lookup_lockres(dlm, name, namelen); | 1414 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); |
1343 | if (res) { | 1415 | if (res) { |
1344 | spin_unlock(&dlm->spinlock); | 1416 | spin_unlock(&dlm->spinlock); |
1345 | 1417 | ||
@@ -1459,21 +1531,18 @@ way_up_top: | |||
1459 | spin_unlock(&dlm->spinlock); | 1531 | spin_unlock(&dlm->spinlock); |
1460 | 1532 | ||
1461 | mle = (struct dlm_master_list_entry *) | 1533 | mle = (struct dlm_master_list_entry *) |
1462 | kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); | 1534 | kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); |
1463 | if (!mle) { | 1535 | if (!mle) { |
1464 | response = DLM_MASTER_RESP_ERROR; | 1536 | response = DLM_MASTER_RESP_ERROR; |
1465 | mlog_errno(-ENOMEM); | 1537 | mlog_errno(-ENOMEM); |
1466 | goto send_response; | 1538 | goto send_response; |
1467 | } | 1539 | } |
1468 | spin_lock(&dlm->spinlock); | ||
1469 | dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, | ||
1470 | name, namelen); | ||
1471 | spin_unlock(&dlm->spinlock); | ||
1472 | goto way_up_top; | 1540 | goto way_up_top; |
1473 | } | 1541 | } |
1474 | 1542 | ||
1475 | // mlog(0, "this is second time thru, already allocated, " | 1543 | // mlog(0, "this is second time thru, already allocated, " |
1476 | // "add the block.\n"); | 1544 | // "add the block.\n"); |
1545 | dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); | ||
1477 | set_bit(request->node_idx, mle->maybe_map); | 1546 | set_bit(request->node_idx, mle->maybe_map); |
1478 | list_add(&mle->list, &dlm->master_list); | 1547 | list_add(&mle->list, &dlm->master_list); |
1479 | response = DLM_MASTER_RESP_NO; | 1548 | response = DLM_MASTER_RESP_NO; |
@@ -1556,6 +1625,8 @@ again: | |||
1556 | dlm_node_iter_init(nodemap, &iter); | 1625 | dlm_node_iter_init(nodemap, &iter); |
1557 | while ((to = dlm_node_iter_next(&iter)) >= 0) { | 1626 | while ((to = dlm_node_iter_next(&iter)) >= 0) { |
1558 | int r = 0; | 1627 | int r = 0; |
1628 | struct dlm_master_list_entry *mle = NULL; | ||
1629 | |||
1559 | mlog(0, "sending assert master to %d (%.*s)\n", to, | 1630 | mlog(0, "sending assert master to %d (%.*s)\n", to, |
1560 | namelen, lockname); | 1631 | namelen, lockname); |
1561 | memset(&assert, 0, sizeof(assert)); | 1632 | memset(&assert, 0, sizeof(assert)); |
@@ -1567,20 +1638,28 @@ again: | |||
1567 | tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, | 1638 | tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, |
1568 | &assert, sizeof(assert), to, &r); | 1639 | &assert, sizeof(assert), to, &r); |
1569 | if (tmpret < 0) { | 1640 | if (tmpret < 0) { |
1570 | mlog(ML_ERROR, "assert_master returned %d!\n", tmpret); | 1641 | mlog(0, "assert_master returned %d!\n", tmpret); |
1571 | if (!dlm_is_host_down(tmpret)) { | 1642 | if (!dlm_is_host_down(tmpret)) { |
1572 | mlog(ML_ERROR, "unhandled error!\n"); | 1643 | mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); |
1573 | BUG(); | 1644 | BUG(); |
1574 | } | 1645 | } |
1575 | /* a node died. finish out the rest of the nodes. */ | 1646 | /* a node died. finish out the rest of the nodes. */ |
1576 | mlog(ML_ERROR, "link to %d went down!\n", to); | 1647 | mlog(0, "link to %d went down!\n", to); |
1577 | /* any nonzero status return will do */ | 1648 | /* any nonzero status return will do */ |
1578 | ret = tmpret; | 1649 | ret = tmpret; |
1579 | } else if (r < 0) { | 1650 | } else if (r < 0) { |
1580 | /* ok, something horribly messed. kill thyself. */ | 1651 | /* ok, something horribly messed. kill thyself. */ |
1581 | mlog(ML_ERROR,"during assert master of %.*s to %u, " | 1652 | mlog(ML_ERROR,"during assert master of %.*s to %u, " |
1582 | "got %d.\n", namelen, lockname, to, r); | 1653 | "got %d.\n", namelen, lockname, to, r); |
1583 | dlm_dump_lock_resources(dlm); | 1654 | spin_lock(&dlm->spinlock); |
1655 | spin_lock(&dlm->master_lock); | ||
1656 | if (dlm_find_mle(dlm, &mle, (char *)lockname, | ||
1657 | namelen)) { | ||
1658 | dlm_print_one_mle(mle); | ||
1659 | __dlm_put_mle(mle); | ||
1660 | } | ||
1661 | spin_unlock(&dlm->master_lock); | ||
1662 | spin_unlock(&dlm->spinlock); | ||
1584 | BUG(); | 1663 | BUG(); |
1585 | } else if (r == EAGAIN) { | 1664 | } else if (r == EAGAIN) { |
1586 | mlog(0, "%.*s: node %u create mles on other " | 1665 | mlog(0, "%.*s: node %u create mles on other " |
@@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1612 | struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; | 1691 | struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; |
1613 | struct dlm_lock_resource *res = NULL; | 1692 | struct dlm_lock_resource *res = NULL; |
1614 | char *name; | 1693 | char *name; |
1615 | unsigned int namelen; | 1694 | unsigned int namelen, hash; |
1616 | u32 flags; | 1695 | u32 flags; |
1617 | int master_request = 0; | 1696 | int master_request = 0; |
1618 | int ret = 0; | 1697 | int ret = 0; |
@@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1622 | 1701 | ||
1623 | name = assert->name; | 1702 | name = assert->name; |
1624 | namelen = assert->namelen; | 1703 | namelen = assert->namelen; |
1704 | hash = dlm_lockid_hash(name, namelen); | ||
1625 | flags = be32_to_cpu(assert->flags); | 1705 | flags = be32_to_cpu(assert->flags); |
1626 | 1706 | ||
1627 | if (namelen > DLM_LOCKID_NAME_MAX) { | 1707 | if (namelen > DLM_LOCKID_NAME_MAX) { |
@@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1646 | if (bit >= O2NM_MAX_NODES) { | 1726 | if (bit >= O2NM_MAX_NODES) { |
1647 | /* not necessarily an error, though less likely. | 1727 | /* not necessarily an error, though less likely. |
1648 | * could be master just re-asserting. */ | 1728 | * could be master just re-asserting. */ |
1649 | mlog(ML_ERROR, "no bits set in the maybe_map, but %u " | 1729 | mlog(0, "no bits set in the maybe_map, but %u " |
1650 | "is asserting! (%.*s)\n", assert->node_idx, | 1730 | "is asserting! (%.*s)\n", assert->node_idx, |
1651 | namelen, name); | 1731 | namelen, name); |
1652 | } else if (bit != assert->node_idx) { | 1732 | } else if (bit != assert->node_idx) { |
@@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1658 | * number winning the mastery will respond | 1738 | * number winning the mastery will respond |
1659 | * YES to mastery requests, but this node | 1739 | * YES to mastery requests, but this node |
1660 | * had no way of knowing. let it pass. */ | 1740 | * had no way of knowing. let it pass. */ |
1661 | mlog(ML_ERROR, "%u is the lowest node, " | 1741 | mlog(0, "%u is the lowest node, " |
1662 | "%u is asserting. (%.*s) %u must " | 1742 | "%u is asserting. (%.*s) %u must " |
1663 | "have begun after %u won.\n", bit, | 1743 | "have begun after %u won.\n", bit, |
1664 | assert->node_idx, namelen, name, bit, | 1744 | assert->node_idx, namelen, name, bit, |
1665 | assert->node_idx); | 1745 | assert->node_idx); |
1666 | } | 1746 | } |
1667 | } | 1747 | } |
1748 | if (mle->type == DLM_MLE_MIGRATION) { | ||
1749 | if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { | ||
1750 | mlog(0, "%s:%.*s: got cleanup assert" | ||
1751 | " from %u for migration\n", | ||
1752 | dlm->name, namelen, name, | ||
1753 | assert->node_idx); | ||
1754 | } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { | ||
1755 | mlog(0, "%s:%.*s: got unrelated assert" | ||
1756 | " from %u for migration, ignoring\n", | ||
1757 | dlm->name, namelen, name, | ||
1758 | assert->node_idx); | ||
1759 | __dlm_put_mle(mle); | ||
1760 | spin_unlock(&dlm->master_lock); | ||
1761 | spin_unlock(&dlm->spinlock); | ||
1762 | goto done; | ||
1763 | } | ||
1764 | } | ||
1668 | } | 1765 | } |
1669 | spin_unlock(&dlm->master_lock); | 1766 | spin_unlock(&dlm->master_lock); |
1670 | 1767 | ||
1671 | /* ok everything checks out with the MLE | 1768 | /* ok everything checks out with the MLE |
1672 | * now check to see if there is a lockres */ | 1769 | * now check to see if there is a lockres */ |
1673 | res = __dlm_lookup_lockres(dlm, name, namelen); | 1770 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); |
1674 | if (res) { | 1771 | if (res) { |
1675 | spin_lock(&res->spinlock); | 1772 | spin_lock(&res->spinlock); |
1676 | if (res->state & DLM_LOCK_RES_RECOVERING) { | 1773 | if (res->state & DLM_LOCK_RES_RECOVERING) { |
@@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1679 | goto kill; | 1776 | goto kill; |
1680 | } | 1777 | } |
1681 | if (!mle) { | 1778 | if (!mle) { |
1682 | if (res->owner != assert->node_idx) { | 1779 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && |
1780 | res->owner != assert->node_idx) { | ||
1683 | mlog(ML_ERROR, "assert_master from " | 1781 | mlog(ML_ERROR, "assert_master from " |
1684 | "%u, but current owner is " | 1782 | "%u, but current owner is " |
1685 | "%u! (%.*s)\n", | 1783 | "%u! (%.*s)\n", |
@@ -1732,6 +1830,7 @@ ok: | |||
1732 | if (mle) { | 1830 | if (mle) { |
1733 | int extra_ref = 0; | 1831 | int extra_ref = 0; |
1734 | int nn = -1; | 1832 | int nn = -1; |
1833 | int rr, err = 0; | ||
1735 | 1834 | ||
1736 | spin_lock(&mle->spinlock); | 1835 | spin_lock(&mle->spinlock); |
1737 | if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) | 1836 | if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) |
@@ -1751,27 +1850,64 @@ ok: | |||
1751 | wake_up(&mle->wq); | 1850 | wake_up(&mle->wq); |
1752 | spin_unlock(&mle->spinlock); | 1851 | spin_unlock(&mle->spinlock); |
1753 | 1852 | ||
1754 | if (mle->type == DLM_MLE_MIGRATION && res) { | 1853 | if (res) { |
1755 | mlog(0, "finishing off migration of lockres %.*s, " | ||
1756 | "from %u to %u\n", | ||
1757 | res->lockname.len, res->lockname.name, | ||
1758 | dlm->node_num, mle->new_master); | ||
1759 | spin_lock(&res->spinlock); | 1854 | spin_lock(&res->spinlock); |
1760 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 1855 | if (mle->type == DLM_MLE_MIGRATION) { |
1761 | dlm_change_lockres_owner(dlm, res, mle->new_master); | 1856 | mlog(0, "finishing off migration of lockres %.*s, " |
1762 | BUG_ON(res->state & DLM_LOCK_RES_DIRTY); | 1857 | "from %u to %u\n", |
1858 | res->lockname.len, res->lockname.name, | ||
1859 | dlm->node_num, mle->new_master); | ||
1860 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
1861 | dlm_change_lockres_owner(dlm, res, mle->new_master); | ||
1862 | BUG_ON(res->state & DLM_LOCK_RES_DIRTY); | ||
1863 | } else { | ||
1864 | dlm_change_lockres_owner(dlm, res, mle->master); | ||
1865 | } | ||
1763 | spin_unlock(&res->spinlock); | 1866 | spin_unlock(&res->spinlock); |
1764 | } | 1867 | } |
1765 | /* master is known, detach if not already detached */ | 1868 | |
1766 | dlm_mle_detach_hb_events(dlm, mle); | 1869 | /* master is known, detach if not already detached. |
1767 | dlm_put_mle(mle); | 1870 | * ensures that only one assert_master call will happen |
1768 | 1871 | * on this mle. */ | |
1872 | spin_lock(&dlm->spinlock); | ||
1873 | spin_lock(&dlm->master_lock); | ||
1874 | |||
1875 | rr = atomic_read(&mle->mle_refs.refcount); | ||
1876 | if (mle->inuse > 0) { | ||
1877 | if (extra_ref && rr < 3) | ||
1878 | err = 1; | ||
1879 | else if (!extra_ref && rr < 2) | ||
1880 | err = 1; | ||
1881 | } else { | ||
1882 | if (extra_ref && rr < 2) | ||
1883 | err = 1; | ||
1884 | else if (!extra_ref && rr < 1) | ||
1885 | err = 1; | ||
1886 | } | ||
1887 | if (err) { | ||
1888 | mlog(ML_ERROR, "%s:%.*s: got assert master from %u " | ||
1889 | "that will mess up this node, refs=%d, extra=%d, " | ||
1890 | "inuse=%d\n", dlm->name, namelen, name, | ||
1891 | assert->node_idx, rr, extra_ref, mle->inuse); | ||
1892 | dlm_print_one_mle(mle); | ||
1893 | } | ||
1894 | list_del_init(&mle->list); | ||
1895 | __dlm_mle_detach_hb_events(dlm, mle); | ||
1896 | __dlm_put_mle(mle); | ||
1769 | if (extra_ref) { | 1897 | if (extra_ref) { |
1770 | /* the assert master message now balances the extra | 1898 | /* the assert master message now balances the extra |
1771 | * ref given by the master / migration request message. | 1899 | * ref given by the master / migration request message. |
1772 | * if this is the last put, it will be removed | 1900 | * if this is the last put, it will be removed |
1773 | * from the list. */ | 1901 | * from the list. */ |
1774 | dlm_put_mle(mle); | 1902 | __dlm_put_mle(mle); |
1903 | } | ||
1904 | spin_unlock(&dlm->master_lock); | ||
1905 | spin_unlock(&dlm->spinlock); | ||
1906 | } else if (res) { | ||
1907 | if (res->owner != assert->node_idx) { | ||
1908 | mlog(0, "assert_master from %u, but current " | ||
1909 | "owner is %u (%.*s), no mle\n", assert->node_idx, | ||
1910 | res->owner, namelen, name); | ||
1775 | } | 1911 | } |
1776 | } | 1912 | } |
1777 | 1913 | ||
@@ -1788,12 +1924,12 @@ done: | |||
1788 | 1924 | ||
1789 | kill: | 1925 | kill: |
1790 | /* kill the caller! */ | 1926 | /* kill the caller! */ |
1927 | mlog(ML_ERROR, "Bad message received from another node. Dumping state " | ||
1928 | "and killing the other node now! This node is OK and can continue.\n"); | ||
1929 | __dlm_print_one_lock_resource(res); | ||
1791 | spin_unlock(&res->spinlock); | 1930 | spin_unlock(&res->spinlock); |
1792 | spin_unlock(&dlm->spinlock); | 1931 | spin_unlock(&dlm->spinlock); |
1793 | dlm_lockres_put(res); | 1932 | dlm_lockres_put(res); |
1794 | mlog(ML_ERROR, "Bad message received from another node. Dumping state " | ||
1795 | "and killing the other node now! This node is OK and can continue.\n"); | ||
1796 | dlm_dump_lock_resources(dlm); | ||
1797 | dlm_put(dlm); | 1933 | dlm_put(dlm); |
1798 | return -EINVAL; | 1934 | return -EINVAL; |
1799 | } | 1935 | } |
@@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, | |||
1803 | int ignore_higher, u8 request_from, u32 flags) | 1939 | int ignore_higher, u8 request_from, u32 flags) |
1804 | { | 1940 | { |
1805 | struct dlm_work_item *item; | 1941 | struct dlm_work_item *item; |
1806 | item = kcalloc(1, sizeof(*item), GFP_KERNEL); | 1942 | item = kcalloc(1, sizeof(*item), GFP_NOFS); |
1807 | if (!item) | 1943 | if (!item) |
1808 | return -ENOMEM; | 1944 | return -ENOMEM; |
1809 | 1945 | ||
@@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, | |||
1825 | list_add_tail(&item->list, &dlm->work_list); | 1961 | list_add_tail(&item->list, &dlm->work_list); |
1826 | spin_unlock(&dlm->work_lock); | 1962 | spin_unlock(&dlm->work_lock); |
1827 | 1963 | ||
1828 | schedule_work(&dlm->dispatched_work); | 1964 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); |
1829 | return 0; | 1965 | return 0; |
1830 | } | 1966 | } |
1831 | 1967 | ||
@@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) | |||
1866 | } | 2002 | } |
1867 | } | 2003 | } |
1868 | 2004 | ||
2005 | /* | ||
2006 | * If we're migrating this lock to someone else, we are no | ||
2007 | * longer allowed to assert out own mastery. OTOH, we need to | ||
2008 | * prevent migration from starting while we're still asserting | ||
2009 | * our dominance. The reserved ast delays migration. | ||
2010 | */ | ||
2011 | spin_lock(&res->spinlock); | ||
2012 | if (res->state & DLM_LOCK_RES_MIGRATING) { | ||
2013 | mlog(0, "Someone asked us to assert mastery, but we're " | ||
2014 | "in the middle of migration. Skipping assert, " | ||
2015 | "the new master will handle that.\n"); | ||
2016 | spin_unlock(&res->spinlock); | ||
2017 | goto put; | ||
2018 | } else | ||
2019 | __dlm_lockres_reserve_ast(res); | ||
2020 | spin_unlock(&res->spinlock); | ||
2021 | |||
1869 | /* this call now finishes out the nodemap | 2022 | /* this call now finishes out the nodemap |
1870 | * even if one or more nodes die */ | 2023 | * even if one or more nodes die */ |
1871 | mlog(0, "worker about to master %.*s here, this=%u\n", | 2024 | mlog(0, "worker about to master %.*s here, this=%u\n", |
@@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) | |||
1875 | nodemap, flags); | 2028 | nodemap, flags); |
1876 | if (ret < 0) { | 2029 | if (ret < 0) { |
1877 | /* no need to restart, we are done */ | 2030 | /* no need to restart, we are done */ |
1878 | mlog_errno(ret); | 2031 | if (!dlm_is_host_down(ret)) |
2032 | mlog_errno(ret); | ||
1879 | } | 2033 | } |
1880 | 2034 | ||
2035 | /* Ok, we've asserted ourselves. Let's let migration start. */ | ||
2036 | dlm_lockres_release_ast(dlm, res); | ||
2037 | |||
2038 | put: | ||
1881 | dlm_lockres_put(res); | 2039 | dlm_lockres_put(res); |
1882 | 2040 | ||
1883 | mlog(0, "finished with dlm_assert_master_worker\n"); | 2041 | mlog(0, "finished with dlm_assert_master_worker\n"); |
@@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, | |||
1916 | BUG(); | 2074 | BUG(); |
1917 | /* host is down, so answer for that node would be | 2075 | /* host is down, so answer for that node would be |
1918 | * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ | 2076 | * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ |
2077 | ret = 0; | ||
1919 | } | 2078 | } |
1920 | 2079 | ||
1921 | if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { | 2080 | if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { |
@@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
2016 | */ | 2175 | */ |
2017 | 2176 | ||
2018 | ret = -ENOMEM; | 2177 | ret = -ENOMEM; |
2019 | mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL); | 2178 | mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); |
2020 | if (!mres) { | 2179 | if (!mres) { |
2021 | mlog_errno(ret); | 2180 | mlog_errno(ret); |
2022 | goto leave; | 2181 | goto leave; |
2023 | } | 2182 | } |
2024 | 2183 | ||
2025 | mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, | 2184 | mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, |
2026 | GFP_KERNEL); | 2185 | GFP_NOFS); |
2027 | if (!mle) { | 2186 | if (!mle) { |
2028 | mlog_errno(ret); | 2187 | mlog_errno(ret); |
2029 | goto leave; | 2188 | goto leave; |
@@ -2117,7 +2276,7 @@ fail: | |||
2117 | * take both dlm->spinlock and dlm->master_lock */ | 2276 | * take both dlm->spinlock and dlm->master_lock */ |
2118 | spin_lock(&dlm->spinlock); | 2277 | spin_lock(&dlm->spinlock); |
2119 | spin_lock(&dlm->master_lock); | 2278 | spin_lock(&dlm->master_lock); |
2120 | dlm_get_mle(mle); | 2279 | dlm_get_mle_inuse(mle); |
2121 | spin_unlock(&dlm->master_lock); | 2280 | spin_unlock(&dlm->master_lock); |
2122 | spin_unlock(&dlm->spinlock); | 2281 | spin_unlock(&dlm->spinlock); |
2123 | 2282 | ||
@@ -2134,7 +2293,10 @@ fail: | |||
2134 | /* migration failed, detach and clean up mle */ | 2293 | /* migration failed, detach and clean up mle */ |
2135 | dlm_mle_detach_hb_events(dlm, mle); | 2294 | dlm_mle_detach_hb_events(dlm, mle); |
2136 | dlm_put_mle(mle); | 2295 | dlm_put_mle(mle); |
2137 | dlm_put_mle(mle); | 2296 | dlm_put_mle_inuse(mle); |
2297 | spin_lock(&res->spinlock); | ||
2298 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
2299 | spin_unlock(&res->spinlock); | ||
2138 | goto leave; | 2300 | goto leave; |
2139 | } | 2301 | } |
2140 | 2302 | ||
@@ -2164,8 +2326,8 @@ fail: | |||
2164 | /* avoid hang during shutdown when migrating lockres | 2326 | /* avoid hang during shutdown when migrating lockres |
2165 | * to a node which also goes down */ | 2327 | * to a node which also goes down */ |
2166 | if (dlm_is_node_dead(dlm, target)) { | 2328 | if (dlm_is_node_dead(dlm, target)) { |
2167 | mlog(0, "%s:%.*s: expected migration target %u " | 2329 | mlog(0, "%s:%.*s: expected migration " |
2168 | "is no longer up. restarting.\n", | 2330 | "target %u is no longer up, restarting\n", |
2169 | dlm->name, res->lockname.len, | 2331 | dlm->name, res->lockname.len, |
2170 | res->lockname.name, target); | 2332 | res->lockname.name, target); |
2171 | ret = -ERESTARTSYS; | 2333 | ret = -ERESTARTSYS; |
@@ -2175,7 +2337,10 @@ fail: | |||
2175 | /* migration failed, detach and clean up mle */ | 2337 | /* migration failed, detach and clean up mle */ |
2176 | dlm_mle_detach_hb_events(dlm, mle); | 2338 | dlm_mle_detach_hb_events(dlm, mle); |
2177 | dlm_put_mle(mle); | 2339 | dlm_put_mle(mle); |
2178 | dlm_put_mle(mle); | 2340 | dlm_put_mle_inuse(mle); |
2341 | spin_lock(&res->spinlock); | ||
2342 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
2343 | spin_unlock(&res->spinlock); | ||
2179 | goto leave; | 2344 | goto leave; |
2180 | } | 2345 | } |
2181 | /* TODO: if node died: stop, clean up, return error */ | 2346 | /* TODO: if node died: stop, clean up, return error */ |
@@ -2191,7 +2356,7 @@ fail: | |||
2191 | 2356 | ||
2192 | /* master is known, detach if not already detached */ | 2357 | /* master is known, detach if not already detached */ |
2193 | dlm_mle_detach_hb_events(dlm, mle); | 2358 | dlm_mle_detach_hb_events(dlm, mle); |
2194 | dlm_put_mle(mle); | 2359 | dlm_put_mle_inuse(mle); |
2195 | ret = 0; | 2360 | ret = 0; |
2196 | 2361 | ||
2197 | dlm_lockres_calc_usage(dlm, res); | 2362 | dlm_lockres_calc_usage(dlm, res); |
@@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2462 | struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; | 2627 | struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; |
2463 | struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; | 2628 | struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; |
2464 | const char *name; | 2629 | const char *name; |
2465 | unsigned int namelen; | 2630 | unsigned int namelen, hash; |
2466 | int ret = 0; | 2631 | int ret = 0; |
2467 | 2632 | ||
2468 | if (!dlm_grab(dlm)) | 2633 | if (!dlm_grab(dlm)) |
@@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2470 | 2635 | ||
2471 | name = migrate->name; | 2636 | name = migrate->name; |
2472 | namelen = migrate->namelen; | 2637 | namelen = migrate->namelen; |
2638 | hash = dlm_lockid_hash(name, namelen); | ||
2473 | 2639 | ||
2474 | /* preallocate.. if this fails, abort */ | 2640 | /* preallocate.. if this fails, abort */ |
2475 | mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, | 2641 | mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, |
2476 | GFP_KERNEL); | 2642 | GFP_NOFS); |
2477 | 2643 | ||
2478 | if (!mle) { | 2644 | if (!mle) { |
2479 | ret = -ENOMEM; | 2645 | ret = -ENOMEM; |
@@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2482 | 2648 | ||
2483 | /* check for pre-existing lock */ | 2649 | /* check for pre-existing lock */ |
2484 | spin_lock(&dlm->spinlock); | 2650 | spin_lock(&dlm->spinlock); |
2485 | res = __dlm_lookup_lockres(dlm, name, namelen); | 2651 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); |
2486 | spin_lock(&dlm->master_lock); | 2652 | spin_lock(&dlm->master_lock); |
2487 | 2653 | ||
2488 | if (res) { | 2654 | if (res) { |
@@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, | |||
2580 | /* remove it from the list so that only one | 2746 | /* remove it from the list so that only one |
2581 | * mle will be found */ | 2747 | * mle will be found */ |
2582 | list_del_init(&tmp->list); | 2748 | list_del_init(&tmp->list); |
2749 | __dlm_mle_detach_hb_events(dlm, mle); | ||
2583 | } | 2750 | } |
2584 | spin_unlock(&tmp->spinlock); | 2751 | spin_unlock(&tmp->spinlock); |
2585 | } | 2752 | } |
@@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) | |||
2601 | struct list_head *iter, *iter2; | 2768 | struct list_head *iter, *iter2; |
2602 | struct dlm_master_list_entry *mle; | 2769 | struct dlm_master_list_entry *mle; |
2603 | struct dlm_lock_resource *res; | 2770 | struct dlm_lock_resource *res; |
2771 | unsigned int hash; | ||
2604 | 2772 | ||
2605 | mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); | 2773 | mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); |
2606 | top: | 2774 | top: |
@@ -2640,7 +2808,7 @@ top: | |||
2640 | * may result in the mle being unlinked and | 2808 | * may result in the mle being unlinked and |
2641 | * freed, but there may still be a process | 2809 | * freed, but there may still be a process |
2642 | * waiting in the dlmlock path which is fine. */ | 2810 | * waiting in the dlmlock path which is fine. */ |
2643 | mlog(ML_ERROR, "node %u was expected master\n", | 2811 | mlog(0, "node %u was expected master\n", |
2644 | dead_node); | 2812 | dead_node); |
2645 | atomic_set(&mle->woken, 1); | 2813 | atomic_set(&mle->woken, 1); |
2646 | spin_unlock(&mle->spinlock); | 2814 | spin_unlock(&mle->spinlock); |
@@ -2673,19 +2841,21 @@ top: | |||
2673 | 2841 | ||
2674 | /* remove from the list early. NOTE: unlinking | 2842 | /* remove from the list early. NOTE: unlinking |
2675 | * list_head while in list_for_each_safe */ | 2843 | * list_head while in list_for_each_safe */ |
2844 | __dlm_mle_detach_hb_events(dlm, mle); | ||
2676 | spin_lock(&mle->spinlock); | 2845 | spin_lock(&mle->spinlock); |
2677 | list_del_init(&mle->list); | 2846 | list_del_init(&mle->list); |
2678 | atomic_set(&mle->woken, 1); | 2847 | atomic_set(&mle->woken, 1); |
2679 | spin_unlock(&mle->spinlock); | 2848 | spin_unlock(&mle->spinlock); |
2680 | wake_up(&mle->wq); | 2849 | wake_up(&mle->wq); |
2681 | 2850 | ||
2682 | mlog(0, "node %u died during migration from " | 2851 | mlog(0, "%s: node %u died during migration from " |
2683 | "%u to %u!\n", dead_node, | 2852 | "%u to %u!\n", dlm->name, dead_node, |
2684 | mle->master, mle->new_master); | 2853 | mle->master, mle->new_master); |
2685 | /* if there is a lockres associated with this | 2854 | /* if there is a lockres associated with this |
2686 | * mle, find it and set its owner to UNKNOWN */ | 2855 | * mle, find it and set its owner to UNKNOWN */ |
2856 | hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len); | ||
2687 | res = __dlm_lookup_lockres(dlm, mle->u.name.name, | 2857 | res = __dlm_lookup_lockres(dlm, mle->u.name.name, |
2688 | mle->u.name.len); | 2858 | mle->u.name.len, hash); |
2689 | if (res) { | 2859 | if (res) { |
2690 | /* unfortunately if we hit this rare case, our | 2860 | /* unfortunately if we hit this rare case, our |
2691 | * lock ordering is messed. we need to drop | 2861 | * lock ordering is messed. we need to drop |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 9962190e7416..da399013516f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -115,12 +115,37 @@ static u64 dlm_get_next_mig_cookie(void) | |||
115 | return c; | 115 | return c; |
116 | } | 116 | } |
117 | 117 | ||
118 | static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm, | ||
119 | u8 dead_node) | ||
120 | { | ||
121 | assert_spin_locked(&dlm->spinlock); | ||
122 | if (dlm->reco.dead_node != dead_node) | ||
123 | mlog(0, "%s: changing dead_node from %u to %u\n", | ||
124 | dlm->name, dlm->reco.dead_node, dead_node); | ||
125 | dlm->reco.dead_node = dead_node; | ||
126 | } | ||
127 | |||
128 | static inline void dlm_set_reco_master(struct dlm_ctxt *dlm, | ||
129 | u8 master) | ||
130 | { | ||
131 | assert_spin_locked(&dlm->spinlock); | ||
132 | mlog(0, "%s: changing new_master from %u to %u\n", | ||
133 | dlm->name, dlm->reco.new_master, master); | ||
134 | dlm->reco.new_master = master; | ||
135 | } | ||
136 | |||
137 | static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) | ||
138 | { | ||
139 | assert_spin_locked(&dlm->spinlock); | ||
140 | clear_bit(dlm->reco.dead_node, dlm->recovery_map); | ||
141 | dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); | ||
142 | dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); | ||
143 | } | ||
144 | |||
118 | static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) | 145 | static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) |
119 | { | 146 | { |
120 | spin_lock(&dlm->spinlock); | 147 | spin_lock(&dlm->spinlock); |
121 | clear_bit(dlm->reco.dead_node, dlm->recovery_map); | 148 | __dlm_reset_recovery(dlm); |
122 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
123 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | ||
124 | spin_unlock(&dlm->spinlock); | 149 | spin_unlock(&dlm->spinlock); |
125 | } | 150 | } |
126 | 151 | ||
@@ -132,12 +157,21 @@ void dlm_dispatch_work(void *data) | |||
132 | struct list_head *iter, *iter2; | 157 | struct list_head *iter, *iter2; |
133 | struct dlm_work_item *item; | 158 | struct dlm_work_item *item; |
134 | dlm_workfunc_t *workfunc; | 159 | dlm_workfunc_t *workfunc; |
160 | int tot=0; | ||
161 | |||
162 | if (!dlm_joined(dlm)) | ||
163 | return; | ||
135 | 164 | ||
136 | spin_lock(&dlm->work_lock); | 165 | spin_lock(&dlm->work_lock); |
137 | list_splice_init(&dlm->work_list, &tmp_list); | 166 | list_splice_init(&dlm->work_list, &tmp_list); |
138 | spin_unlock(&dlm->work_lock); | 167 | spin_unlock(&dlm->work_lock); |
139 | 168 | ||
140 | list_for_each_safe(iter, iter2, &tmp_list) { | 169 | list_for_each_safe(iter, iter2, &tmp_list) { |
170 | tot++; | ||
171 | } | ||
172 | mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); | ||
173 | |||
174 | list_for_each_safe(iter, iter2, &tmp_list) { | ||
141 | item = list_entry(iter, struct dlm_work_item, list); | 175 | item = list_entry(iter, struct dlm_work_item, list); |
142 | workfunc = item->func; | 176 | workfunc = item->func; |
143 | list_del_init(&item->list); | 177 | list_del_init(&item->list); |
@@ -220,6 +254,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) | |||
220 | * | 254 | * |
221 | */ | 255 | */ |
222 | 256 | ||
257 | static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) | ||
258 | { | ||
259 | struct dlm_reco_node_data *ndata; | ||
260 | struct dlm_lock_resource *res; | ||
261 | |||
262 | mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", | ||
263 | dlm->name, dlm->dlm_reco_thread_task->pid, | ||
264 | dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", | ||
265 | dlm->reco.dead_node, dlm->reco.new_master); | ||
266 | |||
267 | list_for_each_entry(ndata, &dlm->reco.node_data, list) { | ||
268 | char *st = "unknown"; | ||
269 | switch (ndata->state) { | ||
270 | case DLM_RECO_NODE_DATA_INIT: | ||
271 | st = "init"; | ||
272 | break; | ||
273 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
274 | st = "requesting"; | ||
275 | break; | ||
276 | case DLM_RECO_NODE_DATA_DEAD: | ||
277 | st = "dead"; | ||
278 | break; | ||
279 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
280 | st = "receiving"; | ||
281 | break; | ||
282 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
283 | st = "requested"; | ||
284 | break; | ||
285 | case DLM_RECO_NODE_DATA_DONE: | ||
286 | st = "done"; | ||
287 | break; | ||
288 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
289 | st = "finalize-sent"; | ||
290 | break; | ||
291 | default: | ||
292 | st = "bad"; | ||
293 | break; | ||
294 | } | ||
295 | mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", | ||
296 | dlm->name, ndata->node_num, st); | ||
297 | } | ||
298 | list_for_each_entry(res, &dlm->reco.resources, recovering) { | ||
299 | mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", | ||
300 | dlm->name, res->lockname.len, res->lockname.name); | ||
301 | } | ||
302 | } | ||
223 | 303 | ||
224 | #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) | 304 | #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) |
225 | 305 | ||
@@ -267,11 +347,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) | |||
267 | { | 347 | { |
268 | int dead; | 348 | int dead; |
269 | spin_lock(&dlm->spinlock); | 349 | spin_lock(&dlm->spinlock); |
270 | dead = test_bit(node, dlm->domain_map); | 350 | dead = !test_bit(node, dlm->domain_map); |
271 | spin_unlock(&dlm->spinlock); | 351 | spin_unlock(&dlm->spinlock); |
272 | return dead; | 352 | return dead; |
273 | } | 353 | } |
274 | 354 | ||
355 | /* returns true if node is no longer in the domain | ||
356 | * could be dead or just not joined */ | ||
357 | static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) | ||
358 | { | ||
359 | int recovered; | ||
360 | spin_lock(&dlm->spinlock); | ||
361 | recovered = !test_bit(node, dlm->recovery_map); | ||
362 | spin_unlock(&dlm->spinlock); | ||
363 | return recovered; | ||
364 | } | ||
365 | |||
366 | |||
275 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) | 367 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) |
276 | { | 368 | { |
277 | if (timeout) { | 369 | if (timeout) { |
@@ -290,6 +382,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) | |||
290 | return 0; | 382 | return 0; |
291 | } | 383 | } |
292 | 384 | ||
385 | int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) | ||
386 | { | ||
387 | if (timeout) { | ||
388 | mlog(0, "%s: waiting %dms for notification of " | ||
389 | "recovery of node %u\n", dlm->name, timeout, node); | ||
390 | wait_event_timeout(dlm->dlm_reco_thread_wq, | ||
391 | dlm_is_node_recovered(dlm, node), | ||
392 | msecs_to_jiffies(timeout)); | ||
393 | } else { | ||
394 | mlog(0, "%s: waiting indefinitely for notification " | ||
395 | "of recovery of node %u\n", dlm->name, node); | ||
396 | wait_event(dlm->dlm_reco_thread_wq, | ||
397 | dlm_is_node_recovered(dlm, node)); | ||
398 | } | ||
399 | /* for now, return 0 */ | ||
400 | return 0; | ||
401 | } | ||
402 | |||
293 | /* callers of the top-level api calls (dlmlock/dlmunlock) should | 403 | /* callers of the top-level api calls (dlmlock/dlmunlock) should |
294 | * block on the dlm->reco.event when recovery is in progress. | 404 | * block on the dlm->reco.event when recovery is in progress. |
295 | * the dlm recovery thread will set this state when it begins | 405 | * the dlm recovery thread will set this state when it begins |
@@ -308,6 +418,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm) | |||
308 | 418 | ||
309 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm) | 419 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm) |
310 | { | 420 | { |
421 | if (dlm_in_recovery(dlm)) { | ||
422 | mlog(0, "%s: reco thread %d in recovery: " | ||
423 | "state=%d, master=%u, dead=%u\n", | ||
424 | dlm->name, dlm->dlm_reco_thread_task->pid, | ||
425 | dlm->reco.state, dlm->reco.new_master, | ||
426 | dlm->reco.dead_node); | ||
427 | } | ||
311 | wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); | 428 | wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); |
312 | } | 429 | } |
313 | 430 | ||
@@ -341,7 +458,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
341 | mlog(0, "new master %u died while recovering %u!\n", | 458 | mlog(0, "new master %u died while recovering %u!\n", |
342 | dlm->reco.new_master, dlm->reco.dead_node); | 459 | dlm->reco.new_master, dlm->reco.dead_node); |
343 | /* unset the new_master, leave dead_node */ | 460 | /* unset the new_master, leave dead_node */ |
344 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | 461 | dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); |
345 | } | 462 | } |
346 | 463 | ||
347 | /* select a target to recover */ | 464 | /* select a target to recover */ |
@@ -350,14 +467,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
350 | 467 | ||
351 | bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); | 468 | bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); |
352 | if (bit >= O2NM_MAX_NODES || bit < 0) | 469 | if (bit >= O2NM_MAX_NODES || bit < 0) |
353 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | 470 | dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); |
354 | else | 471 | else |
355 | dlm->reco.dead_node = bit; | 472 | dlm_set_reco_dead_node(dlm, bit); |
356 | } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { | 473 | } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { |
357 | /* BUG? */ | 474 | /* BUG? */ |
358 | mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", | 475 | mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", |
359 | dlm->reco.dead_node); | 476 | dlm->reco.dead_node); |
360 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | 477 | dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); |
361 | } | 478 | } |
362 | 479 | ||
363 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { | 480 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { |
@@ -366,7 +483,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
366 | /* return to main thread loop and sleep. */ | 483 | /* return to main thread loop and sleep. */ |
367 | return 0; | 484 | return 0; |
368 | } | 485 | } |
369 | mlog(0, "recovery thread found node %u in the recovery map!\n", | 486 | mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", |
487 | dlm->name, dlm->dlm_reco_thread_task->pid, | ||
370 | dlm->reco.dead_node); | 488 | dlm->reco.dead_node); |
371 | spin_unlock(&dlm->spinlock); | 489 | spin_unlock(&dlm->spinlock); |
372 | 490 | ||
@@ -389,8 +507,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
389 | } | 507 | } |
390 | mlog(0, "another node will master this recovery session.\n"); | 508 | mlog(0, "another node will master this recovery session.\n"); |
391 | } | 509 | } |
392 | mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", | 510 | mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", |
393 | dlm->name, dlm->reco.new_master, | 511 | dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master, |
394 | dlm->node_num, dlm->reco.dead_node); | 512 | dlm->node_num, dlm->reco.dead_node); |
395 | 513 | ||
396 | /* it is safe to start everything back up here | 514 | /* it is safe to start everything back up here |
@@ -402,11 +520,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
402 | return 0; | 520 | return 0; |
403 | 521 | ||
404 | master_here: | 522 | master_here: |
405 | mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", | 523 | mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n", |
524 | dlm->dlm_reco_thread_task->pid, | ||
406 | dlm->name, dlm->reco.dead_node, dlm->node_num); | 525 | dlm->name, dlm->reco.dead_node, dlm->node_num); |
407 | 526 | ||
408 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); | 527 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); |
409 | if (status < 0) { | 528 | if (status < 0) { |
529 | /* we should never hit this anymore */ | ||
410 | mlog(ML_ERROR, "error %d remastering locks for node %u, " | 530 | mlog(ML_ERROR, "error %d remastering locks for node %u, " |
411 | "retrying.\n", status, dlm->reco.dead_node); | 531 | "retrying.\n", status, dlm->reco.dead_node); |
412 | /* yield a bit to allow any final network messages | 532 | /* yield a bit to allow any final network messages |
@@ -433,9 +553,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
433 | int destroy = 0; | 553 | int destroy = 0; |
434 | int pass = 0; | 554 | int pass = 0; |
435 | 555 | ||
436 | status = dlm_init_recovery_area(dlm, dead_node); | 556 | do { |
437 | if (status < 0) | 557 | /* we have become recovery master. there is no escaping |
438 | goto leave; | 558 | * this, so just keep trying until we get it. */ |
559 | status = dlm_init_recovery_area(dlm, dead_node); | ||
560 | if (status < 0) { | ||
561 | mlog(ML_ERROR, "%s: failed to alloc recovery area, " | ||
562 | "retrying\n", dlm->name); | ||
563 | msleep(1000); | ||
564 | } | ||
565 | } while (status != 0); | ||
439 | 566 | ||
440 | /* safe to access the node data list without a lock, since this | 567 | /* safe to access the node data list without a lock, since this |
441 | * process is the only one to change the list */ | 568 | * process is the only one to change the list */ |
@@ -452,16 +579,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
452 | continue; | 579 | continue; |
453 | } | 580 | } |
454 | 581 | ||
455 | status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); | 582 | do { |
456 | if (status < 0) { | 583 | status = dlm_request_all_locks(dlm, ndata->node_num, |
457 | mlog_errno(status); | 584 | dead_node); |
458 | if (dlm_is_host_down(status)) | 585 | if (status < 0) { |
459 | ndata->state = DLM_RECO_NODE_DATA_DEAD; | 586 | mlog_errno(status); |
460 | else { | 587 | if (dlm_is_host_down(status)) { |
461 | destroy = 1; | 588 | /* node died, ignore it for recovery */ |
462 | goto leave; | 589 | status = 0; |
590 | ndata->state = DLM_RECO_NODE_DATA_DEAD; | ||
591 | /* wait for the domain map to catch up | ||
592 | * with the network state. */ | ||
593 | wait_event_timeout(dlm->dlm_reco_thread_wq, | ||
594 | dlm_is_node_dead(dlm, | ||
595 | ndata->node_num), | ||
596 | msecs_to_jiffies(1000)); | ||
597 | mlog(0, "waited 1 sec for %u, " | ||
598 | "dead? %s\n", ndata->node_num, | ||
599 | dlm_is_node_dead(dlm, ndata->node_num) ? | ||
600 | "yes" : "no"); | ||
601 | } else { | ||
602 | /* -ENOMEM on the other node */ | ||
603 | mlog(0, "%s: node %u returned " | ||
604 | "%d during recovery, retrying " | ||
605 | "after a short wait\n", | ||
606 | dlm->name, ndata->node_num, | ||
607 | status); | ||
608 | msleep(100); | ||
609 | } | ||
463 | } | 610 | } |
464 | } | 611 | } while (status != 0); |
465 | 612 | ||
466 | switch (ndata->state) { | 613 | switch (ndata->state) { |
467 | case DLM_RECO_NODE_DATA_INIT: | 614 | case DLM_RECO_NODE_DATA_INIT: |
@@ -473,10 +620,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
473 | mlog(0, "node %u died after requesting " | 620 | mlog(0, "node %u died after requesting " |
474 | "recovery info for node %u\n", | 621 | "recovery info for node %u\n", |
475 | ndata->node_num, dead_node); | 622 | ndata->node_num, dead_node); |
476 | // start all over | 623 | /* fine. don't need this node's info. |
477 | destroy = 1; | 624 | * continue without it. */ |
478 | status = -EAGAIN; | 625 | break; |
479 | goto leave; | ||
480 | case DLM_RECO_NODE_DATA_REQUESTING: | 626 | case DLM_RECO_NODE_DATA_REQUESTING: |
481 | ndata->state = DLM_RECO_NODE_DATA_REQUESTED; | 627 | ndata->state = DLM_RECO_NODE_DATA_REQUESTED; |
482 | mlog(0, "now receiving recovery data from " | 628 | mlog(0, "now receiving recovery data from " |
@@ -520,35 +666,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
520 | BUG(); | 666 | BUG(); |
521 | break; | 667 | break; |
522 | case DLM_RECO_NODE_DATA_DEAD: | 668 | case DLM_RECO_NODE_DATA_DEAD: |
523 | mlog(ML_NOTICE, "node %u died after " | 669 | mlog(0, "node %u died after " |
524 | "requesting recovery info for " | 670 | "requesting recovery info for " |
525 | "node %u\n", ndata->node_num, | 671 | "node %u\n", ndata->node_num, |
526 | dead_node); | 672 | dead_node); |
527 | spin_unlock(&dlm_reco_state_lock); | 673 | break; |
528 | // start all over | ||
529 | destroy = 1; | ||
530 | status = -EAGAIN; | ||
531 | /* instead of spinning like crazy here, | ||
532 | * wait for the domain map to catch up | ||
533 | * with the network state. otherwise this | ||
534 | * can be hit hundreds of times before | ||
535 | * the node is really seen as dead. */ | ||
536 | wait_event_timeout(dlm->dlm_reco_thread_wq, | ||
537 | dlm_is_node_dead(dlm, | ||
538 | ndata->node_num), | ||
539 | msecs_to_jiffies(1000)); | ||
540 | mlog(0, "waited 1 sec for %u, " | ||
541 | "dead? %s\n", ndata->node_num, | ||
542 | dlm_is_node_dead(dlm, ndata->node_num) ? | ||
543 | "yes" : "no"); | ||
544 | goto leave; | ||
545 | case DLM_RECO_NODE_DATA_RECEIVING: | 674 | case DLM_RECO_NODE_DATA_RECEIVING: |
546 | case DLM_RECO_NODE_DATA_REQUESTED: | 675 | case DLM_RECO_NODE_DATA_REQUESTED: |
676 | mlog(0, "%s: node %u still in state %s\n", | ||
677 | dlm->name, ndata->node_num, | ||
678 | ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? | ||
679 | "receiving" : "requested"); | ||
547 | all_nodes_done = 0; | 680 | all_nodes_done = 0; |
548 | break; | 681 | break; |
549 | case DLM_RECO_NODE_DATA_DONE: | 682 | case DLM_RECO_NODE_DATA_DONE: |
683 | mlog(0, "%s: node %u state is done\n", | ||
684 | dlm->name, ndata->node_num); | ||
550 | break; | 685 | break; |
551 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 686 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: |
687 | mlog(0, "%s: node %u state is finalize\n", | ||
688 | dlm->name, ndata->node_num); | ||
552 | break; | 689 | break; |
553 | } | 690 | } |
554 | } | 691 | } |
@@ -578,7 +715,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
578 | jiffies, dlm->reco.dead_node, | 715 | jiffies, dlm->reco.dead_node, |
579 | dlm->node_num, dlm->reco.new_master); | 716 | dlm->node_num, dlm->reco.new_master); |
580 | destroy = 1; | 717 | destroy = 1; |
581 | status = ret; | 718 | status = 0; |
582 | /* rescan everything marked dirty along the way */ | 719 | /* rescan everything marked dirty along the way */ |
583 | dlm_kick_thread(dlm, NULL); | 720 | dlm_kick_thread(dlm, NULL); |
584 | break; | 721 | break; |
@@ -591,7 +728,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
591 | 728 | ||
592 | } | 729 | } |
593 | 730 | ||
594 | leave: | ||
595 | if (destroy) | 731 | if (destroy) |
596 | dlm_destroy_recovery_area(dlm, dead_node); | 732 | dlm_destroy_recovery_area(dlm, dead_node); |
597 | 733 | ||
@@ -617,7 +753,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | |||
617 | } | 753 | } |
618 | BUG_ON(num == dead_node); | 754 | BUG_ON(num == dead_node); |
619 | 755 | ||
620 | ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); | 756 | ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS); |
621 | if (!ndata) { | 757 | if (!ndata) { |
622 | dlm_destroy_recovery_area(dlm, dead_node); | 758 | dlm_destroy_recovery_area(dlm, dead_node); |
623 | return -ENOMEM; | 759 | return -ENOMEM; |
@@ -691,16 +827,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) | |||
691 | if (!dlm_grab(dlm)) | 827 | if (!dlm_grab(dlm)) |
692 | return -EINVAL; | 828 | return -EINVAL; |
693 | 829 | ||
830 | if (lr->dead_node != dlm->reco.dead_node) { | ||
831 | mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " | ||
832 | "dead_node is %u\n", dlm->name, lr->node_idx, | ||
833 | lr->dead_node, dlm->reco.dead_node); | ||
834 | dlm_print_reco_node_status(dlm); | ||
835 | /* this is a hack */ | ||
836 | dlm_put(dlm); | ||
837 | return -ENOMEM; | ||
838 | } | ||
694 | BUG_ON(lr->dead_node != dlm->reco.dead_node); | 839 | BUG_ON(lr->dead_node != dlm->reco.dead_node); |
695 | 840 | ||
696 | item = kcalloc(1, sizeof(*item), GFP_KERNEL); | 841 | item = kcalloc(1, sizeof(*item), GFP_NOFS); |
697 | if (!item) { | 842 | if (!item) { |
698 | dlm_put(dlm); | 843 | dlm_put(dlm); |
699 | return -ENOMEM; | 844 | return -ENOMEM; |
700 | } | 845 | } |
701 | 846 | ||
702 | /* this will get freed by dlm_request_all_locks_worker */ | 847 | /* this will get freed by dlm_request_all_locks_worker */ |
703 | buf = (char *) __get_free_page(GFP_KERNEL); | 848 | buf = (char *) __get_free_page(GFP_NOFS); |
704 | if (!buf) { | 849 | if (!buf) { |
705 | kfree(item); | 850 | kfree(item); |
706 | dlm_put(dlm); | 851 | dlm_put(dlm); |
@@ -715,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) | |||
715 | spin_lock(&dlm->work_lock); | 860 | spin_lock(&dlm->work_lock); |
716 | list_add_tail(&item->list, &dlm->work_list); | 861 | list_add_tail(&item->list, &dlm->work_list); |
717 | spin_unlock(&dlm->work_lock); | 862 | spin_unlock(&dlm->work_lock); |
718 | schedule_work(&dlm->dispatched_work); | 863 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); |
719 | 864 | ||
720 | dlm_put(dlm); | 865 | dlm_put(dlm); |
721 | return 0; | 866 | return 0; |
@@ -730,32 +875,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
730 | struct list_head *iter; | 875 | struct list_head *iter; |
731 | int ret; | 876 | int ret; |
732 | u8 dead_node, reco_master; | 877 | u8 dead_node, reco_master; |
878 | int skip_all_done = 0; | ||
733 | 879 | ||
734 | dlm = item->dlm; | 880 | dlm = item->dlm; |
735 | dead_node = item->u.ral.dead_node; | 881 | dead_node = item->u.ral.dead_node; |
736 | reco_master = item->u.ral.reco_master; | 882 | reco_master = item->u.ral.reco_master; |
737 | mres = (struct dlm_migratable_lockres *)data; | 883 | mres = (struct dlm_migratable_lockres *)data; |
738 | 884 | ||
885 | mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", | ||
886 | dlm->name, dead_node, reco_master); | ||
887 | |||
739 | if (dead_node != dlm->reco.dead_node || | 888 | if (dead_node != dlm->reco.dead_node || |
740 | reco_master != dlm->reco.new_master) { | 889 | reco_master != dlm->reco.new_master) { |
741 | /* show extra debug info if the recovery state is messed */ | 890 | /* worker could have been created before the recovery master |
742 | mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " | 891 | * died. if so, do not continue, but do not error. */ |
743 | "request(dead=%u, master=%u)\n", | 892 | if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { |
744 | dlm->name, dlm->reco.dead_node, dlm->reco.new_master, | 893 | mlog(ML_NOTICE, "%s: will not send recovery state, " |
745 | dead_node, reco_master); | 894 | "recovery master %u died, thread=(dead=%u,mas=%u)" |
746 | mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " | 895 | " current=(dead=%u,mas=%u)\n", dlm->name, |
747 | "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", | 896 | reco_master, dead_node, reco_master, |
748 | dlm->name, mres->lockname_len, mres->lockname, mres->master, | 897 | dlm->reco.dead_node, dlm->reco.new_master); |
749 | mres->num_locks, mres->total_locks, mres->flags, | 898 | } else { |
750 | dlm_get_lock_cookie_node(mres->ml[0].cookie), | 899 | mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " |
751 | dlm_get_lock_cookie_seq(mres->ml[0].cookie), | 900 | "master=%u), request(dead=%u, master=%u)\n", |
752 | mres->ml[0].list, mres->ml[0].flags, | 901 | dlm->name, dlm->reco.dead_node, |
753 | mres->ml[0].type, mres->ml[0].convert_type, | 902 | dlm->reco.new_master, dead_node, reco_master); |
754 | mres->ml[0].highest_blocked, mres->ml[0].node); | 903 | } |
755 | BUG(); | 904 | goto leave; |
756 | } | 905 | } |
757 | BUG_ON(dead_node != dlm->reco.dead_node); | ||
758 | BUG_ON(reco_master != dlm->reco.new_master); | ||
759 | 906 | ||
760 | /* lock resources should have already been moved to the | 907 | /* lock resources should have already been moved to the |
761 | * dlm->reco.resources list. now move items from that list | 908 | * dlm->reco.resources list. now move items from that list |
@@ -766,12 +913,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
766 | dlm_move_reco_locks_to_list(dlm, &resources, dead_node); | 913 | dlm_move_reco_locks_to_list(dlm, &resources, dead_node); |
767 | 914 | ||
768 | /* now we can begin blasting lockreses without the dlm lock */ | 915 | /* now we can begin blasting lockreses without the dlm lock */ |
916 | |||
917 | /* any errors returned will be due to the new_master dying, | ||
918 | * the dlm_reco_thread should detect this */ | ||
769 | list_for_each(iter, &resources) { | 919 | list_for_each(iter, &resources) { |
770 | res = list_entry (iter, struct dlm_lock_resource, recovering); | 920 | res = list_entry (iter, struct dlm_lock_resource, recovering); |
771 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, | 921 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, |
772 | DLM_MRES_RECOVERY); | 922 | DLM_MRES_RECOVERY); |
773 | if (ret < 0) | 923 | if (ret < 0) { |
774 | mlog_errno(ret); | 924 | mlog(ML_ERROR, "%s: node %u went down while sending " |
925 | "recovery state for dead node %u, ret=%d\n", dlm->name, | ||
926 | reco_master, dead_node, ret); | ||
927 | skip_all_done = 1; | ||
928 | break; | ||
929 | } | ||
775 | } | 930 | } |
776 | 931 | ||
777 | /* move the resources back to the list */ | 932 | /* move the resources back to the list */ |
@@ -779,10 +934,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
779 | list_splice_init(&resources, &dlm->reco.resources); | 934 | list_splice_init(&resources, &dlm->reco.resources); |
780 | spin_unlock(&dlm->spinlock); | 935 | spin_unlock(&dlm->spinlock); |
781 | 936 | ||
782 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); | 937 | if (!skip_all_done) { |
783 | if (ret < 0) | 938 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); |
784 | mlog_errno(ret); | 939 | if (ret < 0) { |
785 | 940 | mlog(ML_ERROR, "%s: node %u went down while sending " | |
941 | "recovery all-done for dead node %u, ret=%d\n", | ||
942 | dlm->name, reco_master, dead_node, ret); | ||
943 | } | ||
944 | } | ||
945 | leave: | ||
786 | free_page((unsigned long)data); | 946 | free_page((unsigned long)data); |
787 | } | 947 | } |
788 | 948 | ||
@@ -801,8 +961,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) | |||
801 | 961 | ||
802 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, | 962 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, |
803 | sizeof(done_msg), send_to, &tmpret); | 963 | sizeof(done_msg), send_to, &tmpret); |
804 | /* negative status is ignored by the caller */ | 964 | if (ret < 0) { |
805 | if (ret >= 0) | 965 | if (!dlm_is_host_down(ret)) { |
966 | mlog_errno(ret); | ||
967 | mlog(ML_ERROR, "%s: unknown error sending data-done " | ||
968 | "to %u\n", dlm->name, send_to); | ||
969 | BUG(); | ||
970 | } | ||
971 | } else | ||
806 | ret = tmpret; | 972 | ret = tmpret; |
807 | return ret; | 973 | return ret; |
808 | } | 974 | } |
@@ -822,7 +988,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) | |||
822 | mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " | 988 | mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " |
823 | "node_idx=%u, this node=%u\n", done->dead_node, | 989 | "node_idx=%u, this node=%u\n", done->dead_node, |
824 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | 990 | dlm->reco.dead_node, done->node_idx, dlm->node_num); |
825 | BUG_ON(done->dead_node != dlm->reco.dead_node); | 991 | |
992 | mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), | ||
993 | "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " | ||
994 | "node_idx=%u, this node=%u\n", done->dead_node, | ||
995 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | ||
826 | 996 | ||
827 | spin_lock(&dlm_reco_state_lock); | 997 | spin_lock(&dlm_reco_state_lock); |
828 | list_for_each(iter, &dlm->reco.node_data) { | 998 | list_for_each(iter, &dlm->reco.node_data) { |
@@ -1021,8 +1191,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock, | |||
1021 | ml->type == LKM_PRMODE) { | 1191 | ml->type == LKM_PRMODE) { |
1022 | /* if it is already set, this had better be a PR | 1192 | /* if it is already set, this had better be a PR |
1023 | * and it has to match */ | 1193 | * and it has to match */ |
1024 | if (mres->lvb[0] && (ml->type == LKM_EXMODE || | 1194 | if (!dlm_lvb_is_empty(mres->lvb) && |
1025 | memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { | 1195 | (ml->type == LKM_EXMODE || |
1196 | memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { | ||
1026 | mlog(ML_ERROR, "mismatched lvbs!\n"); | 1197 | mlog(ML_ERROR, "mismatched lvbs!\n"); |
1027 | __dlm_print_one_lock_resource(lock->lockres); | 1198 | __dlm_print_one_lock_resource(lock->lockres); |
1028 | BUG(); | 1199 | BUG(); |
@@ -1081,22 +1252,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
1081 | * we must send it immediately. */ | 1252 | * we must send it immediately. */ |
1082 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, | 1253 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, |
1083 | res, total_locks); | 1254 | res, total_locks); |
1084 | if (ret < 0) { | 1255 | if (ret < 0) |
1085 | // TODO | 1256 | goto error; |
1086 | mlog(ML_ERROR, "dlm_send_mig_lockres_msg " | ||
1087 | "returned %d, TODO\n", ret); | ||
1088 | BUG(); | ||
1089 | } | ||
1090 | } | 1257 | } |
1091 | } | 1258 | } |
1092 | /* flush any remaining locks */ | 1259 | /* flush any remaining locks */ |
1093 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); | 1260 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); |
1094 | if (ret < 0) { | 1261 | if (ret < 0) |
1095 | // TODO | 1262 | goto error; |
1096 | mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " | 1263 | return ret; |
1097 | "TODO\n", ret); | 1264 | |
1265 | error: | ||
1266 | mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", | ||
1267 | dlm->name, ret); | ||
1268 | if (!dlm_is_host_down(ret)) | ||
1098 | BUG(); | 1269 | BUG(); |
1099 | } | 1270 | mlog(0, "%s: node %u went down while sending %s " |
1271 | "lockres %.*s\n", dlm->name, send_to, | ||
1272 | flags & DLM_MRES_RECOVERY ? "recovery" : "migration", | ||
1273 | res->lockname.len, res->lockname.name); | ||
1100 | return ret; | 1274 | return ret; |
1101 | } | 1275 | } |
1102 | 1276 | ||
@@ -1144,8 +1318,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1144 | mlog(0, "all done flag. all lockres data received!\n"); | 1318 | mlog(0, "all done flag. all lockres data received!\n"); |
1145 | 1319 | ||
1146 | ret = -ENOMEM; | 1320 | ret = -ENOMEM; |
1147 | buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); | 1321 | buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); |
1148 | item = kcalloc(1, sizeof(*item), GFP_KERNEL); | 1322 | item = kcalloc(1, sizeof(*item), GFP_NOFS); |
1149 | if (!buf || !item) | 1323 | if (!buf || !item) |
1150 | goto leave; | 1324 | goto leave; |
1151 | 1325 | ||
@@ -1236,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1236 | spin_lock(&dlm->work_lock); | 1410 | spin_lock(&dlm->work_lock); |
1237 | list_add_tail(&item->list, &dlm->work_list); | 1411 | list_add_tail(&item->list, &dlm->work_list); |
1238 | spin_unlock(&dlm->work_lock); | 1412 | spin_unlock(&dlm->work_lock); |
1239 | schedule_work(&dlm->dispatched_work); | 1413 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); |
1240 | 1414 | ||
1241 | leave: | 1415 | leave: |
1242 | dlm_put(dlm); | 1416 | dlm_put(dlm); |
@@ -1404,6 +1578,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1404 | struct dlm_ctxt *dlm = data; | 1578 | struct dlm_ctxt *dlm = data; |
1405 | struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; | 1579 | struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; |
1406 | struct dlm_lock_resource *res = NULL; | 1580 | struct dlm_lock_resource *res = NULL; |
1581 | unsigned int hash; | ||
1407 | int master = DLM_LOCK_RES_OWNER_UNKNOWN; | 1582 | int master = DLM_LOCK_RES_OWNER_UNKNOWN; |
1408 | u32 flags = DLM_ASSERT_MASTER_REQUERY; | 1583 | u32 flags = DLM_ASSERT_MASTER_REQUERY; |
1409 | 1584 | ||
@@ -1413,8 +1588,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1413 | return master; | 1588 | return master; |
1414 | } | 1589 | } |
1415 | 1590 | ||
1591 | hash = dlm_lockid_hash(req->name, req->namelen); | ||
1592 | |||
1416 | spin_lock(&dlm->spinlock); | 1593 | spin_lock(&dlm->spinlock); |
1417 | res = __dlm_lookup_lockres(dlm, req->name, req->namelen); | 1594 | res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash); |
1418 | if (res) { | 1595 | if (res) { |
1419 | spin_lock(&res->spinlock); | 1596 | spin_lock(&res->spinlock); |
1420 | master = res->owner; | 1597 | master = res->owner; |
@@ -1481,7 +1658,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1481 | struct dlm_lock *newlock = NULL; | 1658 | struct dlm_lock *newlock = NULL; |
1482 | struct dlm_lockstatus *lksb = NULL; | 1659 | struct dlm_lockstatus *lksb = NULL; |
1483 | int ret = 0; | 1660 | int ret = 0; |
1484 | int i; | 1661 | int i, bad; |
1485 | struct list_head *iter; | 1662 | struct list_head *iter; |
1486 | struct dlm_lock *lock = NULL; | 1663 | struct dlm_lock *lock = NULL; |
1487 | 1664 | ||
@@ -1550,28 +1727,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1550 | } | 1727 | } |
1551 | lksb->flags |= (ml->flags & | 1728 | lksb->flags |= (ml->flags & |
1552 | (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); | 1729 | (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); |
1553 | 1730 | ||
1554 | if (mres->lvb[0]) { | 1731 | if (ml->type == LKM_NLMODE) |
1732 | goto skip_lvb; | ||
1733 | |||
1734 | if (!dlm_lvb_is_empty(mres->lvb)) { | ||
1555 | if (lksb->flags & DLM_LKSB_PUT_LVB) { | 1735 | if (lksb->flags & DLM_LKSB_PUT_LVB) { |
1556 | /* other node was trying to update | 1736 | /* other node was trying to update |
1557 | * lvb when node died. recreate the | 1737 | * lvb when node died. recreate the |
1558 | * lksb with the updated lvb. */ | 1738 | * lksb with the updated lvb. */ |
1559 | memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); | 1739 | memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); |
1740 | /* the lock resource lvb update must happen | ||
1741 | * NOW, before the spinlock is dropped. | ||
1742 | * we no longer wait for the AST to update | ||
1743 | * the lvb. */ | ||
1744 | memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); | ||
1560 | } else { | 1745 | } else { |
1561 | /* otherwise, the node is sending its | 1746 | /* otherwise, the node is sending its |
1562 | * most recent valid lvb info */ | 1747 | * most recent valid lvb info */ |
1563 | BUG_ON(ml->type != LKM_EXMODE && | 1748 | BUG_ON(ml->type != LKM_EXMODE && |
1564 | ml->type != LKM_PRMODE); | 1749 | ml->type != LKM_PRMODE); |
1565 | if (res->lvb[0] && (ml->type == LKM_EXMODE || | 1750 | if (!dlm_lvb_is_empty(res->lvb) && |
1566 | memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { | 1751 | (ml->type == LKM_EXMODE || |
1567 | mlog(ML_ERROR, "received bad lvb!\n"); | 1752 | memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { |
1568 | __dlm_print_one_lock_resource(res); | 1753 | int i; |
1569 | BUG(); | 1754 | mlog(ML_ERROR, "%s:%.*s: received bad " |
1755 | "lvb! type=%d\n", dlm->name, | ||
1756 | res->lockname.len, | ||
1757 | res->lockname.name, ml->type); | ||
1758 | printk("lockres lvb=["); | ||
1759 | for (i=0; i<DLM_LVB_LEN; i++) | ||
1760 | printk("%02x", res->lvb[i]); | ||
1761 | printk("]\nmigrated lvb=["); | ||
1762 | for (i=0; i<DLM_LVB_LEN; i++) | ||
1763 | printk("%02x", mres->lvb[i]); | ||
1764 | printk("]\n"); | ||
1765 | dlm_print_one_lock_resource(res); | ||
1766 | BUG(); | ||
1570 | } | 1767 | } |
1571 | memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); | 1768 | memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); |
1572 | } | 1769 | } |
1573 | } | 1770 | } |
1574 | 1771 | skip_lvb: | |
1575 | 1772 | ||
1576 | /* NOTE: | 1773 | /* NOTE: |
1577 | * wrt lock queue ordering and recovery: | 1774 | * wrt lock queue ordering and recovery: |
@@ -1589,9 +1786,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1589 | * relative to each other, but clearly *not* | 1786 | * relative to each other, but clearly *not* |
1590 | * preserved relative to locks from other nodes. | 1787 | * preserved relative to locks from other nodes. |
1591 | */ | 1788 | */ |
1789 | bad = 0; | ||
1592 | spin_lock(&res->spinlock); | 1790 | spin_lock(&res->spinlock); |
1593 | dlm_lock_get(newlock); | 1791 | list_for_each_entry(lock, queue, list) { |
1594 | list_add_tail(&newlock->list, queue); | 1792 | if (lock->ml.cookie == ml->cookie) { |
1793 | u64 c = lock->ml.cookie; | ||
1794 | mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " | ||
1795 | "exists on this lockres!\n", dlm->name, | ||
1796 | res->lockname.len, res->lockname.name, | ||
1797 | dlm_get_lock_cookie_node(c), | ||
1798 | dlm_get_lock_cookie_seq(c)); | ||
1799 | |||
1800 | mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " | ||
1801 | "node=%u, cookie=%u:%llu, queue=%d\n", | ||
1802 | ml->type, ml->convert_type, ml->node, | ||
1803 | dlm_get_lock_cookie_node(ml->cookie), | ||
1804 | dlm_get_lock_cookie_seq(ml->cookie), | ||
1805 | ml->list); | ||
1806 | |||
1807 | __dlm_print_one_lock_resource(res); | ||
1808 | bad = 1; | ||
1809 | break; | ||
1810 | } | ||
1811 | } | ||
1812 | if (!bad) { | ||
1813 | dlm_lock_get(newlock); | ||
1814 | list_add_tail(&newlock->list, queue); | ||
1815 | } | ||
1595 | spin_unlock(&res->spinlock); | 1816 | spin_unlock(&res->spinlock); |
1596 | } | 1817 | } |
1597 | mlog(0, "done running all the locks\n"); | 1818 | mlog(0, "done running all the locks\n"); |
@@ -1615,8 +1836,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | |||
1615 | struct dlm_lock *lock; | 1836 | struct dlm_lock *lock; |
1616 | 1837 | ||
1617 | res->state |= DLM_LOCK_RES_RECOVERING; | 1838 | res->state |= DLM_LOCK_RES_RECOVERING; |
1618 | if (!list_empty(&res->recovering)) | 1839 | if (!list_empty(&res->recovering)) { |
1840 | mlog(0, | ||
1841 | "Recovering res %s:%.*s, is already on recovery list!\n", | ||
1842 | dlm->name, res->lockname.len, res->lockname.name); | ||
1619 | list_del_init(&res->recovering); | 1843 | list_del_init(&res->recovering); |
1844 | } | ||
1845 | /* We need to hold a reference while on the recovery list */ | ||
1846 | dlm_lockres_get(res); | ||
1620 | list_add_tail(&res->recovering, &dlm->reco.resources); | 1847 | list_add_tail(&res->recovering, &dlm->reco.resources); |
1621 | 1848 | ||
1622 | /* find any pending locks and put them back on proper list */ | 1849 | /* find any pending locks and put them back on proper list */ |
@@ -1705,9 +1932,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | |||
1705 | spin_lock(&res->spinlock); | 1932 | spin_lock(&res->spinlock); |
1706 | dlm_change_lockres_owner(dlm, res, new_master); | 1933 | dlm_change_lockres_owner(dlm, res, new_master); |
1707 | res->state &= ~DLM_LOCK_RES_RECOVERING; | 1934 | res->state &= ~DLM_LOCK_RES_RECOVERING; |
1708 | __dlm_dirty_lockres(dlm, res); | 1935 | if (!__dlm_lockres_unused(res)) |
1936 | __dlm_dirty_lockres(dlm, res); | ||
1709 | spin_unlock(&res->spinlock); | 1937 | spin_unlock(&res->spinlock); |
1710 | wake_up(&res->wq); | 1938 | wake_up(&res->wq); |
1939 | dlm_lockres_put(res); | ||
1711 | } | 1940 | } |
1712 | } | 1941 | } |
1713 | 1942 | ||
@@ -1716,7 +1945,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | |||
1716 | * the RECOVERING state and set the owner | 1945 | * the RECOVERING state and set the owner |
1717 | * if necessary */ | 1946 | * if necessary */ |
1718 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 1947 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { |
1719 | bucket = &(dlm->lockres_hash[i]); | 1948 | bucket = dlm_lockres_hash(dlm, i); |
1720 | hlist_for_each_entry(res, hash_iter, bucket, hash_node) { | 1949 | hlist_for_each_entry(res, hash_iter, bucket, hash_node) { |
1721 | if (res->state & DLM_LOCK_RES_RECOVERING) { | 1950 | if (res->state & DLM_LOCK_RES_RECOVERING) { |
1722 | if (res->owner == dead_node) { | 1951 | if (res->owner == dead_node) { |
@@ -1740,11 +1969,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | |||
1740 | dlm->name, res->lockname.len, | 1969 | dlm->name, res->lockname.len, |
1741 | res->lockname.name, res->owner); | 1970 | res->lockname.name, res->owner); |
1742 | list_del_init(&res->recovering); | 1971 | list_del_init(&res->recovering); |
1972 | dlm_lockres_put(res); | ||
1743 | } | 1973 | } |
1744 | spin_lock(&res->spinlock); | 1974 | spin_lock(&res->spinlock); |
1745 | dlm_change_lockres_owner(dlm, res, new_master); | 1975 | dlm_change_lockres_owner(dlm, res, new_master); |
1746 | res->state &= ~DLM_LOCK_RES_RECOVERING; | 1976 | res->state &= ~DLM_LOCK_RES_RECOVERING; |
1747 | __dlm_dirty_lockres(dlm, res); | 1977 | if (!__dlm_lockres_unused(res)) |
1978 | __dlm_dirty_lockres(dlm, res); | ||
1748 | spin_unlock(&res->spinlock); | 1979 | spin_unlock(&res->spinlock); |
1749 | wake_up(&res->wq); | 1980 | wake_up(&res->wq); |
1750 | } | 1981 | } |
@@ -1881,7 +2112,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) | |||
1881 | * need to be fired as a result. | 2112 | * need to be fired as a result. |
1882 | */ | 2113 | */ |
1883 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 2114 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { |
1884 | bucket = &(dlm->lockres_hash[i]); | 2115 | bucket = dlm_lockres_hash(dlm, i); |
1885 | hlist_for_each_entry(res, iter, bucket, hash_node) { | 2116 | hlist_for_each_entry(res, iter, bucket, hash_node) { |
1886 | /* always prune any $RECOVERY entries for dead nodes, | 2117 | /* always prune any $RECOVERY entries for dead nodes, |
1887 | * otherwise hangs can occur during later recovery */ | 2118 | * otherwise hangs can occur during later recovery */ |
@@ -1921,6 +2152,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) | |||
1921 | { | 2152 | { |
1922 | assert_spin_locked(&dlm->spinlock); | 2153 | assert_spin_locked(&dlm->spinlock); |
1923 | 2154 | ||
2155 | if (dlm->reco.new_master == idx) { | ||
2156 | mlog(0, "%s: recovery master %d just died\n", | ||
2157 | dlm->name, idx); | ||
2158 | if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { | ||
2159 | /* finalize1 was reached, so it is safe to clear | ||
2160 | * the new_master and dead_node. that recovery | ||
2161 | * is complete. */ | ||
2162 | mlog(0, "%s: dead master %d had reached " | ||
2163 | "finalize1 state, clearing\n", dlm->name, idx); | ||
2164 | dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; | ||
2165 | __dlm_reset_recovery(dlm); | ||
2166 | } | ||
2167 | } | ||
2168 | |||
1924 | /* check to see if the node is already considered dead */ | 2169 | /* check to see if the node is already considered dead */ |
1925 | if (!test_bit(idx, dlm->live_nodes_map)) { | 2170 | if (!test_bit(idx, dlm->live_nodes_map)) { |
1926 | mlog(0, "for domain %s, node %d is already dead. " | 2171 | mlog(0, "for domain %s, node %d is already dead. " |
@@ -2084,7 +2329,7 @@ again: | |||
2084 | 2329 | ||
2085 | /* set the new_master to this node */ | 2330 | /* set the new_master to this node */ |
2086 | spin_lock(&dlm->spinlock); | 2331 | spin_lock(&dlm->spinlock); |
2087 | dlm->reco.new_master = dlm->node_num; | 2332 | dlm_set_reco_master(dlm, dlm->node_num); |
2088 | spin_unlock(&dlm->spinlock); | 2333 | spin_unlock(&dlm->spinlock); |
2089 | } | 2334 | } |
2090 | 2335 | ||
@@ -2122,6 +2367,10 @@ again: | |||
2122 | mlog(0, "%s: reco master %u is ready to recover %u\n", | 2367 | mlog(0, "%s: reco master %u is ready to recover %u\n", |
2123 | dlm->name, dlm->reco.new_master, dlm->reco.dead_node); | 2368 | dlm->name, dlm->reco.new_master, dlm->reco.dead_node); |
2124 | status = -EEXIST; | 2369 | status = -EEXIST; |
2370 | } else if (ret == DLM_RECOVERING) { | ||
2371 | mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n", | ||
2372 | dlm->name, dlm->node_num); | ||
2373 | goto again; | ||
2125 | } else { | 2374 | } else { |
2126 | struct dlm_lock_resource *res; | 2375 | struct dlm_lock_resource *res; |
2127 | 2376 | ||
@@ -2153,7 +2402,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) | |||
2153 | 2402 | ||
2154 | mlog_entry("%u\n", dead_node); | 2403 | mlog_entry("%u\n", dead_node); |
2155 | 2404 | ||
2156 | mlog(0, "dead node is %u\n", dead_node); | 2405 | mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); |
2157 | 2406 | ||
2158 | spin_lock(&dlm->spinlock); | 2407 | spin_lock(&dlm->spinlock); |
2159 | dlm_node_iter_init(dlm->domain_map, &iter); | 2408 | dlm_node_iter_init(dlm->domain_map, &iter); |
@@ -2211,6 +2460,14 @@ retry: | |||
2211 | * another ENOMEM */ | 2460 | * another ENOMEM */ |
2212 | msleep(100); | 2461 | msleep(100); |
2213 | goto retry; | 2462 | goto retry; |
2463 | } else if (ret == EAGAIN) { | ||
2464 | mlog(0, "%s: trying to start recovery of node " | ||
2465 | "%u, but node %u is waiting for last recovery " | ||
2466 | "to complete, backoff for a bit\n", dlm->name, | ||
2467 | dead_node, nodenum); | ||
2468 | /* TODO Look into replacing msleep with cond_resched() */ | ||
2469 | msleep(100); | ||
2470 | goto retry; | ||
2214 | } | 2471 | } |
2215 | } | 2472 | } |
2216 | 2473 | ||
@@ -2226,8 +2483,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2226 | if (!dlm_grab(dlm)) | 2483 | if (!dlm_grab(dlm)) |
2227 | return 0; | 2484 | return 0; |
2228 | 2485 | ||
2229 | mlog(0, "node %u wants to recover node %u\n", | 2486 | spin_lock(&dlm->spinlock); |
2230 | br->node_idx, br->dead_node); | 2487 | if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { |
2488 | mlog(0, "%s: node %u wants to recover node %u (%u:%u) " | ||
2489 | "but this node is in finalize state, waiting on finalize2\n", | ||
2490 | dlm->name, br->node_idx, br->dead_node, | ||
2491 | dlm->reco.dead_node, dlm->reco.new_master); | ||
2492 | spin_unlock(&dlm->spinlock); | ||
2493 | return EAGAIN; | ||
2494 | } | ||
2495 | spin_unlock(&dlm->spinlock); | ||
2496 | |||
2497 | mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", | ||
2498 | dlm->name, br->node_idx, br->dead_node, | ||
2499 | dlm->reco.dead_node, dlm->reco.new_master); | ||
2231 | 2500 | ||
2232 | dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); | 2501 | dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); |
2233 | 2502 | ||
@@ -2249,8 +2518,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2249 | "node %u changing it to %u\n", dlm->name, | 2518 | "node %u changing it to %u\n", dlm->name, |
2250 | dlm->reco.dead_node, br->node_idx, br->dead_node); | 2519 | dlm->reco.dead_node, br->node_idx, br->dead_node); |
2251 | } | 2520 | } |
2252 | dlm->reco.new_master = br->node_idx; | 2521 | dlm_set_reco_master(dlm, br->node_idx); |
2253 | dlm->reco.dead_node = br->dead_node; | 2522 | dlm_set_reco_dead_node(dlm, br->dead_node); |
2254 | if (!test_bit(br->dead_node, dlm->recovery_map)) { | 2523 | if (!test_bit(br->dead_node, dlm->recovery_map)) { |
2255 | mlog(0, "recovery master %u sees %u as dead, but this " | 2524 | mlog(0, "recovery master %u sees %u as dead, but this " |
2256 | "node has not yet. marking %u as dead\n", | 2525 | "node has not yet. marking %u as dead\n", |
@@ -2269,10 +2538,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2269 | spin_unlock(&dlm->spinlock); | 2538 | spin_unlock(&dlm->spinlock); |
2270 | 2539 | ||
2271 | dlm_kick_recovery_thread(dlm); | 2540 | dlm_kick_recovery_thread(dlm); |
2541 | |||
2542 | mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", | ||
2543 | dlm->name, br->node_idx, br->dead_node, | ||
2544 | dlm->reco.dead_node, dlm->reco.new_master); | ||
2545 | |||
2272 | dlm_put(dlm); | 2546 | dlm_put(dlm); |
2273 | return 0; | 2547 | return 0; |
2274 | } | 2548 | } |
2275 | 2549 | ||
2550 | #define DLM_FINALIZE_STAGE2 0x01 | ||
2276 | static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) | 2551 | static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) |
2277 | { | 2552 | { |
2278 | int ret = 0; | 2553 | int ret = 0; |
@@ -2280,25 +2555,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) | |||
2280 | struct dlm_node_iter iter; | 2555 | struct dlm_node_iter iter; |
2281 | int nodenum; | 2556 | int nodenum; |
2282 | int status; | 2557 | int status; |
2558 | int stage = 1; | ||
2283 | 2559 | ||
2284 | mlog(0, "finishing recovery for node %s:%u\n", | 2560 | mlog(0, "finishing recovery for node %s:%u, " |
2285 | dlm->name, dlm->reco.dead_node); | 2561 | "stage %d\n", dlm->name, dlm->reco.dead_node, stage); |
2286 | 2562 | ||
2287 | spin_lock(&dlm->spinlock); | 2563 | spin_lock(&dlm->spinlock); |
2288 | dlm_node_iter_init(dlm->domain_map, &iter); | 2564 | dlm_node_iter_init(dlm->domain_map, &iter); |
2289 | spin_unlock(&dlm->spinlock); | 2565 | spin_unlock(&dlm->spinlock); |
2290 | 2566 | ||
2567 | stage2: | ||
2291 | memset(&fr, 0, sizeof(fr)); | 2568 | memset(&fr, 0, sizeof(fr)); |
2292 | fr.node_idx = dlm->node_num; | 2569 | fr.node_idx = dlm->node_num; |
2293 | fr.dead_node = dlm->reco.dead_node; | 2570 | fr.dead_node = dlm->reco.dead_node; |
2571 | if (stage == 2) | ||
2572 | fr.flags |= DLM_FINALIZE_STAGE2; | ||
2294 | 2573 | ||
2295 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | 2574 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { |
2296 | if (nodenum == dlm->node_num) | 2575 | if (nodenum == dlm->node_num) |
2297 | continue; | 2576 | continue; |
2298 | ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, | 2577 | ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, |
2299 | &fr, sizeof(fr), nodenum, &status); | 2578 | &fr, sizeof(fr), nodenum, &status); |
2300 | if (ret >= 0) { | 2579 | if (ret >= 0) |
2301 | ret = status; | 2580 | ret = status; |
2581 | if (ret < 0) { | ||
2582 | mlog_errno(ret); | ||
2302 | if (dlm_is_host_down(ret)) { | 2583 | if (dlm_is_host_down(ret)) { |
2303 | /* this has no effect on this recovery | 2584 | /* this has no effect on this recovery |
2304 | * session, so set the status to zero to | 2585 | * session, so set the status to zero to |
@@ -2306,13 +2587,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) | |||
2306 | mlog(ML_ERROR, "node %u went down after this " | 2587 | mlog(ML_ERROR, "node %u went down after this " |
2307 | "node finished recovery.\n", nodenum); | 2588 | "node finished recovery.\n", nodenum); |
2308 | ret = 0; | 2589 | ret = 0; |
2590 | continue; | ||
2309 | } | 2591 | } |
2310 | } | ||
2311 | if (ret < 0) { | ||
2312 | mlog_errno(ret); | ||
2313 | break; | 2592 | break; |
2314 | } | 2593 | } |
2315 | } | 2594 | } |
2595 | if (stage == 1) { | ||
2596 | /* reset the node_iter back to the top and send finalize2 */ | ||
2597 | iter.curnode = -1; | ||
2598 | stage = 2; | ||
2599 | goto stage2; | ||
2600 | } | ||
2316 | 2601 | ||
2317 | return ret; | 2602 | return ret; |
2318 | } | 2603 | } |
@@ -2321,14 +2606,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2321 | { | 2606 | { |
2322 | struct dlm_ctxt *dlm = data; | 2607 | struct dlm_ctxt *dlm = data; |
2323 | struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; | 2608 | struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; |
2609 | int stage = 1; | ||
2324 | 2610 | ||
2325 | /* ok to return 0, domain has gone away */ | 2611 | /* ok to return 0, domain has gone away */ |
2326 | if (!dlm_grab(dlm)) | 2612 | if (!dlm_grab(dlm)) |
2327 | return 0; | 2613 | return 0; |
2328 | 2614 | ||
2329 | mlog(0, "node %u finalizing recovery of node %u\n", | 2615 | if (fr->flags & DLM_FINALIZE_STAGE2) |
2330 | fr->node_idx, fr->dead_node); | 2616 | stage = 2; |
2331 | 2617 | ||
2618 | mlog(0, "%s: node %u finalizing recovery stage%d of " | ||
2619 | "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, | ||
2620 | fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); | ||
2621 | |||
2332 | spin_lock(&dlm->spinlock); | 2622 | spin_lock(&dlm->spinlock); |
2333 | 2623 | ||
2334 | if (dlm->reco.new_master != fr->node_idx) { | 2624 | if (dlm->reco.new_master != fr->node_idx) { |
@@ -2344,13 +2634,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2344 | BUG(); | 2634 | BUG(); |
2345 | } | 2635 | } |
2346 | 2636 | ||
2347 | dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); | 2637 | switch (stage) { |
2348 | 2638 | case 1: | |
2349 | spin_unlock(&dlm->spinlock); | 2639 | dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); |
2640 | if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { | ||
2641 | mlog(ML_ERROR, "%s: received finalize1 from " | ||
2642 | "new master %u for dead node %u, but " | ||
2643 | "this node has already received it!\n", | ||
2644 | dlm->name, fr->node_idx, fr->dead_node); | ||
2645 | dlm_print_reco_node_status(dlm); | ||
2646 | BUG(); | ||
2647 | } | ||
2648 | dlm->reco.state |= DLM_RECO_STATE_FINALIZE; | ||
2649 | spin_unlock(&dlm->spinlock); | ||
2650 | break; | ||
2651 | case 2: | ||
2652 | if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) { | ||
2653 | mlog(ML_ERROR, "%s: received finalize2 from " | ||
2654 | "new master %u for dead node %u, but " | ||
2655 | "this node did not have finalize1!\n", | ||
2656 | dlm->name, fr->node_idx, fr->dead_node); | ||
2657 | dlm_print_reco_node_status(dlm); | ||
2658 | BUG(); | ||
2659 | } | ||
2660 | dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; | ||
2661 | spin_unlock(&dlm->spinlock); | ||
2662 | dlm_reset_recovery(dlm); | ||
2663 | dlm_kick_recovery_thread(dlm); | ||
2664 | break; | ||
2665 | default: | ||
2666 | BUG(); | ||
2667 | } | ||
2350 | 2668 | ||
2351 | dlm_reset_recovery(dlm); | 2669 | mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", |
2670 | dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); | ||
2352 | 2671 | ||
2353 | dlm_kick_recovery_thread(dlm); | ||
2354 | dlm_put(dlm); | 2672 | dlm_put(dlm); |
2355 | return 0; | 2673 | return 0; |
2356 | } | 2674 | } |
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 44d3b57ae8a8..0c822f3ffb05 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/inet.h> | 39 | #include <linux/inet.h> |
40 | #include <linux/timer.h> | 40 | #include <linux/timer.h> |
41 | #include <linux/kthread.h> | 41 | #include <linux/kthread.h> |
42 | #include <linux/delay.h> | ||
42 | 43 | ||
43 | 44 | ||
44 | #include "cluster/heartbeat.h" | 45 | #include "cluster/heartbeat.h" |
@@ -53,6 +54,8 @@ | |||
53 | #include "cluster/masklog.h" | 54 | #include "cluster/masklog.h" |
54 | 55 | ||
55 | static int dlm_thread(void *data); | 56 | static int dlm_thread(void *data); |
57 | static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, | ||
58 | struct dlm_lock_resource *lockres); | ||
56 | 59 | ||
57 | static void dlm_flush_asts(struct dlm_ctxt *dlm); | 60 | static void dlm_flush_asts(struct dlm_ctxt *dlm); |
58 | 61 | ||
@@ -80,7 +83,7 @@ repeat: | |||
80 | } | 83 | } |
81 | 84 | ||
82 | 85 | ||
83 | static int __dlm_lockres_unused(struct dlm_lock_resource *res) | 86 | int __dlm_lockres_unused(struct dlm_lock_resource *res) |
84 | { | 87 | { |
85 | if (list_empty(&res->granted) && | 88 | if (list_empty(&res->granted) && |
86 | list_empty(&res->converting) && | 89 | list_empty(&res->converting) && |
@@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, | |||
103 | assert_spin_locked(&res->spinlock); | 106 | assert_spin_locked(&res->spinlock); |
104 | 107 | ||
105 | if (__dlm_lockres_unused(res)){ | 108 | if (__dlm_lockres_unused(res)){ |
109 | /* For now, just keep any resource we master */ | ||
110 | if (res->owner == dlm->node_num) | ||
111 | { | ||
112 | if (!list_empty(&res->purge)) { | ||
113 | mlog(0, "we master %s:%.*s, but it is on " | ||
114 | "the purge list. Removing\n", | ||
115 | dlm->name, res->lockname.len, | ||
116 | res->lockname.name); | ||
117 | list_del_init(&res->purge); | ||
118 | dlm->purge_count--; | ||
119 | } | ||
120 | return; | ||
121 | } | ||
122 | |||
106 | if (list_empty(&res->purge)) { | 123 | if (list_empty(&res->purge)) { |
107 | mlog(0, "putting lockres %.*s from purge list\n", | 124 | mlog(0, "putting lockres %.*s from purge list\n", |
108 | res->lockname.len, res->lockname.name); | 125 | res->lockname.len, res->lockname.name); |
@@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, | |||
110 | res->last_used = jiffies; | 127 | res->last_used = jiffies; |
111 | list_add_tail(&res->purge, &dlm->purge_list); | 128 | list_add_tail(&res->purge, &dlm->purge_list); |
112 | dlm->purge_count++; | 129 | dlm->purge_count++; |
130 | |||
131 | /* if this node is not the owner, there is | ||
132 | * no way to keep track of who the owner could be. | ||
133 | * unhash it to avoid serious problems. */ | ||
134 | if (res->owner != dlm->node_num) { | ||
135 | mlog(0, "%s:%.*s: doing immediate " | ||
136 | "purge of lockres owned by %u\n", | ||
137 | dlm->name, res->lockname.len, | ||
138 | res->lockname.name, res->owner); | ||
139 | |||
140 | dlm_purge_lockres_now(dlm, res); | ||
141 | } | ||
113 | } | 142 | } |
114 | } else if (!list_empty(&res->purge)) { | 143 | } else if (!list_empty(&res->purge)) { |
115 | mlog(0, "removing lockres %.*s from purge list\n", | 144 | mlog(0, "removing lockres %.*s from purge list, " |
116 | res->lockname.len, res->lockname.name); | 145 | "owner=%u\n", res->lockname.len, res->lockname.name, |
146 | res->owner); | ||
117 | 147 | ||
118 | list_del_init(&res->purge); | 148 | list_del_init(&res->purge); |
119 | dlm->purge_count--; | 149 | dlm->purge_count--; |
@@ -165,6 +195,7 @@ again: | |||
165 | } else if (ret < 0) { | 195 | } else if (ret < 0) { |
166 | mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", | 196 | mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", |
167 | lockres->lockname.len, lockres->lockname.name); | 197 | lockres->lockname.len, lockres->lockname.name); |
198 | msleep(100); | ||
168 | goto again; | 199 | goto again; |
169 | } | 200 | } |
170 | 201 | ||
@@ -178,6 +209,24 @@ finish: | |||
178 | __dlm_unhash_lockres(lockres); | 209 | __dlm_unhash_lockres(lockres); |
179 | } | 210 | } |
180 | 211 | ||
212 | /* make an unused lockres go away immediately. | ||
213 | * as soon as the dlm spinlock is dropped, this lockres | ||
214 | * will not be found. kfree still happens on last put. */ | ||
215 | static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, | ||
216 | struct dlm_lock_resource *lockres) | ||
217 | { | ||
218 | assert_spin_locked(&dlm->spinlock); | ||
219 | assert_spin_locked(&lockres->spinlock); | ||
220 | |||
221 | BUG_ON(!__dlm_lockres_unused(lockres)); | ||
222 | |||
223 | if (!list_empty(&lockres->purge)) { | ||
224 | list_del_init(&lockres->purge); | ||
225 | dlm->purge_count--; | ||
226 | } | ||
227 | __dlm_unhash_lockres(lockres); | ||
228 | } | ||
229 | |||
181 | static void dlm_run_purge_list(struct dlm_ctxt *dlm, | 230 | static void dlm_run_purge_list(struct dlm_ctxt *dlm, |
182 | int purge_now) | 231 | int purge_now) |
183 | { | 232 | { |
@@ -420,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | |||
420 | /* don't shuffle secondary queues */ | 469 | /* don't shuffle secondary queues */ |
421 | if ((res->owner == dlm->node_num) && | 470 | if ((res->owner == dlm->node_num) && |
422 | !(res->state & DLM_LOCK_RES_DIRTY)) { | 471 | !(res->state & DLM_LOCK_RES_DIRTY)) { |
472 | /* ref for dirty_list */ | ||
473 | dlm_lockres_get(res); | ||
423 | list_add_tail(&res->dirty, &dlm->dirty_list); | 474 | list_add_tail(&res->dirty, &dlm->dirty_list); |
424 | res->state |= DLM_LOCK_RES_DIRTY; | 475 | res->state |= DLM_LOCK_RES_DIRTY; |
425 | } | 476 | } |
@@ -604,6 +655,8 @@ static int dlm_thread(void *data) | |||
604 | list_del_init(&res->dirty); | 655 | list_del_init(&res->dirty); |
605 | spin_unlock(&res->spinlock); | 656 | spin_unlock(&res->spinlock); |
606 | spin_unlock(&dlm->spinlock); | 657 | spin_unlock(&dlm->spinlock); |
658 | /* Drop dirty_list ref */ | ||
659 | dlm_lockres_put(res); | ||
607 | 660 | ||
608 | /* lockres can be re-dirtied/re-added to the | 661 | /* lockres can be re-dirtied/re-added to the |
609 | * dirty_list in this gap, but that is ok */ | 662 | * dirty_list in this gap, but that is ok */ |
@@ -640,8 +693,9 @@ static int dlm_thread(void *data) | |||
640 | * spinlock and do NOT have the dlm lock. | 693 | * spinlock and do NOT have the dlm lock. |
641 | * safe to reserve/queue asts and run the lists. */ | 694 | * safe to reserve/queue asts and run the lists. */ |
642 | 695 | ||
643 | mlog(0, "calling dlm_shuffle_lists with dlm=%p, " | 696 | mlog(0, "calling dlm_shuffle_lists with dlm=%s, " |
644 | "res=%p\n", dlm, res); | 697 | "res=%.*s\n", dlm->name, |
698 | res->lockname.len, res->lockname.name); | ||
645 | 699 | ||
646 | /* called while holding lockres lock */ | 700 | /* called while holding lockres lock */ |
647 | dlm_shuffle_lists(dlm, res); | 701 | dlm_shuffle_lists(dlm, res); |
@@ -655,6 +709,8 @@ in_progress: | |||
655 | /* if the lock was in-progress, stick | 709 | /* if the lock was in-progress, stick |
656 | * it on the back of the list */ | 710 | * it on the back of the list */ |
657 | if (delay) { | 711 | if (delay) { |
712 | /* ref for dirty_list */ | ||
713 | dlm_lockres_get(res); | ||
658 | spin_lock(&res->spinlock); | 714 | spin_lock(&res->spinlock); |
659 | list_add_tail(&res->dirty, &dlm->dirty_list); | 715 | list_add_tail(&res->dirty, &dlm->dirty_list); |
660 | res->state |= DLM_LOCK_RES_DIRTY; | 716 | res->state |= DLM_LOCK_RES_DIRTY; |
@@ -675,7 +731,7 @@ in_progress: | |||
675 | 731 | ||
676 | /* yield and continue right away if there is more work to do */ | 732 | /* yield and continue right away if there is more work to do */ |
677 | if (!n) { | 733 | if (!n) { |
678 | yield(); | 734 | cond_resched(); |
679 | continue; | 735 | continue; |
680 | } | 736 | } |
681 | 737 | ||
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index ac89c509daf9..b0c3134f4f70 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c | |||
@@ -318,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, | |||
318 | 318 | ||
319 | mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); | 319 | mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); |
320 | 320 | ||
321 | if (owner == dlm->node_num) { | ||
322 | /* ended up trying to contact ourself. this means | ||
323 | * that the lockres had been remote but became local | ||
324 | * via a migration. just retry it, now as local */ | ||
325 | mlog(0, "%s:%.*s: this node became the master due to a " | ||
326 | "migration, re-evaluate now\n", dlm->name, | ||
327 | res->lockname.len, res->lockname.name); | ||
328 | return DLM_FORWARD; | ||
329 | } | ||
330 | |||
321 | memset(&unlock, 0, sizeof(unlock)); | 331 | memset(&unlock, 0, sizeof(unlock)); |
322 | unlock.node_idx = dlm->node_num; | 332 | unlock.node_idx = dlm->node_num; |
323 | unlock.flags = cpu_to_be32(flags); | 333 | unlock.flags = cpu_to_be32(flags); |
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index 74ca4e5f9765..e641b084b343 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c | |||
@@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name) | |||
672 | u32 dlm_key; | 672 | u32 dlm_key; |
673 | char *domain; | 673 | char *domain; |
674 | 674 | ||
675 | domain = kmalloc(name->len + 1, GFP_KERNEL); | 675 | domain = kmalloc(name->len + 1, GFP_NOFS); |
676 | if (!domain) { | 676 | if (!domain) { |
677 | mlog_errno(-ENOMEM); | 677 | mlog_errno(-ENOMEM); |
678 | return ERR_PTR(-ENOMEM); | 678 | return ERR_PTR(-ENOMEM); |