diff options
Diffstat (limited to 'fs/ocfs2/cluster/quorum.c')
-rw-r--r-- | fs/ocfs2/cluster/quorum.c | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c new file mode 100644 index 000000000000..7bba98fbfc15 --- /dev/null +++ b/fs/ocfs2/cluster/quorum.c | |||
@@ -0,0 +1,315 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * | ||
3 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
4 | * | ||
5 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public | ||
9 | * License as published by the Free Software Foundation; either | ||
10 | * version 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
15 | * General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public | ||
18 | * License along with this program; if not, write to the | ||
19 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
20 | * Boston, MA 021110-1307, USA. | ||
21 | */ | ||
22 | |||
23 | /* This quorum hack is only here until we transition to some more rational | ||
24 | * approach that is driven from userspace. Honest. No foolin'. | ||
25 | * | ||
26 | * Imagine two nodes lose network connectivity to each other but they're still | ||
27 | * up and operating in every other way. Presumably a network timeout indicates | ||
28 | * that a node is broken and should be recovered. They can't both recover each | ||
29 | * other and both carry on without serialising their access to the file system. | ||
30 | * They need to decide who is authoritative. Now extend that problem to | ||
31 | * arbitrary groups of nodes losing connectivity between each other. | ||
32 | * | ||
33 | * So we declare that a node which has given up on connecting to a majority | ||
34 | * of nodes who are still heartbeating will fence itself. | ||
35 | * | ||
36 | * There are huge opportunities for races here. After we give up on a node's | ||
37 | * connection we need to wait long enough to give heartbeat an opportunity | ||
38 | * to declare the node as truly dead. We also need to be careful with the | ||
39 | * race between when we see a node start heartbeating and when we connect | ||
40 | * to it. | ||
41 | * | ||
42 | * So nodes that are in this transtion put a hold on the quorum decision | ||
43 | * with a counter. As they fall out of this transition they drop the count | ||
44 | * and if they're the last, they fire off the decision. | ||
45 | */ | ||
46 | #include <linux/kernel.h> | ||
47 | #include <linux/slab.h> | ||
48 | #include <linux/workqueue.h> | ||
49 | |||
50 | #include "heartbeat.h" | ||
51 | #include "nodemanager.h" | ||
52 | #define MLOG_MASK_PREFIX ML_QUORUM | ||
53 | #include "masklog.h" | ||
54 | #include "quorum.h" | ||
55 | |||
56 | static struct o2quo_state { | ||
57 | spinlock_t qs_lock; | ||
58 | struct work_struct qs_work; | ||
59 | int qs_pending; | ||
60 | int qs_heartbeating; | ||
61 | unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
62 | int qs_connected; | ||
63 | unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
64 | int qs_holds; | ||
65 | unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
66 | } o2quo_state; | ||
67 | |||
68 | /* this is horribly heavy-handed. It should instead flip the file | ||
69 | * system RO and call some userspace script. */ | ||
70 | static void o2quo_fence_self(void) | ||
71 | { | ||
72 | /* panic spins with interrupts enabled. with preempt | ||
73 | * threads can still schedule, etc, etc */ | ||
74 | o2hb_stop_all_regions(); | ||
75 | panic("ocfs2 is very sorry to be fencing this system by panicing\n"); | ||
76 | } | ||
77 | |||
78 | /* Indicate that a timeout occured on a hearbeat region write. The | ||
79 | * other nodes in the cluster may consider us dead at that time so we | ||
80 | * want to "fence" ourselves so that we don't scribble on the disk | ||
81 | * after they think they've recovered us. This can't solve all | ||
82 | * problems related to writeout after recovery but this hack can at | ||
83 | * least close some of those gaps. When we have real fencing, this can | ||
84 | * go away as our node would be fenced externally before other nodes | ||
85 | * begin recovery. */ | ||
86 | void o2quo_disk_timeout(void) | ||
87 | { | ||
88 | o2quo_fence_self(); | ||
89 | } | ||
90 | |||
91 | static void o2quo_make_decision(void *arg) | ||
92 | { | ||
93 | int quorum; | ||
94 | int lowest_hb, lowest_reachable = 0, fence = 0; | ||
95 | struct o2quo_state *qs = &o2quo_state; | ||
96 | |||
97 | spin_lock(&qs->qs_lock); | ||
98 | |||
99 | lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); | ||
100 | if (lowest_hb != O2NM_MAX_NODES) | ||
101 | lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm); | ||
102 | |||
103 | mlog(0, "heartbeating: %d, connected: %d, " | ||
104 | "lowest: %d (%sreachable)\n", qs->qs_heartbeating, | ||
105 | qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); | ||
106 | |||
107 | if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) || | ||
108 | qs->qs_heartbeating == 1) | ||
109 | goto out; | ||
110 | |||
111 | if (qs->qs_heartbeating & 1) { | ||
112 | /* the odd numbered cluster case is straight forward -- | ||
113 | * if we can't talk to the majority we're hosed */ | ||
114 | quorum = (qs->qs_heartbeating + 1)/2; | ||
115 | if (qs->qs_connected < quorum) { | ||
116 | mlog(ML_ERROR, "fencing this node because it is " | ||
117 | "only connected to %u nodes and %u is needed " | ||
118 | "to make a quorum out of %u heartbeating nodes\n", | ||
119 | qs->qs_connected, quorum, | ||
120 | qs->qs_heartbeating); | ||
121 | fence = 1; | ||
122 | } | ||
123 | } else { | ||
124 | /* the even numbered cluster adds the possibility of each half | ||
125 | * of the cluster being able to talk amongst themselves.. in | ||
126 | * that case we're hosed if we can't talk to the group that has | ||
127 | * the lowest numbered node */ | ||
128 | quorum = qs->qs_heartbeating / 2; | ||
129 | if (qs->qs_connected < quorum) { | ||
130 | mlog(ML_ERROR, "fencing this node because it is " | ||
131 | "only connected to %u nodes and %u is needed " | ||
132 | "to make a quorum out of %u heartbeating nodes\n", | ||
133 | qs->qs_connected, quorum, | ||
134 | qs->qs_heartbeating); | ||
135 | fence = 1; | ||
136 | } | ||
137 | else if ((qs->qs_connected == quorum) && | ||
138 | !lowest_reachable) { | ||
139 | mlog(ML_ERROR, "fencing this node because it is " | ||
140 | "connected to a half-quorum of %u out of %u " | ||
141 | "nodes which doesn't include the lowest active " | ||
142 | "node %u\n", quorum, qs->qs_heartbeating, | ||
143 | lowest_hb); | ||
144 | fence = 1; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | out: | ||
149 | spin_unlock(&qs->qs_lock); | ||
150 | if (fence) | ||
151 | o2quo_fence_self(); | ||
152 | } | ||
153 | |||
154 | static void o2quo_set_hold(struct o2quo_state *qs, u8 node) | ||
155 | { | ||
156 | assert_spin_locked(&qs->qs_lock); | ||
157 | |||
158 | if (!test_and_set_bit(node, qs->qs_hold_bm)) { | ||
159 | qs->qs_holds++; | ||
160 | mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, | ||
161 | "node %u\n", node); | ||
162 | mlog(0, "node %u, %d total\n", node, qs->qs_holds); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) | ||
167 | { | ||
168 | assert_spin_locked(&qs->qs_lock); | ||
169 | |||
170 | if (test_and_clear_bit(node, qs->qs_hold_bm)) { | ||
171 | mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); | ||
172 | if (--qs->qs_holds == 0) { | ||
173 | if (qs->qs_pending) { | ||
174 | qs->qs_pending = 0; | ||
175 | schedule_work(&qs->qs_work); | ||
176 | } | ||
177 | } | ||
178 | mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", | ||
179 | node, qs->qs_holds); | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* as a node comes up we delay the quorum decision until we know the fate of | ||
184 | * the connection. the hold will be droped in conn_up or hb_down. it might be | ||
185 | * perpetuated by con_err until hb_down. if we already have a conn, we might | ||
186 | * be dropping a hold that conn_up got. */ | ||
187 | void o2quo_hb_up(u8 node) | ||
188 | { | ||
189 | struct o2quo_state *qs = &o2quo_state; | ||
190 | |||
191 | spin_lock(&qs->qs_lock); | ||
192 | |||
193 | qs->qs_heartbeating++; | ||
194 | mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, | ||
195 | "node %u\n", node); | ||
196 | mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node); | ||
197 | set_bit(node, qs->qs_hb_bm); | ||
198 | |||
199 | mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); | ||
200 | |||
201 | if (!test_bit(node, qs->qs_conn_bm)) | ||
202 | o2quo_set_hold(qs, node); | ||
203 | else | ||
204 | o2quo_clear_hold(qs, node); | ||
205 | |||
206 | spin_unlock(&qs->qs_lock); | ||
207 | } | ||
208 | |||
209 | /* hb going down releases any holds we might have had due to this node from | ||
210 | * conn_up, conn_err, or hb_up */ | ||
211 | void o2quo_hb_down(u8 node) | ||
212 | { | ||
213 | struct o2quo_state *qs = &o2quo_state; | ||
214 | |||
215 | spin_lock(&qs->qs_lock); | ||
216 | |||
217 | qs->qs_heartbeating--; | ||
218 | mlog_bug_on_msg(qs->qs_heartbeating < 0, | ||
219 | "node %u, %d heartbeating\n", | ||
220 | node, qs->qs_heartbeating); | ||
221 | mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); | ||
222 | clear_bit(node, qs->qs_hb_bm); | ||
223 | |||
224 | mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); | ||
225 | |||
226 | o2quo_clear_hold(qs, node); | ||
227 | |||
228 | spin_unlock(&qs->qs_lock); | ||
229 | } | ||
230 | |||
231 | /* this tells us that we've decided that the node is still heartbeating | ||
232 | * even though we've lost it's conn. it must only be called after conn_err | ||
233 | * and indicates that we must now make a quorum decision in the future, | ||
234 | * though we might be doing so after waiting for holds to drain. Here | ||
235 | * we'll be dropping the hold from conn_err. */ | ||
236 | void o2quo_hb_still_up(u8 node) | ||
237 | { | ||
238 | struct o2quo_state *qs = &o2quo_state; | ||
239 | |||
240 | spin_lock(&qs->qs_lock); | ||
241 | |||
242 | mlog(0, "node %u\n", node); | ||
243 | |||
244 | qs->qs_pending = 1; | ||
245 | o2quo_clear_hold(qs, node); | ||
246 | |||
247 | spin_unlock(&qs->qs_lock); | ||
248 | } | ||
249 | |||
250 | /* This is analagous to hb_up. as a node's connection comes up we delay the | ||
251 | * quorum decision until we see it heartbeating. the hold will be droped in | ||
252 | * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if | ||
253 | * it's already heartbeating we we might be dropping a hold that conn_up got. | ||
254 | * */ | ||
255 | void o2quo_conn_up(u8 node) | ||
256 | { | ||
257 | struct o2quo_state *qs = &o2quo_state; | ||
258 | |||
259 | spin_lock(&qs->qs_lock); | ||
260 | |||
261 | qs->qs_connected++; | ||
262 | mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, | ||
263 | "node %u\n", node); | ||
264 | mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); | ||
265 | set_bit(node, qs->qs_conn_bm); | ||
266 | |||
267 | mlog(0, "node %u, %d total\n", node, qs->qs_connected); | ||
268 | |||
269 | if (!test_bit(node, qs->qs_hb_bm)) | ||
270 | o2quo_set_hold(qs, node); | ||
271 | else | ||
272 | o2quo_clear_hold(qs, node); | ||
273 | |||
274 | spin_unlock(&qs->qs_lock); | ||
275 | } | ||
276 | |||
277 | /* we've decided that we won't ever be connecting to the node again. if it's | ||
278 | * still heartbeating we grab a hold that will delay decisions until either the | ||
279 | * node stops heartbeating from hb_down or the caller decides that the node is | ||
280 | * still up and calls still_up */ | ||
281 | void o2quo_conn_err(u8 node) | ||
282 | { | ||
283 | struct o2quo_state *qs = &o2quo_state; | ||
284 | |||
285 | spin_lock(&qs->qs_lock); | ||
286 | |||
287 | if (test_bit(node, qs->qs_conn_bm)) { | ||
288 | qs->qs_connected--; | ||
289 | mlog_bug_on_msg(qs->qs_connected < 0, | ||
290 | "node %u, connected %d\n", | ||
291 | node, qs->qs_connected); | ||
292 | |||
293 | clear_bit(node, qs->qs_conn_bm); | ||
294 | } | ||
295 | |||
296 | mlog(0, "node %u, %d total\n", node, qs->qs_connected); | ||
297 | |||
298 | if (test_bit(node, qs->qs_hb_bm)) | ||
299 | o2quo_set_hold(qs, node); | ||
300 | |||
301 | spin_unlock(&qs->qs_lock); | ||
302 | } | ||
303 | |||
304 | void o2quo_init(void) | ||
305 | { | ||
306 | struct o2quo_state *qs = &o2quo_state; | ||
307 | |||
308 | spin_lock_init(&qs->qs_lock); | ||
309 | INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL); | ||
310 | } | ||
311 | |||
312 | void o2quo_exit(void) | ||
313 | { | ||
314 | flush_scheduled_work(); | ||
315 | } | ||