aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorSunil Mushran <sunil.mushran@oracle.com>2011-07-24 13:33:54 -0400
committerSunil Mushran <sunil.mushran@oracle.com>2011-07-24 13:33:54 -0400
commit6b27f62fc750d85bc6fc3718b3b38ec60edc2d74 (patch)
tree8046b9a0b55ceb282a3b1d9afd35a4c4fb8e1727 /fs/ocfs2
parent3ba169ccec1c5ad0f678e04fd29b990197fdfe79 (diff)
ocfs2/cluster: Cluster up now includes network connections too
The cluster up check only checks to see if the node is heartbeating or not. If yes it continues assuming that the node is connected to all the nodes. But if that is not the case, the cluster join aborts with a stack of errors that are not easy to comprehend. This patch adds the network connect check upfront and prints the nodes that the node is not yet connected to, before aborting. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c7
-rw-r--r--fs/ocfs2/stack_o2cb.c67
2 files changed, 61 insertions, 13 deletions
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 200d9888ffee..92f2ead0fab6 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -2138,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
2138 goto leave; 2138 goto leave;
2139 } 2139 }
2140 2140
2141 if (!o2hb_check_local_node_heartbeating()) {
2142 mlog(ML_ERROR, "the local node has not been configured, or is "
2143 "not heartbeating\n");
2144 ret = -EPROTO;
2145 goto leave;
2146 }
2147
2148 mlog(0, "register called for domain \"%s\"\n", domain); 2141 mlog(0, "register called for domain \"%s\"\n", domain);
2149 2142
2150retry: 2143retry:
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index be935d89a4e8..94368017edb3 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
28#include "cluster/masklog.h" 28#include "cluster/masklog.h"
29#include "cluster/nodemanager.h" 29#include "cluster/nodemanager.h"
30#include "cluster/heartbeat.h" 30#include "cluster/heartbeat.h"
31#include "cluster/tcp.h"
31 32
32#include "stackglue.h" 33#include "stackglue.h"
33 34
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256} 257}
257 258
258/* 259/*
260 * Check if this node is heartbeating and is connected to all other
261 * heartbeating nodes.
262 */
263static int o2cb_cluster_check(void)
264{
265 u8 node_num;
266 int i;
267 unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
268 unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
269
270 node_num = o2nm_this_node();
271 if (node_num == O2NM_MAX_NODES) {
272 printk(KERN_ERR "o2cb: This node has not been configured.\n");
273 return -EINVAL;
274 }
275
276 /*
277 * o2dlm expects o2net sockets to be created. If not, then
278 * dlm_join_domain() fails with a stack of errors which are both cryptic
279 * and incomplete. The idea here is to detect upfront whether we have
280 * managed to connect to all nodes or not. If not, then list the nodes
281 * to allow the user to check the configuration (incorrect IP, firewall,
282 * etc.) Yes, this is racy. But its not the end of the world.
283 */
284#define O2CB_MAP_STABILIZE_COUNT 60
285 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
286 o2hb_fill_node_map(hbmap, sizeof(hbmap));
287 if (!test_bit(node_num, hbmap)) {
288 printk(KERN_ERR "o2cb: %s heartbeat has not been "
289 "started.\n", (o2hb_global_heartbeat_active() ?
290 "Global" : "Local"));
291 return -EINVAL;
292 }
293 o2net_fill_node_map(netmap, sizeof(netmap));
294 /* Force set the current node to allow easy compare */
295 set_bit(node_num, netmap);
296 if (!memcmp(hbmap, netmap, sizeof(hbmap)))
297 return 0;
298 if (i < O2CB_MAP_STABILIZE_COUNT)
299 msleep(1000);
300 }
301
302 printk(KERN_ERR "o2cb: This node could not connect to nodes:");
303 i = -1;
304 while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
305 i + 1)) < O2NM_MAX_NODES) {
306 if (!test_bit(i, netmap))
307 printk(" %u", i);
308 }
309 printk(".\n");
310
311 return -ENOTCONN;
312}
313
314/*
259 * Called from the dlm when it's about to evict a node. This is how the 315 * Called from the dlm when it's about to evict a node. This is how the
260 * classic stack signals node death. 316 * classic stack signals node death.
261 */ 317 */
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
280 BUG_ON(conn == NULL); 336 BUG_ON(conn == NULL);
281 BUG_ON(conn->cc_proto == NULL); 337 BUG_ON(conn->cc_proto == NULL);
282 338
283 /* for now we only have one cluster/node, make sure we see it 339 /* Ensure cluster stack is up and all nodes are connected */
284 * in the heartbeat universe */ 340 rc = o2cb_cluster_check();
285 if (!o2hb_check_local_node_heartbeating()) { 341 if (rc) {
286 if (o2hb_global_heartbeat_active()) 342 printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
287 mlog(ML_ERROR, "Global heartbeat not started\n"); 343 "before retrying.\n");
288 rc = -EINVAL;
289 goto out; 344 goto out;
290 } 345 }
291 346