aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/cluster
diff options
context:
space:
mode:
authorTao Ma <tao.ma@oracle.com>2008-03-05 02:50:12 -0500
committerMark Fasheh <mfasheh@suse.com>2008-04-18 11:56:10 -0400
commit5cc3bf2786f63cceb191c3c02ddd83c6f38a7d64 (patch)
treea9d7f6fa7d251cff67d6b177835ff1f43d23ab2d /fs/ocfs2/cluster
parent8f50eb978935431ccbf89b0344efd4ce6a924875 (diff)
ocfs2: Reconnect after idle time out.
Currently, o2net connects to a node on hb_up and disconnects on hb_down and net timeout. It disconnects on net timeout is ok, but it should attempt to reconnect back. This is because sometimes nodes get overloaded enough that the network connection breaks but the disk hb does not. And if we get into that situation, we either fence (unnecessarily) or wait for its disk hb to die (and sometimes hang in the process). So in this updated scheme, when the network disconnects, we keep attempting to reconnect till we succeed or we get a disk hb down event. If the other node is really dead, then we will eventually get a node down event. If not, we should be able to connect again and continue. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Diffstat (limited to 'fs/ocfs2/cluster')
-rw-r--r--fs/ocfs2/cluster/tcp.c51
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h2
2 files changed, 38 insertions, 15 deletions
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b8057c51b205..4ea4b0a26975 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -399,8 +399,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
399 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); 399 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
400 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); 400 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
401 401
402 /* we won't reconnect after our valid conn goes away for
403 * this hb iteration.. here so it shows up in the logs */
404 if (was_valid && !valid && err == 0) 402 if (was_valid && !valid && err == 0)
405 err = -ENOTCONN; 403 err = -ENOTCONN;
406 404
@@ -430,11 +428,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
430 428
431 if (!was_valid && valid) { 429 if (!was_valid && valid) {
432 o2quo_conn_up(o2net_num_from_nn(nn)); 430 o2quo_conn_up(o2net_num_from_nn(nn));
433 /* this is a bit of a hack. we only try reconnecting
434 * when heartbeating starts until we get a connection.
435 * if that connection then dies we don't try reconnecting.
436 * the only way to start connecting again is to down
437 * heartbeat and bring it back up. */
438 cancel_delayed_work(&nn->nn_connect_expired); 431 cancel_delayed_work(&nn->nn_connect_expired);
439 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", 432 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
440 o2nm_this_node() > sc->sc_node->nd_num ? 433 o2nm_this_node() > sc->sc_node->nd_num ?
@@ -457,6 +450,18 @@ static void o2net_set_nn_state(struct o2net_node *nn,
457 delay = 0; 450 delay = 0;
458 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); 451 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
459 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); 452 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
453
454 /*
455 * Delay the expired work after idle timeout.
456 *
457 * We might have lots of failed connection attempts that run
458 * through here but we only cancel the connect_expired work when
459 * a connection attempt succeeds. So only the first enqueue of
460 * the connect_expired work will do anything. The rest will see
461 * that it's already queued and do nothing.
462 */
463 delay += msecs_to_jiffies(o2net_idle_timeout(NULL));
464 queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
460 } 465 }
461 466
462 /* keep track of the nn's sc ref for the caller */ 467 /* keep track of the nn's sc ref for the caller */
@@ -1193,6 +1198,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1193 * shut down already */ 1198 * shut down already */
1194 if (nn->nn_sc == sc) { 1199 if (nn->nn_sc == sc) {
1195 o2net_sc_reset_idle_timer(sc); 1200 o2net_sc_reset_idle_timer(sc);
1201 atomic_set(&nn->nn_timeout, 0);
1196 o2net_set_nn_state(nn, sc, 1, 0); 1202 o2net_set_nn_state(nn, sc, 1, 0);
1197 } 1203 }
1198 spin_unlock(&nn->nn_lock); 1204 spin_unlock(&nn->nn_lock);
@@ -1391,6 +1397,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
1391static void o2net_idle_timer(unsigned long data) 1397static void o2net_idle_timer(unsigned long data)
1392{ 1398{
1393 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1399 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1400 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1394 struct timeval now; 1401 struct timeval now;
1395 1402
1396 do_gettimeofday(&now); 1403 do_gettimeofday(&now);
@@ -1413,6 +1420,12 @@ static void o2net_idle_timer(unsigned long data)
1413 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, 1420 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
1414 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); 1421 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
1415 1422
1423 /*
1424 * Initialize the nn_timeout so that the next connection attempt
1425 * will continue in o2net_start_connect.
1426 */
1427 atomic_set(&nn->nn_timeout, 1);
1428
1416 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 1429 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1417} 1430}
1418 1431
@@ -1447,6 +1460,7 @@ static void o2net_start_connect(struct work_struct *work)
1447 struct socket *sock = NULL; 1460 struct socket *sock = NULL;
1448 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1461 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1449 int ret = 0, stop; 1462 int ret = 0, stop;
1463 unsigned int timeout;
1450 1464
1451 /* if we're greater we initiate tx, otherwise we accept */ 1465 /* if we're greater we initiate tx, otherwise we accept */
1452 if (o2nm_this_node() <= o2net_num_from_nn(nn)) 1466 if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1480,17 @@ static void o2net_start_connect(struct work_struct *work)
1466 } 1480 }
1467 1481
1468 spin_lock(&nn->nn_lock); 1482 spin_lock(&nn->nn_lock);
1469 /* see if we already have one pending or have given up */ 1483 /*
1470 stop = (nn->nn_sc || nn->nn_persistent_error); 1484 * see if we already have one pending or have given up.
1485 * For nn_timeout, it is set when we close the connection
1486 * because of the idle time out. So it means that we have
1487 * at least connected to that node successfully once,
1488 * now try to connect to it again.
1489 */
1490 timeout = atomic_read(&nn->nn_timeout);
1491 stop = (nn->nn_sc ||
1492 (nn->nn_persistent_error &&
1493 (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
1471 spin_unlock(&nn->nn_lock); 1494 spin_unlock(&nn->nn_lock);
1472 if (stop) 1495 if (stop)
1473 goto out; 1496 goto out;
@@ -1579,6 +1602,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
1579 1602
1580 /* don't reconnect until it's heartbeating again */ 1603 /* don't reconnect until it's heartbeating again */
1581 spin_lock(&nn->nn_lock); 1604 spin_lock(&nn->nn_lock);
1605 atomic_set(&nn->nn_timeout, 0);
1582 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1606 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1583 spin_unlock(&nn->nn_lock); 1607 spin_unlock(&nn->nn_lock);
1584 1608
@@ -1613,17 +1637,12 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1613 (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); 1637 (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
1614 1638
1615 if (node_num != o2nm_this_node()) { 1639 if (node_num != o2nm_this_node()) {
1616 /* heartbeat doesn't work unless a local node number is
1617 * configured and doing so brings up the o2net_wq, so we can
1618 * use it.. */
1619 queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
1620 msecs_to_jiffies(o2net_idle_timeout(node)));
1621
1622 /* believe it or not, accept and node hearbeating testing 1640 /* believe it or not, accept and node hearbeating testing
1623 * can succeed for this node before we got here.. so 1641 * can succeed for this node before we got here.. so
1624 * only use set_nn_state to clear the persistent error 1642 * only use set_nn_state to clear the persistent error
1625 * if that hasn't already happened */ 1643 * if that hasn't already happened */
1626 spin_lock(&nn->nn_lock); 1644 spin_lock(&nn->nn_lock);
1645 atomic_set(&nn->nn_timeout, 0);
1627 if (nn->nn_persistent_error) 1646 if (nn->nn_persistent_error)
1628 o2net_set_nn_state(nn, NULL, 0, 0); 1647 o2net_set_nn_state(nn, NULL, 0, 0);
1629 spin_unlock(&nn->nn_lock); 1648 spin_unlock(&nn->nn_lock);
@@ -1747,6 +1766,7 @@ static int o2net_accept_one(struct socket *sock)
1747 new_sock = NULL; 1766 new_sock = NULL;
1748 1767
1749 spin_lock(&nn->nn_lock); 1768 spin_lock(&nn->nn_lock);
1769 atomic_set(&nn->nn_timeout, 0);
1750 o2net_set_nn_state(nn, sc, 0, 0); 1770 o2net_set_nn_state(nn, sc, 0, 0);
1751 spin_unlock(&nn->nn_lock); 1771 spin_unlock(&nn->nn_lock);
1752 1772
@@ -1941,6 +1961,7 @@ int o2net_init(void)
1941 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { 1961 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1942 struct o2net_node *nn = o2net_nn_from_num(i); 1962 struct o2net_node *nn = o2net_nn_from_num(i);
1943 1963
1964 atomic_set(&nn->nn_timeout, 0);
1944 spin_lock_init(&nn->nn_lock); 1965 spin_lock_init(&nn->nn_lock);
1945 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); 1966 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
1946 INIT_DELAYED_WORK(&nn->nn_connect_expired, 1967 INIT_DELAYED_WORK(&nn->nn_connect_expired,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index d25b9af28500..b4c5586f46ea 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
95 unsigned nn_sc_valid:1; 95 unsigned nn_sc_valid:1;
96 /* if this is set tx just returns it */ 96 /* if this is set tx just returns it */
97 int nn_persistent_error; 97 int nn_persistent_error;
98 /* It is only set to 1 after the idle time out. */
99 atomic_t nn_timeout;
98 100
99 /* threads waiting for an sc to arrive wait on the wq for generation 101 /* threads waiting for an sc to arrive wait on the wq for generation
100 * to increase. it is increased when a connecting socket succeeds 102 * to increase. it is increased when a connecting socket succeeds