diff options
Diffstat (limited to 'fs/ocfs2/cluster/tcp.c')
-rw-r--r-- | fs/ocfs2/cluster/tcp.c | 165 |
1 files changed, 123 insertions, 42 deletions
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index ee50c9610e7f..1e44ad14881a 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data); | |||
142 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); | 142 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); |
143 | static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); | 143 | static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); |
144 | 144 | ||
145 | /* | 145 | static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, |
146 | * FIXME: These should use to_o2nm_cluster_from_node(), but we end up | 146 | u32 msgkey, struct task_struct *task, u8 node) |
147 | * losing our parent link to the cluster during shutdown. This can be | 147 | { |
148 | * solved by adding a pre-removal callback to configfs, or passing | 148 | #ifdef CONFIG_DEBUG_FS |
149 | * around the cluster with the node. -jeffm | 149 | INIT_LIST_HEAD(&nst->st_net_debug_item); |
150 | */ | 150 | nst->st_task = task; |
151 | static inline int o2net_reconnect_delay(struct o2nm_node *node) | 151 | nst->st_msg_type = msgtype; |
152 | nst->st_msg_key = msgkey; | ||
153 | nst->st_node = node; | ||
154 | #endif | ||
155 | } | ||
156 | |||
157 | static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) | ||
158 | { | ||
159 | #ifdef CONFIG_DEBUG_FS | ||
160 | do_gettimeofday(&nst->st_sock_time); | ||
161 | #endif | ||
162 | } | ||
163 | |||
164 | static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) | ||
165 | { | ||
166 | #ifdef CONFIG_DEBUG_FS | ||
167 | do_gettimeofday(&nst->st_send_time); | ||
168 | #endif | ||
169 | } | ||
170 | |||
171 | static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) | ||
172 | { | ||
173 | #ifdef CONFIG_DEBUG_FS | ||
174 | do_gettimeofday(&nst->st_status_time); | ||
175 | #endif | ||
176 | } | ||
177 | |||
178 | static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, | ||
179 | struct o2net_sock_container *sc) | ||
180 | { | ||
181 | #ifdef CONFIG_DEBUG_FS | ||
182 | nst->st_sc = sc; | ||
183 | #endif | ||
184 | } | ||
185 | |||
186 | static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) | ||
187 | { | ||
188 | #ifdef CONFIG_DEBUG_FS | ||
189 | nst->st_id = msg_id; | ||
190 | #endif | ||
191 | } | ||
192 | |||
193 | static inline int o2net_reconnect_delay(void) | ||
152 | { | 194 | { |
153 | return o2nm_single_cluster->cl_reconnect_delay_ms; | 195 | return o2nm_single_cluster->cl_reconnect_delay_ms; |
154 | } | 196 | } |
155 | 197 | ||
156 | static inline int o2net_keepalive_delay(struct o2nm_node *node) | 198 | static inline int o2net_keepalive_delay(void) |
157 | { | 199 | { |
158 | return o2nm_single_cluster->cl_keepalive_delay_ms; | 200 | return o2nm_single_cluster->cl_keepalive_delay_ms; |
159 | } | 201 | } |
160 | 202 | ||
161 | static inline int o2net_idle_timeout(struct o2nm_node *node) | 203 | static inline int o2net_idle_timeout(void) |
162 | { | 204 | { |
163 | return o2nm_single_cluster->cl_idle_timeout_ms; | 205 | return o2nm_single_cluster->cl_idle_timeout_ms; |
164 | } | 206 | } |
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref) | |||
296 | o2nm_node_put(sc->sc_node); | 338 | o2nm_node_put(sc->sc_node); |
297 | sc->sc_node = NULL; | 339 | sc->sc_node = NULL; |
298 | 340 | ||
341 | o2net_debug_del_sc(sc); | ||
299 | kfree(sc); | 342 | kfree(sc); |
300 | } | 343 | } |
301 | 344 | ||
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) | |||
336 | 379 | ||
337 | ret = sc; | 380 | ret = sc; |
338 | sc->sc_page = page; | 381 | sc->sc_page = page; |
382 | o2net_debug_add_sc(sc); | ||
339 | sc = NULL; | 383 | sc = NULL; |
340 | page = NULL; | 384 | page = NULL; |
341 | 385 | ||
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
399 | mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); | 443 | mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); |
400 | mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); | 444 | mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); |
401 | 445 | ||
402 | /* we won't reconnect after our valid conn goes away for | ||
403 | * this hb iteration.. here so it shows up in the logs */ | ||
404 | if (was_valid && !valid && err == 0) | 446 | if (was_valid && !valid && err == 0) |
405 | err = -ENOTCONN; | 447 | err = -ENOTCONN; |
406 | 448 | ||
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
430 | 472 | ||
431 | if (!was_valid && valid) { | 473 | if (!was_valid && valid) { |
432 | o2quo_conn_up(o2net_num_from_nn(nn)); | 474 | o2quo_conn_up(o2net_num_from_nn(nn)); |
433 | /* this is a bit of a hack. we only try reconnecting | ||
434 | * when heartbeating starts until we get a connection. | ||
435 | * if that connection then dies we don't try reconnecting. | ||
436 | * the only way to start connecting again is to down | ||
437 | * heartbeat and bring it back up. */ | ||
438 | cancel_delayed_work(&nn->nn_connect_expired); | 475 | cancel_delayed_work(&nn->nn_connect_expired); |
439 | printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", | 476 | printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", |
440 | o2nm_this_node() > sc->sc_node->nd_num ? | 477 | o2nm_this_node() > sc->sc_node->nd_num ? |
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
451 | /* delay if we're withing a RECONNECT_DELAY of the | 488 | /* delay if we're withing a RECONNECT_DELAY of the |
452 | * last attempt */ | 489 | * last attempt */ |
453 | delay = (nn->nn_last_connect_attempt + | 490 | delay = (nn->nn_last_connect_attempt + |
454 | msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node))) | 491 | msecs_to_jiffies(o2net_reconnect_delay())) |
455 | - jiffies; | 492 | - jiffies; |
456 | if (delay > msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node))) | 493 | if (delay > msecs_to_jiffies(o2net_reconnect_delay())) |
457 | delay = 0; | 494 | delay = 0; |
458 | mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); | 495 | mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); |
459 | queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); | 496 | queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); |
497 | |||
498 | /* | ||
499 | * Delay the expired work after idle timeout. | ||
500 | * | ||
501 | * We might have lots of failed connection attempts that run | ||
502 | * through here but we only cancel the connect_expired work when | ||
503 | * a connection attempt succeeds. So only the first enqueue of | ||
504 | * the connect_expired work will do anything. The rest will see | ||
505 | * that it's already queued and do nothing. | ||
506 | */ | ||
507 | delay += msecs_to_jiffies(o2net_idle_timeout()); | ||
508 | queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay); | ||
460 | } | 509 | } |
461 | 510 | ||
462 | /* keep track of the nn's sc ref for the caller */ | 511 | /* keep track of the nn's sc ref for the caller */ |
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
914 | struct o2net_status_wait nsw = { | 963 | struct o2net_status_wait nsw = { |
915 | .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), | 964 | .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), |
916 | }; | 965 | }; |
966 | struct o2net_send_tracking nst; | ||
967 | |||
968 | o2net_init_nst(&nst, msg_type, key, current, target_node); | ||
917 | 969 | ||
918 | if (o2net_wq == NULL) { | 970 | if (o2net_wq == NULL) { |
919 | mlog(0, "attempt to tx without o2netd running\n"); | 971 | mlog(0, "attempt to tx without o2netd running\n"); |
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
939 | goto out; | 991 | goto out; |
940 | } | 992 | } |
941 | 993 | ||
994 | o2net_debug_add_nst(&nst); | ||
995 | |||
996 | o2net_set_nst_sock_time(&nst); | ||
997 | |||
942 | ret = wait_event_interruptible(nn->nn_sc_wq, | 998 | ret = wait_event_interruptible(nn->nn_sc_wq, |
943 | o2net_tx_can_proceed(nn, &sc, &error)); | 999 | o2net_tx_can_proceed(nn, &sc, &error)); |
944 | if (!ret && error) | 1000 | if (!ret && error) |
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
946 | if (ret) | 1002 | if (ret) |
947 | goto out; | 1003 | goto out; |
948 | 1004 | ||
1005 | o2net_set_nst_sock_container(&nst, sc); | ||
1006 | |||
949 | veclen = caller_veclen + 1; | 1007 | veclen = caller_veclen + 1; |
950 | vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); | 1008 | vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); |
951 | if (vec == NULL) { | 1009 | if (vec == NULL) { |
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
972 | goto out; | 1030 | goto out; |
973 | 1031 | ||
974 | msg->msg_num = cpu_to_be32(nsw.ns_id); | 1032 | msg->msg_num = cpu_to_be32(nsw.ns_id); |
1033 | o2net_set_nst_msg_id(&nst, nsw.ns_id); | ||
1034 | |||
1035 | o2net_set_nst_send_time(&nst); | ||
975 | 1036 | ||
976 | /* finally, convert the message header to network byte-order | 1037 | /* finally, convert the message header to network byte-order |
977 | * and send */ | 1038 | * and send */ |
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
986 | } | 1047 | } |
987 | 1048 | ||
988 | /* wait on other node's handler */ | 1049 | /* wait on other node's handler */ |
1050 | o2net_set_nst_status_time(&nst); | ||
989 | wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); | 1051 | wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); |
990 | 1052 | ||
991 | /* Note that we avoid overwriting the callers status return | 1053 | /* Note that we avoid overwriting the callers status return |
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
998 | mlog(0, "woken, returning system status %d, user status %d\n", | 1060 | mlog(0, "woken, returning system status %d, user status %d\n", |
999 | ret, nsw.ns_status); | 1061 | ret, nsw.ns_status); |
1000 | out: | 1062 | out: |
1063 | o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ | ||
1001 | if (sc) | 1064 | if (sc) |
1002 | sc_put(sc); | 1065 | sc_put(sc); |
1003 | if (vec) | 1066 | if (vec) |
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) | |||
1154 | * but isn't. This can ultimately cause corruption. | 1217 | * but isn't. This can ultimately cause corruption. |
1155 | */ | 1218 | */ |
1156 | if (be32_to_cpu(hand->o2net_idle_timeout_ms) != | 1219 | if (be32_to_cpu(hand->o2net_idle_timeout_ms) != |
1157 | o2net_idle_timeout(sc->sc_node)) { | 1220 | o2net_idle_timeout()) { |
1158 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " | 1221 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " |
1159 | "%u ms, but we use %u ms locally. disconnecting\n", | 1222 | "%u ms, but we use %u ms locally. disconnecting\n", |
1160 | SC_NODEF_ARGS(sc), | 1223 | SC_NODEF_ARGS(sc), |
1161 | be32_to_cpu(hand->o2net_idle_timeout_ms), | 1224 | be32_to_cpu(hand->o2net_idle_timeout_ms), |
1162 | o2net_idle_timeout(sc->sc_node)); | 1225 | o2net_idle_timeout()); |
1163 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1226 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1164 | return -1; | 1227 | return -1; |
1165 | } | 1228 | } |
1166 | 1229 | ||
1167 | if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != | 1230 | if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != |
1168 | o2net_keepalive_delay(sc->sc_node)) { | 1231 | o2net_keepalive_delay()) { |
1169 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " | 1232 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " |
1170 | "%u ms, but we use %u ms locally. disconnecting\n", | 1233 | "%u ms, but we use %u ms locally. disconnecting\n", |
1171 | SC_NODEF_ARGS(sc), | 1234 | SC_NODEF_ARGS(sc), |
1172 | be32_to_cpu(hand->o2net_keepalive_delay_ms), | 1235 | be32_to_cpu(hand->o2net_keepalive_delay_ms), |
1173 | o2net_keepalive_delay(sc->sc_node)); | 1236 | o2net_keepalive_delay()); |
1174 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1237 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1175 | return -1; | 1238 | return -1; |
1176 | } | 1239 | } |
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) | |||
1193 | * shut down already */ | 1256 | * shut down already */ |
1194 | if (nn->nn_sc == sc) { | 1257 | if (nn->nn_sc == sc) { |
1195 | o2net_sc_reset_idle_timer(sc); | 1258 | o2net_sc_reset_idle_timer(sc); |
1259 | atomic_set(&nn->nn_timeout, 0); | ||
1196 | o2net_set_nn_state(nn, sc, 1, 0); | 1260 | o2net_set_nn_state(nn, sc, 1, 0); |
1197 | } | 1261 | } |
1198 | spin_unlock(&nn->nn_lock); | 1262 | spin_unlock(&nn->nn_lock); |
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void) | |||
1347 | { | 1411 | { |
1348 | o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( | 1412 | o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( |
1349 | O2HB_MAX_WRITE_TIMEOUT_MS); | 1413 | O2HB_MAX_WRITE_TIMEOUT_MS); |
1350 | o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( | 1414 | o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout()); |
1351 | o2net_idle_timeout(NULL)); | ||
1352 | o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( | 1415 | o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( |
1353 | o2net_keepalive_delay(NULL)); | 1416 | o2net_keepalive_delay()); |
1354 | o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( | 1417 | o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( |
1355 | o2net_reconnect_delay(NULL)); | 1418 | o2net_reconnect_delay()); |
1356 | } | 1419 | } |
1357 | 1420 | ||
1358 | /* ------------------------------------------------------------ */ | 1421 | /* ------------------------------------------------------------ */ |
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work) | |||
1391 | static void o2net_idle_timer(unsigned long data) | 1454 | static void o2net_idle_timer(unsigned long data) |
1392 | { | 1455 | { |
1393 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; | 1456 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; |
1457 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | ||
1394 | struct timeval now; | 1458 | struct timeval now; |
1395 | 1459 | ||
1396 | do_gettimeofday(&now); | 1460 | do_gettimeofday(&now); |
1397 | 1461 | ||
1398 | printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " | 1462 | printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " |
1399 | "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), | 1463 | "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), |
1400 | o2net_idle_timeout(sc->sc_node) / 1000, | 1464 | o2net_idle_timeout() / 1000, |
1401 | o2net_idle_timeout(sc->sc_node) % 1000); | 1465 | o2net_idle_timeout() % 1000); |
1402 | mlog(ML_NOTICE, "here are some times that might help debug the " | 1466 | mlog(ML_NOTICE, "here are some times that might help debug the " |
1403 | "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " | 1467 | "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " |
1404 | "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", | 1468 | "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", |
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data) | |||
1413 | sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, | 1477 | sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, |
1414 | sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); | 1478 | sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); |
1415 | 1479 | ||
1480 | /* | ||
1481 | * Initialize the nn_timeout so that the next connection attempt | ||
1482 | * will continue in o2net_start_connect. | ||
1483 | */ | ||
1484 | atomic_set(&nn->nn_timeout, 1); | ||
1485 | |||
1416 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); | 1486 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); |
1417 | } | 1487 | } |
1418 | 1488 | ||
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) | |||
1420 | { | 1490 | { |
1421 | o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); | 1491 | o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); |
1422 | o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, | 1492 | o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, |
1423 | msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); | 1493 | msecs_to_jiffies(o2net_keepalive_delay())); |
1424 | do_gettimeofday(&sc->sc_tv_timer); | 1494 | do_gettimeofday(&sc->sc_tv_timer); |
1425 | mod_timer(&sc->sc_idle_timeout, | 1495 | mod_timer(&sc->sc_idle_timeout, |
1426 | jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); | 1496 | jiffies + msecs_to_jiffies(o2net_idle_timeout())); |
1427 | } | 1497 | } |
1428 | 1498 | ||
1429 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) | 1499 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) |
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work) | |||
1447 | struct socket *sock = NULL; | 1517 | struct socket *sock = NULL; |
1448 | struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; | 1518 | struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; |
1449 | int ret = 0, stop; | 1519 | int ret = 0, stop; |
1520 | unsigned int timeout; | ||
1450 | 1521 | ||
1451 | /* if we're greater we initiate tx, otherwise we accept */ | 1522 | /* if we're greater we initiate tx, otherwise we accept */ |
1452 | if (o2nm_this_node() <= o2net_num_from_nn(nn)) | 1523 | if (o2nm_this_node() <= o2net_num_from_nn(nn)) |
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work) | |||
1466 | } | 1537 | } |
1467 | 1538 | ||
1468 | spin_lock(&nn->nn_lock); | 1539 | spin_lock(&nn->nn_lock); |
1469 | /* see if we already have one pending or have given up */ | 1540 | /* |
1470 | stop = (nn->nn_sc || nn->nn_persistent_error); | 1541 | * see if we already have one pending or have given up. |
1542 | * For nn_timeout, it is set when we close the connection | ||
1543 | * because of the idle time out. So it means that we have | ||
1544 | * at least connected to that node successfully once, | ||
1545 | * now try to connect to it again. | ||
1546 | */ | ||
1547 | timeout = atomic_read(&nn->nn_timeout); | ||
1548 | stop = (nn->nn_sc || | ||
1549 | (nn->nn_persistent_error && | ||
1550 | (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); | ||
1471 | spin_unlock(&nn->nn_lock); | 1551 | spin_unlock(&nn->nn_lock); |
1472 | if (stop) | 1552 | if (stop) |
1473 | goto out; | 1553 | goto out; |
@@ -1552,12 +1632,11 @@ static void o2net_connect_expired(struct work_struct *work) | |||
1552 | 1632 | ||
1553 | spin_lock(&nn->nn_lock); | 1633 | spin_lock(&nn->nn_lock); |
1554 | if (!nn->nn_sc_valid) { | 1634 | if (!nn->nn_sc_valid) { |
1555 | struct o2nm_node *node = nn->nn_sc->sc_node; | ||
1556 | mlog(ML_ERROR, "no connection established with node %u after " | 1635 | mlog(ML_ERROR, "no connection established with node %u after " |
1557 | "%u.%u seconds, giving up and returning errors.\n", | 1636 | "%u.%u seconds, giving up and returning errors.\n", |
1558 | o2net_num_from_nn(nn), | 1637 | o2net_num_from_nn(nn), |
1559 | o2net_idle_timeout(node) / 1000, | 1638 | o2net_idle_timeout() / 1000, |
1560 | o2net_idle_timeout(node) % 1000); | 1639 | o2net_idle_timeout() % 1000); |
1561 | 1640 | ||
1562 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); | 1641 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); |
1563 | } | 1642 | } |
@@ -1580,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node) | |||
1580 | 1659 | ||
1581 | /* don't reconnect until it's heartbeating again */ | 1660 | /* don't reconnect until it's heartbeating again */ |
1582 | spin_lock(&nn->nn_lock); | 1661 | spin_lock(&nn->nn_lock); |
1662 | atomic_set(&nn->nn_timeout, 0); | ||
1583 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); | 1663 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); |
1584 | spin_unlock(&nn->nn_lock); | 1664 | spin_unlock(&nn->nn_lock); |
1585 | 1665 | ||
@@ -1611,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, | |||
1611 | 1691 | ||
1612 | /* ensure an immediate connect attempt */ | 1692 | /* ensure an immediate connect attempt */ |
1613 | nn->nn_last_connect_attempt = jiffies - | 1693 | nn->nn_last_connect_attempt = jiffies - |
1614 | (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); | 1694 | (msecs_to_jiffies(o2net_reconnect_delay()) + 1); |
1615 | 1695 | ||
1616 | if (node_num != o2nm_this_node()) { | 1696 | if (node_num != o2nm_this_node()) { |
1617 | /* heartbeat doesn't work unless a local node number is | ||
1618 | * configured and doing so brings up the o2net_wq, so we can | ||
1619 | * use it.. */ | ||
1620 | queue_delayed_work(o2net_wq, &nn->nn_connect_expired, | ||
1621 | msecs_to_jiffies(o2net_idle_timeout(node))); | ||
1622 | |||
1623 | /* believe it or not, accept and node hearbeating testing | 1697 | /* believe it or not, accept and node hearbeating testing |
1624 | * can succeed for this node before we got here.. so | 1698 | * can succeed for this node before we got here.. so |
1625 | * only use set_nn_state to clear the persistent error | 1699 | * only use set_nn_state to clear the persistent error |
1626 | * if that hasn't already happened */ | 1700 | * if that hasn't already happened */ |
1627 | spin_lock(&nn->nn_lock); | 1701 | spin_lock(&nn->nn_lock); |
1702 | atomic_set(&nn->nn_timeout, 0); | ||
1628 | if (nn->nn_persistent_error) | 1703 | if (nn->nn_persistent_error) |
1629 | o2net_set_nn_state(nn, NULL, 0, 0); | 1704 | o2net_set_nn_state(nn, NULL, 0, 0); |
1630 | spin_unlock(&nn->nn_lock); | 1705 | spin_unlock(&nn->nn_lock); |
@@ -1748,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock) | |||
1748 | new_sock = NULL; | 1823 | new_sock = NULL; |
1749 | 1824 | ||
1750 | spin_lock(&nn->nn_lock); | 1825 | spin_lock(&nn->nn_lock); |
1826 | atomic_set(&nn->nn_timeout, 0); | ||
1751 | o2net_set_nn_state(nn, sc, 0, 0); | 1827 | o2net_set_nn_state(nn, sc, 0, 0); |
1752 | spin_unlock(&nn->nn_lock); | 1828 | spin_unlock(&nn->nn_lock); |
1753 | 1829 | ||
@@ -1923,6 +1999,9 @@ int o2net_init(void) | |||
1923 | 1999 | ||
1924 | o2quo_init(); | 2000 | o2quo_init(); |
1925 | 2001 | ||
2002 | if (o2net_debugfs_init()) | ||
2003 | return -ENOMEM; | ||
2004 | |||
1926 | o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); | 2005 | o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); |
1927 | o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); | 2006 | o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); |
1928 | o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); | 2007 | o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); |
@@ -1942,6 +2021,7 @@ int o2net_init(void) | |||
1942 | for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { | 2021 | for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { |
1943 | struct o2net_node *nn = o2net_nn_from_num(i); | 2022 | struct o2net_node *nn = o2net_nn_from_num(i); |
1944 | 2023 | ||
2024 | atomic_set(&nn->nn_timeout, 0); | ||
1945 | spin_lock_init(&nn->nn_lock); | 2025 | spin_lock_init(&nn->nn_lock); |
1946 | INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); | 2026 | INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); |
1947 | INIT_DELAYED_WORK(&nn->nn_connect_expired, | 2027 | INIT_DELAYED_WORK(&nn->nn_connect_expired, |
@@ -1963,4 +2043,5 @@ void o2net_exit(void) | |||
1963 | kfree(o2net_hand); | 2043 | kfree(o2net_hand); |
1964 | kfree(o2net_keep_req); | 2044 | kfree(o2net_keep_req); |
1965 | kfree(o2net_keep_resp); | 2045 | kfree(o2net_keep_resp); |
2046 | o2net_debugfs_exit(); | ||
1966 | } | 2047 | } |