diff options
author | LUU Duc Canh <canh.d.luu@dektech.com.au> | 2018-09-26 15:00:54 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-09-29 14:45:14 -0400 |
commit | c140eb166d681f66bd7e99fb121357db1a503e7f (patch) | |
tree | 9eda87117d15507035fd046a9f9c3d610f0013fc /net/tipc/node.c | |
parent | 418b9a353a821f5d1787fd310d2af31232e9ff32 (diff) |
tipc: fix failover problem
We see the following scenario:
1) Link endpoint B on node 1 discovers that its peer endpoint is gone.
Since there is a second working link, failover procedure is started.
2) Link endpoint A on node 1 sends a FAILOVER message to peer endpoint
A on node 2. The node item 1->2 goes to state FAILINGOVER.
3) Linke endpoint A/2 receives the failover, and is supposed to take
down its parallell link endpoint B/2, while producing a FAILOVER
message to send back to A/1.
4) However, B/2 has already been deleted, so no FAILOVER message can
created.
5) Node 1->2 remains in state FAILINGOVER forever, refusing to receive
any messages that can bring B/1 up again. We are left with a non-
redundant link between node 1 and 2.
We fix this with letting endpoint A/2 build a dummy FAILOVER message
to send to back to A/1, so that the situation can be resolved.
Signed-off-by: LUU Duc Canh <canh.d.luu@dektech.com.au>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/tipc/node.c')
-rw-r--r-- | net/tipc/node.c | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/net/tipc/node.c b/net/tipc/node.c index 68014f1b6976..b0ee25f1f2e6 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c | |||
@@ -111,6 +111,7 @@ struct tipc_node { | |||
111 | int action_flags; | 111 | int action_flags; |
112 | struct list_head list; | 112 | struct list_head list; |
113 | int state; | 113 | int state; |
114 | bool failover_sent; | ||
114 | u16 sync_point; | 115 | u16 sync_point; |
115 | int link_cnt; | 116 | int link_cnt; |
116 | u16 working_links; | 117 | u16 working_links; |
@@ -680,6 +681,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, | |||
680 | *slot0 = bearer_id; | 681 | *slot0 = bearer_id; |
681 | *slot1 = bearer_id; | 682 | *slot1 = bearer_id; |
682 | tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); | 683 | tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); |
684 | n->failover_sent = false; | ||
683 | n->action_flags |= TIPC_NOTIFY_NODE_UP; | 685 | n->action_flags |= TIPC_NOTIFY_NODE_UP; |
684 | tipc_link_set_active(nl, true); | 686 | tipc_link_set_active(nl, true); |
685 | tipc_bcast_add_peer(n->net, nl, xmitq); | 687 | tipc_bcast_add_peer(n->net, nl, xmitq); |
@@ -1615,6 +1617,15 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, | |||
1615 | tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), | 1617 | tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), |
1616 | tipc_link_inputq(l)); | 1618 | tipc_link_inputq(l)); |
1617 | } | 1619 | } |
1620 | /* If parallel link was already down, and this happened before | ||
1621 | * the tunnel link came up, FAILOVER was never sent. Ensure that | ||
1622 | * FAILOVER is sent to get peer out of NODE_FAILINGOVER state. | ||
1623 | */ | ||
1624 | if (n->state != NODE_FAILINGOVER && !n->failover_sent) { | ||
1625 | tipc_link_create_dummy_tnl_msg(l, xmitq); | ||
1626 | n->failover_sent = true; | ||
1627 | } | ||
1628 | |||
1618 | /* If pkts arrive out of order, use lowest calculated syncpt */ | 1629 | /* If pkts arrive out of order, use lowest calculated syncpt */ |
1619 | if (less(syncpt, n->sync_point)) | 1630 | if (less(syncpt, n->sync_point)) |
1620 | n->sync_point = syncpt; | 1631 | n->sync_point = syncpt; |