tipc: add neighbor monitoring framework

TIPC based clusters are by default set up with full-mesh link connectivity between all nodes. Those links are expected to provide a short failure detection time, by default set to 1500 ms. Because of this, the background load for neighbor monitoring in an N-node cluster increases with a factor N on each node, while the overall monitoring traffic through the network infrastructure increases at a ~(N * (N - 1)) rate. Experience has shown that such clusters don't scale well beyond ~100 nodes unless we significantly increase failure discovery tolerance. This commit introduces a framework and an algorithm that drastically reduces this background load, while basically maintaining the original failure detection times across the whole cluster. Using this algorithm, background load will now grow at a rate of ~(2 * sqrt(N)) per node, and at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will now have to actively monitor 38 neighbors in a 400-node cluster, instead of as before 399. This "Overlapping Ring Supervision Algorithm" is completely distributed and employs no centralized or coordinated state. It goes as follows: - Each node makes up a linearly ascending, circular list of all its N known neighbors, based on their TIPC node identity. This algorithm must be the same on all nodes. - The node then selects the next M = sqrt(N) - 1 nodes downstream from itself in the list, and chooses to actively monitor those. This is called its "local monitoring domain". - It creates a domain record describing the monitoring domain, and piggy-backs this in the data area of all neighbor monitoring messages (LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in the cluster eventually (default within 400 ms) will learn about its monitoring domain. - Whenever a node discovers a change in its local domain, e.g., a node has been added or has gone down, it creates and sends out a new version of its node record to inform all neighbors about the change. - A node receiving a domain record from anybody outside its local domain matches this against its own list (which may not look the same), and chooses to not actively monitor those members of the received domain record that are also present in its own list. Instead, it relies on indications from the direct monitoring nodes if an indirectly monitored node has gone up or down. If a node is indicated lost, the receiving node temporarily activates its own direct monitoring towards that node in order to confirm, or not, that it is actually gone. - Since each node is actively monitoring sqrt(N) downstream neighbors, each node is also actively monitored by the same number of upstream neighbors. This means that all non-direct monitoring nodes normally will receive sqrt(N) indications that a node is gone. - A major drawback with ring monitoring is how it handles failures that cause massive network partitionings. If both a lost node and all its direct monitoring neighbors are inside the lost partition, the nodes in the remaining partition will never receive indications about the loss. To overcome this, each node also chooses to actively monitor some nodes outside its local domain. Those nodes are called remote domain "heads", and are selected in such a way that no node in the cluster will be more than two direct monitoring hops away. Because of this, each node, apart from monitoring the member of its local domain, will also typically monitor sqrt(N) remote head nodes. - As an optimization, local list status, domain status and domain records are marked with a generation number. This saves senders from unnecessarily conveying unaltered domain records, and receivers from performing unneeded re-adaptations of their node monitoring list, such as re-assigning domain heads. - As a measure of caution we have added the possibility to disable the new algorithm through configuration. We do this by keeping a threshold value for the cluster size; a cluster that grows beyond this value will switch from full-mesh to ring monitoring, and vice versa when it shrinks below the value. This means that if the threshold is set to a value larger than any anticipated cluster size (default size is 32) the new algorithm is effectively disabled. A patch set for altering the threshold value and for listing the table contents will follow shortly. - This change is fully backwards compatible. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Jon Paul Maloy <jon.maloy@ericsson.com> 2016-06-13 20:46:22 -0400
committer: David S. Miller <davem@davemloft.net> 2016-06-15 17:06:28 -0400
commit: 35c55c9877f8de0ab129fa1a309271d0ecc868b9 (patch)
tree: 5c8011a871be5083f1c36bdf4ca6c1e4168390c3 /net/tipc/node.c
parent: 7889681f4a6c2148e1245604bac751a1cae8f882 (diff)
1 files changed, 12 insertions, 14 deletions
diff --git a/net/tipc/node.c b/net/tipc/node.c
index d6a490f991a4..a3fc0a3f4077 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -40,6 +40,7 @@
 #include "name_distr.h"
 #include "socket.h"
 #include "bcast.h"
+#include "monitor.h"
 #include "discover.h"
 #include "netlink.h"
@@ -205,17 +206,6 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr)
        return caps;
 }
-/*
- * A trivial power-of-two bitmask technique is used for speed, since this
- * operation is done for every incoming TIPC packet. The number of hash table
- * entries has been chosen so that no hash chain exceeds 8 nodes and will
- * usually be much smaller (typically only a single node).
- */
-static unsigned int tipc_hashfn(u32 addr)
-{
-        return addr & (NODE_HTABLE_SIZE - 1);
-}
 static void tipc_node_kref_release(struct kref *kref)
 {
        struct tipc_node *n = container_of(kref, struct tipc_node, kref);
@@ -279,6 +269,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
        u32 addr = 0;
        u32 flags = n->action_flags;
        u32 link_id = 0;
+        u32 bearer_id;
        struct list_head *publ_list;
        if (likely(!flags)) {
@@ -288,6 +279,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
        addr = n->addr;
        link_id = n->link_id;
+        bearer_id = link_id & 0xffff;
        publ_list = &n->publ_list;
        n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
@@ -301,13 +293,16 @@ static void tipc_node_write_unlock(struct tipc_node *n)
        if (flags & TIPC_NOTIFY_NODE_UP)
                tipc_named_node_up(net, addr);
-        if (flags & TIPC_NOTIFY_LINK_UP)
+        if (flags & TIPC_NOTIFY_LINK_UP) {
+                tipc_mon_peer_up(net, addr, bearer_id);
                tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
                                     TIPC_NODE_SCOPE, link_id, addr);
+        }
-        if (flags & TIPC_NOTIFY_LINK_DOWN)
+        if (flags & TIPC_NOTIFY_LINK_DOWN) {
+                tipc_mon_peer_down(net, addr, bearer_id);
                tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
                                      link_id, addr);
+        }
 }
 struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
@@ -691,6 +686,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
        struct tipc_link *l = le->link;
        struct tipc_media_addr *maddr;
        struct sk_buff_head xmitq;
+        int old_bearer_id = bearer_id;
        if (!l)
                return;
@@ -710,6 +706,8 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
                tipc_link_fsm_evt(l, LINK_RESET_EVT);
        }
        tipc_node_write_unlock(n);
+        if (delete)
+                tipc_mon_remove_peer(n->net, n->addr, old_bearer_id);
        tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
        tipc_sk_rcv(n->net, &le->inputq);
 }
author	Jon Paul Maloy <jon.maloy@ericsson.com>	2016-06-13 20:46:22 -0400
committer	David S. Miller <davem@davemloft.net>	2016-06-15 17:06:28 -0400
commit	35c55c9877f8de0ab129fa1a309271d0ecc868b9 (patch)
tree	5c8011a871be5083f1c36bdf4ca6c1e4168390c3 /net/tipc/node.c
parent	7889681f4a6c2148e1245604bac751a1cae8f882 (diff)

diff --git a/net/tipc/node.c b/net/tipc/node.c index d6a490f991a4..a3fc0a3f4077 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c
@@ -40,6 +40,7 @@
40	#include "name_distr.h"	40	#include "name_distr.h"
41	#include "socket.h"	41	#include "socket.h"
42	#include "bcast.h"	42	#include "bcast.h"
		43	#include "monitor.h"
43	#include "discover.h"	44	#include "discover.h"
44	#include "netlink.h"	45	#include "netlink.h"
45		46
@@ -205,17 +206,6 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr)
205	return caps;	206	return caps;
206	}	207	}
207		208
208	/*
209	* A trivial power-of-two bitmask technique is used for speed, since this
210	* operation is done for every incoming TIPC packet. The number of hash table
211	* entries has been chosen so that no hash chain exceeds 8 nodes and will
212	* usually be much smaller (typically only a single node).
213	*/
214	static unsigned int tipc_hashfn(u32 addr)
215	{
216	return addr & (NODE_HTABLE_SIZE - 1);
217	}
218
219	static void tipc_node_kref_release(struct kref *kref)	209	static void tipc_node_kref_release(struct kref *kref)
220	{	210	{
221	struct tipc_node *n = container_of(kref, struct tipc_node, kref);	211	struct tipc_node *n = container_of(kref, struct tipc_node, kref);
@@ -279,6 +269,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
279	u32 addr = 0;	269	u32 addr = 0;
280	u32 flags = n->action_flags;	270	u32 flags = n->action_flags;
281	u32 link_id = 0;	271	u32 link_id = 0;
		272	u32 bearer_id;
282	struct list_head *publ_list;	273	struct list_head *publ_list;
283		274
284	if (likely(!flags)) {	275	if (likely(!flags)) {
@@ -288,6 +279,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
288		279
289	addr = n->addr;	280	addr = n->addr;
290	link_id = n->link_id;	281	link_id = n->link_id;
		282	bearer_id = link_id & 0xffff;
291	publ_list = &n->publ_list;	283	publ_list = &n->publ_list;
292		284
293	n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN \| TIPC_NOTIFY_NODE_UP \|	285	n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN \| TIPC_NOTIFY_NODE_UP \|
@@ -301,13 +293,16 @@ static void tipc_node_write_unlock(struct tipc_node *n)
301	if (flags & TIPC_NOTIFY_NODE_UP)	293	if (flags & TIPC_NOTIFY_NODE_UP)
302	tipc_named_node_up(net, addr);	294	tipc_named_node_up(net, addr);
303		295
304	if (flags & TIPC_NOTIFY_LINK_UP)	296	if (flags & TIPC_NOTIFY_LINK_UP) {
		297	tipc_mon_peer_up(net, addr, bearer_id);
305	tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,	298	tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
306	TIPC_NODE_SCOPE, link_id, addr);	299	TIPC_NODE_SCOPE, link_id, addr);
307		300	}
308	if (flags & TIPC_NOTIFY_LINK_DOWN)	301	if (flags & TIPC_NOTIFY_LINK_DOWN) {
		302	tipc_mon_peer_down(net, addr, bearer_id);
309	tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,	303	tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
310	link_id, addr);	304	link_id, addr);
		305	}
311	}	306	}
312		307
313	struct tipc_node tipc_node_create(struct net net, u32 addr, u16 capabilities)	308	struct tipc_node tipc_node_create(struct net net, u32 addr, u16 capabilities)
@@ -691,6 +686,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
691	struct tipc_link *l = le->link;	686	struct tipc_link *l = le->link;
692	struct tipc_media_addr *maddr;	687	struct tipc_media_addr *maddr;
693	struct sk_buff_head xmitq;	688	struct sk_buff_head xmitq;
		689	int old_bearer_id = bearer_id;
694		690
695	if (!l)	691	if (!l)
696	return;	692	return;
@@ -710,6 +706,8 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
710	tipc_link_fsm_evt(l, LINK_RESET_EVT);	706	tipc_link_fsm_evt(l, LINK_RESET_EVT);
711	}	707	}
712	tipc_node_write_unlock(n);	708	tipc_node_write_unlock(n);
		709	if (delete)
		710	tipc_mon_remove_peer(n->net, n->addr, old_bearer_id);
713	tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);	711	tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
714	tipc_sk_rcv(n->net, &le->inputq);	712	tipc_sk_rcv(n->net, &le->inputq);
715	}	713	}