aboutsummaryrefslogtreecommitdiffstats
path: root/net/tipc/link.c
diff options
context:
space:
mode:
authorJon Paul Maloy <jon.maloy@ericsson.com>2016-06-13 20:46:22 -0400
committerDavid S. Miller <davem@davemloft.net>2016-06-15 17:06:28 -0400
commit35c55c9877f8de0ab129fa1a309271d0ecc868b9 (patch)
tree5c8011a871be5083f1c36bdf4ca6c1e4168390c3 /net/tipc/link.c
parent7889681f4a6c2148e1245604bac751a1cae8f882 (diff)
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link connectivity between all nodes. Those links are expected to provide a short failure detection time, by default set to 1500 ms. Because of this, the background load for neighbor monitoring in an N-node cluster increases with a factor N on each node, while the overall monitoring traffic through the network infrastructure increases at a ~(N * (N - 1)) rate. Experience has shown that such clusters don't scale well beyond ~100 nodes unless we significantly increase failure discovery tolerance. This commit introduces a framework and an algorithm that drastically reduces this background load, while basically maintaining the original failure detection times across the whole cluster. Using this algorithm, background load will now grow at a rate of ~(2 * sqrt(N)) per node, and at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will now have to actively monitor 38 neighbors in a 400-node cluster, instead of as before 399. This "Overlapping Ring Supervision Algorithm" is completely distributed and employs no centralized or coordinated state. It goes as follows: - Each node makes up a linearly ascending, circular list of all its N known neighbors, based on their TIPC node identity. This algorithm must be the same on all nodes. - The node then selects the next M = sqrt(N) - 1 nodes downstream from itself in the list, and chooses to actively monitor those. This is called its "local monitoring domain". - It creates a domain record describing the monitoring domain, and piggy-backs this in the data area of all neighbor monitoring messages (LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in the cluster eventually (default within 400 ms) will learn about its monitoring domain. - Whenever a node discovers a change in its local domain, e.g., a node has been added or has gone down, it creates and sends out a new version of its node record to inform all neighbors about the change. - A node receiving a domain record from anybody outside its local domain matches this against its own list (which may not look the same), and chooses to not actively monitor those members of the received domain record that are also present in its own list. Instead, it relies on indications from the direct monitoring nodes if an indirectly monitored node has gone up or down. If a node is indicated lost, the receiving node temporarily activates its own direct monitoring towards that node in order to confirm, or not, that it is actually gone. - Since each node is actively monitoring sqrt(N) downstream neighbors, each node is also actively monitored by the same number of upstream neighbors. This means that all non-direct monitoring nodes normally will receive sqrt(N) indications that a node is gone. - A major drawback with ring monitoring is how it handles failures that cause massive network partitionings. If both a lost node and all its direct monitoring neighbors are inside the lost partition, the nodes in the remaining partition will never receive indications about the loss. To overcome this, each node also chooses to actively monitor some nodes outside its local domain. Those nodes are called remote domain "heads", and are selected in such a way that no node in the cluster will be more than two direct monitoring hops away. Because of this, each node, apart from monitoring the member of its local domain, will also typically monitor sqrt(N) remote head nodes. - As an optimization, local list status, domain status and domain records are marked with a generation number. This saves senders from unnecessarily conveying unaltered domain records, and receivers from performing unneeded re-adaptations of their node monitoring list, such as re-assigning domain heads. - As a measure of caution we have added the possibility to disable the new algorithm through configuration. We do this by keeping a threshold value for the cluster size; a cluster that grows beyond this value will switch from full-mesh to ring monitoring, and vice versa when it shrinks below the value. This means that if the threshold is set to a value larger than any anticipated cluster size (default size is 32) the new algorithm is effectively disabled. A patch set for altering the threshold value and for listing the table contents will follow shortly. - This change is fully backwards compatible. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/tipc/link.c')
-rw-r--r--net/tipc/link.c49
1 files changed, 37 insertions, 12 deletions
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a904ccd5a93a..03f8bdf70d8f 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -42,6 +42,7 @@
42#include "name_distr.h" 42#include "name_distr.h"
43#include "discover.h" 43#include "discover.h"
44#include "netlink.h" 44#include "netlink.h"
45#include "monitor.h"
45 46
46#include <linux/pkt_sched.h> 47#include <linux/pkt_sched.h>
47 48
@@ -95,6 +96,7 @@ struct tipc_stats {
95 * @pmsg: convenience pointer to "proto_msg" field 96 * @pmsg: convenience pointer to "proto_msg" field
96 * @priority: current link priority 97 * @priority: current link priority
97 * @net_plane: current link network plane ('A' through 'H') 98 * @net_plane: current link network plane ('A' through 'H')
99 * @mon_state: cookie with information needed by link monitor
98 * @backlog_limit: backlog queue congestion thresholds (indexed by importance) 100 * @backlog_limit: backlog queue congestion thresholds (indexed by importance)
99 * @exp_msg_count: # of tunnelled messages expected during link changeover 101 * @exp_msg_count: # of tunnelled messages expected during link changeover
100 * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset 102 * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset
@@ -138,6 +140,7 @@ struct tipc_link {
138 char if_name[TIPC_MAX_IF_NAME]; 140 char if_name[TIPC_MAX_IF_NAME];
139 u32 priority; 141 u32 priority;
140 char net_plane; 142 char net_plane;
143 struct tipc_mon_state mon_state;
141 u16 rst_cnt; 144 u16 rst_cnt;
142 145
143 /* Failover/synch */ 146 /* Failover/synch */
@@ -708,18 +711,25 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
708 bool setup = false; 711 bool setup = false;
709 u16 bc_snt = l->bc_sndlink->snd_nxt - 1; 712 u16 bc_snt = l->bc_sndlink->snd_nxt - 1;
710 u16 bc_acked = l->bc_rcvlink->acked; 713 u16 bc_acked = l->bc_rcvlink->acked;
711 714 struct tipc_mon_state *mstate = &l->mon_state;
712 link_profile_stats(l);
713 715
714 switch (l->state) { 716 switch (l->state) {
715 case LINK_ESTABLISHED: 717 case LINK_ESTABLISHED:
716 case LINK_SYNCHING: 718 case LINK_SYNCHING:
717 if (l->silent_intv_cnt > l->abort_limit)
718 return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
719 mtyp = STATE_MSG; 719 mtyp = STATE_MSG;
720 link_profile_stats(l);
721 tipc_mon_get_state(l->net, l->addr, mstate, l->bearer_id);
722 if (mstate->reset || (l->silent_intv_cnt > l->abort_limit))
723 return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
720 state = bc_acked != bc_snt; 724 state = bc_acked != bc_snt;
721 probe = l->silent_intv_cnt; 725 state |= l->bc_rcvlink->rcv_unacked;
722 l->silent_intv_cnt++; 726 state |= l->rcv_unacked;
727 state |= !skb_queue_empty(&l->transmq);
728 state |= !skb_queue_empty(&l->deferdq);
729 probe = mstate->probing;
730 probe |= l->silent_intv_cnt;
731 if (probe || mstate->monitoring)
732 l->silent_intv_cnt++;
723 break; 733 break;
724 case LINK_RESET: 734 case LINK_RESET:
725 setup = l->rst_cnt++ <= 4; 735 setup = l->rst_cnt++ <= 4;
@@ -830,6 +840,7 @@ void tipc_link_reset(struct tipc_link *l)
830 l->stats.recv_info = 0; 840 l->stats.recv_info = 0;
831 l->stale_count = 0; 841 l->stale_count = 0;
832 l->bc_peer_is_up = false; 842 l->bc_peer_is_up = false;
843 memset(&l->mon_state, 0, sizeof(l->mon_state));
833 tipc_link_reset_stats(l); 844 tipc_link_reset_stats(l);
834} 845}
835 846
@@ -1238,6 +1249,9 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
1238 struct tipc_msg *hdr; 1249 struct tipc_msg *hdr;
1239 struct sk_buff_head *dfq = &l->deferdq; 1250 struct sk_buff_head *dfq = &l->deferdq;
1240 bool node_up = link_is_up(l->bc_rcvlink); 1251 bool node_up = link_is_up(l->bc_rcvlink);
1252 struct tipc_mon_state *mstate = &l->mon_state;
1253 int dlen = 0;
1254 void *data;
1241 1255
1242 /* Don't send protocol message during reset or link failover */ 1256 /* Don't send protocol message during reset or link failover */
1243 if (tipc_link_is_blocked(l)) 1257 if (tipc_link_is_blocked(l))
@@ -1250,12 +1264,13 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
1250 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; 1264 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
1251 1265
1252 skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, 1266 skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE,
1253 TIPC_MAX_IF_NAME, l->addr, 1267 tipc_max_domain_size, l->addr,
1254 tipc_own_addr(l->net), 0, 0, 0); 1268 tipc_own_addr(l->net), 0, 0, 0);
1255 if (!skb) 1269 if (!skb)
1256 return; 1270 return;
1257 1271
1258 hdr = buf_msg(skb); 1272 hdr = buf_msg(skb);
1273 data = msg_data(hdr);
1259 msg_set_session(hdr, l->session); 1274 msg_set_session(hdr, l->session);
1260 msg_set_bearer_id(hdr, l->bearer_id); 1275 msg_set_bearer_id(hdr, l->bearer_id);
1261 msg_set_net_plane(hdr, l->net_plane); 1276 msg_set_net_plane(hdr, l->net_plane);
@@ -1271,14 +1286,18 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
1271 1286
1272 if (mtyp == STATE_MSG) { 1287 if (mtyp == STATE_MSG) {
1273 msg_set_seq_gap(hdr, rcvgap); 1288 msg_set_seq_gap(hdr, rcvgap);
1274 msg_set_size(hdr, INT_H_SIZE);
1275 msg_set_probe(hdr, probe); 1289 msg_set_probe(hdr, probe);
1290 tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id);
1291 msg_set_size(hdr, INT_H_SIZE + dlen);
1292 skb_trim(skb, INT_H_SIZE + dlen);
1276 l->stats.sent_states++; 1293 l->stats.sent_states++;
1277 l->rcv_unacked = 0; 1294 l->rcv_unacked = 0;
1278 } else { 1295 } else {
1279 /* RESET_MSG or ACTIVATE_MSG */ 1296 /* RESET_MSG or ACTIVATE_MSG */
1280 msg_set_max_pkt(hdr, l->advertised_mtu); 1297 msg_set_max_pkt(hdr, l->advertised_mtu);
1281 strcpy(msg_data(hdr), l->if_name); 1298 strcpy(data, l->if_name);
1299 msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME);
1300 skb_trim(skb, INT_H_SIZE + TIPC_MAX_IF_NAME);
1282 } 1301 }
1283 if (probe) 1302 if (probe)
1284 l->stats.sent_probes++; 1303 l->stats.sent_probes++;
@@ -1371,7 +1390,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
1371 u16 peers_tol = msg_link_tolerance(hdr); 1390 u16 peers_tol = msg_link_tolerance(hdr);
1372 u16 peers_prio = msg_linkprio(hdr); 1391 u16 peers_prio = msg_linkprio(hdr);
1373 u16 rcv_nxt = l->rcv_nxt; 1392 u16 rcv_nxt = l->rcv_nxt;
1393 u16 dlen = msg_data_sz(hdr);
1374 int mtyp = msg_type(hdr); 1394 int mtyp = msg_type(hdr);
1395 void *data;
1375 char *if_name; 1396 char *if_name;
1376 int rc = 0; 1397 int rc = 0;
1377 1398
@@ -1381,6 +1402,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
1381 if (tipc_own_addr(l->net) > msg_prevnode(hdr)) 1402 if (tipc_own_addr(l->net) > msg_prevnode(hdr))
1382 l->net_plane = msg_net_plane(hdr); 1403 l->net_plane = msg_net_plane(hdr);
1383 1404
1405 skb_linearize(skb);
1406 hdr = buf_msg(skb);
1407 data = msg_data(hdr);
1408
1384 switch (mtyp) { 1409 switch (mtyp) {
1385 case RESET_MSG: 1410 case RESET_MSG:
1386 1411
@@ -1391,8 +1416,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
1391 /* fall thru' */ 1416 /* fall thru' */
1392 1417
1393 case ACTIVATE_MSG: 1418 case ACTIVATE_MSG:
1394 skb_linearize(skb);
1395 hdr = buf_msg(skb);
1396 1419
1397 /* Complete own link name with peer's interface name */ 1420 /* Complete own link name with peer's interface name */
1398 if_name = strrchr(l->name, ':') + 1; 1421 if_name = strrchr(l->name, ':') + 1;
@@ -1400,7 +1423,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
1400 break; 1423 break;
1401 if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) 1424 if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME)
1402 break; 1425 break;
1403 strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME); 1426 strncpy(if_name, data, TIPC_MAX_IF_NAME);
1404 1427
1405 /* Update own tolerance if peer indicates a non-zero value */ 1428 /* Update own tolerance if peer indicates a non-zero value */
1406 if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) 1429 if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
@@ -1448,6 +1471,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
1448 rc = TIPC_LINK_UP_EVT; 1471 rc = TIPC_LINK_UP_EVT;
1449 break; 1472 break;
1450 } 1473 }
1474 tipc_mon_rcv(l->net, data, dlen, l->addr,
1475 &l->mon_state, l->bearer_id);
1451 1476
1452 /* Send NACK if peer has sent pkts we haven't received yet */ 1477 /* Send NACK if peer has sent pkts we haven't received yet */
1453 if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) 1478 if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l))