summaryrefslogtreecommitdiffstats
path: root/net/tipc/bcast.c
diff options
context:
space:
mode:
authorAllan Stephens <allan.stephens@windriver.com>2011-10-27 14:17:53 -0400
committerPaul Gortmaker <paul.gortmaker@windriver.com>2012-02-06 16:59:18 -0500
commit7a54d4a99dcbbfdf1d4550faa19b615091137953 (patch)
tree34b685bc29547373a43b0f3bd15cf54c07c01971 /net/tipc/bcast.c
parentb98158e3b36645305363a598d91c544fa31446f1 (diff)
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent spurious retransmit requests in dual LAN networks, and to prevent the broadcast link from stalling due to the failure of a receiving node to acknowledge receiving a broadcast message or request its retransmission. Note: These changes only impact the timing of when ACK and NACK messages are sent, and not the basic broadcast link protocol itself, so inter- operability with nodes using the "classic" algorithms is maintained. The revised algorithms are as follows: 1) An explicit ACK message is still sent after receiving 16 in-sequence messages, and implicit ACK information continues to be carried in other unicast link message headers (including link state messages). However, the timing of explicit ACKs is now based on the receiving node's absolute network address rather than its relative network address to ensure that the failure of another node does not delay the ACK beyond its 16 message target. 2) A NACK message is now typically sent only when a message gap persists for two consecutive incoming link state messages; this ensures that a suspected gap is not confirmed until both LANs in a dual LAN network have had an opportunity to deliver the message, thereby preventing spurious NACKs. A NACK message can also be generated by the arrival of a single link state message, if the deferred queue is so big that the current message gap cannot be the result of "normal" mis-ordering due to the use of dual LANs (or one LAN using a bonded interface). Since link state messages typically arrive at different nodes at different times the problem of multiple nodes issuing identical NACKs simultaneously is inherently avoided. 3) Nodes continue to "peek" at NACK messages sent by other nodes. If another node requests retransmission of a message gap suspected (but not yet confirmed) by the peeking node, the peeking node forgets about the gap and does not generate a duplicate retransmit request. (If the peeking node subsequently fails to receive the lost message, later link state messages will cause it to rediscover and confirm the gap and send another NACK.) 4) Message gap "equality" is now determined by the start of the gap only. This is sufficient to deal with the most common cases of message loss, and eliminates the need for complex end of gap computations. 5) A peeking node no longer tries to determine whether it should send a complementary NACK, since the most common cases of message loss don't require it to be sent. Consequently, the node no longer examines the "broadcast tag" field of a NACK message when peeking. Signed-off-by: Allan Stephens <allan.stephens@windriver.com> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Diffstat (limited to 'net/tipc/bcast.c')
-rw-r--r--net/tipc/bcast.c226
1 files changed, 78 insertions, 148 deletions
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index facc216c6a92..1f3b1607d9d4 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -157,39 +157,14 @@ u32 tipc_bclink_get_last_sent(void)
157 return bcl->fsm_msg_cnt; 157 return bcl->fsm_msg_cnt;
158} 158}
159 159
160/** 160static void bclink_update_last_sent(struct tipc_node *node, u32 seqno)
161 * bclink_set_gap - set gap according to contents of current deferred pkt queue
162 *
163 * Called with 'node' locked, bc_lock unlocked
164 */
165
166static void bclink_set_gap(struct tipc_node *n_ptr)
167{
168 struct sk_buff *buf = n_ptr->bclink.deferred_head;
169
170 n_ptr->bclink.gap_after = n_ptr->bclink.gap_to =
171 mod(n_ptr->bclink.last_in);
172 if (unlikely(buf != NULL))
173 n_ptr->bclink.gap_to = mod(buf_seqno(buf) - 1);
174}
175
176/**
177 * bclink_ack_allowed - test if ACK or NACK message can be sent at this moment
178 *
179 * This mechanism endeavours to prevent all nodes in network from trying
180 * to ACK or NACK at the same time.
181 *
182 * Note: TIPC uses a different trigger to distribute ACKs than it does to
183 * distribute NACKs, but tries to use the same spacing (divide by 16).
184 */
185
186static int bclink_ack_allowed(u32 n)
187{ 161{
188 return (n % TIPC_MIN_LINK_WIN) == tipc_own_tag; 162 node->bclink.last_sent = less_eq(node->bclink.last_sent, seqno) ?
163 seqno : node->bclink.last_sent;
189} 164}
190 165
191 166
192/** 167/*
193 * tipc_bclink_retransmit_to - get most recent node to request retransmission 168 * tipc_bclink_retransmit_to - get most recent node to request retransmission
194 * 169 *
195 * Called with bc_lock locked 170 * Called with bc_lock locked
@@ -300,44 +275,56 @@ exit:
300 spin_unlock_bh(&bc_lock); 275 spin_unlock_bh(&bc_lock);
301} 276}
302 277
303/** 278/*
304 * bclink_send_ack - unicast an ACK msg 279 * tipc_bclink_update_link_state - update broadcast link state
305 * 280 *
306 * tipc_net_lock and node lock set 281 * tipc_net_lock and node lock set
307 */ 282 */
308 283
309static void bclink_send_ack(struct tipc_node *n_ptr) 284void tipc_bclink_update_link_state(struct tipc_node *n_ptr, u32 last_sent)
310{ 285{
311 struct tipc_link *l_ptr = n_ptr->active_links[n_ptr->addr & 1]; 286 struct sk_buff *buf;
312 287
313 if (l_ptr != NULL) 288 /* Ignore "stale" link state info */
314 tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0);
315}
316 289
317/** 290 if (less_eq(last_sent, n_ptr->bclink.last_in))
318 * bclink_send_nack- broadcast a NACK msg 291 return;
319 *
320 * tipc_net_lock and node lock set
321 */
322 292
323static void bclink_send_nack(struct tipc_node *n_ptr) 293 /* Update link synchronization state; quit if in sync */
324{ 294
325 struct sk_buff *buf; 295 bclink_update_last_sent(n_ptr, last_sent);
326 struct tipc_msg *msg; 296
297 if (n_ptr->bclink.last_sent == n_ptr->bclink.last_in)
298 return;
299
300 /* Update out-of-sync state; quit if loss is still unconfirmed */
327 301
328 if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to)) 302 if ((++n_ptr->bclink.oos_state) == 1) {
303 if (n_ptr->bclink.deferred_size < (TIPC_MIN_LINK_WIN / 2))
304 return;
305 n_ptr->bclink.oos_state++;
306 }
307
308 /* Don't NACK if one has been recently sent (or seen) */
309
310 if (n_ptr->bclink.oos_state & 0x1)
329 return; 311 return;
330 312
313 /* Send NACK */
314
331 buf = tipc_buf_acquire(INT_H_SIZE); 315 buf = tipc_buf_acquire(INT_H_SIZE);
332 if (buf) { 316 if (buf) {
333 msg = buf_msg(buf); 317 struct tipc_msg *msg = buf_msg(buf);
318
334 tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG, 319 tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG,
335 INT_H_SIZE, n_ptr->addr); 320 INT_H_SIZE, n_ptr->addr);
336 msg_set_non_seq(msg, 1); 321 msg_set_non_seq(msg, 1);
337 msg_set_mc_netid(msg, tipc_net_id); 322 msg_set_mc_netid(msg, tipc_net_id);
338 msg_set_bcast_ack(msg, mod(n_ptr->bclink.last_in)); 323 msg_set_bcast_ack(msg, n_ptr->bclink.last_in);
339 msg_set_bcgap_after(msg, n_ptr->bclink.gap_after); 324 msg_set_bcgap_after(msg, n_ptr->bclink.last_in);
340 msg_set_bcgap_to(msg, n_ptr->bclink.gap_to); 325 msg_set_bcgap_to(msg, n_ptr->bclink.deferred_head
326 ? buf_seqno(n_ptr->bclink.deferred_head) - 1
327 : n_ptr->bclink.last_sent);
341 msg_set_bcast_tag(msg, tipc_own_tag); 328 msg_set_bcast_tag(msg, tipc_own_tag);
342 329
343 spin_lock_bh(&bc_lock); 330 spin_lock_bh(&bc_lock);
@@ -346,96 +333,37 @@ static void bclink_send_nack(struct tipc_node *n_ptr)
346 spin_unlock_bh(&bc_lock); 333 spin_unlock_bh(&bc_lock);
347 buf_discard(buf); 334 buf_discard(buf);
348 335
349 /* 336 n_ptr->bclink.oos_state++;
350 * Ensure we doesn't send another NACK msg to the node
351 * until 16 more deferred messages arrive from it
352 * (i.e. helps prevent all nodes from NACK'ing at same time)
353 */
354
355 n_ptr->bclink.nack_sync = tipc_own_tag;
356 } 337 }
357} 338}
358 339
359/** 340/*
360 * tipc_bclink_check_gap - send a NACK if a sequence gap exists 341 * bclink_peek_nack - monitor retransmission requests sent by other nodes
361 * 342 *
362 * tipc_net_lock and node lock set 343 * Delay any upcoming NACK by this node if another node has already
363 */ 344 * requested the first message this node is going to ask for.
364
365void tipc_bclink_check_gap(struct tipc_node *n_ptr, u32 last_sent)
366{
367 if (!n_ptr->bclink.supported ||
368 less_eq(last_sent, mod(n_ptr->bclink.last_in)))
369 return;
370
371 bclink_set_gap(n_ptr);
372 if (n_ptr->bclink.gap_after == n_ptr->bclink.gap_to)
373 n_ptr->bclink.gap_to = last_sent;
374 bclink_send_nack(n_ptr);
375}
376
377/**
378 * tipc_bclink_peek_nack - process a NACK msg meant for another node
379 * 345 *
380 * Only tipc_net_lock set. 346 * Only tipc_net_lock set.
381 */ 347 */
382 348
383static void tipc_bclink_peek_nack(u32 dest, u32 sender_tag, u32 gap_after, u32 gap_to) 349static void bclink_peek_nack(struct tipc_msg *msg)
384{ 350{
385 struct tipc_node *n_ptr = tipc_node_find(dest); 351 struct tipc_node *n_ptr = tipc_node_find(msg_destnode(msg));
386 u32 my_after, my_to;
387 352
388 if (unlikely(!n_ptr || !tipc_node_is_up(n_ptr))) 353 if (unlikely(!n_ptr))
389 return; 354 return;
355
390 tipc_node_lock(n_ptr); 356 tipc_node_lock(n_ptr);
391 /*
392 * Modify gap to suppress unnecessary NACKs from this node
393 */
394 my_after = n_ptr->bclink.gap_after;
395 my_to = n_ptr->bclink.gap_to;
396
397 if (less_eq(gap_after, my_after)) {
398 if (less(my_after, gap_to) && less(gap_to, my_to))
399 n_ptr->bclink.gap_after = gap_to;
400 else if (less_eq(my_to, gap_to))
401 n_ptr->bclink.gap_to = n_ptr->bclink.gap_after;
402 } else if (less_eq(gap_after, my_to)) {
403 if (less_eq(my_to, gap_to))
404 n_ptr->bclink.gap_to = gap_after;
405 } else {
406 /*
407 * Expand gap if missing bufs not in deferred queue:
408 */
409 struct sk_buff *buf = n_ptr->bclink.deferred_head;
410 u32 prev = n_ptr->bclink.gap_to;
411 357
412 for (; buf; buf = buf->next) { 358 if (n_ptr->bclink.supported &&
413 u32 seqno = buf_seqno(buf); 359 (n_ptr->bclink.last_in != n_ptr->bclink.last_sent) &&
360 (n_ptr->bclink.last_in == msg_bcgap_after(msg)))
361 n_ptr->bclink.oos_state = 2;
414 362
415 if (mod(seqno - prev) != 1) {
416 buf = NULL;
417 break;
418 }
419 if (seqno == gap_after)
420 break;
421 prev = seqno;
422 }
423 if (buf == NULL)
424 n_ptr->bclink.gap_to = gap_after;
425 }
426 /*
427 * Some nodes may send a complementary NACK now:
428 */
429 if (bclink_ack_allowed(sender_tag + 1)) {
430 if (n_ptr->bclink.gap_to != n_ptr->bclink.gap_after) {
431 bclink_send_nack(n_ptr);
432 bclink_set_gap(n_ptr);
433 }
434 }
435 tipc_node_unlock(n_ptr); 363 tipc_node_unlock(n_ptr);
436} 364}
437 365
438/** 366/*
439 * tipc_bclink_send_msg - broadcast a packet to all nodes in cluster 367 * tipc_bclink_send_msg - broadcast a packet to all nodes in cluster
440 */ 368 */
441 369
@@ -505,10 +433,7 @@ void tipc_bclink_recv_pkt(struct sk_buff *buf)
505 spin_unlock_bh(&bc_lock); 433 spin_unlock_bh(&bc_lock);
506 } else { 434 } else {
507 tipc_node_unlock(node); 435 tipc_node_unlock(node);
508 tipc_bclink_peek_nack(msg_destnode(msg), 436 bclink_peek_nack(msg);
509 msg_bcast_tag(msg),
510 msg_bcgap_after(msg),
511 msg_bcgap_to(msg));
512 } 437 }
513 goto exit; 438 goto exit;
514 } 439 }
@@ -519,16 +444,28 @@ void tipc_bclink_recv_pkt(struct sk_buff *buf)
519 next_in = mod(node->bclink.last_in + 1); 444 next_in = mod(node->bclink.last_in + 1);
520 445
521 if (likely(seqno == next_in)) { 446 if (likely(seqno == next_in)) {
447 bclink_update_last_sent(node, seqno);
522receive: 448receive:
449 node->bclink.last_in = seqno;
450 node->bclink.oos_state = 0;
451
523 spin_lock_bh(&bc_lock); 452 spin_lock_bh(&bc_lock);
524 bcl->stats.recv_info++; 453 bcl->stats.recv_info++;
525 node->bclink.last_in++; 454
526 bclink_set_gap(node); 455 /*
527 if (unlikely(bclink_ack_allowed(seqno))) { 456 * Unicast an ACK periodically, ensuring that
528 bclink_send_ack(node); 457 * all nodes in the cluster don't ACK at the same time
458 */
459
460 if (((seqno - tipc_own_addr) % TIPC_MIN_LINK_WIN) == 0) {
461 tipc_link_send_proto_msg(
462 node->active_links[node->addr & 1],
463 STATE_MSG, 0, 0, 0, 0, 0);
529 bcl->stats.sent_acks++; 464 bcl->stats.sent_acks++;
530 } 465 }
531 466
467 /* Deliver message to destination */
468
532 if (likely(msg_isdata(msg))) { 469 if (likely(msg_isdata(msg))) {
533 spin_unlock_bh(&bc_lock); 470 spin_unlock_bh(&bc_lock);
534 tipc_node_unlock(node); 471 tipc_node_unlock(node);
@@ -567,9 +504,14 @@ receive:
567 if (unlikely(!tipc_node_is_up(node))) 504 if (unlikely(!tipc_node_is_up(node)))
568 goto unlock; 505 goto unlock;
569 506
570 if (!node->bclink.deferred_head) 507 if (node->bclink.last_in == node->bclink.last_sent)
571 goto unlock; 508 goto unlock;
572 509
510 if (!node->bclink.deferred_head) {
511 node->bclink.oos_state = 1;
512 goto unlock;
513 }
514
573 msg = buf_msg(node->bclink.deferred_head); 515 msg = buf_msg(node->bclink.deferred_head);
574 seqno = msg_seqno(msg); 516 seqno = msg_seqno(msg);
575 next_in = mod(next_in + 1); 517 next_in = mod(next_in + 1);
@@ -580,31 +522,19 @@ receive:
580 522
581 buf = node->bclink.deferred_head; 523 buf = node->bclink.deferred_head;
582 node->bclink.deferred_head = buf->next; 524 node->bclink.deferred_head = buf->next;
525 node->bclink.deferred_size--;
583 goto receive; 526 goto receive;
584 } 527 }
585 528
586 /* Handle out-of-sequence broadcast message */ 529 /* Handle out-of-sequence broadcast message */
587 530
588 if (less(next_in, seqno)) { 531 if (less(next_in, seqno)) {
589 u32 gap_after = node->bclink.gap_after;
590 u32 gap_to = node->bclink.gap_to;
591
592 deferred = tipc_link_defer_pkt(&node->bclink.deferred_head, 532 deferred = tipc_link_defer_pkt(&node->bclink.deferred_head,
593 &node->bclink.deferred_tail, 533 &node->bclink.deferred_tail,
594 buf); 534 buf);
595 if (deferred) { 535 node->bclink.deferred_size += deferred;
596 node->bclink.nack_sync++; 536 bclink_update_last_sent(node, seqno);
597 if (seqno == mod(gap_after + 1))
598 node->bclink.gap_after = seqno;
599 else if (less(gap_after, seqno) && less(seqno, gap_to))
600 node->bclink.gap_to = seqno;
601 }
602 buf = NULL; 537 buf = NULL;
603 if (bclink_ack_allowed(node->bclink.nack_sync)) {
604 if (gap_to != gap_after)
605 bclink_send_nack(node);
606 bclink_set_gap(node);
607 }
608 } else 538 } else
609 deferred = 0; 539 deferred = 0;
610 540