aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Kelley <mikelley@microsoft.com>2018-06-05 16:37:51 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2018-07-03 07:02:28 -0400
commit71b38245acb05a38d2d861792bdd99cd9f6a0f78 (patch)
treea1e2cbc6aa01eba596d3a341fbabc11781de8572
parente9a7fda29a5620d9ac2a750d8e35f5d270096321 (diff)
Drivers: hv: vmbus: Add comments on ring buffer signaling
Add comments describing intricacies of Hyper-V ring buffer signaling code. This information is not in Hyper-V public documents, so include here to capture the knowledge for future coders. There are no code changes in this commit. Signed-off-by: Michael Kelley <mikelley@microsoft.com> Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--drivers/hv/ring_buffer.c65
-rw-r--r--include/linux/hyperv.h31
2 files changed, 77 insertions, 19 deletions
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index be3c8b10b84a..3e90eb91db45 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -431,7 +431,24 @@ static u32 hv_pkt_iter_bytes_read(const struct hv_ring_buffer_info *rbi,
431} 431}
432 432
433/* 433/*
434 * Update host ring buffer after iterating over packets. 434 * Update host ring buffer after iterating over packets. If the host has
435 * stopped queuing new entries because it found the ring buffer full, and
436 * sufficient space is being freed up, signal the host. But be careful to
437 * only signal the host when necessary, both for performance reasons and
438 * because Hyper-V protects itself by throttling guests that signal
439 * inappropriately.
440 *
441 * Determining when to signal is tricky. There are three key data inputs
442 * that must be handled in this order to avoid race conditions:
443 *
444 * 1. Update the read_index
445 * 2. Read the pending_send_sz
446 * 3. Read the current write_index
447 *
448 * The interrupt_mask is not used to determine when to signal. The
449 * interrupt_mask is used only on the guest->host ring buffer when
450 * sending requests to the host. The host does not use it on the host->
451 * guest ring buffer to indicate whether it should be signaled.
435 */ 452 */
436void hv_pkt_iter_close(struct vmbus_channel *channel) 453void hv_pkt_iter_close(struct vmbus_channel *channel)
437{ 454{
@@ -447,22 +464,30 @@ void hv_pkt_iter_close(struct vmbus_channel *channel)
447 start_read_index = rbi->ring_buffer->read_index; 464 start_read_index = rbi->ring_buffer->read_index;
448 rbi->ring_buffer->read_index = rbi->priv_read_index; 465 rbi->ring_buffer->read_index = rbi->priv_read_index;
449 466
467 /*
468 * Older versions of Hyper-V (before WS2102 and Win8) do not
469 * implement pending_send_sz and simply poll if the host->guest
470 * ring buffer is full. No signaling is needed or expected.
471 */
450 if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz) 472 if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz)
451 return; 473 return;
452 474
453 /* 475 /*
454 * Issue a full memory barrier before making the signaling decision. 476 * Issue a full memory barrier before making the signaling decision.
455 * Here is the reason for having this barrier: 477 * If reading pending_send_sz were to be reordered and happen
456 * If the reading of the pend_sz (in this function) 478 * before we commit the new read_index, a race could occur. If the
457 * were to be reordered and read before we commit the new read 479 * host were to set the pending_send_sz after we have sampled
458 * index (in the calling function) we could 480 * pending_send_sz, and the ring buffer blocks before we commit the
459 * have a problem. If the host were to set the pending_sz after we
460 * have sampled pending_sz and go to sleep before we commit the
461 * read index, we could miss sending the interrupt. Issue a full 481 * read index, we could miss sending the interrupt. Issue a full
462 * memory barrier to address this. 482 * memory barrier to address this.
463 */ 483 */
464 virt_mb(); 484 virt_mb();
465 485
486 /*
487 * If the pending_send_sz is zero, then the ring buffer is not
488 * blocked and there is no need to signal. This is far by the
489 * most common case, so exit quickly for best performance.
490 */
466 pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz); 491 pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
467 if (!pending_sz) 492 if (!pending_sz)
468 return; 493 return;
@@ -476,14 +501,32 @@ void hv_pkt_iter_close(struct vmbus_channel *channel)
476 bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index); 501 bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index);
477 502
478 /* 503 /*
479 * If there was space before we began iteration, 504 * We want to signal the host only if we're transitioning
480 * then host was not blocked. 505 * from a "not enough free space" state to a "enough free
506 * space" state. For example, it's possible that this function
507 * could run and free up enough space to signal the host, and then
508 * run again and free up additional space before the host has a
509 * chance to clear the pending_send_sz. The 2nd invocation would
510 * be a null transition from "enough free space" to "enough free
511 * space", which doesn't warrant a signal.
512 *
513 * Exactly filling the ring buffer is treated as "not enough
514 * space". The ring buffer always must have at least one byte
515 * empty so the empty and full conditions are distinguishable.
516 * hv_get_bytes_to_write() doesn't fully tell the truth in
517 * this regard.
518 *
519 * So first check if we were in the "enough free space" state
520 * before we began the iteration. If so, the host was not
521 * blocked, and there's no need to signal.
481 */ 522 */
482
483 if (curr_write_sz - bytes_read > pending_sz) 523 if (curr_write_sz - bytes_read > pending_sz)
484 return; 524 return;
485 525
486 /* If pending write will not fit, don't give false hope. */ 526 /*
527 * Similarly, if the new state is "not enough space", then
528 * there's no need to signal.
529 */
487 if (curr_write_sz <= pending_sz) 530 if (curr_write_sz <= pending_sz)
488 return; 531 return;
489 532
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 3a3012f57be4..2330f08062c7 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -89,18 +89,33 @@ struct hv_ring_buffer {
89 u32 interrupt_mask; 89 u32 interrupt_mask;
90 90
91 /* 91 /*
92 * Win8 uses some of the reserved bits to implement 92 * WS2012/Win8 and later versions of Hyper-V implement interrupt
93 * interrupt driven flow management. On the send side 93 * driven flow management. The feature bit feat_pending_send_sz
94 * we can request that the receiver interrupt the sender 94 * is set by the host on the host->guest ring buffer, and by the
95 * when the ring transitions from being full to being able 95 * guest on the guest->host ring buffer.
96 * to handle a message of size "pending_send_sz".
97 * 96 *
98 * Add necessary state for this enhancement. 97 * The meaning of the feature bit is a bit complex in that it has
98 * semantics that apply to both ring buffers. If the guest sets
99 * the feature bit in the guest->host ring buffer, the guest is
100 * telling the host that:
101 * 1) It will set the pending_send_sz field in the guest->host ring
102 * buffer when it is waiting for space to become available, and
103 * 2) It will read the pending_send_sz field in the host->guest
104 * ring buffer and interrupt the host when it frees enough space
105 *
106 * Similarly, if the host sets the feature bit in the host->guest
107 * ring buffer, the host is telling the guest that:
108 * 1) It will set the pending_send_sz field in the host->guest ring
109 * buffer when it is waiting for space to become available, and
110 * 2) It will read the pending_send_sz field in the guest->host
111 * ring buffer and interrupt the guest when it frees enough space
112 *
113 * If either the guest or host does not set the feature bit that it
114 * owns, that guest or host must do polling if it encounters a full
115 * ring buffer, and not signal the other end with an interrupt.
99 */ 116 */
100 u32 pending_send_sz; 117 u32 pending_send_sz;
101
102 u32 reserved1[12]; 118 u32 reserved1[12];
103
104 union { 119 union {
105 struct { 120 struct {
106 u32 feat_pending_send_sz:1; 121 u32 feat_pending_send_sz:1;