diff options
author | Michael Kelley <mikelley@microsoft.com> | 2018-06-05 16:37:51 -0400 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2018-07-03 07:02:28 -0400 |
commit | 71b38245acb05a38d2d861792bdd99cd9f6a0f78 (patch) | |
tree | a1e2cbc6aa01eba596d3a341fbabc11781de8572 | |
parent | e9a7fda29a5620d9ac2a750d8e35f5d270096321 (diff) |
Drivers: hv: vmbus: Add comments on ring buffer signaling
Add comments describing intricacies of Hyper-V ring buffer
signaling code. This information is not in Hyper-V public
documents, so include here to capture the knowledge for
future coders.
There are no code changes in this commit.
Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r-- | drivers/hv/ring_buffer.c | 65 | ||||
-rw-r--r-- | include/linux/hyperv.h | 31 |
2 files changed, 77 insertions, 19 deletions
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index be3c8b10b84a..3e90eb91db45 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c | |||
@@ -431,7 +431,24 @@ static u32 hv_pkt_iter_bytes_read(const struct hv_ring_buffer_info *rbi, | |||
431 | } | 431 | } |
432 | 432 | ||
433 | /* | 433 | /* |
434 | * Update host ring buffer after iterating over packets. | 434 | * Update host ring buffer after iterating over packets. If the host has |
435 | * stopped queuing new entries because it found the ring buffer full, and | ||
436 | * sufficient space is being freed up, signal the host. But be careful to | ||
437 | * only signal the host when necessary, both for performance reasons and | ||
438 | * because Hyper-V protects itself by throttling guests that signal | ||
439 | * inappropriately. | ||
440 | * | ||
441 | * Determining when to signal is tricky. There are three key data inputs | ||
442 | * that must be handled in this order to avoid race conditions: | ||
443 | * | ||
444 | * 1. Update the read_index | ||
445 | * 2. Read the pending_send_sz | ||
446 | * 3. Read the current write_index | ||
447 | * | ||
448 | * The interrupt_mask is not used to determine when to signal. The | ||
449 | * interrupt_mask is used only on the guest->host ring buffer when | ||
450 | * sending requests to the host. The host does not use it on the host-> | ||
451 | * guest ring buffer to indicate whether it should be signaled. | ||
435 | */ | 452 | */ |
436 | void hv_pkt_iter_close(struct vmbus_channel *channel) | 453 | void hv_pkt_iter_close(struct vmbus_channel *channel) |
437 | { | 454 | { |
@@ -447,22 +464,30 @@ void hv_pkt_iter_close(struct vmbus_channel *channel) | |||
447 | start_read_index = rbi->ring_buffer->read_index; | 464 | start_read_index = rbi->ring_buffer->read_index; |
448 | rbi->ring_buffer->read_index = rbi->priv_read_index; | 465 | rbi->ring_buffer->read_index = rbi->priv_read_index; |
449 | 466 | ||
467 | /* | ||
468 | * Older versions of Hyper-V (before WS2102 and Win8) do not | ||
469 | * implement pending_send_sz and simply poll if the host->guest | ||
470 | * ring buffer is full. No signaling is needed or expected. | ||
471 | */ | ||
450 | if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz) | 472 | if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz) |
451 | return; | 473 | return; |
452 | 474 | ||
453 | /* | 475 | /* |
454 | * Issue a full memory barrier before making the signaling decision. | 476 | * Issue a full memory barrier before making the signaling decision. |
455 | * Here is the reason for having this barrier: | 477 | * If reading pending_send_sz were to be reordered and happen |
456 | * If the reading of the pend_sz (in this function) | 478 | * before we commit the new read_index, a race could occur. If the |
457 | * were to be reordered and read before we commit the new read | 479 | * host were to set the pending_send_sz after we have sampled |
458 | * index (in the calling function) we could | 480 | * pending_send_sz, and the ring buffer blocks before we commit the |
459 | * have a problem. If the host were to set the pending_sz after we | ||
460 | * have sampled pending_sz and go to sleep before we commit the | ||
461 | * read index, we could miss sending the interrupt. Issue a full | 481 | * read index, we could miss sending the interrupt. Issue a full |
462 | * memory barrier to address this. | 482 | * memory barrier to address this. |
463 | */ | 483 | */ |
464 | virt_mb(); | 484 | virt_mb(); |
465 | 485 | ||
486 | /* | ||
487 | * If the pending_send_sz is zero, then the ring buffer is not | ||
488 | * blocked and there is no need to signal. This is far by the | ||
489 | * most common case, so exit quickly for best performance. | ||
490 | */ | ||
466 | pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz); | 491 | pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz); |
467 | if (!pending_sz) | 492 | if (!pending_sz) |
468 | return; | 493 | return; |
@@ -476,14 +501,32 @@ void hv_pkt_iter_close(struct vmbus_channel *channel) | |||
476 | bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index); | 501 | bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index); |
477 | 502 | ||
478 | /* | 503 | /* |
479 | * If there was space before we began iteration, | 504 | * We want to signal the host only if we're transitioning |
480 | * then host was not blocked. | 505 | * from a "not enough free space" state to a "enough free |
506 | * space" state. For example, it's possible that this function | ||
507 | * could run and free up enough space to signal the host, and then | ||
508 | * run again and free up additional space before the host has a | ||
509 | * chance to clear the pending_send_sz. The 2nd invocation would | ||
510 | * be a null transition from "enough free space" to "enough free | ||
511 | * space", which doesn't warrant a signal. | ||
512 | * | ||
513 | * Exactly filling the ring buffer is treated as "not enough | ||
514 | * space". The ring buffer always must have at least one byte | ||
515 | * empty so the empty and full conditions are distinguishable. | ||
516 | * hv_get_bytes_to_write() doesn't fully tell the truth in | ||
517 | * this regard. | ||
518 | * | ||
519 | * So first check if we were in the "enough free space" state | ||
520 | * before we began the iteration. If so, the host was not | ||
521 | * blocked, and there's no need to signal. | ||
481 | */ | 522 | */ |
482 | |||
483 | if (curr_write_sz - bytes_read > pending_sz) | 523 | if (curr_write_sz - bytes_read > pending_sz) |
484 | return; | 524 | return; |
485 | 525 | ||
486 | /* If pending write will not fit, don't give false hope. */ | 526 | /* |
527 | * Similarly, if the new state is "not enough space", then | ||
528 | * there's no need to signal. | ||
529 | */ | ||
487 | if (curr_write_sz <= pending_sz) | 530 | if (curr_write_sz <= pending_sz) |
488 | return; | 531 | return; |
489 | 532 | ||
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 3a3012f57be4..2330f08062c7 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h | |||
@@ -89,18 +89,33 @@ struct hv_ring_buffer { | |||
89 | u32 interrupt_mask; | 89 | u32 interrupt_mask; |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * Win8 uses some of the reserved bits to implement | 92 | * WS2012/Win8 and later versions of Hyper-V implement interrupt |
93 | * interrupt driven flow management. On the send side | 93 | * driven flow management. The feature bit feat_pending_send_sz |
94 | * we can request that the receiver interrupt the sender | 94 | * is set by the host on the host->guest ring buffer, and by the |
95 | * when the ring transitions from being full to being able | 95 | * guest on the guest->host ring buffer. |
96 | * to handle a message of size "pending_send_sz". | ||
97 | * | 96 | * |
98 | * Add necessary state for this enhancement. | 97 | * The meaning of the feature bit is a bit complex in that it has |
98 | * semantics that apply to both ring buffers. If the guest sets | ||
99 | * the feature bit in the guest->host ring buffer, the guest is | ||
100 | * telling the host that: | ||
101 | * 1) It will set the pending_send_sz field in the guest->host ring | ||
102 | * buffer when it is waiting for space to become available, and | ||
103 | * 2) It will read the pending_send_sz field in the host->guest | ||
104 | * ring buffer and interrupt the host when it frees enough space | ||
105 | * | ||
106 | * Similarly, if the host sets the feature bit in the host->guest | ||
107 | * ring buffer, the host is telling the guest that: | ||
108 | * 1) It will set the pending_send_sz field in the host->guest ring | ||
109 | * buffer when it is waiting for space to become available, and | ||
110 | * 2) It will read the pending_send_sz field in the guest->host | ||
111 | * ring buffer and interrupt the guest when it frees enough space | ||
112 | * | ||
113 | * If either the guest or host does not set the feature bit that it | ||
114 | * owns, that guest or host must do polling if it encounters a full | ||
115 | * ring buffer, and not signal the other end with an interrupt. | ||
99 | */ | 116 | */ |
100 | u32 pending_send_sz; | 117 | u32 pending_send_sz; |
101 | |||
102 | u32 reserved1[12]; | 118 | u32 reserved1[12]; |
103 | |||
104 | union { | 119 | union { |
105 | struct { | 120 | struct { |
106 | u32 feat_pending_send_sz:1; | 121 | u32 feat_pending_send_sz:1; |