aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Matlack <dmatlack@google.com>2014-08-18 18:46:06 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2014-09-03 04:03:41 -0400
commitee3d1570b58677885b4552bce8217fda7b226a68 (patch)
treebfc831cf5b242321553fbd6a702085a1916fd04c
parent00f034a12fdd81210d58116326d92780aac5c238 (diff)
kvm: fix potentially corrupt mmio cache
vcpu exits and memslot mutations can run concurrently as long as the vcpu does not aquire the slots mutex. Thus it is theoretically possible for memslots to change underneath a vcpu that is handling an exit. If we increment the memslot generation number again after synchronize_srcu_expedited(), vcpus can safely cache memslot generation without maintaining a single rcu_dereference through an entire vm exit. And much of the x86/kvm code does not maintain a single rcu_dereference of the current memslots during each exit. We can prevent the following case: vcpu (CPU 0) | thread (CPU 1) --------------------------------------------+-------------------------- 1 vm exit | 2 srcu_read_unlock(&kvm->srcu) | 3 decide to cache something based on | old memslots | 4 | change memslots | (increments generation) 5 | synchronize_srcu(&kvm->srcu); 6 retrieve generation # from new memslots | 7 tag cache with new memslot generation | 8 srcu_read_unlock(&kvm->srcu) | ... | <action based on cache occurs even | though the caching decision was based | on the old memslots> | ... | <action *continues* to occur until next | memslot generation change, which may | be never> | | By incrementing the generation after synchronizing with kvm->srcu readers, we ensure that the generation retrieved in (6) will become invalid soon after (8). Keeping the existing increment is not strictly necessary, but we do keep it and just move it for consistency from update_memslots to install_new_memslots. It invalidates old cached MMIOs immediately, instead of having to wait for the end of synchronize_srcu_expedited, which makes the code more clearly correct in case CPU 1 is preempted right after synchronize_srcu() returns. To avoid halving the generation space in SPTEs, always presume that the low bit of the generation is zero when reconstructing a generation number out of an SPTE. This effectively disables MMIO caching in SPTEs during the call to synchronize_srcu_expedited. Using the low bit this way is somewhat like a seqcount---where the protected thing is a cache, and instead of retrying we can simply punt if we observe the low bit to be 1. Cc: stable@vger.kernel.org Signed-off-by: David Matlack <dmatlack@google.com> Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Reviewed-by: David Matlack <dmatlack@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r--Documentation/virtual/kvm/mmu.txt14
-rw-r--r--arch/x86/kvm/mmu.c20
-rw-r--r--virt/kvm/kvm_main.c23
3 files changed, 42 insertions, 15 deletions
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 290894176142..53838d9c6295 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -425,6 +425,20 @@ fault through the slow path.
425Since only 19 bits are used to store generation-number on mmio spte, all 425Since only 19 bits are used to store generation-number on mmio spte, all
426pages are zapped when there is an overflow. 426pages are zapped when there is an overflow.
427 427
428Unfortunately, a single memory access might access kvm_memslots(kvm) multiple
429times, the last one happening when the generation number is retrieved and
430stored into the MMIO spte. Thus, the MMIO spte might be created based on
431out-of-date information, but with an up-to-date generation number.
432
433To avoid this, the generation number is incremented again after synchronize_srcu
434returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a
435memslot update, while some SRCU readers might be using the old copy. We do not
436want to use an MMIO sptes created with an odd generation number, and we can do
437this without losing a bit in the MMIO spte. The low bit of the generation
438is not stored in MMIO spte, and presumed zero when it is extracted out of the
439spte. If KVM is unlucky and creates an MMIO spte while the low bit is 1,
440the next access to the spte will always be a cache miss.
441
428 442
429Further reading 443Further reading
430=============== 444===============
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 323c3f5f5c84..96515957ba82 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -199,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
199EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 199EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
200 200
201/* 201/*
202 * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, 202 * the low bit of the generation number is always presumed to be zero.
203 * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation 203 * This disables mmio caching during memslot updates. The concept is
204 * number. 204 * similar to a seqcount but instead of retrying the access we just punt
205 * and ignore the cache.
206 *
207 * spte bits 3-11 are used as bits 1-9 of the generation number,
208 * the bits 52-61 are used as bits 10-19 of the generation number.
205 */ 209 */
206#define MMIO_SPTE_GEN_LOW_SHIFT 3 210#define MMIO_SPTE_GEN_LOW_SHIFT 2
207#define MMIO_SPTE_GEN_HIGH_SHIFT 52 211#define MMIO_SPTE_GEN_HIGH_SHIFT 52
208 212
209#define MMIO_GEN_SHIFT 19 213#define MMIO_GEN_SHIFT 20
210#define MMIO_GEN_LOW_SHIFT 9 214#define MMIO_GEN_LOW_SHIFT 10
211#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) 215#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
212#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) 216#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
213#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) 217#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
214 218
@@ -4428,7 +4432,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
4428 * The very rare case: if the generation-number is round, 4432 * The very rare case: if the generation-number is round,
4429 * zap all shadow pages. 4433 * zap all shadow pages.
4430 */ 4434 */
4431 if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { 4435 if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
4432 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); 4436 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
4433 kvm_mmu_invalidate_zap_all_pages(kvm); 4437 kvm_mmu_invalidate_zap_all_pages(kvm);
4434 } 4438 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0bfdb673db26..bb8641b5d83b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -95,8 +95,6 @@ static int hardware_enable_all(void);
95static void hardware_disable_all(void); 95static void hardware_disable_all(void);
96 96
97static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 97static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
98static void update_memslots(struct kvm_memslots *slots,
99 struct kvm_memory_slot *new, u64 last_generation);
100 98
101static void kvm_release_pfn_dirty(pfn_t pfn); 99static void kvm_release_pfn_dirty(pfn_t pfn);
102static void mark_page_dirty_in_slot(struct kvm *kvm, 100static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -695,8 +693,7 @@ static void sort_memslots(struct kvm_memslots *slots)
695} 693}
696 694
697static void update_memslots(struct kvm_memslots *slots, 695static void update_memslots(struct kvm_memslots *slots,
698 struct kvm_memory_slot *new, 696 struct kvm_memory_slot *new)
699 u64 last_generation)
700{ 697{
701 if (new) { 698 if (new) {
702 int id = new->id; 699 int id = new->id;
@@ -707,8 +704,6 @@ static void update_memslots(struct kvm_memslots *slots,
707 if (new->npages != npages) 704 if (new->npages != npages)
708 sort_memslots(slots); 705 sort_memslots(slots);
709 } 706 }
710
711 slots->generation = last_generation + 1;
712} 707}
713 708
714static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) 709static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
@@ -730,10 +725,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
730{ 725{
731 struct kvm_memslots *old_memslots = kvm->memslots; 726 struct kvm_memslots *old_memslots = kvm->memslots;
732 727
733 update_memslots(slots, new, kvm->memslots->generation); 728 /*
729 * Set the low bit in the generation, which disables SPTE caching
730 * until the end of synchronize_srcu_expedited.
731 */
732 WARN_ON(old_memslots->generation & 1);
733 slots->generation = old_memslots->generation + 1;
734
735 update_memslots(slots, new);
734 rcu_assign_pointer(kvm->memslots, slots); 736 rcu_assign_pointer(kvm->memslots, slots);
735 synchronize_srcu_expedited(&kvm->srcu); 737 synchronize_srcu_expedited(&kvm->srcu);
736 738
739 /*
740 * Increment the new memslot generation a second time. This prevents
741 * vm exits that race with memslot updates from caching a memslot
742 * generation that will (potentially) be valid forever.
743 */
744 slots->generation++;
745
737 kvm_arch_memslots_updated(kvm); 746 kvm_arch_memslots_updated(kvm);
738 747
739 return old_memslots; 748 return old_memslots;