diff options
author | Marcelo Tosatti <mtosatti@redhat.com> | 2009-12-23 11:35:21 -0500 |
---|---|---|
committer | Marcelo Tosatti <mtosatti@redhat.com> | 2010-03-01 10:35:44 -0500 |
commit | bc6678a33d9b952981a8e44a4f876c3ad64ca4d8 (patch) | |
tree | e26027179eb0d76f234509145a395dd6e5910074 /virt/kvm/kvm_main.c | |
parent | 3ad26d8139a82b0510b1e0435ee82ae461d33401 (diff) |
KVM: introduce kvm->srcu and convert kvm_set_memory_region to SRCU update
Use two steps for memslot deletion: mark the slot invalid (which stops
instantiation of new shadow pages for that slot, but allows destruction),
then instantiate the new empty slot.
Also simplifies kvm_handle_hva locking.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 141 |
1 files changed, 105 insertions, 36 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 87d296d8b270..2bb24a814fdf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <linux/bitops.h> | 44 | #include <linux/bitops.h> |
45 | #include <linux/spinlock.h> | 45 | #include <linux/spinlock.h> |
46 | #include <linux/compat.h> | 46 | #include <linux/compat.h> |
47 | #include <linux/srcu.h> | ||
47 | 48 | ||
48 | #include <asm/processor.h> | 49 | #include <asm/processor.h> |
49 | #include <asm/io.h> | 50 | #include <asm/io.h> |
@@ -213,7 +214,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, | |||
213 | unsigned long address) | 214 | unsigned long address) |
214 | { | 215 | { |
215 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | 216 | struct kvm *kvm = mmu_notifier_to_kvm(mn); |
216 | int need_tlb_flush; | 217 | int need_tlb_flush, idx; |
217 | 218 | ||
218 | /* | 219 | /* |
219 | * When ->invalidate_page runs, the linux pte has been zapped | 220 | * When ->invalidate_page runs, the linux pte has been zapped |
@@ -233,10 +234,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, | |||
233 | * pte after kvm_unmap_hva returned, without noticing the page | 234 | * pte after kvm_unmap_hva returned, without noticing the page |
234 | * is going to be freed. | 235 | * is going to be freed. |
235 | */ | 236 | */ |
237 | idx = srcu_read_lock(&kvm->srcu); | ||
236 | spin_lock(&kvm->mmu_lock); | 238 | spin_lock(&kvm->mmu_lock); |
237 | kvm->mmu_notifier_seq++; | 239 | kvm->mmu_notifier_seq++; |
238 | need_tlb_flush = kvm_unmap_hva(kvm, address); | 240 | need_tlb_flush = kvm_unmap_hva(kvm, address); |
239 | spin_unlock(&kvm->mmu_lock); | 241 | spin_unlock(&kvm->mmu_lock); |
242 | srcu_read_unlock(&kvm->srcu, idx); | ||
240 | 243 | ||
241 | /* we've to flush the tlb before the pages can be freed */ | 244 | /* we've to flush the tlb before the pages can be freed */ |
242 | if (need_tlb_flush) | 245 | if (need_tlb_flush) |
@@ -250,11 +253,14 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, | |||
250 | pte_t pte) | 253 | pte_t pte) |
251 | { | 254 | { |
252 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | 255 | struct kvm *kvm = mmu_notifier_to_kvm(mn); |
256 | int idx; | ||
253 | 257 | ||
258 | idx = srcu_read_lock(&kvm->srcu); | ||
254 | spin_lock(&kvm->mmu_lock); | 259 | spin_lock(&kvm->mmu_lock); |
255 | kvm->mmu_notifier_seq++; | 260 | kvm->mmu_notifier_seq++; |
256 | kvm_set_spte_hva(kvm, address, pte); | 261 | kvm_set_spte_hva(kvm, address, pte); |
257 | spin_unlock(&kvm->mmu_lock); | 262 | spin_unlock(&kvm->mmu_lock); |
263 | srcu_read_unlock(&kvm->srcu, idx); | ||
258 | } | 264 | } |
259 | 265 | ||
260 | static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | 266 | static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, |
@@ -263,8 +269,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | |||
263 | unsigned long end) | 269 | unsigned long end) |
264 | { | 270 | { |
265 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | 271 | struct kvm *kvm = mmu_notifier_to_kvm(mn); |
266 | int need_tlb_flush = 0; | 272 | int need_tlb_flush = 0, idx; |
267 | 273 | ||
274 | idx = srcu_read_lock(&kvm->srcu); | ||
268 | spin_lock(&kvm->mmu_lock); | 275 | spin_lock(&kvm->mmu_lock); |
269 | /* | 276 | /* |
270 | * The count increase must become visible at unlock time as no | 277 | * The count increase must become visible at unlock time as no |
@@ -275,6 +282,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | |||
275 | for (; start < end; start += PAGE_SIZE) | 282 | for (; start < end; start += PAGE_SIZE) |
276 | need_tlb_flush |= kvm_unmap_hva(kvm, start); | 283 | need_tlb_flush |= kvm_unmap_hva(kvm, start); |
277 | spin_unlock(&kvm->mmu_lock); | 284 | spin_unlock(&kvm->mmu_lock); |
285 | srcu_read_unlock(&kvm->srcu, idx); | ||
278 | 286 | ||
279 | /* we've to flush the tlb before the pages can be freed */ | 287 | /* we've to flush the tlb before the pages can be freed */ |
280 | if (need_tlb_flush) | 288 | if (need_tlb_flush) |
@@ -312,11 +320,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, | |||
312 | unsigned long address) | 320 | unsigned long address) |
313 | { | 321 | { |
314 | struct kvm *kvm = mmu_notifier_to_kvm(mn); | 322 | struct kvm *kvm = mmu_notifier_to_kvm(mn); |
315 | int young; | 323 | int young, idx; |
316 | 324 | ||
325 | idx = srcu_read_lock(&kvm->srcu); | ||
317 | spin_lock(&kvm->mmu_lock); | 326 | spin_lock(&kvm->mmu_lock); |
318 | young = kvm_age_hva(kvm, address); | 327 | young = kvm_age_hva(kvm, address); |
319 | spin_unlock(&kvm->mmu_lock); | 328 | spin_unlock(&kvm->mmu_lock); |
329 | srcu_read_unlock(&kvm->srcu, idx); | ||
320 | 330 | ||
321 | if (young) | 331 | if (young) |
322 | kvm_flush_remote_tlbs(kvm); | 332 | kvm_flush_remote_tlbs(kvm); |
@@ -379,11 +389,15 @@ static struct kvm *kvm_create_vm(void) | |||
379 | kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 389 | kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); |
380 | if (!kvm->memslots) | 390 | if (!kvm->memslots) |
381 | goto out_err; | 391 | goto out_err; |
392 | if (init_srcu_struct(&kvm->srcu)) | ||
393 | goto out_err; | ||
382 | 394 | ||
383 | #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET | 395 | #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET |
384 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | 396 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); |
385 | if (!page) | 397 | if (!page) { |
398 | cleanup_srcu_struct(&kvm->srcu); | ||
386 | goto out_err; | 399 | goto out_err; |
400 | } | ||
387 | 401 | ||
388 | kvm->coalesced_mmio_ring = | 402 | kvm->coalesced_mmio_ring = |
389 | (struct kvm_coalesced_mmio_ring *)page_address(page); | 403 | (struct kvm_coalesced_mmio_ring *)page_address(page); |
@@ -391,6 +405,7 @@ static struct kvm *kvm_create_vm(void) | |||
391 | 405 | ||
392 | r = kvm_init_mmu_notifier(kvm); | 406 | r = kvm_init_mmu_notifier(kvm); |
393 | if (r) { | 407 | if (r) { |
408 | cleanup_srcu_struct(&kvm->srcu); | ||
394 | #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET | 409 | #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET |
395 | put_page(page); | 410 | put_page(page); |
396 | #endif | 411 | #endif |
@@ -480,6 +495,7 @@ static void kvm_destroy_vm(struct kvm *kvm) | |||
480 | #else | 495 | #else |
481 | kvm_arch_flush_shadow(kvm); | 496 | kvm_arch_flush_shadow(kvm); |
482 | #endif | 497 | #endif |
498 | cleanup_srcu_struct(&kvm->srcu); | ||
483 | kvm_arch_destroy_vm(kvm); | 499 | kvm_arch_destroy_vm(kvm); |
484 | hardware_disable_all(); | 500 | hardware_disable_all(); |
485 | mmdrop(mm); | 501 | mmdrop(mm); |
@@ -521,12 +537,13 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
521 | struct kvm_userspace_memory_region *mem, | 537 | struct kvm_userspace_memory_region *mem, |
522 | int user_alloc) | 538 | int user_alloc) |
523 | { | 539 | { |
524 | int r; | 540 | int r, flush_shadow = 0; |
525 | gfn_t base_gfn; | 541 | gfn_t base_gfn; |
526 | unsigned long npages; | 542 | unsigned long npages; |
527 | unsigned long i; | 543 | unsigned long i; |
528 | struct kvm_memory_slot *memslot; | 544 | struct kvm_memory_slot *memslot; |
529 | struct kvm_memory_slot old, new; | 545 | struct kvm_memory_slot old, new; |
546 | struct kvm_memslots *slots, *old_memslots; | ||
530 | 547 | ||
531 | r = -EINVAL; | 548 | r = -EINVAL; |
532 | /* General sanity checks */ | 549 | /* General sanity checks */ |
@@ -588,15 +605,7 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
588 | memset(new.rmap, 0, npages * sizeof(*new.rmap)); | 605 | memset(new.rmap, 0, npages * sizeof(*new.rmap)); |
589 | 606 | ||
590 | new.user_alloc = user_alloc; | 607 | new.user_alloc = user_alloc; |
591 | /* | 608 | new.userspace_addr = mem->userspace_addr; |
592 | * hva_to_rmmap() serialzies with the mmu_lock and to be | ||
593 | * safe it has to ignore memslots with !user_alloc && | ||
594 | * !userspace_addr. | ||
595 | */ | ||
596 | if (user_alloc) | ||
597 | new.userspace_addr = mem->userspace_addr; | ||
598 | else | ||
599 | new.userspace_addr = 0; | ||
600 | } | 609 | } |
601 | if (!npages) | 610 | if (!npages) |
602 | goto skip_lpage; | 611 | goto skip_lpage; |
@@ -651,8 +660,9 @@ skip_lpage: | |||
651 | if (!new.dirty_bitmap) | 660 | if (!new.dirty_bitmap) |
652 | goto out_free; | 661 | goto out_free; |
653 | memset(new.dirty_bitmap, 0, dirty_bytes); | 662 | memset(new.dirty_bitmap, 0, dirty_bytes); |
663 | /* destroy any largepage mappings for dirty tracking */ | ||
654 | if (old.npages) | 664 | if (old.npages) |
655 | kvm_arch_flush_shadow(kvm); | 665 | flush_shadow = 1; |
656 | } | 666 | } |
657 | #else /* not defined CONFIG_S390 */ | 667 | #else /* not defined CONFIG_S390 */ |
658 | new.user_alloc = user_alloc; | 668 | new.user_alloc = user_alloc; |
@@ -660,34 +670,72 @@ skip_lpage: | |||
660 | new.userspace_addr = mem->userspace_addr; | 670 | new.userspace_addr = mem->userspace_addr; |
661 | #endif /* not defined CONFIG_S390 */ | 671 | #endif /* not defined CONFIG_S390 */ |
662 | 672 | ||
663 | if (!npages) | 673 | if (!npages) { |
674 | r = -ENOMEM; | ||
675 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | ||
676 | if (!slots) | ||
677 | goto out_free; | ||
678 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | ||
679 | if (mem->slot >= slots->nmemslots) | ||
680 | slots->nmemslots = mem->slot + 1; | ||
681 | slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; | ||
682 | |||
683 | old_memslots = kvm->memslots; | ||
684 | rcu_assign_pointer(kvm->memslots, slots); | ||
685 | synchronize_srcu_expedited(&kvm->srcu); | ||
686 | /* From this point no new shadow pages pointing to a deleted | ||
687 | * memslot will be created. | ||
688 | * | ||
689 | * validation of sp->gfn happens in: | ||
690 | * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) | ||
691 | * - kvm_is_visible_gfn (mmu_check_roots) | ||
692 | */ | ||
664 | kvm_arch_flush_shadow(kvm); | 693 | kvm_arch_flush_shadow(kvm); |
694 | kfree(old_memslots); | ||
695 | } | ||
665 | 696 | ||
666 | r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); | 697 | r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); |
667 | if (r) | 698 | if (r) |
668 | goto out_free; | 699 | goto out_free; |
669 | 700 | ||
670 | spin_lock(&kvm->mmu_lock); | 701 | #ifdef CONFIG_DMAR |
671 | if (mem->slot >= kvm->memslots->nmemslots) | 702 | /* map the pages in iommu page table */ |
672 | kvm->memslots->nmemslots = mem->slot + 1; | 703 | if (npages) { |
704 | r = kvm_iommu_map_pages(kvm, &new); | ||
705 | if (r) | ||
706 | goto out_free; | ||
707 | } | ||
708 | #endif | ||
673 | 709 | ||
674 | *memslot = new; | 710 | r = -ENOMEM; |
675 | spin_unlock(&kvm->mmu_lock); | 711 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); |
712 | if (!slots) | ||
713 | goto out_free; | ||
714 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | ||
715 | if (mem->slot >= slots->nmemslots) | ||
716 | slots->nmemslots = mem->slot + 1; | ||
717 | |||
718 | /* actual memory is freed via old in kvm_free_physmem_slot below */ | ||
719 | if (!npages) { | ||
720 | new.rmap = NULL; | ||
721 | new.dirty_bitmap = NULL; | ||
722 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) | ||
723 | new.lpage_info[i] = NULL; | ||
724 | } | ||
725 | |||
726 | slots->memslots[mem->slot] = new; | ||
727 | old_memslots = kvm->memslots; | ||
728 | rcu_assign_pointer(kvm->memslots, slots); | ||
729 | synchronize_srcu_expedited(&kvm->srcu); | ||
676 | 730 | ||
677 | kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); | 731 | kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); |
678 | 732 | ||
679 | kvm_free_physmem_slot(&old, npages ? &new : NULL); | 733 | kvm_free_physmem_slot(&old, &new); |
680 | /* Slot deletion case: we have to update the current slot */ | 734 | kfree(old_memslots); |
681 | spin_lock(&kvm->mmu_lock); | 735 | |
682 | if (!npages) | 736 | if (flush_shadow) |
683 | *memslot = old; | 737 | kvm_arch_flush_shadow(kvm); |
684 | spin_unlock(&kvm->mmu_lock); | 738 | |
685 | #ifdef CONFIG_DMAR | ||
686 | /* map the pages in iommu page table */ | ||
687 | r = kvm_iommu_map_pages(kvm, memslot); | ||
688 | if (r) | ||
689 | goto out; | ||
690 | #endif | ||
691 | return 0; | 739 | return 0; |
692 | 740 | ||
693 | out_free: | 741 | out_free: |
@@ -787,7 +835,7 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva); | |||
787 | struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) | 835 | struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) |
788 | { | 836 | { |
789 | int i; | 837 | int i; |
790 | struct kvm_memslots *slots = kvm->memslots; | 838 | struct kvm_memslots *slots = rcu_dereference(kvm->memslots); |
791 | 839 | ||
792 | for (i = 0; i < slots->nmemslots; ++i) { | 840 | for (i = 0; i < slots->nmemslots; ++i) { |
793 | struct kvm_memory_slot *memslot = &slots->memslots[i]; | 841 | struct kvm_memory_slot *memslot = &slots->memslots[i]; |
@@ -809,12 +857,15 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | |||
809 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) | 857 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) |
810 | { | 858 | { |
811 | int i; | 859 | int i; |
812 | struct kvm_memslots *slots = kvm->memslots; | 860 | struct kvm_memslots *slots = rcu_dereference(kvm->memslots); |
813 | 861 | ||
814 | gfn = unalias_gfn(kvm, gfn); | 862 | gfn = unalias_gfn(kvm, gfn); |
815 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | 863 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { |
816 | struct kvm_memory_slot *memslot = &slots->memslots[i]; | 864 | struct kvm_memory_slot *memslot = &slots->memslots[i]; |
817 | 865 | ||
866 | if (memslot->flags & KVM_MEMSLOT_INVALID) | ||
867 | continue; | ||
868 | |||
818 | if (gfn >= memslot->base_gfn | 869 | if (gfn >= memslot->base_gfn |
819 | && gfn < memslot->base_gfn + memslot->npages) | 870 | && gfn < memslot->base_gfn + memslot->npages) |
820 | return 1; | 871 | return 1; |
@@ -823,13 +874,31 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) | |||
823 | } | 874 | } |
824 | EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); | 875 | EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); |
825 | 876 | ||
877 | int memslot_id(struct kvm *kvm, gfn_t gfn) | ||
878 | { | ||
879 | int i; | ||
880 | struct kvm_memslots *slots = rcu_dereference(kvm->memslots); | ||
881 | struct kvm_memory_slot *memslot = NULL; | ||
882 | |||
883 | gfn = unalias_gfn(kvm, gfn); | ||
884 | for (i = 0; i < slots->nmemslots; ++i) { | ||
885 | memslot = &slots->memslots[i]; | ||
886 | |||
887 | if (gfn >= memslot->base_gfn | ||
888 | && gfn < memslot->base_gfn + memslot->npages) | ||
889 | break; | ||
890 | } | ||
891 | |||
892 | return memslot - slots->memslots; | ||
893 | } | ||
894 | |||
826 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) | 895 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) |
827 | { | 896 | { |
828 | struct kvm_memory_slot *slot; | 897 | struct kvm_memory_slot *slot; |
829 | 898 | ||
830 | gfn = unalias_gfn(kvm, gfn); | 899 | gfn = unalias_gfn(kvm, gfn); |
831 | slot = gfn_to_memslot_unaliased(kvm, gfn); | 900 | slot = gfn_to_memslot_unaliased(kvm, gfn); |
832 | if (!slot) | 901 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) |
833 | return bad_hva(); | 902 | return bad_hva(); |
834 | return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); | 903 | return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); |
835 | } | 904 | } |