aboutsummaryrefslogtreecommitdiffstats
path: root/virt/kvm/kvm_main.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-04 12:30:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-04 12:30:33 -0400
commitecefbd94b834fa32559d854646d777c56749ef1c (patch)
treeca8958900ad9e208a8e5fb7704f1b66dc76131b4 /virt/kvm/kvm_main.c
parentce57e981f2b996aaca2031003b3f866368307766 (diff)
parent3d11df7abbff013b811d5615320580cd5d9d7d31 (diff)
Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity: "Highlights of the changes for this release include support for vfio level triggered interrupts, improved big real mode support on older Intels, a streamlines guest page table walker, guest APIC speedups, PIO optimizations, better overcommit handling, and read-only memory." * tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits) KVM: s390: Fix vcpu_load handling in interrupt code KVM: x86: Fix guest debug across vcpu INIT reset KVM: Add resampling irqfds for level triggered interrupts KVM: optimize apic interrupt delivery KVM: MMU: Eliminate pointless temporary 'ac' KVM: MMU: Avoid access/dirty update loop if all is well KVM: MMU: Eliminate eperm temporary KVM: MMU: Optimize is_last_gpte() KVM: MMU: Simplify walk_addr_generic() loop KVM: MMU: Optimize pte permission checks KVM: MMU: Update accessed and dirty bits after guest pagetable walk KVM: MMU: Move gpte_access() out of paging_tmpl.h KVM: MMU: Optimize gpte_access() slightly KVM: MMU: Push clean gpte write protection out of gpte_access() KVM: clarify kvmclock documentation KVM: make processes waiting on vcpu mutex killable KVM: SVM: Make use of asm.h KVM: VMX: Make use of asm.h KVM: VMX: Make lto-friendly KVM: x86: lapic: Clean up find_highest_vector() and count_vectors() ... Conflicts: arch/s390/include/asm/processor.h arch/x86/kvm/i8259.c
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r--virt/kvm/kvm_main.c541
1 files changed, 327 insertions, 214 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d617f69131d7..c353b4599cec 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
100 100
101static bool largepages_enabled = true; 101static bool largepages_enabled = true;
102 102
103static struct page *hwpoison_page; 103bool kvm_is_mmio_pfn(pfn_t pfn)
104static pfn_t hwpoison_pfn;
105
106struct page *fault_page;
107pfn_t fault_pfn;
108
109inline int kvm_is_mmio_pfn(pfn_t pfn)
110{ 104{
111 if (pfn_valid(pfn)) { 105 if (pfn_valid(pfn)) {
112 int reserved; 106 int reserved;
@@ -137,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)
137/* 131/*
138 * Switches to specified vcpu, until a matching vcpu_put() 132 * Switches to specified vcpu, until a matching vcpu_put()
139 */ 133 */
140void vcpu_load(struct kvm_vcpu *vcpu) 134int vcpu_load(struct kvm_vcpu *vcpu)
141{ 135{
142 int cpu; 136 int cpu;
143 137
144 mutex_lock(&vcpu->mutex); 138 if (mutex_lock_killable(&vcpu->mutex))
139 return -EINTR;
145 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 140 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
146 /* The thread running this VCPU changed. */ 141 /* The thread running this VCPU changed. */
147 struct pid *oldpid = vcpu->pid; 142 struct pid *oldpid = vcpu->pid;
@@ -154,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
154 preempt_notifier_register(&vcpu->preempt_notifier); 149 preempt_notifier_register(&vcpu->preempt_notifier);
155 kvm_arch_vcpu_load(vcpu, cpu); 150 kvm_arch_vcpu_load(vcpu, cpu);
156 put_cpu(); 151 put_cpu();
152 return 0;
157} 153}
158 154
159void vcpu_put(struct kvm_vcpu *vcpu) 155void vcpu_put(struct kvm_vcpu *vcpu)
@@ -236,6 +232,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
236 } 232 }
237 vcpu->run = page_address(page); 233 vcpu->run = page_address(page);
238 234
235 kvm_vcpu_set_in_spin_loop(vcpu, false);
236 kvm_vcpu_set_dy_eligible(vcpu, false);
237
239 r = kvm_arch_vcpu_init(vcpu); 238 r = kvm_arch_vcpu_init(vcpu);
240 if (r < 0) 239 if (r < 0)
241 goto fail_free_run; 240 goto fail_free_run;
@@ -332,8 +331,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
332 * count is also read inside the mmu_lock critical section. 331 * count is also read inside the mmu_lock critical section.
333 */ 332 */
334 kvm->mmu_notifier_count++; 333 kvm->mmu_notifier_count++;
335 for (; start < end; start += PAGE_SIZE) 334 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
336 need_tlb_flush |= kvm_unmap_hva(kvm, start);
337 need_tlb_flush |= kvm->tlbs_dirty; 335 need_tlb_flush |= kvm->tlbs_dirty;
338 /* we've to flush the tlb before the pages can be freed */ 336 /* we've to flush the tlb before the pages can be freed */
339 if (need_tlb_flush) 337 if (need_tlb_flush)
@@ -412,7 +410,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
412 int idx; 410 int idx;
413 411
414 idx = srcu_read_lock(&kvm->srcu); 412 idx = srcu_read_lock(&kvm->srcu);
415 kvm_arch_flush_shadow(kvm); 413 kvm_arch_flush_shadow_all(kvm);
416 srcu_read_unlock(&kvm->srcu, idx); 414 srcu_read_unlock(&kvm->srcu, idx);
417} 415}
418 416
@@ -551,16 +549,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
551static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 549static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
552 struct kvm_memory_slot *dont) 550 struct kvm_memory_slot *dont)
553{ 551{
554 if (!dont || free->rmap != dont->rmap)
555 vfree(free->rmap);
556
557 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 552 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
558 kvm_destroy_dirty_bitmap(free); 553 kvm_destroy_dirty_bitmap(free);
559 554
560 kvm_arch_free_memslot(free, dont); 555 kvm_arch_free_memslot(free, dont);
561 556
562 free->npages = 0; 557 free->npages = 0;
563 free->rmap = NULL;
564} 558}
565 559
566void kvm_free_physmem(struct kvm *kvm) 560void kvm_free_physmem(struct kvm *kvm)
@@ -590,7 +584,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
590#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 584#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
591 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 585 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
592#else 586#else
593 kvm_arch_flush_shadow(kvm); 587 kvm_arch_flush_shadow_all(kvm);
594#endif 588#endif
595 kvm_arch_destroy_vm(kvm); 589 kvm_arch_destroy_vm(kvm);
596 kvm_free_physmem(kvm); 590 kvm_free_physmem(kvm);
@@ -686,6 +680,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
686 slots->generation++; 680 slots->generation++;
687} 681}
688 682
683static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
684{
685 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
686
687#ifdef KVM_CAP_READONLY_MEM
688 valid_flags |= KVM_MEM_READONLY;
689#endif
690
691 if (mem->flags & ~valid_flags)
692 return -EINVAL;
693
694 return 0;
695}
696
689/* 697/*
690 * Allocate some memory and give it an address in the guest physical address 698 * Allocate some memory and give it an address in the guest physical address
691 * space. 699 * space.
@@ -706,6 +714,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
706 struct kvm_memory_slot old, new; 714 struct kvm_memory_slot old, new;
707 struct kvm_memslots *slots, *old_memslots; 715 struct kvm_memslots *slots, *old_memslots;
708 716
717 r = check_memory_region_flags(mem);
718 if (r)
719 goto out;
720
709 r = -EINVAL; 721 r = -EINVAL;
710 /* General sanity checks */ 722 /* General sanity checks */
711 if (mem->memory_size & (PAGE_SIZE - 1)) 723 if (mem->memory_size & (PAGE_SIZE - 1))
@@ -769,11 +781,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
769 if (npages && !old.npages) { 781 if (npages && !old.npages) {
770 new.user_alloc = user_alloc; 782 new.user_alloc = user_alloc;
771 new.userspace_addr = mem->userspace_addr; 783 new.userspace_addr = mem->userspace_addr;
772#ifndef CONFIG_S390 784
773 new.rmap = vzalloc(npages * sizeof(*new.rmap));
774 if (!new.rmap)
775 goto out_free;
776#endif /* not defined CONFIG_S390 */
777 if (kvm_arch_create_memslot(&new, npages)) 785 if (kvm_arch_create_memslot(&new, npages))
778 goto out_free; 786 goto out_free;
779 } 787 }
@@ -785,7 +793,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
785 /* destroy any largepage mappings for dirty tracking */ 793 /* destroy any largepage mappings for dirty tracking */
786 } 794 }
787 795
788 if (!npages) { 796 if (!npages || base_gfn != old.base_gfn) {
789 struct kvm_memory_slot *slot; 797 struct kvm_memory_slot *slot;
790 798
791 r = -ENOMEM; 799 r = -ENOMEM;
@@ -801,14 +809,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
801 old_memslots = kvm->memslots; 809 old_memslots = kvm->memslots;
802 rcu_assign_pointer(kvm->memslots, slots); 810 rcu_assign_pointer(kvm->memslots, slots);
803 synchronize_srcu_expedited(&kvm->srcu); 811 synchronize_srcu_expedited(&kvm->srcu);
804 /* From this point no new shadow pages pointing to a deleted 812 /* From this point no new shadow pages pointing to a deleted,
805 * memslot will be created. 813 * or moved, memslot will be created.
806 * 814 *
807 * validation of sp->gfn happens in: 815 * validation of sp->gfn happens in:
808 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 816 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
809 * - kvm_is_visible_gfn (mmu_check_roots) 817 * - kvm_is_visible_gfn (mmu_check_roots)
810 */ 818 */
811 kvm_arch_flush_shadow(kvm); 819 kvm_arch_flush_shadow_memslot(kvm, slot);
812 kfree(old_memslots); 820 kfree(old_memslots);
813 } 821 }
814 822
@@ -832,7 +840,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
832 840
833 /* actual memory is freed via old in kvm_free_physmem_slot below */ 841 /* actual memory is freed via old in kvm_free_physmem_slot below */
834 if (!npages) { 842 if (!npages) {
835 new.rmap = NULL;
836 new.dirty_bitmap = NULL; 843 new.dirty_bitmap = NULL;
837 memset(&new.arch, 0, sizeof(new.arch)); 844 memset(&new.arch, 0, sizeof(new.arch));
838 } 845 }
@@ -844,13 +851,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
844 851
845 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 852 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
846 853
847 /*
848 * If the new memory slot is created, we need to clear all
849 * mmio sptes.
850 */
851 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
852 kvm_arch_flush_shadow(kvm);
853
854 kvm_free_physmem_slot(&old, &new); 854 kvm_free_physmem_slot(&old, &new);
855 kfree(old_memslots); 855 kfree(old_memslots);
856 856
@@ -932,53 +932,6 @@ void kvm_disable_largepages(void)
932} 932}
933EXPORT_SYMBOL_GPL(kvm_disable_largepages); 933EXPORT_SYMBOL_GPL(kvm_disable_largepages);
934 934
935int is_error_page(struct page *page)
936{
937 return page == bad_page || page == hwpoison_page || page == fault_page;
938}
939EXPORT_SYMBOL_GPL(is_error_page);
940
941int is_error_pfn(pfn_t pfn)
942{
943 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
944}
945EXPORT_SYMBOL_GPL(is_error_pfn);
946
947int is_hwpoison_pfn(pfn_t pfn)
948{
949 return pfn == hwpoison_pfn;
950}
951EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
952
953int is_fault_pfn(pfn_t pfn)
954{
955 return pfn == fault_pfn;
956}
957EXPORT_SYMBOL_GPL(is_fault_pfn);
958
959int is_noslot_pfn(pfn_t pfn)
960{
961 return pfn == bad_pfn;
962}
963EXPORT_SYMBOL_GPL(is_noslot_pfn);
964
965int is_invalid_pfn(pfn_t pfn)
966{
967 return pfn == hwpoison_pfn || pfn == fault_pfn;
968}
969EXPORT_SYMBOL_GPL(is_invalid_pfn);
970
971static inline unsigned long bad_hva(void)
972{
973 return PAGE_OFFSET;
974}
975
976int kvm_is_error_hva(unsigned long addr)
977{
978 return addr == bad_hva();
979}
980EXPORT_SYMBOL_GPL(kvm_is_error_hva);
981
982struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 935struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
983{ 936{
984 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 937 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1021,28 +974,62 @@ out:
1021 return size; 974 return size;
1022} 975}
1023 976
1024static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 977static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1025 gfn_t *nr_pages) 978{
979 return slot->flags & KVM_MEM_READONLY;
980}
981
982static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
983 gfn_t *nr_pages, bool write)
1026{ 984{
1027 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 985 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1028 return bad_hva(); 986 return KVM_HVA_ERR_BAD;
987
988 if (memslot_is_readonly(slot) && write)
989 return KVM_HVA_ERR_RO_BAD;
1029 990
1030 if (nr_pages) 991 if (nr_pages)
1031 *nr_pages = slot->npages - (gfn - slot->base_gfn); 992 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1032 993
1033 return gfn_to_hva_memslot(slot, gfn); 994 return __gfn_to_hva_memslot(slot, gfn);
1034} 995}
1035 996
997static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
998 gfn_t *nr_pages)
999{
1000 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1001}
1002
1003unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1004 gfn_t gfn)
1005{
1006 return gfn_to_hva_many(slot, gfn, NULL);
1007}
1008EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1009
1036unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1010unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1037{ 1011{
1038 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1012 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1039} 1013}
1040EXPORT_SYMBOL_GPL(gfn_to_hva); 1014EXPORT_SYMBOL_GPL(gfn_to_hva);
1041 1015
1042static pfn_t get_fault_pfn(void) 1016/*
1017 * The hva returned by this function is only allowed to be read.
1018 * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
1019 */
1020static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
1021{
1022 return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
1023}
1024
1025static int kvm_read_hva(void *data, void __user *hva, int len)
1043{ 1026{
1044 get_page(fault_page); 1027 return __copy_from_user(data, hva, len);
1045 return fault_pfn; 1028}
1029
1030static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
1031{
1032 return __copy_from_user_inatomic(data, hva, len);
1046} 1033}
1047 1034
1048int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1035int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1065,108 +1052,186 @@ static inline int check_user_page_hwpoison(unsigned long addr)
1065 return rc == -EHWPOISON; 1052 return rc == -EHWPOISON;
1066} 1053}
1067 1054
1068static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, 1055/*
1069 bool *async, bool write_fault, bool *writable) 1056 * The atomic path to get the writable pfn which will be stored in @pfn,
1057 * true indicates success, otherwise false is returned.
1058 */
1059static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
1060 bool write_fault, bool *writable, pfn_t *pfn)
1070{ 1061{
1071 struct page *page[1]; 1062 struct page *page[1];
1072 int npages = 0; 1063 int npages;
1073 pfn_t pfn;
1074 1064
1075 /* we can do it either atomically or asynchronously, not both */ 1065 if (!(async || atomic))
1076 BUG_ON(atomic && async); 1066 return false;
1077 1067
1078 BUG_ON(!write_fault && !writable); 1068 /*
1069 * Fast pin a writable pfn only if it is a write fault request
1070 * or the caller allows to map a writable pfn for a read fault
1071 * request.
1072 */
1073 if (!(write_fault || writable))
1074 return false;
1079 1075
1080 if (writable) 1076 npages = __get_user_pages_fast(addr, 1, 1, page);
1081 *writable = true; 1077 if (npages == 1) {
1078 *pfn = page_to_pfn(page[0]);
1082 1079
1083 if (atomic || async) 1080 if (writable)
1084 npages = __get_user_pages_fast(addr, 1, 1, page); 1081 *writable = true;
1082 return true;
1083 }
1085 1084
1086 if (unlikely(npages != 1) && !atomic) { 1085 return false;
1087 might_sleep(); 1086}
1088 1087
1089 if (writable) 1088/*
1090 *writable = write_fault; 1089 * The slow path to get the pfn of the specified host virtual address,
1090 * 1 indicates success, -errno is returned if error is detected.
1091 */
1092static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1093 bool *writable, pfn_t *pfn)
1094{
1095 struct page *page[1];
1096 int npages = 0;
1091 1097
1092 if (async) { 1098 might_sleep();
1093 down_read(&current->mm->mmap_sem); 1099
1094 npages = get_user_page_nowait(current, current->mm, 1100 if (writable)
1095 addr, write_fault, page); 1101 *writable = write_fault;
1096 up_read(&current->mm->mmap_sem); 1102
1097 } else 1103 if (async) {
1098 npages = get_user_pages_fast(addr, 1, write_fault, 1104 down_read(&current->mm->mmap_sem);
1099 page); 1105 npages = get_user_page_nowait(current, current->mm,
1100 1106 addr, write_fault, page);
1101 /* map read fault as writable if possible */ 1107 up_read(&current->mm->mmap_sem);
1102 if (unlikely(!write_fault) && npages == 1) { 1108 } else
1103 struct page *wpage[1]; 1109 npages = get_user_pages_fast(addr, 1, write_fault,
1104 1110 page);
1105 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1111 if (npages != 1)
1106 if (npages == 1) { 1112 return npages;
1107 *writable = true; 1113
1108 put_page(page[0]); 1114 /* map read fault as writable if possible */
1109 page[0] = wpage[0]; 1115 if (unlikely(!write_fault) && writable) {
1110 } 1116 struct page *wpage[1];
1111 npages = 1; 1117
1118 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1119 if (npages == 1) {
1120 *writable = true;
1121 put_page(page[0]);
1122 page[0] = wpage[0];
1112 } 1123 }
1124
1125 npages = 1;
1113 } 1126 }
1127 *pfn = page_to_pfn(page[0]);
1128 return npages;
1129}
1114 1130
1115 if (unlikely(npages != 1)) { 1131static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1116 struct vm_area_struct *vma; 1132{
1133 if (unlikely(!(vma->vm_flags & VM_READ)))
1134 return false;
1117 1135
1118 if (atomic) 1136 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1119 return get_fault_pfn(); 1137 return false;
1120 1138
1121 down_read(&current->mm->mmap_sem); 1139 return true;
1122 if (npages == -EHWPOISON || 1140}
1123 (!async && check_user_page_hwpoison(addr))) {
1124 up_read(&current->mm->mmap_sem);
1125 get_page(hwpoison_page);
1126 return page_to_pfn(hwpoison_page);
1127 }
1128 1141
1129 vma = find_vma_intersection(current->mm, addr, addr+1); 1142/*
1130 1143 * Pin guest page in memory and return its pfn.
1131 if (vma == NULL) 1144 * @addr: host virtual address which maps memory to the guest
1132 pfn = get_fault_pfn(); 1145 * @atomic: whether this function can sleep
1133 else if ((vma->vm_flags & VM_PFNMAP)) { 1146 * @async: whether this function need to wait IO complete if the
1134 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1147 * host page is not in the memory
1135 vma->vm_pgoff; 1148 * @write_fault: whether we should get a writable host page
1136 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1149 * @writable: whether it allows to map a writable host page for !@write_fault
1137 } else { 1150 *
1138 if (async && (vma->vm_flags & VM_WRITE)) 1151 * The function will map a writable host page for these two cases:
1139 *async = true; 1152 * 1): @write_fault = true
1140 pfn = get_fault_pfn(); 1153 * 2): @write_fault = false && @writable, @writable will tell the caller
1141 } 1154 * whether the mapping is writable.
1142 up_read(&current->mm->mmap_sem); 1155 */
1143 } else 1156static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1144 pfn = page_to_pfn(page[0]); 1157 bool write_fault, bool *writable)
1158{
1159 struct vm_area_struct *vma;
1160 pfn_t pfn = 0;
1161 int npages;
1162
1163 /* we can do it either atomically or asynchronously, not both */
1164 BUG_ON(atomic && async);
1145 1165
1166 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
1167 return pfn;
1168
1169 if (atomic)
1170 return KVM_PFN_ERR_FAULT;
1171
1172 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1173 if (npages == 1)
1174 return pfn;
1175
1176 down_read(&current->mm->mmap_sem);
1177 if (npages == -EHWPOISON ||
1178 (!async && check_user_page_hwpoison(addr))) {
1179 pfn = KVM_PFN_ERR_HWPOISON;
1180 goto exit;
1181 }
1182
1183 vma = find_vma_intersection(current->mm, addr, addr + 1);
1184
1185 if (vma == NULL)
1186 pfn = KVM_PFN_ERR_FAULT;
1187 else if ((vma->vm_flags & VM_PFNMAP)) {
1188 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1189 vma->vm_pgoff;
1190 BUG_ON(!kvm_is_mmio_pfn(pfn));
1191 } else {
1192 if (async && vma_is_valid(vma, write_fault))
1193 *async = true;
1194 pfn = KVM_PFN_ERR_FAULT;
1195 }
1196exit:
1197 up_read(&current->mm->mmap_sem);
1146 return pfn; 1198 return pfn;
1147} 1199}
1148 1200
1149pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1201static pfn_t
1202__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1203 bool *async, bool write_fault, bool *writable)
1150{ 1204{
1151 return hva_to_pfn(kvm, addr, true, NULL, true, NULL); 1205 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1206
1207 if (addr == KVM_HVA_ERR_RO_BAD)
1208 return KVM_PFN_ERR_RO_FAULT;
1209
1210 if (kvm_is_error_hva(addr))
1211 return KVM_PFN_ERR_BAD;
1212
1213 /* Do not map writable pfn in the readonly memslot. */
1214 if (writable && memslot_is_readonly(slot)) {
1215 *writable = false;
1216 writable = NULL;
1217 }
1218
1219 return hva_to_pfn(addr, atomic, async, write_fault,
1220 writable);
1152} 1221}
1153EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1154 1222
1155static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1223static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1156 bool write_fault, bool *writable) 1224 bool write_fault, bool *writable)
1157{ 1225{
1158 unsigned long addr; 1226 struct kvm_memory_slot *slot;
1159 1227
1160 if (async) 1228 if (async)
1161 *async = false; 1229 *async = false;
1162 1230
1163 addr = gfn_to_hva(kvm, gfn); 1231 slot = gfn_to_memslot(kvm, gfn);
1164 if (kvm_is_error_hva(addr)) {
1165 get_page(bad_page);
1166 return page_to_pfn(bad_page);
1167 }
1168 1232
1169 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); 1233 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
1234 writable);
1170} 1235}
1171 1236
1172pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1237pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1195,12 +1260,16 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1195} 1260}
1196EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1261EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1197 1262
1198pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1263pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1199 struct kvm_memory_slot *slot, gfn_t gfn) 1264{
1265 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
1266}
1267
1268pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1200{ 1269{
1201 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1270 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1202 return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1203} 1271}
1272EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1204 1273
1205int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1274int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1206 int nr_pages) 1275 int nr_pages)
@@ -1219,30 +1288,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1219} 1288}
1220EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1289EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1221 1290
1291static struct page *kvm_pfn_to_page(pfn_t pfn)
1292{
1293 if (is_error_pfn(pfn))
1294 return KVM_ERR_PTR_BAD_PAGE;
1295
1296 if (kvm_is_mmio_pfn(pfn)) {
1297 WARN_ON(1);
1298 return KVM_ERR_PTR_BAD_PAGE;
1299 }
1300
1301 return pfn_to_page(pfn);
1302}
1303
1222struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1304struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1223{ 1305{
1224 pfn_t pfn; 1306 pfn_t pfn;
1225 1307
1226 pfn = gfn_to_pfn(kvm, gfn); 1308 pfn = gfn_to_pfn(kvm, gfn);
1227 if (!kvm_is_mmio_pfn(pfn))
1228 return pfn_to_page(pfn);
1229
1230 WARN_ON(kvm_is_mmio_pfn(pfn));
1231 1309
1232 get_page(bad_page); 1310 return kvm_pfn_to_page(pfn);
1233 return bad_page;
1234} 1311}
1235 1312
1236EXPORT_SYMBOL_GPL(gfn_to_page); 1313EXPORT_SYMBOL_GPL(gfn_to_page);
1237 1314
1238void kvm_release_page_clean(struct page *page) 1315void kvm_release_page_clean(struct page *page)
1239{ 1316{
1317 WARN_ON(is_error_page(page));
1318
1240 kvm_release_pfn_clean(page_to_pfn(page)); 1319 kvm_release_pfn_clean(page_to_pfn(page));
1241} 1320}
1242EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1321EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1243 1322
1244void kvm_release_pfn_clean(pfn_t pfn) 1323void kvm_release_pfn_clean(pfn_t pfn)
1245{ 1324{
1325 WARN_ON(is_error_pfn(pfn));
1326
1246 if (!kvm_is_mmio_pfn(pfn)) 1327 if (!kvm_is_mmio_pfn(pfn))
1247 put_page(pfn_to_page(pfn)); 1328 put_page(pfn_to_page(pfn));
1248} 1329}
@@ -1250,6 +1331,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1250 1331
1251void kvm_release_page_dirty(struct page *page) 1332void kvm_release_page_dirty(struct page *page)
1252{ 1333{
1334 WARN_ON(is_error_page(page));
1335
1253 kvm_release_pfn_dirty(page_to_pfn(page)); 1336 kvm_release_pfn_dirty(page_to_pfn(page));
1254} 1337}
1255EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1338EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1305,10 +1388,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1305 int r; 1388 int r;
1306 unsigned long addr; 1389 unsigned long addr;
1307 1390
1308 addr = gfn_to_hva(kvm, gfn); 1391 addr = gfn_to_hva_read(kvm, gfn);
1309 if (kvm_is_error_hva(addr)) 1392 if (kvm_is_error_hva(addr))
1310 return -EFAULT; 1393 return -EFAULT;
1311 r = __copy_from_user(data, (void __user *)addr + offset, len); 1394 r = kvm_read_hva(data, (void __user *)addr + offset, len);
1312 if (r) 1395 if (r)
1313 return -EFAULT; 1396 return -EFAULT;
1314 return 0; 1397 return 0;
@@ -1343,11 +1426,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1343 gfn_t gfn = gpa >> PAGE_SHIFT; 1426 gfn_t gfn = gpa >> PAGE_SHIFT;
1344 int offset = offset_in_page(gpa); 1427 int offset = offset_in_page(gpa);
1345 1428
1346 addr = gfn_to_hva(kvm, gfn); 1429 addr = gfn_to_hva_read(kvm, gfn);
1347 if (kvm_is_error_hva(addr)) 1430 if (kvm_is_error_hva(addr))
1348 return -EFAULT; 1431 return -EFAULT;
1349 pagefault_disable(); 1432 pagefault_disable();
1350 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1433 r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
1351 pagefault_enable(); 1434 pagefault_enable();
1352 if (r) 1435 if (r)
1353 return -EFAULT; 1436 return -EFAULT;
@@ -1580,6 +1663,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1580} 1663}
1581EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1664EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1582 1665
1666#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
1667/*
1668 * Helper that checks whether a VCPU is eligible for directed yield.
1669 * Most eligible candidate to yield is decided by following heuristics:
1670 *
1671 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
1672 * (preempted lock holder), indicated by @in_spin_loop.
1673 * Set at the beiginning and cleared at the end of interception/PLE handler.
1674 *
1675 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
1676 * chance last time (mostly it has become eligible now since we have probably
1677 * yielded to lockholder in last iteration. This is done by toggling
1678 * @dy_eligible each time a VCPU checked for eligibility.)
1679 *
1680 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
1681 * to preempted lock-holder could result in wrong VCPU selection and CPU
1682 * burning. Giving priority for a potential lock-holder increases lock
1683 * progress.
1684 *
1685 * Since algorithm is based on heuristics, accessing another VCPU data without
1686 * locking does not harm. It may result in trying to yield to same VCPU, fail
1687 * and continue with next VCPU and so on.
1688 */
1689bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1690{
1691 bool eligible;
1692
1693 eligible = !vcpu->spin_loop.in_spin_loop ||
1694 (vcpu->spin_loop.in_spin_loop &&
1695 vcpu->spin_loop.dy_eligible);
1696
1697 if (vcpu->spin_loop.in_spin_loop)
1698 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
1699
1700 return eligible;
1701}
1702#endif
1583void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1703void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1584{ 1704{
1585 struct kvm *kvm = me->kvm; 1705 struct kvm *kvm = me->kvm;
@@ -1589,6 +1709,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1589 int pass; 1709 int pass;
1590 int i; 1710 int i;
1591 1711
1712 kvm_vcpu_set_in_spin_loop(me, true);
1592 /* 1713 /*
1593 * We boost the priority of a VCPU that is runnable but not 1714 * We boost the priority of a VCPU that is runnable but not
1594 * currently running, because it got preempted by something 1715 * currently running, because it got preempted by something
@@ -1607,6 +1728,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1607 continue; 1728 continue;
1608 if (waitqueue_active(&vcpu->wq)) 1729 if (waitqueue_active(&vcpu->wq))
1609 continue; 1730 continue;
1731 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
1732 continue;
1610 if (kvm_vcpu_yield_to(vcpu)) { 1733 if (kvm_vcpu_yield_to(vcpu)) {
1611 kvm->last_boosted_vcpu = i; 1734 kvm->last_boosted_vcpu = i;
1612 yielded = 1; 1735 yielded = 1;
@@ -1614,6 +1737,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1614 } 1737 }
1615 } 1738 }
1616 } 1739 }
1740 kvm_vcpu_set_in_spin_loop(me, false);
1741
1742 /* Ensure vcpu is not eligible during next spinloop */
1743 kvm_vcpu_set_dy_eligible(me, false);
1617} 1744}
1618EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1745EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1619 1746
@@ -1766,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
1766#endif 1893#endif
1767 1894
1768 1895
1769 vcpu_load(vcpu); 1896 r = vcpu_load(vcpu);
1897 if (r)
1898 return r;
1770 switch (ioctl) { 1899 switch (ioctl) {
1771 case KVM_RUN: 1900 case KVM_RUN:
1772 r = -EINVAL; 1901 r = -EINVAL;
@@ -2094,6 +2223,29 @@ static long kvm_vm_ioctl(struct file *filp,
2094 break; 2223 break;
2095 } 2224 }
2096#endif 2225#endif
2226#ifdef __KVM_HAVE_IRQ_LINE
2227 case KVM_IRQ_LINE_STATUS:
2228 case KVM_IRQ_LINE: {
2229 struct kvm_irq_level irq_event;
2230
2231 r = -EFAULT;
2232 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2233 goto out;
2234
2235 r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
2236 if (r)
2237 goto out;
2238
2239 r = -EFAULT;
2240 if (ioctl == KVM_IRQ_LINE_STATUS) {
2241 if (copy_to_user(argp, &irq_event, sizeof irq_event))
2242 goto out;
2243 }
2244
2245 r = 0;
2246 break;
2247 }
2248#endif
2097 default: 2249 default:
2098 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2250 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2099 if (r == -ENOTTY) 2251 if (r == -ENOTTY)
@@ -2698,9 +2850,6 @@ static struct syscore_ops kvm_syscore_ops = {
2698 .resume = kvm_resume, 2850 .resume = kvm_resume,
2699}; 2851};
2700 2852
2701struct page *bad_page;
2702pfn_t bad_pfn;
2703
2704static inline 2853static inline
2705struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2854struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2706{ 2855{
@@ -2732,33 +2881,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2732 if (r) 2881 if (r)
2733 goto out_fail; 2882 goto out_fail;
2734 2883
2735 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2736
2737 if (bad_page == NULL) {
2738 r = -ENOMEM;
2739 goto out;
2740 }
2741
2742 bad_pfn = page_to_pfn(bad_page);
2743
2744 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2745
2746 if (hwpoison_page == NULL) {
2747 r = -ENOMEM;
2748 goto out_free_0;
2749 }
2750
2751 hwpoison_pfn = page_to_pfn(hwpoison_page);
2752
2753 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2754
2755 if (fault_page == NULL) {
2756 r = -ENOMEM;
2757 goto out_free_0;
2758 }
2759
2760 fault_pfn = page_to_pfn(fault_page);
2761
2762 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2884 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2763 r = -ENOMEM; 2885 r = -ENOMEM;
2764 goto out_free_0; 2886 goto out_free_0;
@@ -2833,12 +2955,6 @@ out_free_1:
2833out_free_0a: 2955out_free_0a:
2834 free_cpumask_var(cpus_hardware_enabled); 2956 free_cpumask_var(cpus_hardware_enabled);
2835out_free_0: 2957out_free_0:
2836 if (fault_page)
2837 __free_page(fault_page);
2838 if (hwpoison_page)
2839 __free_page(hwpoison_page);
2840 __free_page(bad_page);
2841out:
2842 kvm_arch_exit(); 2958 kvm_arch_exit();
2843out_fail: 2959out_fail:
2844 return r; 2960 return r;
@@ -2858,8 +2974,5 @@ void kvm_exit(void)
2858 kvm_arch_hardware_unsetup(); 2974 kvm_arch_hardware_unsetup();
2859 kvm_arch_exit(); 2975 kvm_arch_exit();
2860 free_cpumask_var(cpus_hardware_enabled); 2976 free_cpumask_var(cpus_hardware_enabled);
2861 __free_page(fault_page);
2862 __free_page(hwpoison_page);
2863 __free_page(bad_page);
2864} 2977}
2865EXPORT_SYMBOL_GPL(kvm_exit); 2978EXPORT_SYMBOL_GPL(kvm_exit);