aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAvi Kivity <avi@redhat.com>2012-07-26 04:54:21 -0400
committerAvi Kivity <avi@redhat.com>2012-07-26 04:54:21 -0400
commite9bda6f6f902e6b55d9baceb5523468a048cbe56 (patch)
treebf09cc165da1197cd34967da0593d08b9a37c0f3
parentbdc0077af574800d24318b6945cf2344e8dbb050 (diff)
parent06e48c510aa37f6e791602e6420422ea7071fe94 (diff)
Merge branch 'queue' into next
Merge patches queued during the run-up to the merge window. * queue: (25 commits) KVM: Choose better candidate for directed yield KVM: Note down when cpu relax intercepted or pause loop exited KVM: Add config to support ple or cpu relax optimzation KVM: switch to symbolic name for irq_states size KVM: x86: Fix typos in pmu.c KVM: x86: Fix typos in lapic.c KVM: x86: Fix typos in cpuid.c KVM: x86: Fix typos in emulate.c KVM: x86: Fix typos in x86.c KVM: SVM: Fix typos KVM: VMX: Fix typos KVM: remove the unused parameter of gfn_to_pfn_memslot KVM: remove is_error_hpa KVM: make bad_pfn static to kvm_main.c KVM: using get_fault_pfn to get the fault pfn KVM: MMU: track the refcount when unmap the page KVM: x86: remove unnecessary mark_page_dirty KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range() KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp() KVM: MMU: Add memslot parameter to hva handlers ... Signed-off-by: Avi Kivity <avi@redhat.com>
-rw-r--r--arch/powerpc/include/asm/kvm_host.h2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c47
-rw-r--r--arch/powerpc/kvm/e500_tlb.c2
-rw-r--r--arch/s390/kvm/Kconfig1
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/emulate.c10
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c117
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/vmx.c6
-rw-r--r--arch/x86/kvm/x86.c26
-rw-r--r--include/linux/kvm_host.h62
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/iommu.c10
-rw-r--r--virt/kvm/irq_comm.c4
-rw-r--r--virt/kvm/kvm_main.c84
20 files changed, 277 insertions, 115 deletions
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12fd7bf5..572ad0141268 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -52,6 +52,8 @@
52 52
53struct kvm; 53struct kvm;
54extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 54extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
55extern int kvm_unmap_hva_range(struct kvm *kvm,
56 unsigned long start, unsigned long end);
55extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); 57extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
56extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 58extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
57extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 59extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d03eb6f7b058..3c635c0616b0 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
756 goto out_put; 756 goto out_put;
757} 757}
758 758
759static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 759static int kvm_handle_hva_range(struct kvm *kvm,
760 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 760 unsigned long start,
761 unsigned long gfn)) 761 unsigned long end,
762 int (*handler)(struct kvm *kvm,
763 unsigned long *rmapp,
764 unsigned long gfn))
762{ 765{
763 int ret; 766 int ret;
764 int retval = 0; 767 int retval = 0;
@@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
767 770
768 slots = kvm_memslots(kvm); 771 slots = kvm_memslots(kvm);
769 kvm_for_each_memslot(memslot, slots) { 772 kvm_for_each_memslot(memslot, slots) {
770 unsigned long start = memslot->userspace_addr; 773 unsigned long hva_start, hva_end;
771 unsigned long end; 774 gfn_t gfn, gfn_end;
772 775
773 end = start + (memslot->npages << PAGE_SHIFT); 776 hva_start = max(start, memslot->userspace_addr);
774 if (hva >= start && hva < end) { 777 hva_end = min(end, memslot->userspace_addr +
775 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 778 (memslot->npages << PAGE_SHIFT));
779 if (hva_start >= hva_end)
780 continue;
781 /*
782 * {gfn(page) | page intersects with [hva_start, hva_end)} =
783 * {gfn, gfn+1, ..., gfn_end-1}.
784 */
785 gfn = hva_to_gfn_memslot(hva_start, memslot);
786 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
787
788 for (; gfn < gfn_end; ++gfn) {
789 gfn_t gfn_offset = gfn - memslot->base_gfn;
776 790
777 ret = handler(kvm, &memslot->rmap[gfn_offset], 791 ret = handler(kvm, &memslot->rmap[gfn_offset], gfn);
778 memslot->base_gfn + gfn_offset);
779 retval |= ret; 792 retval |= ret;
780 } 793 }
781 } 794 }
@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
783 return retval; 796 return retval;
784} 797}
785 798
799static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
800 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
801 unsigned long gfn))
802{
803 return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
804}
805
786static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 806static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
787 unsigned long gfn) 807 unsigned long gfn)
788{ 808{
@@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
850 return 0; 870 return 0;
851} 871}
852 872
873int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
874{
875 if (kvm->arch.using_mmu_notifiers)
876 kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
877 return 0;
878}
879
853static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 880static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
854 unsigned long gfn) 881 unsigned long gfn)
855{ 882{
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index c510fc961302..c8f6c5826742 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -520,7 +520,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
520 520
521 if (likely(!pfnmap)) { 521 if (likely(!pfnmap)) {
522 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); 522 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
523 pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); 523 pfn = gfn_to_pfn_memslot(slot, gfn);
524 if (is_error_pfn(pfn)) { 524 if (is_error_pfn(pfn)) {
525 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", 525 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
526 (long)gfn); 526 (long)gfn);
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 78eb9847008f..a6e2677724e1 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
21 depends on HAVE_KVM && EXPERIMENTAL 21 depends on HAVE_KVM && EXPERIMENTAL
22 select PREEMPT_NOTIFIERS 22 select PREEMPT_NOTIFIERS
23 select ANON_INODES 23 select ANON_INODES
24 select HAVE_KVM_CPU_RELAX_INTERCEPT
24 ---help--- 25 ---help---
25 Support hosting paravirtualized guest machines using the SIE 26 Support hosting paravirtualized guest machines using the SIE
26 virtualization capability on the mainframe. This should work 27 virtualization capability on the mainframe. This should work
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09155d64cf7e..48e713188469 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -500,11 +500,11 @@ struct kvm_vcpu_arch {
500}; 500};
501 501
502struct kvm_lpage_info { 502struct kvm_lpage_info {
503 unsigned long rmap_pde;
504 int write_count; 503 int write_count;
505}; 504};
506 505
507struct kvm_arch_memory_slot { 506struct kvm_arch_memory_slot {
507 unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1];
508 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; 508 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
509}; 509};
510 510
@@ -957,6 +957,7 @@ extern bool kvm_rebooting;
957 957
958#define KVM_ARCH_WANT_MMU_NOTIFIER 958#define KVM_ARCH_WANT_MMU_NOTIFIER
959int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 959int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
960int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
960int kvm_age_hva(struct kvm *kvm, unsigned long hva); 961int kvm_age_hva(struct kvm *kvm, unsigned long hva);
961int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 962int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
962void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 963void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a28f338843ea..45c044f0fff7 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -37,6 +37,7 @@ config KVM
37 select TASK_DELAY_ACCT 37 select TASK_DELAY_ACCT
38 select PERF_EVENTS 38 select PERF_EVENTS
39 select HAVE_KVM_MSI 39 select HAVE_KVM_MSI
40 select HAVE_KVM_CPU_RELAX_INTERCEPT
40 ---help--- 41 ---help---
41 Support hosting fully virtualized guest machines using hardware 42 Support hosting fully virtualized guest machines using hardware
42 virtualization extensions. You will need a fairly recent 43 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f1397b7c..b496da684bd6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
316 } 316 }
317 case 7: { 317 case 7: {
318 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 318 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
319 /* Mask ebx against host capbability word 9 */ 319 /* Mask ebx against host capability word 9 */
320 if (index == 0) { 320 if (index == 0) {
321 entry->ebx &= kvm_supported_word9_x86_features; 321 entry->ebx &= kvm_supported_word9_x86_features;
322 cpuid_mask(&entry->ebx, 9); 322 cpuid_mask(&entry->ebx, 9);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97d9a9914ba8..85b611e13e84 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -642,7 +642,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
642 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) 642 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
643 goto bad; 643 goto bad;
644 } else { 644 } else {
645 /* exapand-down segment */ 645 /* expand-down segment */
646 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) 646 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
647 goto bad; 647 goto bad;
648 lim = desc.d ? 0xffffffff : 0xffff; 648 lim = desc.d ? 0xffffffff : 0xffff;
@@ -1383,7 +1383,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1383 err_code = selector & 0xfffc; 1383 err_code = selector & 0xfffc;
1384 err_vec = GP_VECTOR; 1384 err_vec = GP_VECTOR;
1385 1385
1386 /* can't load system descriptor into segment selecor */ 1386 /* can't load system descriptor into segment selector */
1387 if (seg <= VCPU_SREG_GS && !seg_desc.s) 1387 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1388 goto exception; 1388 goto exception;
1389 1389
@@ -2398,7 +2398,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2398 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); 2398 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2399 2399
2400 /* 2400 /*
2401 * Now load segment descriptors. If fault happenes at this stage 2401 * Now load segment descriptors. If fault happens at this stage
2402 * it is handled in a context of new task 2402 * it is handled in a context of new task
2403 */ 2403 */
2404 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); 2404 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
@@ -2640,7 +2640,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2640 * 2640 *
2641 * 1. jmp/call/int to task gate: Check against DPL of the task gate 2641 * 1. jmp/call/int to task gate: Check against DPL of the task gate
2642 * 2. Exception/IRQ/iret: No check is performed 2642 * 2. Exception/IRQ/iret: No check is performed
2643 * 3. jmp/call to TSS: Check agains DPL of the TSS 2643 * 3. jmp/call to TSS: Check against DPL of the TSS
2644 */ 2644 */
2645 if (reason == TASK_SWITCH_GATE) { 2645 if (reason == TASK_SWITCH_GATE) {
2646 if (idt_index != -1) { 2646 if (idt_index != -1) {
@@ -2681,7 +2681,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2681 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; 2681 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2682 2682
2683 /* set back link to prev task only if NT bit is set in eflags 2683 /* set back link to prev task only if NT bit is set in eflags
2684 note that old_tss_sel is not used afetr this point */ 2684 note that old_tss_sel is not used after this point */
2685 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 2685 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2686 old_tss_sel = 0xffff; 2686 old_tss_sel = 0xffff;
2687 2687
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2086f2bfba33..2d03568e9498 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -70,7 +70,7 @@ struct kvm_pic {
70 struct kvm_io_device dev_slave; 70 struct kvm_io_device dev_slave;
71 struct kvm_io_device dev_eclr; 71 struct kvm_io_device dev_eclr;
72 void (*ack_notifier)(void *opaque, int irq); 72 void (*ack_notifier)(void *opaque, int irq);
73 unsigned long irq_states[16]; 73 unsigned long irq_states[PIC_NUM_PINS];
74}; 74};
75 75
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 76struct kvm_pic *kvm_create_pic(struct kvm *kvm);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce878788a39f..fff7173f6a71 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -719,7 +719,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
719{ 719{
720 unsigned char alignment = offset & 0xf; 720 unsigned char alignment = offset & 0xf;
721 u32 result; 721 u32 result;
722 /* this bitmask has a bit cleared for each reserver register */ 722 /* this bitmask has a bit cleared for each reserved register */
723 static const u64 rmask = 0x43ff01ffffffe70cULL; 723 static const u64 rmask = 0x43ff01ffffffe70cULL;
724 724
725 if ((alignment + len) > 4) { 725 if ((alignment + len) > 4) {
@@ -792,7 +792,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
792 atomic_set(&apic->lapic_timer.pending, 0); 792 atomic_set(&apic->lapic_timer.pending, 0);
793 793
794 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { 794 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
795 /* lapic timer in oneshot or peroidic mode */ 795 /* lapic timer in oneshot or periodic mode */
796 now = apic->lapic_timer.timer.base->get_time(); 796 now = apic->lapic_timer.timer.base->get_time();
797 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) 797 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
798 * APIC_BUS_CYCLE_NS * apic->divide_count; 798 * APIC_BUS_CYCLE_NS * apic->divide_count;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01ca00423938..241993443599 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
556 return 0; 556 return 0;
557 557
558 pfn = spte_to_pfn(old_spte); 558 pfn = spte_to_pfn(old_spte);
559
560 /*
561 * KVM does not hold the refcount of the page used by
562 * kvm mmu, before reclaiming the page, we should
563 * unmap it from mmu first.
564 */
565 WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
566
559 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 567 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
560 kvm_set_pfn_accessed(pfn); 568 kvm_set_pfn_accessed(pfn);
561 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 569 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
@@ -960,13 +968,13 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
960static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, 968static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
961 struct kvm_memory_slot *slot) 969 struct kvm_memory_slot *slot)
962{ 970{
963 struct kvm_lpage_info *linfo; 971 unsigned long idx;
964 972
965 if (likely(level == PT_PAGE_TABLE_LEVEL)) 973 if (likely(level == PT_PAGE_TABLE_LEVEL))
966 return &slot->rmap[gfn - slot->base_gfn]; 974 return &slot->rmap[gfn - slot->base_gfn];
967 975
968 linfo = lpage_info_slot(gfn, slot, level); 976 idx = gfn_to_index(gfn, slot->base_gfn, level);
969 return &linfo->rmap_pde; 977 return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx];
970} 978}
971 979
972/* 980/*
@@ -1200,7 +1208,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1200} 1208}
1201 1209
1202static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1210static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1203 unsigned long data) 1211 struct kvm_memory_slot *slot, unsigned long data)
1204{ 1212{
1205 u64 *sptep; 1213 u64 *sptep;
1206 struct rmap_iterator iter; 1214 struct rmap_iterator iter;
@@ -1218,7 +1226,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1218} 1226}
1219 1227
1220static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1228static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1221 unsigned long data) 1229 struct kvm_memory_slot *slot, unsigned long data)
1222{ 1230{
1223 u64 *sptep; 1231 u64 *sptep;
1224 struct rmap_iterator iter; 1232 struct rmap_iterator iter;
@@ -1259,43 +1267,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1259 return 0; 1267 return 0;
1260} 1268}
1261 1269
1262static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 1270static int kvm_handle_hva_range(struct kvm *kvm,
1263 unsigned long data, 1271 unsigned long start,
1264 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 1272 unsigned long end,
1265 unsigned long data)) 1273 unsigned long data,
1274 int (*handler)(struct kvm *kvm,
1275 unsigned long *rmapp,
1276 struct kvm_memory_slot *slot,
1277 unsigned long data))
1266{ 1278{
1267 int j; 1279 int j;
1268 int ret; 1280 int ret = 0;
1269 int retval = 0;
1270 struct kvm_memslots *slots; 1281 struct kvm_memslots *slots;
1271 struct kvm_memory_slot *memslot; 1282 struct kvm_memory_slot *memslot;
1272 1283
1273 slots = kvm_memslots(kvm); 1284 slots = kvm_memslots(kvm);
1274 1285
1275 kvm_for_each_memslot(memslot, slots) { 1286 kvm_for_each_memslot(memslot, slots) {
1276 unsigned long start = memslot->userspace_addr; 1287 unsigned long hva_start, hva_end;
1277 unsigned long end; 1288 gfn_t gfn_start, gfn_end;
1278 1289
1279 end = start + (memslot->npages << PAGE_SHIFT); 1290 hva_start = max(start, memslot->userspace_addr);
1280 if (hva >= start && hva < end) { 1291 hva_end = min(end, memslot->userspace_addr +
1281 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 1292 (memslot->npages << PAGE_SHIFT));
1282 gfn_t gfn = memslot->base_gfn + gfn_offset; 1293 if (hva_start >= hva_end)
1294 continue;
1295 /*
1296 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1297 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1298 */
1299 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1300 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1283 1301
1284 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 1302 for (j = PT_PAGE_TABLE_LEVEL;
1303 j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
1304 unsigned long idx, idx_end;
1305 unsigned long *rmapp;
1285 1306
1286 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 1307 /*
1287 struct kvm_lpage_info *linfo; 1308 * {idx(page_j) | page_j intersects with
1309 * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
1310 */
1311 idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
1312 idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
1288 1313
1289 linfo = lpage_info_slot(gfn, memslot, 1314 rmapp = __gfn_to_rmap(gfn_start, j, memslot);
1290 PT_DIRECTORY_LEVEL + j); 1315
1291 ret |= handler(kvm, &linfo->rmap_pde, data); 1316 for (; idx <= idx_end; ++idx)
1292 } 1317 ret |= handler(kvm, rmapp++, memslot, data);
1293 trace_kvm_age_page(hva, memslot, ret);
1294 retval |= ret;
1295 } 1318 }
1296 } 1319 }
1297 1320
1298 return retval; 1321 return ret;
1322}
1323
1324static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1325 unsigned long data,
1326 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
1327 struct kvm_memory_slot *slot,
1328 unsigned long data))
1329{
1330 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1299} 1331}
1300 1332
1301int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1333int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
@@ -1303,13 +1335,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1303 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); 1335 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
1304} 1336}
1305 1337
1338int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1339{
1340 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1341}
1342
1306void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1343void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1307{ 1344{
1308 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 1345 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1309} 1346}
1310 1347
1311static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1348static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1312 unsigned long data) 1349 struct kvm_memory_slot *slot, unsigned long data)
1313{ 1350{
1314 u64 *sptep; 1351 u64 *sptep;
1315 struct rmap_iterator uninitialized_var(iter); 1352 struct rmap_iterator uninitialized_var(iter);
@@ -1323,8 +1360,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1323 * This has some overhead, but not as much as the cost of swapping 1360 * This has some overhead, but not as much as the cost of swapping
1324 * out actively used pages or breaking up actively used hugepages. 1361 * out actively used pages or breaking up actively used hugepages.
1325 */ 1362 */
1326 if (!shadow_accessed_mask) 1363 if (!shadow_accessed_mask) {
1327 return kvm_unmap_rmapp(kvm, rmapp, data); 1364 young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
1365 goto out;
1366 }
1328 1367
1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1368 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1330 sptep = rmap_get_next(&iter)) { 1369 sptep = rmap_get_next(&iter)) {
@@ -1336,12 +1375,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1336 (unsigned long *)sptep); 1375 (unsigned long *)sptep);
1337 } 1376 }
1338 } 1377 }
1339 1378out:
1379 /* @data has hva passed to kvm_age_hva(). */
1380 trace_kvm_age_page(data, slot, young);
1340 return young; 1381 return young;
1341} 1382}
1342 1383
1343static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1384static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1344 unsigned long data) 1385 struct kvm_memory_slot *slot, unsigned long data)
1345{ 1386{
1346 u64 *sptep; 1387 u64 *sptep;
1347 struct rmap_iterator iter; 1388 struct rmap_iterator iter;
@@ -1379,13 +1420,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1379 1420
1380 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 1421 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
1381 1422
1382 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 1423 kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
1383 kvm_flush_remote_tlbs(vcpu->kvm); 1424 kvm_flush_remote_tlbs(vcpu->kvm);
1384} 1425}
1385 1426
1386int kvm_age_hva(struct kvm *kvm, unsigned long hva) 1427int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1387{ 1428{
1388 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 1429 return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
1389} 1430}
1390 1431
1391int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1432int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -2472,14 +2513,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2472 unsigned long hva; 2513 unsigned long hva;
2473 2514
2474 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2515 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2475 if (!slot) { 2516 if (!slot)
2476 get_page(fault_page); 2517 return get_fault_pfn();
2477 return page_to_pfn(fault_page);
2478 }
2479 2518
2480 hva = gfn_to_hva_memslot(slot, gfn); 2519 hva = gfn_to_hva_memslot(slot, gfn);
2481 2520
2482 return hva_to_pfn_atomic(vcpu->kvm, hva); 2521 return hva_to_pfn_atomic(hva);
2483} 2522}
2484 2523
2485static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2524static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 9b7ec1150ab0..cfc258a6bf97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Kernel-based Virtual Machine -- Performane Monitoring Unit support 2 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
3 * 3 *
4 * Copyright 2011 Red Hat, Inc. and/or its affiliates. 4 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
5 * 5 *
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead950d6c8..687d0c30e559 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2063,7 +2063,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
2063 if (svm->nested.intercept & 1ULL) { 2063 if (svm->nested.intercept & 1ULL) {
2064 /* 2064 /*
2065 * The #vmexit can't be emulated here directly because this 2065 * The #vmexit can't be emulated here directly because this
2066 * code path runs with irqs and preemtion disabled. A 2066 * code path runs with irqs and preemption disabled. A
2067 * #vmexit emulation might sleep. Only signal request for 2067 * #vmexit emulation might sleep. Only signal request for
2068 * the #vmexit here. 2068 * the #vmexit here.
2069 */ 2069 */
@@ -2409,7 +2409,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2409{ 2409{
2410 /* 2410 /*
2411 * This function merges the msr permission bitmaps of kvm and the 2411 * This function merges the msr permission bitmaps of kvm and the
2412 * nested vmcb. It is omptimized in that it only merges the parts where 2412 * nested vmcb. It is optimized in that it only merges the parts where
2413 * the kvm msr permission bitmap may contain zero bits 2413 * the kvm msr permission bitmap may contain zero bits
2414 */ 2414 */
2415 int i; 2415 int i;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c39b60707e02..2300e5319ed9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1343,7 +1343,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1343 guest_efer = vmx->vcpu.arch.efer; 1343 guest_efer = vmx->vcpu.arch.efer;
1344 1344
1345 /* 1345 /*
1346 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 1346 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1347 * outside long mode 1347 * outside long mode
1348 */ 1348 */
1349 ignore_bits = EFER_NX | EFER_SCE; 1349 ignore_bits = EFER_NX | EFER_SCE;
@@ -3261,7 +3261,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3261 * qemu binaries. 3261 * qemu binaries.
3262 * IA32 arch specifies that at the time of processor reset the 3262 * IA32 arch specifies that at the time of processor reset the
3263 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3263 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3264 * is setting it to 0 in the usedland code. This causes invalid guest 3264 * is setting it to 0 in the userland code. This causes invalid guest
3265 * state vmexit when "unrestricted guest" mode is turned on. 3265 * state vmexit when "unrestricted guest" mode is turned on.
3266 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3266 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3267 * tree. Newer qemu binaries with that qemu fix would not need this 3267 * tree. Newer qemu binaries with that qemu fix would not need this
@@ -4446,7 +4446,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4446 hypercall[2] = 0xc1; 4446 hypercall[2] = 0xc1;
4447} 4447}
4448 4448
4449/* called to set cr0 as approriate for a mov-to-cr0 exit. */ 4449/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4450static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4450static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4451{ 4451{
4452 if (to_vmx(vcpu)->nested.vmxon && 4452 if (to_vmx(vcpu)->nested.vmxon &&
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59b59508ff07..3d9d08edbf29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1093,7 +1093,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1093 * For each generation, we track the original measured 1093 * For each generation, we track the original measured
1094 * nanosecond time, offset, and write, so if TSCs are in 1094 * nanosecond time, offset, and write, so if TSCs are in
1095 * sync, we can match exact offset, and if not, we can match 1095 * sync, we can match exact offset, and if not, we can match
1096 * exact software computaion in compute_guest_tsc() 1096 * exact software computation in compute_guest_tsc()
1097 * 1097 *
1098 * These values are tracked in kvm->arch.cur_xxx variables. 1098 * These values are tracked in kvm->arch.cur_xxx variables.
1099 */ 1099 */
@@ -1500,7 +1500,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1500{ 1500{
1501 gpa_t gpa = data & ~0x3f; 1501 gpa_t gpa = data & ~0x3f;
1502 1502
1503 /* Bits 2:5 are resrved, Should be zero */ 1503 /* Bits 2:5 are reserved, Should be zero */
1504 if (data & 0x3c) 1504 if (data & 0x3c)
1505 return 1; 1505 return 1;
1506 1506
@@ -1723,7 +1723,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1723 * Ignore all writes to this no longer documented MSR. 1723 * Ignore all writes to this no longer documented MSR.
1724 * Writes are only relevant for old K7 processors, 1724 * Writes are only relevant for old K7 processors,
1725 * all pre-dating SVM, but a recommended workaround from 1725 * all pre-dating SVM, but a recommended workaround from
1726 * AMD for these chips. It is possible to speicify the 1726 * AMD for these chips. It is possible to specify the
1727 * affected processor models on the command line, hence 1727 * affected processor models on the command line, hence
1728 * the need to ignore the workaround. 1728 * the need to ignore the workaround.
1729 */ 1729 */
@@ -2632,7 +2632,6 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
2632 if (!vcpu->arch.time_page) 2632 if (!vcpu->arch.time_page)
2633 return -EINVAL; 2633 return -EINVAL;
2634 src->flags |= PVCLOCK_GUEST_STOPPED; 2634 src->flags |= PVCLOCK_GUEST_STOPPED;
2635 mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
2636 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2635 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2637 return 0; 2636 return 0;
2638} 2637}
@@ -4492,7 +4491,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4492 4491
4493 /* 4492 /*
4494 * if emulation was due to access to shadowed page table 4493 * if emulation was due to access to shadowed page table
4495 * and it failed try to unshadow page and re-entetr the 4494 * and it failed try to unshadow page and re-enter the
4496 * guest to let CPU execute the instruction. 4495 * guest to let CPU execute the instruction.
4497 */ 4496 */
4498 if (kvm_mmu_unprotect_page_virt(vcpu, gva)) 4497 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -5588,7 +5587,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5588 /* 5587 /*
5589 * We are here if userspace calls get_regs() in the middle of 5588 * We are here if userspace calls get_regs() in the middle of
5590 * instruction emulation. Registers state needs to be copied 5589 * instruction emulation. Registers state needs to be copied
5591 * back from emulation context to vcpu. Usrapace shouldn't do 5590 * back from emulation context to vcpu. Userspace shouldn't do
5592 * that usually, but some bad designed PV devices (vmware 5591 * that usually, but some bad designed PV devices (vmware
5593 * backdoor interface) need this to work 5592 * backdoor interface) need this to work
5594 */ 5593 */
@@ -6117,7 +6116,7 @@ int kvm_arch_hardware_enable(void *garbage)
6117 * as we reset last_host_tsc on all VCPUs to stop this from being 6116 * as we reset last_host_tsc on all VCPUs to stop this from being
6118 * called multiple times (one for each physical CPU bringup). 6117 * called multiple times (one for each physical CPU bringup).
6119 * 6118 *
6120 * Platforms with unnreliable TSCs don't have to deal with this, they 6119 * Platforms with unreliable TSCs don't have to deal with this, they
6121 * will be compensated by the logic in vcpu_load, which sets the TSC to 6120 * will be compensated by the logic in vcpu_load, which sets the TSC to
6122 * catchup mode. This will catchup all VCPUs to real time, but cannot 6121 * catchup mode. This will catchup all VCPUs to real time, but cannot
6123 * guarantee that they stay in perfect synchronization. 6122 * guarantee that they stay in perfect synchronization.
@@ -6314,6 +6313,10 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6314 int i; 6313 int i;
6315 6314
6316 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6315 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6316 if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) {
6317 kvm_kvfree(free->arch.rmap_pde[i]);
6318 free->arch.rmap_pde[i] = NULL;
6319 }
6317 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6320 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6318 kvm_kvfree(free->arch.lpage_info[i]); 6321 kvm_kvfree(free->arch.lpage_info[i]);
6319 free->arch.lpage_info[i] = NULL; 6322 free->arch.lpage_info[i] = NULL;
@@ -6333,6 +6336,11 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6333 lpages = gfn_to_index(slot->base_gfn + npages - 1, 6336 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6334 slot->base_gfn, level) + 1; 6337 slot->base_gfn, level) + 1;
6335 6338
6339 slot->arch.rmap_pde[i] =
6340 kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i]));
6341 if (!slot->arch.rmap_pde[i])
6342 goto out_free;
6343
6336 slot->arch.lpage_info[i] = 6344 slot->arch.lpage_info[i] =
6337 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6345 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6338 if (!slot->arch.lpage_info[i]) 6346 if (!slot->arch.lpage_info[i])
@@ -6361,7 +6369,9 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6361 6369
6362out_free: 6370out_free:
6363 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6371 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6372 kvm_kvfree(slot->arch.rmap_pde[i]);
6364 kvm_kvfree(slot->arch.lpage_info[i]); 6373 kvm_kvfree(slot->arch.lpage_info[i]);
6374 slot->arch.rmap_pde[i] = NULL;
6365 slot->arch.lpage_info[i] = NULL; 6375 slot->arch.lpage_info[i] = NULL;
6366 } 6376 }
6367 return -ENOMEM; 6377 return -ENOMEM;
@@ -6381,7 +6391,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6381 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6391 map_flags = MAP_SHARED | MAP_ANONYMOUS;
6382 6392
6383 /*To keep backward compatibility with older userspace, 6393 /*To keep backward compatibility with older userspace,
6384 *x86 needs to hanlde !user_alloc case. 6394 *x86 needs to handle !user_alloc case.
6385 */ 6395 */
6386 if (!user_alloc) { 6396 if (!user_alloc) {
6387 if (npages && !old.rmap) { 6397 if (npages && !old.rmap) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b70b48b01098..1993eb1cb2cd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -183,6 +183,18 @@ struct kvm_vcpu {
183 } async_pf; 183 } async_pf;
184#endif 184#endif
185 185
186#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
187 /*
188 * Cpu relax intercept or pause loop exit optimization
189 * in_spin_loop: set when a vcpu does a pause loop exit
190 * or cpu relax intercepted.
191 * dy_eligible: indicates whether vcpu is eligible for directed yield.
192 */
193 struct {
194 bool in_spin_loop;
195 bool dy_eligible;
196 } spin_loop;
197#endif
186 struct kvm_vcpu_arch arch; 198 struct kvm_vcpu_arch arch;
187}; 199};
188 200
@@ -378,20 +390,11 @@ id_to_memslot(struct kvm_memslots *slots, int id)
378 return slot; 390 return slot;
379} 391}
380 392
381#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
382#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
383static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
384
385extern struct page *bad_page; 393extern struct page *bad_page;
386extern struct page *fault_page;
387
388extern pfn_t bad_pfn;
389extern pfn_t fault_pfn;
390 394
391int is_error_page(struct page *page); 395int is_error_page(struct page *page);
392int is_error_pfn(pfn_t pfn); 396int is_error_pfn(pfn_t pfn);
393int is_hwpoison_pfn(pfn_t pfn); 397int is_hwpoison_pfn(pfn_t pfn);
394int is_fault_pfn(pfn_t pfn);
395int is_noslot_pfn(pfn_t pfn); 398int is_noslot_pfn(pfn_t pfn);
396int is_invalid_pfn(pfn_t pfn); 399int is_invalid_pfn(pfn_t pfn);
397int kvm_is_error_hva(unsigned long addr); 400int kvm_is_error_hva(unsigned long addr);
@@ -427,20 +430,20 @@ void kvm_release_page_dirty(struct page *page);
427void kvm_set_page_dirty(struct page *page); 430void kvm_set_page_dirty(struct page *page);
428void kvm_set_page_accessed(struct page *page); 431void kvm_set_page_accessed(struct page *page);
429 432
430pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); 433pfn_t hva_to_pfn_atomic(unsigned long addr);
431pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); 434pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
432pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 435pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
433 bool write_fault, bool *writable); 436 bool write_fault, bool *writable);
434pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 437pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
435pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 438pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
436 bool *writable); 439 bool *writable);
437pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 440pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
438 struct kvm_memory_slot *slot, gfn_t gfn);
439void kvm_release_pfn_dirty(pfn_t); 441void kvm_release_pfn_dirty(pfn_t);
440void kvm_release_pfn_clean(pfn_t pfn); 442void kvm_release_pfn_clean(pfn_t pfn);
441void kvm_set_pfn_dirty(pfn_t pfn); 443void kvm_set_pfn_dirty(pfn_t pfn);
442void kvm_set_pfn_accessed(pfn_t pfn); 444void kvm_set_pfn_accessed(pfn_t pfn);
443void kvm_get_pfn(pfn_t pfn); 445void kvm_get_pfn(pfn_t pfn);
446pfn_t get_fault_pfn(void);
444 447
445int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 448int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
446 int len); 449 int len);
@@ -740,6 +743,14 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
740 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 743 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
741} 744}
742 745
746static inline gfn_t
747hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
748{
749 gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
750
751 return slot->base_gfn + gfn_offset;
752}
753
743static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 754static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
744 gfn_t gfn) 755 gfn_t gfn)
745{ 756{
@@ -899,5 +910,32 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
899 } 910 }
900} 911}
901 912
913#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
914
915static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
916{
917 vcpu->spin_loop.in_spin_loop = val;
918}
919static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
920{
921 vcpu->spin_loop.dy_eligible = val;
922}
923
924#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
925
926static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
927{
928}
929
930static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
931{
932}
933
934static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
935{
936 return true;
937}
938
939#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
902#endif 940#endif
903 941
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 28694f4a9139..d01b24b72c61 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
21 21
22config HAVE_KVM_MSI 22config HAVE_KVM_MSI
23 bool 23 bool
24
25config HAVE_KVM_CPU_RELAX_INTERCEPT
26 bool
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index e9fff9830bf0..c03f1fb26701 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
42static void kvm_iommu_put_pages(struct kvm *kvm, 42static void kvm_iommu_put_pages(struct kvm *kvm,
43 gfn_t base_gfn, unsigned long npages); 43 gfn_t base_gfn, unsigned long npages);
44 44
45static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, 45static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
46 gfn_t gfn, unsigned long size) 46 unsigned long size)
47{ 47{
48 gfn_t end_gfn; 48 gfn_t end_gfn;
49 pfn_t pfn; 49 pfn_t pfn;
50 50
51 pfn = gfn_to_pfn_memslot(kvm, slot, gfn); 51 pfn = gfn_to_pfn_memslot(slot, gfn);
52 end_gfn = gfn + (size >> PAGE_SHIFT); 52 end_gfn = gfn + (size >> PAGE_SHIFT);
53 gfn += 1; 53 gfn += 1;
54 54
@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
56 return pfn; 56 return pfn;
57 57
58 while (gfn < end_gfn) 58 while (gfn < end_gfn)
59 gfn_to_pfn_memslot(kvm, slot, gfn++); 59 gfn_to_pfn_memslot(slot, gfn++);
60 60
61 return pfn; 61 return pfn;
62} 62}
@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
105 * Pin all pages we are about to map in memory. This is 105 * Pin all pages we are about to map in memory. This is
106 * important because we unmap and unpin in 4kb steps later. 106 * important because we unmap and unpin in 4kb steps later.
107 */ 107 */
108 pfn = kvm_pin_pages(kvm, slot, gfn, page_size); 108 pfn = kvm_pin_pages(slot, gfn, page_size);
109 if (is_error_pfn(pfn)) { 109 if (is_error_pfn(pfn)) {
110 gfn += 1; 110 gfn += 1;
111 continue; 111 continue;
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 83402d74a767..7118be0f2f2c 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -321,11 +321,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
321 switch (ue->u.irqchip.irqchip) { 321 switch (ue->u.irqchip.irqchip) {
322 case KVM_IRQCHIP_PIC_MASTER: 322 case KVM_IRQCHIP_PIC_MASTER:
323 e->set = kvm_set_pic_irq; 323 e->set = kvm_set_pic_irq;
324 max_pin = 16; 324 max_pin = PIC_NUM_PINS;
325 break; 325 break;
326 case KVM_IRQCHIP_PIC_SLAVE: 326 case KVM_IRQCHIP_PIC_SLAVE:
327 e->set = kvm_set_pic_irq; 327 e->set = kvm_set_pic_irq;
328 max_pin = 16; 328 max_pin = PIC_NUM_PINS;
329 delta = 8; 329 delta = 8;
330 break; 330 break;
331 case KVM_IRQCHIP_IOAPIC: 331 case KVM_IRQCHIP_IOAPIC:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 246852397e30..0014ee99dc7f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,11 +100,14 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
100 100
101static bool largepages_enabled = true; 101static bool largepages_enabled = true;
102 102
103struct page *bad_page;
104static pfn_t bad_pfn;
105
103static struct page *hwpoison_page; 106static struct page *hwpoison_page;
104static pfn_t hwpoison_pfn; 107static pfn_t hwpoison_pfn;
105 108
106struct page *fault_page; 109static struct page *fault_page;
107pfn_t fault_pfn; 110static pfn_t fault_pfn;
108 111
109inline int kvm_is_mmio_pfn(pfn_t pfn) 112inline int kvm_is_mmio_pfn(pfn_t pfn)
110{ 113{
@@ -236,6 +239,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
236 } 239 }
237 vcpu->run = page_address(page); 240 vcpu->run = page_address(page);
238 241
242 kvm_vcpu_set_in_spin_loop(vcpu, false);
243 kvm_vcpu_set_dy_eligible(vcpu, false);
244
239 r = kvm_arch_vcpu_init(vcpu); 245 r = kvm_arch_vcpu_init(vcpu);
240 if (r < 0) 246 if (r < 0)
241 goto fail_free_run; 247 goto fail_free_run;
@@ -332,8 +338,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
332 * count is also read inside the mmu_lock critical section. 338 * count is also read inside the mmu_lock critical section.
333 */ 339 */
334 kvm->mmu_notifier_count++; 340 kvm->mmu_notifier_count++;
335 for (; start < end; start += PAGE_SIZE) 341 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
336 need_tlb_flush |= kvm_unmap_hva(kvm, start);
337 need_tlb_flush |= kvm->tlbs_dirty; 342 need_tlb_flush |= kvm->tlbs_dirty;
338 /* we've to flush the tlb before the pages can be freed */ 343 /* we've to flush the tlb before the pages can be freed */
339 if (need_tlb_flush) 344 if (need_tlb_flush)
@@ -950,12 +955,6 @@ int is_hwpoison_pfn(pfn_t pfn)
950} 955}
951EXPORT_SYMBOL_GPL(is_hwpoison_pfn); 956EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
952 957
953int is_fault_pfn(pfn_t pfn)
954{
955 return pfn == fault_pfn;
956}
957EXPORT_SYMBOL_GPL(is_fault_pfn);
958
959int is_noslot_pfn(pfn_t pfn) 958int is_noslot_pfn(pfn_t pfn)
960{ 959{
961 return pfn == bad_pfn; 960 return pfn == bad_pfn;
@@ -1039,11 +1038,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1039} 1038}
1040EXPORT_SYMBOL_GPL(gfn_to_hva); 1039EXPORT_SYMBOL_GPL(gfn_to_hva);
1041 1040
1042static pfn_t get_fault_pfn(void) 1041pfn_t get_fault_pfn(void)
1043{ 1042{
1044 get_page(fault_page); 1043 get_page(fault_page);
1045 return fault_pfn; 1044 return fault_pfn;
1046} 1045}
1046EXPORT_SYMBOL_GPL(get_fault_pfn);
1047 1047
1048int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1048int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1049 unsigned long start, int write, struct page **page) 1049 unsigned long start, int write, struct page **page)
@@ -1065,8 +1065,8 @@ static inline int check_user_page_hwpoison(unsigned long addr)
1065 return rc == -EHWPOISON; 1065 return rc == -EHWPOISON;
1066} 1066}
1067 1067
1068static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, 1068static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1069 bool *async, bool write_fault, bool *writable) 1069 bool write_fault, bool *writable)
1070{ 1070{
1071 struct page *page[1]; 1071 struct page *page[1];
1072 int npages = 0; 1072 int npages = 0;
@@ -1146,9 +1146,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
1146 return pfn; 1146 return pfn;
1147} 1147}
1148 1148
1149pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1149pfn_t hva_to_pfn_atomic(unsigned long addr)
1150{ 1150{
1151 return hva_to_pfn(kvm, addr, true, NULL, true, NULL); 1151 return hva_to_pfn(addr, true, NULL, true, NULL);
1152} 1152}
1153EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); 1153EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1154 1154
@@ -1166,7 +1166,7 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1166 return page_to_pfn(bad_page); 1166 return page_to_pfn(bad_page);
1167 } 1167 }
1168 1168
1169 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); 1169 return hva_to_pfn(addr, atomic, async, write_fault, writable);
1170} 1170}
1171 1171
1172pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1172pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1195,11 +1195,10 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1195} 1195}
1196EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1196EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1197 1197
1198pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1198pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1199 struct kvm_memory_slot *slot, gfn_t gfn)
1200{ 1199{
1201 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1200 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1202 return hva_to_pfn(kvm, addr, false, NULL, true, NULL); 1201 return hva_to_pfn(addr, false, NULL, true, NULL);
1203} 1202}
1204 1203
1205int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1204int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@@ -1580,6 +1579,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1580} 1579}
1581EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1580EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1582 1581
1582#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
1583/*
1584 * Helper that checks whether a VCPU is eligible for directed yield.
1585 * Most eligible candidate to yield is decided by following heuristics:
1586 *
1587 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
1588 * (preempted lock holder), indicated by @in_spin_loop.
1589 * Set at the beiginning and cleared at the end of interception/PLE handler.
1590 *
1591 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
1592 * chance last time (mostly it has become eligible now since we have probably
1593 * yielded to lockholder in last iteration. This is done by toggling
1594 * @dy_eligible each time a VCPU checked for eligibility.)
1595 *
1596 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
1597 * to preempted lock-holder could result in wrong VCPU selection and CPU
1598 * burning. Giving priority for a potential lock-holder increases lock
1599 * progress.
1600 *
1601 * Since algorithm is based on heuristics, accessing another VCPU data without
1602 * locking does not harm. It may result in trying to yield to same VCPU, fail
1603 * and continue with next VCPU and so on.
1604 */
1605bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1606{
1607 bool eligible;
1608
1609 eligible = !vcpu->spin_loop.in_spin_loop ||
1610 (vcpu->spin_loop.in_spin_loop &&
1611 vcpu->spin_loop.dy_eligible);
1612
1613 if (vcpu->spin_loop.in_spin_loop)
1614 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
1615
1616 return eligible;
1617}
1618#endif
1583void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1619void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1584{ 1620{
1585 struct kvm *kvm = me->kvm; 1621 struct kvm *kvm = me->kvm;
@@ -1589,6 +1625,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1589 int pass; 1625 int pass;
1590 int i; 1626 int i;
1591 1627
1628 kvm_vcpu_set_in_spin_loop(me, true);
1592 /* 1629 /*
1593 * We boost the priority of a VCPU that is runnable but not 1630 * We boost the priority of a VCPU that is runnable but not
1594 * currently running, because it got preempted by something 1631 * currently running, because it got preempted by something
@@ -1607,6 +1644,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1607 continue; 1644 continue;
1608 if (waitqueue_active(&vcpu->wq)) 1645 if (waitqueue_active(&vcpu->wq))
1609 continue; 1646 continue;
1647 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
1648 continue;
1610 if (kvm_vcpu_yield_to(vcpu)) { 1649 if (kvm_vcpu_yield_to(vcpu)) {
1611 kvm->last_boosted_vcpu = i; 1650 kvm->last_boosted_vcpu = i;
1612 yielded = 1; 1651 yielded = 1;
@@ -1614,6 +1653,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1614 } 1653 }
1615 } 1654 }
1616 } 1655 }
1656 kvm_vcpu_set_in_spin_loop(me, false);
1657
1658 /* Ensure vcpu is not eligible during next spinloop */
1659 kvm_vcpu_set_dy_eligible(me, false);
1617} 1660}
1618EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1661EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1619 1662
@@ -2697,9 +2740,6 @@ static struct syscore_ops kvm_syscore_ops = {
2697 .resume = kvm_resume, 2740 .resume = kvm_resume,
2698}; 2741};
2699 2742
2700struct page *bad_page;
2701pfn_t bad_pfn;
2702
2703static inline 2743static inline
2704struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2744struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2705{ 2745{