Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini: - PPC and ARM bugfixes from submaintainers - Fix old Windows versions on AMD (recent regression) - Fix old Linux versions on processors without EPT - Fixes for LAPIC timer optimizations * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (21 commits) KVM: nVMX: Fix size checks in vmx_set_nested_state KVM: selftests: make hyperv_cpuid test pass on AMD KVM: lapic: Check for in-kernel LAPIC before deferencing apic pointer KVM: fix KVM_CLEAR_DIRTY_LOG for memory slots of unaligned size x86/kvm/mmu: reset MMU context when 32-bit guest switches PAE KVM: x86: Whitelist port 0x7e for pre-incrementing %rip Documentation: kvm: fix dirty log ioctl arch lists KVM: VMX: Move RSB stuffing to before the first RET after VM-Exit KVM: arm/arm64: Don't emulate virtual timers on userspace ioctls kvm: arm: Skip stage2 huge mappings for unaligned ipa backed by THP KVM: arm/arm64: Ensure vcpu target is unset on reset failure KVM: lapic: Convert guest TSC to host time domain if necessary KVM: lapic: Allow user to disable adaptive tuning of timer advancement KVM: lapic: Track lapic timer advance per vCPU KVM: lapic: Disable timer advancement if adaptive tuning goes haywire x86: kvm: hyper-v: deal with buggy TLB flush requests from WS2012 KVM: x86: Consider LAPIC TSC-Deadline timer expired if deadline too short KVM: PPC: Book3S: Protect memslots while validating user address KVM: PPC: Book3S HV: Perserve PSSCR FAKE_SUSPEND bit on guest exit KVM: arm/arm64: vgic-v3: Retire pending interrupts on disabling LPIs ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-05-03 19:49:46 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-05-03 19:49:46 -0400
commit: aa1be08f52585fe36ecfaf5bddfdc784eb4c94cf (patch)
tree: bb8a647ba51f6990c880234c32c7ffe0cc8ec826
parent: 82463436a7fa40345c6febf0baa4c954af506ca6 (diff)
parent: e8ab8d24b488632d07ce5ddb261f1d454114415b (diff)
23 files changed, 192 insertions, 65 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 67068c47c591..64b38dfcc243 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -321,7 +321,7 @@ cpu's hardware control block.
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 Capability: basic
-Architectures: x86
+Architectures: all
 Type: vm ioctl
 Parameters: struct kvm_dirty_log (in/out)
 Returns: 0 on success, -1 on error
@@ -3810,7 +3810,7 @@ to I/O ports.
 4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl)
 Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
-Architectures: x86
+Architectures: x86, arm, arm64, mips
 Type: vm ioctl
 Parameters: struct kvm_dirty_log (in)
 Returns: 0 on success, -1 on error
@@ -3830,8 +3830,9 @@ The ioctl clears the dirty status of pages in a memory slot, according to
 the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap
 field.  Bit 0 of the bitmap corresponds to page "first_page" in the
 memory slot, and num_pages is the size in bits of the input bitmap.
-Both first_page and num_pages must be a multiple of 64.  For each bit
+first_page must be a multiple of 64; num_pages must also be a multiple of
-that is set in the input bitmap, the corresponding page is marked "clean"
+64 unless first_page + num_pages is the size of the memory slot.  For each
+bit that is set in the input bitmap, the corresponding page is marked "clean"
 in KVM's dirty bitmap, and dirty tracking is re-enabled for that page
 (for example via write-protection, or by clearing the dirty bit in
 a page table entry).
@@ -4799,7 +4800,7 @@ and injected exceptions.
 7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
-Architectures: all
+Architectures: x86, arm, arm64, mips
 Parameters: args[0] whether feature should be enabled or not
 With this capability enabled, KVM_GET_DIRTY_LOG will not automatically
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f02b04973710..f100e331e69b 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -543,14 +543,14 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
        if (ret != H_SUCCESS)
                return ret;
+        idx = srcu_read_lock(&vcpu->kvm->srcu);
        ret = kvmppc_tce_validate(stt, tce);
        if (ret != H_SUCCESS)
-                return ret;
+                goto unlock_exit;
        dir = iommu_tce_direction(tce);
-        idx = srcu_read_lock(&vcpu->kvm->srcu);
        if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
                ret = H_PARAMETER;
                goto unlock_exit;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 06964350b97a..b2b29d4f9842 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3423,7 +3423,9 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
        vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
        vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
-        mtspr(SPRN_PSSCR, host_psscr);
+        /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
+        mtspr(SPRN_PSSCR, host_psscr |
+              (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
        mtspr(SPRN_HFSCR, host_hfscr);
        mtspr(SPRN_CIABR, host_ciabr);
        mtspr(SPRN_DAWR, host_dawr);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a9d03af34030..c79abe7ca093 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -295,6 +295,7 @@ union kvm_mmu_extended_role {
                unsigned int valid:1;
                unsigned int execonly:1;
                unsigned int cr0_pg:1;
+                unsigned int cr4_pae:1;
                unsigned int cr4_pse:1;
                unsigned int cr4_pke:1;
                unsigned int cr4_smap:1;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index dabfcf7c3941..7a0e64ccd6ff 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -381,6 +381,7 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LINT0_REENABLED   (1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED     (1 << 1)
 #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE   (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP    (1 << 3)
 #define KVM_STATE_NESTED_GUEST_MODE     0x00000001
 #define KVM_STATE_NESTED_RUN_PENDING    0x00000002
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 421899f6ad7b..cc24b3a32c44 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1371,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
                valid_bank_mask = BIT_ULL(0);
                sparse_banks[0] = flush.processor_mask;
-                all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
+                /*
+                 * Work around possible WS2012 bug: it sends hypercalls
+                 * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+                 * while also expecting us to flush something and crashing if
+                 * we don't. Let's treat processor_mask == 0 same as
+                 * HV_FLUSH_ALL_PROCESSORS.
+                 */
+                all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+                        flush.processor_mask == 0;
        } else {
                if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
                                            sizeof(flush_ex))))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9bf70cf84564..bd13fdddbdc4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -70,7 +70,6 @@
 #define APIC_BROADCAST                  0xFF
 #define X2APIC_BROADCAST                0xFFFFFFFFul
-static bool lapic_timer_advance_adjust_done = false;
 #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -1482,14 +1481,32 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
        return false;
 }
+static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
+{
+        u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+        /*
+         * If the guest TSC is running at a different ratio than the host, then
+         * convert the delay to nanoseconds to achieve an accurate delay.  Note
+         * that __delay() uses delay_tsc whenever the hardware has TSC, thus
+         * always for VMX enabled hardware.
+         */
+        if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) {
+                __delay(min(guest_cycles,
+                        nsec_to_cycles(vcpu, timer_advance_ns)));
+        } else {
+                u64 delay_ns = guest_cycles * 1000000ULL;
+                do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
+                ndelay(min_t(u32, delay_ns, timer_advance_ns));
+        }
+}
 void wait_lapic_expire(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
+        u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
        u64 guest_tsc, tsc_deadline, ns;
-        if (!lapic_in_kernel(vcpu))
-                return;
        if (apic->lapic_timer.expired_tscdeadline == 0)
                return;
@@ -1501,33 +1518,37 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
        guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
        trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
-        /* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
        if (guest_tsc < tsc_deadline)
-                __delay(min(tsc_deadline - guest_tsc,
+                __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
-                        nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
-        if (!lapic_timer_advance_adjust_done) {
+        if (!apic->lapic_timer.timer_advance_adjust_done) {
                /* too early */
                if (guest_tsc < tsc_deadline) {
                        ns = (tsc_deadline - guest_tsc) * 1000000ULL;
                        do_div(ns, vcpu->arch.virtual_tsc_khz);
-                        lapic_timer_advance_ns -= min((unsigned int)ns,
+                        timer_advance_ns -= min((u32)ns,
-                                lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+                                timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
                } else {
                /* too late */
                        ns = (guest_tsc - tsc_deadline) * 1000000ULL;
                        do_div(ns, vcpu->arch.virtual_tsc_khz);
-                        lapic_timer_advance_ns += min((unsigned int)ns,
+                        timer_advance_ns += min((u32)ns,
-                                lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+                                timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
                }
                if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
-                        lapic_timer_advance_adjust_done = true;
+                        apic->lapic_timer.timer_advance_adjust_done = true;
+                if (unlikely(timer_advance_ns > 5000)) {
+                        timer_advance_ns = 0;
+                        apic->lapic_timer.timer_advance_adjust_done = true;
+                }
+                apic->lapic_timer.timer_advance_ns = timer_advance_ns;
        }
 }
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
 {
-        u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+        struct kvm_timer *ktimer = &apic->lapic_timer;
+        u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
        u64 ns = 0;
        ktime_t expire;
        struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1542,13 +1563,15 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
        now = ktime_get();
        guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-        if (likely(tscdeadline > guest_tsc)) {
-                ns = (tscdeadline - guest_tsc) * 1000000ULL;
+        ns = (tscdeadline - guest_tsc) * 1000000ULL;
-                do_div(ns, this_tsc_khz);
+        do_div(ns, this_tsc_khz);
+        if (likely(tscdeadline > guest_tsc) &&
+            likely(ns > apic->lapic_timer.timer_advance_ns)) {
                expire = ktime_add_ns(now, ns);
-                expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+                expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
-                hrtimer_start(&apic->lapic_timer.timer,
+                hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED);
-                                expire, HRTIMER_MODE_ABS_PINNED);
        } else
                apic_timer_expired(apic);
@@ -2255,7 +2278,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
                return HRTIMER_NORESTART;
 }
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
 {
        struct kvm_lapic *apic;
@@ -2279,6 +2302,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
        hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_ABS_PINNED);
        apic->lapic_timer.timer.function = apic_timer_fn;
+        if (timer_advance_ns == -1) {
+                apic->lapic_timer.timer_advance_ns = 1000;
+                apic->lapic_timer.timer_advance_adjust_done = false;
+        } else {
+                apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+                apic->lapic_timer.timer_advance_adjust_done = true;
+        }
        /*
         * APIC is created enabled. This will prevent kvm_lapic_set_base from
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff6ef9c3d760..d6d049ba3045 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,8 +31,10 @@ struct kvm_timer {
        u32 timer_mode_mask;
        u64 tscdeadline;
        u64 expired_tscdeadline;
+        u32 timer_advance_ns;
        atomic_t pending;                       /* accumulated triggered timers */
        bool hv_timer_in_use;
+        bool timer_advance_adjust_done;
 };
 struct kvm_lapic {
@@ -62,7 +64,7 @@ struct kvm_lapic {
 struct dest_map;
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e10962dfc203..d9c7b45d231f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4781,6 +4781,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
        union kvm_mmu_extended_role ext = {0};
        ext.cr0_pg = !!is_paging(vcpu);
+        ext.cr4_pae = !!is_pae(vcpu);
        ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
        ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
        ext.cr4_pse = !!is_pse(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6401eb7ef19c..0c601d079cd2 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5423,7 +5423,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
                return ret;
        /* Empty 'VMXON' state is permitted */
-        if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+        if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
                return 0;
        if (kvm_state->vmx.vmcs_pa != -1ull) {
@@ -5467,7 +5467,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
            vmcs12->vmcs_link_pointer != -1ull) {
                struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
-                if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
+                if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12))
                        return -EINVAL;
                if (copy_from_user(shadow_vmcs12,
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 7b272738c576..d4cb1945b2e3 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -3,6 +3,7 @@
 #include <asm/asm.h>
 #include <asm/bitsperlong.h>
 #include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
 #define WORD_SIZE (BITS_PER_LONG / 8)
@@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter)
 * referred to by VMCS.HOST_RIP.
 */
 ENTRY(vmx_vmexit)
+#ifdef CONFIG_RETPOLINE
+        ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
+        /* Preserve guest's RAX, it's used to stuff the RSB. */
+        push %_ASM_AX
+        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+        FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+        pop %_ASM_AX
+.Lvmexit_skip_rsb:
+#endif
        ret
 ENDPROC(vmx_vmexit)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b4e7d645275a..0c955bb286ff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6462,9 +6462,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
        x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
-        /* Eliminate branch target predictions from guest mode */
-        vmexit_fill_RSB();
        /* All fields are clean at this point */
        if (static_branch_unlikely(&enable_evmcs))
                current_evmcs->hv_clean_fields |=
@@ -7032,6 +7029,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
        struct vcpu_vmx *vmx;
        u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+        struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
        if (kvm_mwait_in_guest(vcpu->kvm))
                return -EOPNOTSUPP;
@@ -7040,7 +7038,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
        tscl = rdtsc();
        guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
        delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
-        lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+        lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+                                                    ktimer->timer_advance_ns);
        if (delta_tsc > lapic_timer_advance_cycles)
                delta_tsc -= lapic_timer_advance_cycles;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a0d1fc80ac5a..b5edc8e3ce1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -136,10 +136,14 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 static u32 __read_mostly tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+/*
-unsigned int __read_mostly lapic_timer_advance_ns = 1000;
+ * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
+ * adaptive tuning starting from default advancment of 1000ns.  '0' disables
+ * advancement entirely.  Any other value is used as-is and disables adaptive
+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ */
+static int __read_mostly lapic_timer_advance_ns = -1;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
 static bool __read_mostly vector_hashing = true;
 module_param(vector_hashing, bool, S_IRUGO);
@@ -6535,6 +6539,12 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.pio.count = 0;
+        return 1;
+}
 static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.pio.count = 0;
@@ -6551,12 +6561,23 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
        unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
        int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
                                            size, port, &val, 1);
+        if (ret)
+                return ret;
-        if (!ret) {
+        /*
+         * Workaround userspace that relies on old KVM behavior of %rip being
+         * incremented prior to exiting to userspace to handle "OUT 0x7e".
+         */
+        if (port == 0x7e &&
+            kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+                vcpu->arch.complete_userspace_io =
+                        complete_fast_pio_out_port_0x7e;
+                kvm_skip_emulated_instruction(vcpu);
+        } else {
                vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
                vcpu->arch.complete_userspace_io = complete_fast_pio_out;
        }
-        return ret;
+        return 0;
 }
 static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
@@ -7873,7 +7894,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        }
        trace_kvm_entry(vcpu->vcpu_id);
-        if (lapic_timer_advance_ns)
+        if (lapic_in_kernel(vcpu) &&
+            vcpu->arch.apic->lapic_timer.timer_advance_ns)
                wait_lapic_expire(vcpu);
        guest_enter_irqoff();
@@ -9061,7 +9083,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        if (irqchip_in_kernel(vcpu->kvm)) {
                vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
-                r = kvm_create_lapic(vcpu);
+                r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
                if (r < 0)
                        goto fail_mmu_destroy;
        } else
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index aedc5d0d4989..534d3f28bb01 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void);
 extern unsigned int min_timer_period_us;
-extern unsigned int lapic_timer_advance_ns;
 extern bool enable_vmware_backdoor;
 extern struct static_key kvm_no_apic_vcpu;
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index 4715cfba20dc..93f99c6b7d79 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -288,8 +288,11 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
 #endif
        max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1;
        guest_page_size = (1ul << guest_page_shift);
-        /* 1G of guest page sized pages */
+        /*
-        guest_num_pages = (1ul << (30 - guest_page_shift));
+         * A little more than 1G of guest page sized pages.  Cover the
+         * case where the size is not aligned to 64 pages.
+         */
+        guest_num_pages = (1ul << (30 - guest_page_shift)) + 3;
        host_page_size = getpagesize();
        host_num_pages = (guest_num_pages * guest_page_size) / host_page_size +
                         !!((guest_num_pages * guest_page_size) % host_page_size);
@@ -359,7 +362,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
                kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
 #ifdef USE_CLEAR_DIRTY_LOG
                kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
-                                       DIV_ROUND_UP(host_num_pages, 64) * 64);
+                                       host_num_pages);
 #endif
                vm_dirty_log_verify(bmap);
                iteration++;
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
index 264425f75806..9a21e912097c 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -141,7 +141,13 @@ int main(int argc, char *argv[])
        free(hv_cpuid_entries);
-        vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap);
+        rv = _vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap);
+        if (rv) {
+                fprintf(stderr,
+                        "Enlightened VMCS is unsupported, skip related test\n");
+                goto vm_free;
+        }
        hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm);
        if (!hv_cpuid_entries)
@@ -151,6 +157,7 @@ int main(int argc, char *argv[])
        free(hv_cpuid_entries);
+vm_free:
        kvm_vm_free(vm);
        return 0;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 3417f2dbc366..7fc272ecae16 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -508,6 +508,14 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
        /*
+         * Update the timer output so that it is likely to match the
+         * state we're about to restore. If the timer expires between
+         * this point and the register restoration, we'll take the
+         * interrupt anyway.
+         */
+        kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
+        /*
         * When using a userspace irqchip with the architected timers and a
         * host interrupt controller that doesn't support an active state, we
         * must still prevent continuously exiting from the guest, and
@@ -730,7 +738,6 @@ static void kvm_timer_init_interrupt(void *info)
 int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 {
        struct arch_timer_context *timer;
-        bool level;
        switch (regid) {
        case KVM_REG_ARM_TIMER_CTL:
@@ -758,10 +765,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
                return -1;
        }
-        level = kvm_timer_should_fire(timer);
-        kvm_timer_update_irq(vcpu, level, timer);
-        timer_emulate(timer);
        return 0;
 }
@@ -812,7 +815,7 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
        switch (treg) {
        case TIMER_REG_TVAL:
-                val = kvm_phys_timer_read() - timer->cntvoff - timer->cnt_cval;
+                val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
                break;
        case TIMER_REG_CTL:
@@ -858,7 +861,7 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
 {
        switch (treg) {
        case TIMER_REG_TVAL:
-                timer->cnt_cval = val - kvm_phys_timer_read() - timer->cntvoff;
+                timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val;
                break;
        case TIMER_REG_CTL:
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 99c37384ba7b..f412ebc90610 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -934,7 +934,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
                               const struct kvm_vcpu_init *init)
 {
-        unsigned int i;
+        unsigned int i, ret;
        int phys_target = kvm_target_cpu();
        if (init->target != phys_target)
@@ -969,9 +969,14 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
        vcpu->arch.target = phys_target;
        /* Now we know what it is, we can reset it. */
-        return kvm_reset_vcpu(vcpu);
+        ret = kvm_reset_vcpu(vcpu);
-}
+        if (ret) {
+                vcpu->arch.target = -1;
+                bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+        }
+        return ret;
+}
 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
                                         struct kvm_vcpu_init *init)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 27c958306449..a39dcfdbcc65 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1781,8 +1781,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                 * Only PMD_SIZE transparent hugepages(THP) are
                 * currently supported. This code will need to be
                 * updated to support other THP sizes.
+                 *
+                 * Make sure the host VA and the guest IPA are sufficiently
+                 * aligned and that the block is contained within the memslot.
                 */
-                if (transparent_hugepage_adjust(&pfn, &fault_ipa))
+                if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+                    transparent_hugepage_adjust(&pfn, &fault_ipa))
                        vma_pagesize = PMD_SIZE;
        }
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 4a12322bf7df..9f4843fe9cda 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -200,6 +200,9 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
        vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+        if (was_enabled && !vgic_cpu->lpis_enabled)
+                vgic_flush_pending_lpis(vcpu);
        if (!was_enabled && vgic_cpu->lpis_enabled)
                vgic_enable_lpis(vcpu);
 }
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 3af69f2a3866..191deccf60bf 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -151,6 +151,27 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
        kfree(irq);
 }
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
+{
+        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+        struct vgic_irq *irq, *tmp;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+        list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+                if (irq->intid >= VGIC_MIN_LPI) {
+                        raw_spin_lock(&irq->irq_lock);
+                        list_del(&irq->ap_list);
+                        irq->vcpu = NULL;
+                        raw_spin_unlock(&irq->irq_lock);
+                        vgic_put_irq(vcpu->kvm, irq);
+                }
+        }
+        raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+}
 void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
 {
        WARN_ON(irq_set_irqchip_state(irq->host_irq,
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index a90024718ca4..abeeffabc456 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -238,6 +238,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu);
 bool vgic_has_its(struct kvm *kvm);
 int kvm_vgic_register_its_device(void);
 void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
 int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
 int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
 int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dc8edc97ba85..a704d1f9bd96 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1240,7 +1240,7 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
                return -EINVAL;
-        if ((log->first_page & 63) || (log->num_pages & 63))
+        if (log->first_page & 63)
                return -EINVAL;
        slots = __kvm_memslots(kvm, as_id);
@@ -1253,8 +1253,9 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
        n = kvm_dirty_bitmap_bytes(memslot);
        if (log->first_page > memslot->npages ||
-            log->num_pages > memslot->npages - log->first_page)
+            log->num_pages > memslot->npages - log->first_page ||
-                        return -EINVAL;
+            (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
+            return -EINVAL;
        *flush = false;
        dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-05-03 19:49:46 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-05-03 19:49:46 -0400
commit	aa1be08f52585fe36ecfaf5bddfdc784eb4c94cf (patch)
tree	bb8a647ba51f6990c880234c32c7ffe0cc8ec826
parent	82463436a7fa40345c6febf0baa4c954af506ca6 (diff)
parent	e8ab8d24b488632d07ce5ddb261f1d454114415b (diff)