aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-06-24 12:36:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-24 12:36:49 -0400
commit4e241557fc1cb560bd9e77ca1b4a9352732a5427 (patch)
treeda4dbe5e5b3a8792daf9ed7e6bd320c56c86d252
parent08d183e3c1f650b4db1d07d764502116861542fa (diff)
parentf2ae45edbca7ba5324eef01719ede0151dc5cead (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull first batch of KVM updates from Paolo Bonzini: "The bulk of the changes here is for x86. And for once it's not for silicon that no one owns: these are really new features for everyone. Details: - ARM: several features are in progress but missed the 4.2 deadline. So here is just a smattering of bug fixes, plus enabling the VFIO integration. - s390: Some fixes/refactorings/optimizations, plus support for 2GB pages. - x86: * host and guest support for marking kvmclock as a stable scheduler clock. * support for write combining. * support for system management mode, needed for secure boot in guests. * a bunch of cleanups required for the above * support for virtualized performance counters on AMD * legacy PCI device assignment is deprecated and defaults to "n" in Kconfig; VFIO replaces it On top of this there are also bug fixes and eager FPU context loading for FPU-heavy guests. - Common code: Support for multiple address spaces; for now it is used only for x86 SMM but the s390 folks also have plans" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (124 commits) KVM: s390: clear floating interrupt bitmap and parameters KVM: x86/vPMU: Enable PMU handling for AMD PERFCTRn and EVNTSELn MSRs KVM: x86/vPMU: Implement AMD vPMU code for KVM KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch KVM: x86/vPMU: introduce kvm_pmu_msr_idx_to_pmc KVM: x86/vPMU: reorder PMU functions KVM: x86/vPMU: whitespace and stylistic adjustments in PMU code KVM: x86/vPMU: use the new macros to go between PMC, PMU and VCPU KVM: x86/vPMU: introduce pmu.h header KVM: x86/vPMU: rename a few PMU functions KVM: MTRR: do not map huge page for non-consistent range KVM: MTRR: simplify kvm_mtrr_get_guest_memory_type KVM: MTRR: introduce mtrr_for_each_mem_type KVM: MTRR: introduce fixed_mtrr_addr_* functions KVM: MTRR: sort variable MTRRs KVM: MTRR: introduce var_mtrr_range KVM: MTRR: introduce fixed_mtrr_segment table KVM: MTRR: improve kvm_mtrr_get_guest_memory_type KVM: MTRR: do not split 64 bits MSR content KVM: MTRR: clean up mtrr default type ...
-rw-r--r--Documentation/virtual/kvm/api.txt69
-rw-r--r--Documentation/virtual/kvm/mmu.txt6
-rw-r--r--arch/arm/kvm/Kconfig1
-rw-r--r--arch/arm/kvm/Makefile2
-rw-r--r--arch/arm/kvm/arm.c24
-rw-r--r--arch/arm/kvm/interrupts.S10
-rw-r--r--arch/arm/kvm/interrupts_head.S23
-rw-r--r--arch/arm/kvm/mmu.c14
-rw-r--r--arch/arm/kvm/psci.c16
-rw-r--r--arch/arm64/kvm/Kconfig1
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/hyp.S8
-rw-r--r--arch/arm64/kvm/vgic-v2-switch.S3
-rw-r--r--arch/arm64/kvm/vgic-v3-switch.S2
-rw-r--r--arch/mips/include/asm/kvm_host.h2
-rw-r--r--arch/mips/kvm/mips.c13
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h2
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h14
-rw-r--r--arch/powerpc/kvm/book3s.c9
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c15
-rw-r--r--arch/powerpc/kvm/book3s_pr.c11
-rw-r--r--arch/powerpc/kvm/booke.c13
-rw-r--r--arch/powerpc/kvm/powerpc.c9
-rw-r--r--arch/s390/include/asm/kvm_host.h6
-rw-r--r--arch/s390/kernel/entry.S2
-rw-r--r--arch/s390/kvm/intercept.c16
-rw-r--r--arch/s390/kvm/interrupt.c90
-rw-r--r--arch/s390/kvm/kvm-s390.c81
-rw-r--r--arch/s390/kvm/kvm-s390.h25
-rw-r--r--arch/s390/kvm/priv.c8
-rw-r--r--arch/x86/include/asm/kvm_emulate.h9
-rw-r--r--arch/x86/include/asm/kvm_host.h92
-rw-r--r--arch/x86/include/asm/pvclock-abi.h1
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h14
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/kvmclock.c14
-rw-r--r--arch/x86/kvm/Kconfig9
-rw-r--r--arch/x86/kvm/Makefile6
-rw-r--r--arch/x86/kvm/cpuid.c13
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c303
-rw-r--r--arch/x86/kvm/ioapic.c9
-rw-r--r--arch/x86/kvm/irq_comm.c14
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/lapic.c59
-rw-r--r--arch/x86/kvm/lapic.h15
-rw-r--r--arch/x86/kvm/mmu.c678
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu_audit.c20
-rw-r--r--arch/x86/kvm/mtrr.c699
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/pmu.c553
-rw-r--r--arch/x86/kvm/pmu.h118
-rw-r--r--arch/x86/kvm/pmu_amd.c207
-rw-r--r--arch/x86/kvm/pmu_intel.c358
-rw-r--r--arch/x86/kvm/svm.c116
-rw-r--r--arch/x86/kvm/trace.h22
-rw-r--r--arch/x86/kvm/vmx.c363
-rw-r--r--arch/x86/kvm/x86.c890
-rw-r--r--arch/x86/kvm/x86.h8
-rw-r--r--include/linux/kvm_host.h96
-rw-r--r--include/linux/kvm_types.h1
-rw-r--r--include/uapi/linux/kvm.h9
-rw-r--r--virt/kvm/arm/vgic-v3-emul.c56
-rw-r--r--virt/kvm/arm/vgic.c7
-rw-r--r--virt/kvm/async_pf.h4
-rw-r--r--virt/kvm/coalesced_mmio.h4
-rw-r--r--virt/kvm/irqchip.c41
-rw-r--r--virt/kvm/kvm_main.c432
72 files changed, 4077 insertions, 1702 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 9fa2bf8c3f6f..a7926a90156f 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -254,6 +254,11 @@ since the last call to this ioctl. Bit 0 is the first page in the
254memory slot. Ensure the entire structure is cleared to avoid padding 254memory slot. Ensure the entire structure is cleared to avoid padding
255issues. 255issues.
256 256
257If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies
258the address space for which you want to return the dirty bitmap.
259They must be less than the value that KVM_CHECK_EXTENSION returns for
260the KVM_CAP_MULTI_ADDRESS_SPACE capability.
261
257 262
2584.9 KVM_SET_MEMORY_ALIAS 2634.9 KVM_SET_MEMORY_ALIAS
259 264
@@ -820,11 +825,21 @@ struct kvm_vcpu_events {
820 } nmi; 825 } nmi;
821 __u32 sipi_vector; 826 __u32 sipi_vector;
822 __u32 flags; 827 __u32 flags;
828 struct {
829 __u8 smm;
830 __u8 pending;
831 __u8 smm_inside_nmi;
832 __u8 latched_init;
833 } smi;
823}; 834};
824 835
825KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that 836Only two fields are defined in the flags field:
826interrupt.shadow contains a valid state. Otherwise, this field is undefined. 837
838- KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
839 interrupt.shadow contains a valid state.
827 840
841- KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
842 smi contains a valid state.
828 843
8294.32 KVM_SET_VCPU_EVENTS 8444.32 KVM_SET_VCPU_EVENTS
830 845
@@ -841,17 +856,20 @@ vcpu.
841See KVM_GET_VCPU_EVENTS for the data structure. 856See KVM_GET_VCPU_EVENTS for the data structure.
842 857
843Fields that may be modified asynchronously by running VCPUs can be excluded 858Fields that may be modified asynchronously by running VCPUs can be excluded
844from the update. These fields are nmi.pending and sipi_vector. Keep the 859from the update. These fields are nmi.pending, sipi_vector, smi.smm,
845corresponding bits in the flags field cleared to suppress overwriting the 860smi.pending. Keep the corresponding bits in the flags field cleared to
846current in-kernel state. The bits are: 861suppress overwriting the current in-kernel state. The bits are:
847 862
848KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel 863KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
849KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector 864KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
865KVM_VCPUEVENT_VALID_SMM - transfer the smi sub-struct.
850 866
851If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in 867If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
852the flags field to signal that interrupt.shadow contains a valid state and 868the flags field to signal that interrupt.shadow contains a valid state and
853shall be written into the VCPU. 869shall be written into the VCPU.
854 870
871KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
872
855 873
8564.33 KVM_GET_DEBUGREGS 8744.33 KVM_GET_DEBUGREGS
857 875
@@ -911,6 +929,13 @@ slot. When changing an existing slot, it may be moved in the guest
911physical memory space, or its flags may be modified. It may not be 929physical memory space, or its flags may be modified. It may not be
912resized. Slots may not overlap in guest physical address space. 930resized. Slots may not overlap in guest physical address space.
913 931
932If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot"
933specifies the address space which is being modified. They must be
934less than the value that KVM_CHECK_EXTENSION returns for the
935KVM_CAP_MULTI_ADDRESS_SPACE capability. Slots in separate address spaces
936are unrelated; the restriction on overlapping slots only applies within
937each address space.
938
914Memory for the region is taken starting at the address denoted by the 939Memory for the region is taken starting at the address denoted by the
915field userspace_addr, which must point at user addressable memory for 940field userspace_addr, which must point at user addressable memory for
916the entire memory slot size. Any object may back this memory, including 941the entire memory slot size. Any object may back this memory, including
@@ -959,7 +984,8 @@ documentation when it pops into existence).
9594.37 KVM_ENABLE_CAP 9844.37 KVM_ENABLE_CAP
960 985
961Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM 986Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM
962Architectures: ppc, s390 987Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM),
988 mips (only KVM_CAP_ENABLE_CAP), ppc, s390
963Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) 989Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM)
964Parameters: struct kvm_enable_cap (in) 990Parameters: struct kvm_enable_cap (in)
965Returns: 0 on success; -1 on error 991Returns: 0 on success; -1 on error
@@ -1268,7 +1294,7 @@ The flags bitmap is defined as:
1268 /* the host supports the ePAPR idle hcall 1294 /* the host supports the ePAPR idle hcall
1269 #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) 1295 #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0)
1270 1296
12714.48 KVM_ASSIGN_PCI_DEVICE 12974.48 KVM_ASSIGN_PCI_DEVICE (deprecated)
1272 1298
1273Capability: none 1299Capability: none
1274Architectures: x86 1300Architectures: x86
@@ -1318,7 +1344,7 @@ Errors:
1318 have their standard meanings. 1344 have their standard meanings.
1319 1345
1320 1346
13214.49 KVM_DEASSIGN_PCI_DEVICE 13474.49 KVM_DEASSIGN_PCI_DEVICE (deprecated)
1322 1348
1323Capability: none 1349Capability: none
1324Architectures: x86 1350Architectures: x86
@@ -1337,7 +1363,7 @@ Errors:
1337 Other error conditions may be defined by individual device types or 1363 Other error conditions may be defined by individual device types or
1338 have their standard meanings. 1364 have their standard meanings.
1339 1365
13404.50 KVM_ASSIGN_DEV_IRQ 13664.50 KVM_ASSIGN_DEV_IRQ (deprecated)
1341 1367
1342Capability: KVM_CAP_ASSIGN_DEV_IRQ 1368Capability: KVM_CAP_ASSIGN_DEV_IRQ
1343Architectures: x86 1369Architectures: x86
@@ -1377,7 +1403,7 @@ Errors:
1377 have their standard meanings. 1403 have their standard meanings.
1378 1404
1379 1405
13804.51 KVM_DEASSIGN_DEV_IRQ 14064.51 KVM_DEASSIGN_DEV_IRQ (deprecated)
1381 1407
1382Capability: KVM_CAP_ASSIGN_DEV_IRQ 1408Capability: KVM_CAP_ASSIGN_DEV_IRQ
1383Architectures: x86 1409Architectures: x86
@@ -1451,7 +1477,7 @@ struct kvm_irq_routing_s390_adapter {
1451}; 1477};
1452 1478
1453 1479
14544.53 KVM_ASSIGN_SET_MSIX_NR 14804.53 KVM_ASSIGN_SET_MSIX_NR (deprecated)
1455 1481
1456Capability: none 1482Capability: none
1457Architectures: x86 1483Architectures: x86
@@ -1473,7 +1499,7 @@ struct kvm_assigned_msix_nr {
1473#define KVM_MAX_MSIX_PER_DEV 256 1499#define KVM_MAX_MSIX_PER_DEV 256
1474 1500
1475 1501
14764.54 KVM_ASSIGN_SET_MSIX_ENTRY 15024.54 KVM_ASSIGN_SET_MSIX_ENTRY (deprecated)
1477 1503
1478Capability: none 1504Capability: none
1479Architectures: x86 1505Architectures: x86
@@ -1629,7 +1655,7 @@ should skip processing the bitmap and just invalidate everything. It must
1629be set to the number of set bits in the bitmap. 1655be set to the number of set bits in the bitmap.
1630 1656
1631 1657
16324.61 KVM_ASSIGN_SET_INTX_MASK 16584.61 KVM_ASSIGN_SET_INTX_MASK (deprecated)
1633 1659
1634Capability: KVM_CAP_PCI_2_3 1660Capability: KVM_CAP_PCI_2_3
1635Architectures: x86 1661Architectures: x86
@@ -2978,6 +3004,16 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
2978and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), 3004and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
2979which is the maximum number of possibly pending cpu-local interrupts. 3005which is the maximum number of possibly pending cpu-local interrupts.
2980 3006
30074.90 KVM_SMI
3008
3009Capability: KVM_CAP_X86_SMM
3010Architectures: x86
3011Type: vcpu ioctl
3012Parameters: none
3013Returns: 0 on success, -1 on error
3014
3015Queues an SMI on the thread's vcpu.
3016
29815. The kvm_run structure 30175. The kvm_run structure
2982------------------------ 3018------------------------
2983 3019
@@ -3013,7 +3049,12 @@ an interrupt can be injected now with KVM_INTERRUPT.
3013The value of the current interrupt flag. Only valid if in-kernel 3049The value of the current interrupt flag. Only valid if in-kernel
3014local APIC is not used. 3050local APIC is not used.
3015 3051
3016 __u8 padding2[2]; 3052 __u16 flags;
3053
3054More architecture-specific flags detailing state of the VCPU that may
3055affect the device's behavior. The only currently defined flag is
3056KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the
3057VCPU is in system management mode.
3017 3058
3018 /* in (pre_kvm_run), out (post_kvm_run) */ 3059 /* in (pre_kvm_run), out (post_kvm_run) */
3019 __u64 cr8; 3060 __u64 cr8;
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index c59bd9bc41ef..3a4d681c3e98 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -173,6 +173,12 @@ Shadow pages contain the following information:
173 Contains the value of cr4.smap && !cr0.wp for which the page is valid 173 Contains the value of cr4.smap && !cr0.wp for which the page is valid
174 (pages for which this is true are different from other pages; see the 174 (pages for which this is true are different from other pages; see the
175 treatment of cr0.wp=0 below). 175 treatment of cr0.wp=0 below).
176 role.smm:
177 Is 1 if the page is valid in system management mode. This field
178 determines which of the kvm_memslots array was used to build this
179 shadow page; it is also used to go back from a struct kvm_mmu_page
180 to a memslot, through the kvm_memslots_for_spte_role macro and
181 __gfn_to_memslot.
176 gfn: 182 gfn:
177 Either the guest page table containing the translations shadowed by this 183 Either the guest page table containing the translations shadowed by this
178 page, or the base page frame for linear translations. See role.direct. 184 page, or the base page frame for linear translations. See role.direct.
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index f1f79d104309..bfb915d05665 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select KVM_GENERIC_DIRTYLOG_READ_PROTECT 28 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
29 select SRCU 29 select SRCU
30 select MMU_NOTIFIER 30 select MMU_NOTIFIER
31 select KVM_VFIO
31 select HAVE_KVM_EVENTFD 32 select HAVE_KVM_EVENTFD
32 select HAVE_KVM_IRQFD 33 select HAVE_KVM_IRQFD
33 depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER 34 depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 139e46c08b6e..c5eef02c52ba 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -15,7 +15,7 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
15AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) 15AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
16 16
17KVM := ../../../virt/kvm 17KVM := ../../../virt/kvm
18kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o 18kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
19 19
20obj-y += kvm-arm.o init.o interrupts.o 20obj-y += kvm-arm.o init.o interrupts.o
21obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o 21obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index d9631ecddd56..bc738d2b8392 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -171,7 +171,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
171 int r; 171 int r;
172 switch (ext) { 172 switch (ext) {
173 case KVM_CAP_IRQCHIP: 173 case KVM_CAP_IRQCHIP:
174 case KVM_CAP_IRQFD:
175 case KVM_CAP_IOEVENTFD: 174 case KVM_CAP_IOEVENTFD:
176 case KVM_CAP_DEVICE_CTRL: 175 case KVM_CAP_DEVICE_CTRL:
177 case KVM_CAP_USER_MEMORY: 176 case KVM_CAP_USER_MEMORY:
@@ -532,6 +531,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
532 kvm_vgic_flush_hwstate(vcpu); 531 kvm_vgic_flush_hwstate(vcpu);
533 kvm_timer_flush_hwstate(vcpu); 532 kvm_timer_flush_hwstate(vcpu);
534 533
534 preempt_disable();
535 local_irq_disable(); 535 local_irq_disable();
536 536
537 /* 537 /*
@@ -544,6 +544,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
544 544
545 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { 545 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
546 local_irq_enable(); 546 local_irq_enable();
547 preempt_enable();
547 kvm_timer_sync_hwstate(vcpu); 548 kvm_timer_sync_hwstate(vcpu);
548 kvm_vgic_sync_hwstate(vcpu); 549 kvm_vgic_sync_hwstate(vcpu);
549 continue; 550 continue;
@@ -553,14 +554,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
553 * Enter the guest 554 * Enter the guest
554 */ 555 */
555 trace_kvm_entry(*vcpu_pc(vcpu)); 556 trace_kvm_entry(*vcpu_pc(vcpu));
556 kvm_guest_enter(); 557 __kvm_guest_enter();
557 vcpu->mode = IN_GUEST_MODE; 558 vcpu->mode = IN_GUEST_MODE;
558 559
559 ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); 560 ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
560 561
561 vcpu->mode = OUTSIDE_GUEST_MODE; 562 vcpu->mode = OUTSIDE_GUEST_MODE;
562 kvm_guest_exit(); 563 /*
563 trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); 564 * Back from guest
565 *************************************************************/
566
564 /* 567 /*
565 * We may have taken a host interrupt in HYP mode (ie 568 * We may have taken a host interrupt in HYP mode (ie
566 * while executing the guest). This interrupt is still 569 * while executing the guest). This interrupt is still
@@ -574,8 +577,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
574 local_irq_enable(); 577 local_irq_enable();
575 578
576 /* 579 /*
577 * Back from guest 580 * We do local_irq_enable() before calling kvm_guest_exit() so
578 *************************************************************/ 581 * that if a timer interrupt hits while running the guest we
582 * account that tick as being spent in the guest. We enable
583 * preemption after calling kvm_guest_exit() so that if we get
584 * preempted we make sure ticks after that is not counted as
585 * guest time.
586 */
587 kvm_guest_exit();
588 trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
589 preempt_enable();
590
579 591
580 kvm_timer_sync_hwstate(vcpu); 592 kvm_timer_sync_hwstate(vcpu);
581 kvm_vgic_sync_hwstate(vcpu); 593 kvm_vgic_sync_hwstate(vcpu);
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 79caf79b304a..f7db3a5d80e3 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -170,13 +170,9 @@ __kvm_vcpu_return:
170 @ Don't trap coprocessor accesses for host kernel 170 @ Don't trap coprocessor accesses for host kernel
171 set_hstr vmexit 171 set_hstr vmexit
172 set_hdcr vmexit 172 set_hdcr vmexit
173 set_hcptr vmexit, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11)) 173 set_hcptr vmexit, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11)), after_vfp_restore
174 174
175#ifdef CONFIG_VFPv3 175#ifdef CONFIG_VFPv3
176 @ Save floating point registers we if let guest use them.
177 tst r2, #(HCPTR_TCP(10) | HCPTR_TCP(11))
178 bne after_vfp_restore
179
180 @ Switch VFP/NEON hardware state to the host's 176 @ Switch VFP/NEON hardware state to the host's
181 add r7, vcpu, #VCPU_VFP_GUEST 177 add r7, vcpu, #VCPU_VFP_GUEST
182 store_vfp_state r7 178 store_vfp_state r7
@@ -188,6 +184,8 @@ after_vfp_restore:
188 @ Restore FPEXC_EN which we clobbered on entry 184 @ Restore FPEXC_EN which we clobbered on entry
189 pop {r2} 185 pop {r2}
190 VFPFMXR FPEXC, r2 186 VFPFMXR FPEXC, r2
187#else
188after_vfp_restore:
191#endif 189#endif
192 190
193 @ Reset Hyp-role 191 @ Reset Hyp-role
@@ -483,7 +481,7 @@ switch_to_guest_vfp:
483 push {r3-r7} 481 push {r3-r7}
484 482
485 @ NEON/VFP used. Turn on VFP access. 483 @ NEON/VFP used. Turn on VFP access.
486 set_hcptr vmexit, (HCPTR_TCP(10) | HCPTR_TCP(11)) 484 set_hcptr vmtrap, (HCPTR_TCP(10) | HCPTR_TCP(11))
487 485
488 @ Switch VFP/NEON hardware state to the guest's 486 @ Switch VFP/NEON hardware state to the guest's
489 add r7, r0, #VCPU_VFP_HOST 487 add r7, r0, #VCPU_VFP_HOST
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index 35e4a3a0c476..702740d37465 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -412,7 +412,6 @@ vcpu .req r0 @ vcpu pointer always in r0
412 add r11, vcpu, #VCPU_VGIC_CPU 412 add r11, vcpu, #VCPU_VGIC_CPU
413 413
414 /* Save all interesting registers */ 414 /* Save all interesting registers */
415 ldr r3, [r2, #GICH_HCR]
416 ldr r4, [r2, #GICH_VMCR] 415 ldr r4, [r2, #GICH_VMCR]
417 ldr r5, [r2, #GICH_MISR] 416 ldr r5, [r2, #GICH_MISR]
418 ldr r6, [r2, #GICH_EISR0] 417 ldr r6, [r2, #GICH_EISR0]
@@ -420,7 +419,6 @@ vcpu .req r0 @ vcpu pointer always in r0
420 ldr r8, [r2, #GICH_ELRSR0] 419 ldr r8, [r2, #GICH_ELRSR0]
421 ldr r9, [r2, #GICH_ELRSR1] 420 ldr r9, [r2, #GICH_ELRSR1]
422 ldr r10, [r2, #GICH_APR] 421 ldr r10, [r2, #GICH_APR]
423ARM_BE8(rev r3, r3 )
424ARM_BE8(rev r4, r4 ) 422ARM_BE8(rev r4, r4 )
425ARM_BE8(rev r5, r5 ) 423ARM_BE8(rev r5, r5 )
426ARM_BE8(rev r6, r6 ) 424ARM_BE8(rev r6, r6 )
@@ -429,7 +427,6 @@ ARM_BE8(rev r8, r8 )
429ARM_BE8(rev r9, r9 ) 427ARM_BE8(rev r9, r9 )
430ARM_BE8(rev r10, r10 ) 428ARM_BE8(rev r10, r10 )
431 429
432 str r3, [r11, #VGIC_V2_CPU_HCR]
433 str r4, [r11, #VGIC_V2_CPU_VMCR] 430 str r4, [r11, #VGIC_V2_CPU_VMCR]
434 str r5, [r11, #VGIC_V2_CPU_MISR] 431 str r5, [r11, #VGIC_V2_CPU_MISR]
435#ifdef CONFIG_CPU_ENDIAN_BE8 432#ifdef CONFIG_CPU_ENDIAN_BE8
@@ -591,8 +588,13 @@ ARM_BE8(rev r6, r6 )
591.endm 588.endm
592 589
593/* Configures the HCPTR (Hyp Coprocessor Trap Register) on entry/return 590/* Configures the HCPTR (Hyp Coprocessor Trap Register) on entry/return
594 * (hardware reset value is 0). Keep previous value in r2. */ 591 * (hardware reset value is 0). Keep previous value in r2.
595.macro set_hcptr operation, mask 592 * An ISB is emited on vmexit/vmtrap, but executed on vmexit only if
593 * VFP wasn't already enabled (always executed on vmtrap).
594 * If a label is specified with vmexit, it is branched to if VFP wasn't
595 * enabled.
596 */
597.macro set_hcptr operation, mask, label = none
596 mrc p15, 4, r2, c1, c1, 2 598 mrc p15, 4, r2, c1, c1, 2
597 ldr r3, =\mask 599 ldr r3, =\mask
598 .if \operation == vmentry 600 .if \operation == vmentry
@@ -601,6 +603,17 @@ ARM_BE8(rev r6, r6 )
601 bic r3, r2, r3 @ Don't trap defined coproc-accesses 603 bic r3, r2, r3 @ Don't trap defined coproc-accesses
602 .endif 604 .endif
603 mcr p15, 4, r3, c1, c1, 2 605 mcr p15, 4, r3, c1, c1, 2
606 .if \operation != vmentry
607 .if \operation == vmexit
608 tst r2, #(HCPTR_TCP(10) | HCPTR_TCP(11))
609 beq 1f
610 .endif
611 isb
612 .if \label != none
613 b \label
614 .endif
6151:
616 .endif
604.endm 617.endm
605 618
606/* Configures the HDCR (Hyp Debug Configuration Register) on entry/return 619/* Configures the HDCR (Hyp Debug Configuration Register) on entry/return
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1d5accbd3dcf..7b4201294187 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -691,8 +691,8 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
691 * work. This is not used by the hardware and we have no 691 * work. This is not used by the hardware and we have no
692 * alignment requirement for this allocation. 692 * alignment requirement for this allocation.
693 */ 693 */
694 pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), 694 pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
695 GFP_KERNEL | __GFP_ZERO); 695 GFP_KERNEL | __GFP_ZERO);
696 696
697 if (!pgd) { 697 if (!pgd) {
698 kvm_free_hwpgd(hwpgd); 698 kvm_free_hwpgd(hwpgd);
@@ -1155,7 +1155,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1155 */ 1155 */
1156void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1156void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1157{ 1157{
1158 struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot); 1158 struct kvm_memslots *slots = kvm_memslots(kvm);
1159 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1159 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; 1160 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1160 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1161 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1161 1162
@@ -1718,8 +1719,9 @@ out:
1718} 1719}
1719 1720
1720void kvm_arch_commit_memory_region(struct kvm *kvm, 1721void kvm_arch_commit_memory_region(struct kvm *kvm,
1721 struct kvm_userspace_memory_region *mem, 1722 const struct kvm_userspace_memory_region *mem,
1722 const struct kvm_memory_slot *old, 1723 const struct kvm_memory_slot *old,
1724 const struct kvm_memory_slot *new,
1723 enum kvm_mr_change change) 1725 enum kvm_mr_change change)
1724{ 1726{
1725 /* 1727 /*
@@ -1733,7 +1735,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
1733 1735
1734int kvm_arch_prepare_memory_region(struct kvm *kvm, 1736int kvm_arch_prepare_memory_region(struct kvm *kvm,
1735 struct kvm_memory_slot *memslot, 1737 struct kvm_memory_slot *memslot,
1736 struct kvm_userspace_memory_region *mem, 1738 const struct kvm_userspace_memory_region *mem,
1737 enum kvm_mr_change change) 1739 enum kvm_mr_change change)
1738{ 1740{
1739 hva_t hva = mem->userspace_addr; 1741 hva_t hva = mem->userspace_addr;
@@ -1838,7 +1840,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
1838 return 0; 1840 return 0;
1839} 1841}
1840 1842
1841void kvm_arch_memslots_updated(struct kvm *kvm) 1843void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
1842{ 1844{
1843} 1845}
1844 1846
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 02fa8eff6ae1..531e922486b2 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -230,10 +230,6 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
230 case PSCI_0_2_FN64_AFFINITY_INFO: 230 case PSCI_0_2_FN64_AFFINITY_INFO:
231 val = kvm_psci_vcpu_affinity_info(vcpu); 231 val = kvm_psci_vcpu_affinity_info(vcpu);
232 break; 232 break;
233 case PSCI_0_2_FN_MIGRATE:
234 case PSCI_0_2_FN64_MIGRATE:
235 val = PSCI_RET_NOT_SUPPORTED;
236 break;
237 case PSCI_0_2_FN_MIGRATE_INFO_TYPE: 233 case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
238 /* 234 /*
239 * Trusted OS is MP hence does not require migration 235 * Trusted OS is MP hence does not require migration
@@ -242,10 +238,6 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
242 */ 238 */
243 val = PSCI_0_2_TOS_MP; 239 val = PSCI_0_2_TOS_MP;
244 break; 240 break;
245 case PSCI_0_2_FN_MIGRATE_INFO_UP_CPU:
246 case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU:
247 val = PSCI_RET_NOT_SUPPORTED;
248 break;
249 case PSCI_0_2_FN_SYSTEM_OFF: 241 case PSCI_0_2_FN_SYSTEM_OFF:
250 kvm_psci_system_off(vcpu); 242 kvm_psci_system_off(vcpu);
251 /* 243 /*
@@ -271,7 +263,8 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
271 ret = 0; 263 ret = 0;
272 break; 264 break;
273 default: 265 default:
274 return -EINVAL; 266 val = PSCI_RET_NOT_SUPPORTED;
267 break;
275 } 268 }
276 269
277 *vcpu_reg(vcpu, 0) = val; 270 *vcpu_reg(vcpu, 0) = val;
@@ -291,12 +284,9 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
291 case KVM_PSCI_FN_CPU_ON: 284 case KVM_PSCI_FN_CPU_ON:
292 val = kvm_psci_vcpu_on(vcpu); 285 val = kvm_psci_vcpu_on(vcpu);
293 break; 286 break;
294 case KVM_PSCI_FN_CPU_SUSPEND: 287 default:
295 case KVM_PSCI_FN_MIGRATE:
296 val = PSCI_RET_NOT_SUPPORTED; 288 val = PSCI_RET_NOT_SUPPORTED;
297 break; 289 break;
298 default:
299 return -EINVAL;
300 } 290 }
301 291
302 *vcpu_reg(vcpu, 0) = val; 292 *vcpu_reg(vcpu, 0) = val;
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 5105e297ed5f..bfffe8f4bd53 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select KVM_ARM_HOST 28 select KVM_ARM_HOST
29 select KVM_GENERIC_DIRTYLOG_READ_PROTECT 29 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
30 select SRCU 30 select SRCU
31 select KVM_VFIO
31 select HAVE_KVM_EVENTFD 32 select HAVE_KVM_EVENTFD
32 select HAVE_KVM_IRQFD 33 select HAVE_KVM_IRQFD
33 ---help--- 34 ---help---
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d5904f876cdb..f90f4aa7f88d 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -11,7 +11,7 @@ ARM=../../../arch/arm/kvm
11 11
12obj-$(CONFIG_KVM_ARM_HOST) += kvm.o 12obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
13 13
14kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o 14kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
15kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o 15kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
16kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o 16kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
17 17
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 5befd010e232..519805f71876 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -50,8 +50,8 @@
50 stp x29, lr, [x3, #80] 50 stp x29, lr, [x3, #80]
51 51
52 mrs x19, sp_el0 52 mrs x19, sp_el0
53 mrs x20, elr_el2 // EL1 PC 53 mrs x20, elr_el2 // pc before entering el2
54 mrs x21, spsr_el2 // EL1 pstate 54 mrs x21, spsr_el2 // pstate before entering el2
55 55
56 stp x19, x20, [x3, #96] 56 stp x19, x20, [x3, #96]
57 str x21, [x3, #112] 57 str x21, [x3, #112]
@@ -82,8 +82,8 @@
82 ldr x21, [x3, #16] 82 ldr x21, [x3, #16]
83 83
84 msr sp_el0, x19 84 msr sp_el0, x19
85 msr elr_el2, x20 // EL1 PC 85 msr elr_el2, x20 // pc on return from el2
86 msr spsr_el2, x21 // EL1 pstate 86 msr spsr_el2, x21 // pstate on return from el2
87 87
88 add x3, x2, #CPU_XREG_OFFSET(19) 88 add x3, x2, #CPU_XREG_OFFSET(19)
89 ldp x19, x20, [x3] 89 ldp x19, x20, [x3]
diff --git a/arch/arm64/kvm/vgic-v2-switch.S b/arch/arm64/kvm/vgic-v2-switch.S
index f002fe1c3700..3f000712a85d 100644
--- a/arch/arm64/kvm/vgic-v2-switch.S
+++ b/arch/arm64/kvm/vgic-v2-switch.S
@@ -47,7 +47,6 @@ __save_vgic_v2_state:
47 add x3, x0, #VCPU_VGIC_CPU 47 add x3, x0, #VCPU_VGIC_CPU
48 48
49 /* Save all interesting registers */ 49 /* Save all interesting registers */
50 ldr w4, [x2, #GICH_HCR]
51 ldr w5, [x2, #GICH_VMCR] 50 ldr w5, [x2, #GICH_VMCR]
52 ldr w6, [x2, #GICH_MISR] 51 ldr w6, [x2, #GICH_MISR]
53 ldr w7, [x2, #GICH_EISR0] 52 ldr w7, [x2, #GICH_EISR0]
@@ -55,7 +54,6 @@ __save_vgic_v2_state:
55 ldr w9, [x2, #GICH_ELRSR0] 54 ldr w9, [x2, #GICH_ELRSR0]
56 ldr w10, [x2, #GICH_ELRSR1] 55 ldr w10, [x2, #GICH_ELRSR1]
57 ldr w11, [x2, #GICH_APR] 56 ldr w11, [x2, #GICH_APR]
58CPU_BE( rev w4, w4 )
59CPU_BE( rev w5, w5 ) 57CPU_BE( rev w5, w5 )
60CPU_BE( rev w6, w6 ) 58CPU_BE( rev w6, w6 )
61CPU_BE( rev w7, w7 ) 59CPU_BE( rev w7, w7 )
@@ -64,7 +62,6 @@ CPU_BE( rev w9, w9 )
64CPU_BE( rev w10, w10 ) 62CPU_BE( rev w10, w10 )
65CPU_BE( rev w11, w11 ) 63CPU_BE( rev w11, w11 )
66 64
67 str w4, [x3, #VGIC_V2_CPU_HCR]
68 str w5, [x3, #VGIC_V2_CPU_VMCR] 65 str w5, [x3, #VGIC_V2_CPU_VMCR]
69 str w6, [x3, #VGIC_V2_CPU_MISR] 66 str w6, [x3, #VGIC_V2_CPU_MISR]
70CPU_LE( str w7, [x3, #VGIC_V2_CPU_EISR] ) 67CPU_LE( str w7, [x3, #VGIC_V2_CPU_EISR] )
diff --git a/arch/arm64/kvm/vgic-v3-switch.S b/arch/arm64/kvm/vgic-v3-switch.S
index 617a012a0107..3c20730ddff5 100644
--- a/arch/arm64/kvm/vgic-v3-switch.S
+++ b/arch/arm64/kvm/vgic-v3-switch.S
@@ -48,13 +48,11 @@
48 dsb st 48 dsb st
49 49
50 // Save all interesting registers 50 // Save all interesting registers
51 mrs_s x4, ICH_HCR_EL2
52 mrs_s x5, ICH_VMCR_EL2 51 mrs_s x5, ICH_VMCR_EL2
53 mrs_s x6, ICH_MISR_EL2 52 mrs_s x6, ICH_MISR_EL2
54 mrs_s x7, ICH_EISR_EL2 53 mrs_s x7, ICH_EISR_EL2
55 mrs_s x8, ICH_ELSR_EL2 54 mrs_s x8, ICH_ELSR_EL2
56 55
57 str w4, [x3, #VGIC_V3_CPU_HCR]
58 str w5, [x3, #VGIC_V3_CPU_VMCR] 56 str w5, [x3, #VGIC_V3_CPU_VMCR]
59 str w6, [x3, #VGIC_V3_CPU_MISR] 57 str w6, [x3, #VGIC_V3_CPU_MISR]
60 str w7, [x3, #VGIC_V3_CPU_EISR] 58 str w7, [x3, #VGIC_V3_CPU_EISR]
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 4c25823563fe..e8c8d9d0c45f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -839,7 +839,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
839static inline void kvm_arch_sync_events(struct kvm *kvm) {} 839static inline void kvm_arch_sync_events(struct kvm *kvm) {}
840static inline void kvm_arch_free_memslot(struct kvm *kvm, 840static inline void kvm_arch_free_memslot(struct kvm *kvm,
841 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} 841 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
842static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} 842static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
843static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} 843static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
844static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 844static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
845 struct kvm_memory_slot *slot) {} 845 struct kvm_memory_slot *slot) {}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index bb68e8d520e8..cd4c129ce743 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -198,15 +198,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
198 198
199int kvm_arch_prepare_memory_region(struct kvm *kvm, 199int kvm_arch_prepare_memory_region(struct kvm *kvm,
200 struct kvm_memory_slot *memslot, 200 struct kvm_memory_slot *memslot,
201 struct kvm_userspace_memory_region *mem, 201 const struct kvm_userspace_memory_region *mem,
202 enum kvm_mr_change change) 202 enum kvm_mr_change change)
203{ 203{
204 return 0; 204 return 0;
205} 205}
206 206
207void kvm_arch_commit_memory_region(struct kvm *kvm, 207void kvm_arch_commit_memory_region(struct kvm *kvm,
208 struct kvm_userspace_memory_region *mem, 208 const struct kvm_userspace_memory_region *mem,
209 const struct kvm_memory_slot *old, 209 const struct kvm_memory_slot *old,
210 const struct kvm_memory_slot *new,
210 enum kvm_mr_change change) 211 enum kvm_mr_change change)
211{ 212{
212 unsigned long npages = 0; 213 unsigned long npages = 0;
@@ -393,7 +394,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
393 kvm_mips_deliver_interrupts(vcpu, 394 kvm_mips_deliver_interrupts(vcpu,
394 kvm_read_c0_guest_cause(vcpu->arch.cop0)); 395 kvm_read_c0_guest_cause(vcpu->arch.cop0));
395 396
396 kvm_guest_enter(); 397 __kvm_guest_enter();
397 398
398 /* Disable hardware page table walking while in guest */ 399 /* Disable hardware page table walking while in guest */
399 htw_stop(); 400 htw_stop();
@@ -403,7 +404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
403 /* Re-enable HTW before enabling interrupts */ 404 /* Re-enable HTW before enabling interrupts */
404 htw_start(); 405 htw_start();
405 406
406 kvm_guest_exit(); 407 __kvm_guest_exit();
407 local_irq_enable(); 408 local_irq_enable();
408 409
409 if (vcpu->sigset_active) 410 if (vcpu->sigset_active)
@@ -968,6 +969,7 @@ out:
968/* Get (and clear) the dirty memory log for a memory slot. */ 969/* Get (and clear) the dirty memory log for a memory slot. */
969int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) 970int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
970{ 971{
972 struct kvm_memslots *slots;
971 struct kvm_memory_slot *memslot; 973 struct kvm_memory_slot *memslot;
972 unsigned long ga, ga_end; 974 unsigned long ga, ga_end;
973 int is_dirty = 0; 975 int is_dirty = 0;
@@ -982,7 +984,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
982 984
983 /* If nothing is dirty, don't bother messing with page tables. */ 985 /* If nothing is dirty, don't bother messing with page tables. */
984 if (is_dirty) { 986 if (is_dirty) {
985 memslot = &kvm->memslots->memslots[log->slot]; 987 slots = kvm_memslots(kvm);
988 memslot = id_to_memslot(slots, log->slot);
986 989
987 ga = memslot->base_gfn << PAGE_SHIFT; 990 ga = memslot->base_gfn << PAGE_SHIFT;
988 ga_end = ga + (memslot->npages << PAGE_SHIFT); 991 ga_end = ga + (memslot->npages << PAGE_SHIFT);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 3536d12eb798..2aa79c864e91 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -430,7 +430,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
430 */ 430 */
431static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) 431static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
432{ 432{
433 return rcu_dereference_raw_notrace(kvm->memslots); 433 return rcu_dereference_raw_notrace(kvm->memslots[0]);
434} 434}
435 435
436extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); 436extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a193a13cf08b..d91f65b28e32 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -698,7 +698,7 @@ struct kvm_vcpu_arch {
698static inline void kvm_arch_hardware_disable(void) {} 698static inline void kvm_arch_hardware_disable(void) {}
699static inline void kvm_arch_hardware_unsetup(void) {} 699static inline void kvm_arch_hardware_unsetup(void) {}
700static inline void kvm_arch_sync_events(struct kvm *kvm) {} 700static inline void kvm_arch_sync_events(struct kvm *kvm) {}
701static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} 701static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
702static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} 702static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
703static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 703static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
704static inline void kvm_arch_exit(void) {} 704static inline void kvm_arch_exit(void) {}
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index b8475daad884..c6ef05bd0765 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -182,10 +182,11 @@ extern int kvmppc_core_create_memslot(struct kvm *kvm,
182 unsigned long npages); 182 unsigned long npages);
183extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, 183extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
184 struct kvm_memory_slot *memslot, 184 struct kvm_memory_slot *memslot,
185 struct kvm_userspace_memory_region *mem); 185 const struct kvm_userspace_memory_region *mem);
186extern void kvmppc_core_commit_memory_region(struct kvm *kvm, 186extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
187 struct kvm_userspace_memory_region *mem, 187 const struct kvm_userspace_memory_region *mem,
188 const struct kvm_memory_slot *old); 188 const struct kvm_memory_slot *old,
189 const struct kvm_memory_slot *new);
189extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, 190extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
190 struct kvm_ppc_smmu_info *info); 191 struct kvm_ppc_smmu_info *info);
191extern void kvmppc_core_flush_memslot(struct kvm *kvm, 192extern void kvmppc_core_flush_memslot(struct kvm *kvm,
@@ -243,10 +244,11 @@ struct kvmppc_ops {
243 void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot); 244 void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot);
244 int (*prepare_memory_region)(struct kvm *kvm, 245 int (*prepare_memory_region)(struct kvm *kvm,
245 struct kvm_memory_slot *memslot, 246 struct kvm_memory_slot *memslot,
246 struct kvm_userspace_memory_region *mem); 247 const struct kvm_userspace_memory_region *mem);
247 void (*commit_memory_region)(struct kvm *kvm, 248 void (*commit_memory_region)(struct kvm *kvm,
248 struct kvm_userspace_memory_region *mem, 249 const struct kvm_userspace_memory_region *mem,
249 const struct kvm_memory_slot *old); 250 const struct kvm_memory_slot *old,
251 const struct kvm_memory_slot *new);
250 int (*unmap_hva)(struct kvm *kvm, unsigned long hva); 252 int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
251 int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, 253 int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
252 unsigned long end); 254 unsigned long end);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 453a8a47a467..05ea8fc7f829 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -757,16 +757,17 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
757 757
758int kvmppc_core_prepare_memory_region(struct kvm *kvm, 758int kvmppc_core_prepare_memory_region(struct kvm *kvm,
759 struct kvm_memory_slot *memslot, 759 struct kvm_memory_slot *memslot,
760 struct kvm_userspace_memory_region *mem) 760 const struct kvm_userspace_memory_region *mem)
761{ 761{
762 return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); 762 return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem);
763} 763}
764 764
765void kvmppc_core_commit_memory_region(struct kvm *kvm, 765void kvmppc_core_commit_memory_region(struct kvm *kvm,
766 struct kvm_userspace_memory_region *mem, 766 const struct kvm_userspace_memory_region *mem,
767 const struct kvm_memory_slot *old) 767 const struct kvm_memory_slot *old,
768 const struct kvm_memory_slot *new)
768{ 769{
769 kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); 770 kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new);
770} 771}
771 772
772int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 773int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1a4acf8bf4f4..dab68b7af3f2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -650,7 +650,7 @@ static void kvmppc_rmap_reset(struct kvm *kvm)
650 int srcu_idx; 650 int srcu_idx;
651 651
652 srcu_idx = srcu_read_lock(&kvm->srcu); 652 srcu_idx = srcu_read_lock(&kvm->srcu);
653 slots = kvm->memslots; 653 slots = kvm_memslots(kvm);
654 kvm_for_each_memslot(memslot, slots) { 654 kvm_for_each_memslot(memslot, slots) {
655 /* 655 /*
656 * This assumes it is acceptable to lose reference and 656 * This assumes it is acceptable to lose reference and
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index df81caab7383..68d067ad4222 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2321,6 +2321,7 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
2321static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, 2321static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
2322 struct kvm_dirty_log *log) 2322 struct kvm_dirty_log *log)
2323{ 2323{
2324 struct kvm_memslots *slots;
2324 struct kvm_memory_slot *memslot; 2325 struct kvm_memory_slot *memslot;
2325 int r; 2326 int r;
2326 unsigned long n; 2327 unsigned long n;
@@ -2331,7 +2332,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
2331 if (log->slot >= KVM_USER_MEM_SLOTS) 2332 if (log->slot >= KVM_USER_MEM_SLOTS)
2332 goto out; 2333 goto out;
2333 2334
2334 memslot = id_to_memslot(kvm->memslots, log->slot); 2335 slots = kvm_memslots(kvm);
2336 memslot = id_to_memslot(slots, log->slot);
2335 r = -ENOENT; 2337 r = -ENOENT;
2336 if (!memslot->dirty_bitmap) 2338 if (!memslot->dirty_bitmap)
2337 goto out; 2339 goto out;
@@ -2374,16 +2376,18 @@ static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
2374 2376
2375static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, 2377static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
2376 struct kvm_memory_slot *memslot, 2378 struct kvm_memory_slot *memslot,
2377 struct kvm_userspace_memory_region *mem) 2379 const struct kvm_userspace_memory_region *mem)
2378{ 2380{
2379 return 0; 2381 return 0;
2380} 2382}
2381 2383
2382static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, 2384static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
2383 struct kvm_userspace_memory_region *mem, 2385 const struct kvm_userspace_memory_region *mem,
2384 const struct kvm_memory_slot *old) 2386 const struct kvm_memory_slot *old,
2387 const struct kvm_memory_slot *new)
2385{ 2388{
2386 unsigned long npages = mem->memory_size >> PAGE_SHIFT; 2389 unsigned long npages = mem->memory_size >> PAGE_SHIFT;
2390 struct kvm_memslots *slots;
2387 struct kvm_memory_slot *memslot; 2391 struct kvm_memory_slot *memslot;
2388 2392
2389 if (npages && old->npages) { 2393 if (npages && old->npages) {
@@ -2393,7 +2397,8 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
2393 * since the rmap array starts out as all zeroes, 2397 * since the rmap array starts out as all zeroes,
2394 * i.e. no pages are dirty. 2398 * i.e. no pages are dirty.
2395 */ 2399 */
2396 memslot = id_to_memslot(kvm->memslots, mem->slot); 2400 slots = kvm_memslots(kvm);
2401 memslot = id_to_memslot(slots, mem->slot);
2397 kvmppc_hv_get_dirty_log(kvm, memslot, NULL); 2402 kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
2398 } 2403 }
2399} 2404}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f57383941d03..64891b081ad5 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1530,6 +1530,7 @@ out:
1530static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, 1530static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
1531 struct kvm_dirty_log *log) 1531 struct kvm_dirty_log *log)
1532{ 1532{
1533 struct kvm_memslots *slots;
1533 struct kvm_memory_slot *memslot; 1534 struct kvm_memory_slot *memslot;
1534 struct kvm_vcpu *vcpu; 1535 struct kvm_vcpu *vcpu;
1535 ulong ga, ga_end; 1536 ulong ga, ga_end;
@@ -1545,7 +1546,8 @@ static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
1545 1546
1546 /* If nothing is dirty, don't bother messing with page tables. */ 1547 /* If nothing is dirty, don't bother messing with page tables. */
1547 if (is_dirty) { 1548 if (is_dirty) {
1548 memslot = id_to_memslot(kvm->memslots, log->slot); 1549 slots = kvm_memslots(kvm);
1550 memslot = id_to_memslot(slots, log->slot);
1549 1551
1550 ga = memslot->base_gfn << PAGE_SHIFT; 1552 ga = memslot->base_gfn << PAGE_SHIFT;
1551 ga_end = ga + (memslot->npages << PAGE_SHIFT); 1553 ga_end = ga + (memslot->npages << PAGE_SHIFT);
@@ -1571,14 +1573,15 @@ static void kvmppc_core_flush_memslot_pr(struct kvm *kvm,
1571 1573
1572static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, 1574static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
1573 struct kvm_memory_slot *memslot, 1575 struct kvm_memory_slot *memslot,
1574 struct kvm_userspace_memory_region *mem) 1576 const struct kvm_userspace_memory_region *mem)
1575{ 1577{
1576 return 0; 1578 return 0;
1577} 1579}
1578 1580
1579static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, 1581static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
1580 struct kvm_userspace_memory_region *mem, 1582 const struct kvm_userspace_memory_region *mem,
1581 const struct kvm_memory_slot *old) 1583 const struct kvm_memory_slot *old,
1584 const struct kvm_memory_slot *new)
1582{ 1585{
1583 return; 1586 return;
1584} 1587}
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 6c1316a15a27..cc5842657161 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1004,10 +1004,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
1004 break; 1004 break;
1005 } 1005 }
1006 1006
1007 local_irq_enable();
1008
1009 trace_kvm_exit(exit_nr, vcpu); 1007 trace_kvm_exit(exit_nr, vcpu);
1010 kvm_guest_exit(); 1008 __kvm_guest_exit();
1009
1010 local_irq_enable();
1011 1011
1012 run->exit_reason = KVM_EXIT_UNKNOWN; 1012 run->exit_reason = KVM_EXIT_UNKNOWN;
1013 run->ready_for_interrupt_injection = 1; 1013 run->ready_for_interrupt_injection = 1;
@@ -1784,14 +1784,15 @@ int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
1784 1784
1785int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1785int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1786 struct kvm_memory_slot *memslot, 1786 struct kvm_memory_slot *memslot,
1787 struct kvm_userspace_memory_region *mem) 1787 const struct kvm_userspace_memory_region *mem)
1788{ 1788{
1789 return 0; 1789 return 0;
1790} 1790}
1791 1791
1792void kvmppc_core_commit_memory_region(struct kvm *kvm, 1792void kvmppc_core_commit_memory_region(struct kvm *kvm,
1793 struct kvm_userspace_memory_region *mem, 1793 const struct kvm_userspace_memory_region *mem,
1794 const struct kvm_memory_slot *old) 1794 const struct kvm_memory_slot *old,
1795 const struct kvm_memory_slot *new)
1795{ 1796{
1796} 1797}
1797 1798
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ac3ddf115f3d..e5dde32fe71f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -115,7 +115,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
115 continue; 115 continue;
116 } 116 }
117 117
118 kvm_guest_enter(); 118 __kvm_guest_enter();
119 return 1; 119 return 1;
120 } 120 }
121 121
@@ -595,18 +595,19 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
595 595
596int kvm_arch_prepare_memory_region(struct kvm *kvm, 596int kvm_arch_prepare_memory_region(struct kvm *kvm,
597 struct kvm_memory_slot *memslot, 597 struct kvm_memory_slot *memslot,
598 struct kvm_userspace_memory_region *mem, 598 const struct kvm_userspace_memory_region *mem,
599 enum kvm_mr_change change) 599 enum kvm_mr_change change)
600{ 600{
601 return kvmppc_core_prepare_memory_region(kvm, memslot, mem); 601 return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
602} 602}
603 603
604void kvm_arch_commit_memory_region(struct kvm *kvm, 604void kvm_arch_commit_memory_region(struct kvm *kvm,
605 struct kvm_userspace_memory_region *mem, 605 const struct kvm_userspace_memory_region *mem,
606 const struct kvm_memory_slot *old, 606 const struct kvm_memory_slot *old,
607 const struct kvm_memory_slot *new,
607 enum kvm_mr_change change) 608 enum kvm_mr_change change)
608{ 609{
609 kvmppc_core_commit_memory_region(kvm, mem, old); 610 kvmppc_core_commit_memory_region(kvm, mem, old, new);
610} 611}
611 612
612void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 613void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d01fc588b5c3..3024acbe1f9d 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -80,6 +80,7 @@ struct sca_block {
80#define CPUSTAT_MCDS 0x00000100 80#define CPUSTAT_MCDS 0x00000100
81#define CPUSTAT_SM 0x00000080 81#define CPUSTAT_SM 0x00000080
82#define CPUSTAT_IBS 0x00000040 82#define CPUSTAT_IBS 0x00000040
83#define CPUSTAT_GED2 0x00000010
83#define CPUSTAT_G 0x00000008 84#define CPUSTAT_G 0x00000008
84#define CPUSTAT_GED 0x00000004 85#define CPUSTAT_GED 0x00000004
85#define CPUSTAT_J 0x00000002 86#define CPUSTAT_J 0x00000002
@@ -95,7 +96,8 @@ struct kvm_s390_sie_block {
95#define PROG_IN_SIE (1<<0) 96#define PROG_IN_SIE (1<<0)
96 __u32 prog0c; /* 0x000c */ 97 __u32 prog0c; /* 0x000c */
97 __u8 reserved10[16]; /* 0x0010 */ 98 __u8 reserved10[16]; /* 0x0010 */
98#define PROG_BLOCK_SIE 0x00000001 99#define PROG_BLOCK_SIE (1<<0)
100#define PROG_REQUEST (1<<1)
99 atomic_t prog20; /* 0x0020 */ 101 atomic_t prog20; /* 0x0020 */
100 __u8 reserved24[4]; /* 0x0024 */ 102 __u8 reserved24[4]; /* 0x0024 */
101 __u64 cputm; /* 0x0028 */ 103 __u64 cputm; /* 0x0028 */
@@ -634,7 +636,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
634static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 636static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
635static inline void kvm_arch_free_memslot(struct kvm *kvm, 637static inline void kvm_arch_free_memslot(struct kvm *kvm,
636 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} 638 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
637static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} 639static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
638static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} 640static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
639static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 641static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
640 struct kvm_memory_slot *slot) {} 642 struct kvm_memory_slot *slot) {}
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 99b44acbfcc7..3238893c9d4f 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -1005,7 +1005,7 @@ ENTRY(sie64a)
1005.Lsie_gmap: 1005.Lsie_gmap:
1006 lg %r14,__SF_EMPTY(%r15) # get control block pointer 1006 lg %r14,__SF_EMPTY(%r15) # get control block pointer
1007 oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now 1007 oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now
1008 tm __SIE_PROG20+3(%r14),1 # last exit... 1008 tm __SIE_PROG20+3(%r14),3 # last exit...
1009 jnz .Lsie_done 1009 jnz .Lsie_done
1010 LPP __SF_EMPTY(%r15) # set guest id 1010 LPP __SF_EMPTY(%r15) # set guest id
1011 sie 0(%r14) 1011 sie 0(%r14)
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 9e3779e3e496..7365e8a46032 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -241,21 +241,6 @@ static int handle_prog(struct kvm_vcpu *vcpu)
241 return kvm_s390_inject_prog_irq(vcpu, &pgm_info); 241 return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
242} 242}
243 243
244static int handle_instruction_and_prog(struct kvm_vcpu *vcpu)
245{
246 int rc, rc2;
247
248 vcpu->stat.exit_instr_and_program++;
249 rc = handle_instruction(vcpu);
250 rc2 = handle_prog(vcpu);
251
252 if (rc == -EOPNOTSUPP)
253 vcpu->arch.sie_block->icptcode = 0x04;
254 if (rc)
255 return rc;
256 return rc2;
257}
258
259/** 244/**
260 * handle_external_interrupt - used for external interruption interceptions 245 * handle_external_interrupt - used for external interruption interceptions
261 * 246 *
@@ -355,7 +340,6 @@ static const intercept_handler_t intercept_funcs[] = {
355 [0x00 >> 2] = handle_noop, 340 [0x00 >> 2] = handle_noop,
356 [0x04 >> 2] = handle_instruction, 341 [0x04 >> 2] = handle_instruction,
357 [0x08 >> 2] = handle_prog, 342 [0x08 >> 2] = handle_prog,
358 [0x0C >> 2] = handle_instruction_and_prog,
359 [0x10 >> 2] = handle_noop, 343 [0x10 >> 2] = handle_noop,
360 [0x14 >> 2] = handle_external_interrupt, 344 [0x14 >> 2] = handle_external_interrupt,
361 [0x18 >> 2] = handle_noop, 345 [0x18 >> 2] = handle_noop,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 0d3deef6edff..c98d89708e99 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -134,6 +134,8 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
134 134
135 active_mask = pending_local_irqs(vcpu); 135 active_mask = pending_local_irqs(vcpu);
136 active_mask |= pending_floating_irqs(vcpu); 136 active_mask |= pending_floating_irqs(vcpu);
137 if (!active_mask)
138 return 0;
137 139
138 if (psw_extint_disabled(vcpu)) 140 if (psw_extint_disabled(vcpu))
139 active_mask &= ~IRQ_PEND_EXT_MASK; 141 active_mask &= ~IRQ_PEND_EXT_MASK;
@@ -941,12 +943,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
941 if (cpu_timer_irq_pending(vcpu)) 943 if (cpu_timer_irq_pending(vcpu))
942 set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); 944 set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
943 945
944 do { 946 while ((irqs = deliverable_irqs(vcpu)) && !rc) {
945 irqs = deliverable_irqs(vcpu);
946 /* bits are in the order of interrupt priority */ 947 /* bits are in the order of interrupt priority */
947 irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT); 948 irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT);
948 if (irq_type == IRQ_PEND_COUNT)
949 break;
950 if (is_ioirq(irq_type)) { 949 if (is_ioirq(irq_type)) {
951 rc = __deliver_io(vcpu, irq_type); 950 rc = __deliver_io(vcpu, irq_type);
952 } else { 951 } else {
@@ -958,9 +957,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
958 } 957 }
959 rc = func(vcpu); 958 rc = func(vcpu);
960 } 959 }
961 if (rc) 960 }
962 break;
963 } while (!rc);
964 961
965 set_intercept_indicators(vcpu); 962 set_intercept_indicators(vcpu);
966 963
@@ -1061,7 +1058,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1061 if (sclp.has_sigpif) 1058 if (sclp.has_sigpif)
1062 return __inject_extcall_sigpif(vcpu, src_id); 1059 return __inject_extcall_sigpif(vcpu, src_id);
1063 1060
1064 if (!test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) 1061 if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
1065 return -EBUSY; 1062 return -EBUSY;
1066 *extcall = irq->u.extcall; 1063 *extcall = irq->u.extcall;
1067 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 1064 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
@@ -1340,12 +1337,54 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1340 return 0; 1337 return 0;
1341} 1338}
1342 1339
1343static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) 1340/*
1341 * Find a destination VCPU for a floating irq and kick it.
1342 */
1343static void __floating_irq_kick(struct kvm *kvm, u64 type)
1344{ 1344{
1345 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
1345 struct kvm_s390_local_interrupt *li; 1346 struct kvm_s390_local_interrupt *li;
1347 struct kvm_vcpu *dst_vcpu;
1348 int sigcpu, online_vcpus, nr_tries = 0;
1349
1350 online_vcpus = atomic_read(&kvm->online_vcpus);
1351 if (!online_vcpus)
1352 return;
1353
1354 /* find idle VCPUs first, then round robin */
1355 sigcpu = find_first_bit(fi->idle_mask, online_vcpus);
1356 if (sigcpu == online_vcpus) {
1357 do {
1358 sigcpu = fi->next_rr_cpu;
1359 fi->next_rr_cpu = (fi->next_rr_cpu + 1) % online_vcpus;
1360 /* avoid endless loops if all vcpus are stopped */
1361 if (nr_tries++ >= online_vcpus)
1362 return;
1363 } while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu)));
1364 }
1365 dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
1366
1367 /* make the VCPU drop out of the SIE, or wake it up if sleeping */
1368 li = &dst_vcpu->arch.local_int;
1369 spin_lock(&li->lock);
1370 switch (type) {
1371 case KVM_S390_MCHK:
1372 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
1373 break;
1374 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
1375 atomic_set_mask(CPUSTAT_IO_INT, li->cpuflags);
1376 break;
1377 default:
1378 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
1379 break;
1380 }
1381 spin_unlock(&li->lock);
1382 kvm_s390_vcpu_wakeup(dst_vcpu);
1383}
1384
1385static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1386{
1346 struct kvm_s390_float_interrupt *fi; 1387 struct kvm_s390_float_interrupt *fi;
1347 struct kvm_vcpu *dst_vcpu = NULL;
1348 int sigcpu;
1349 u64 type = READ_ONCE(inti->type); 1388 u64 type = READ_ONCE(inti->type);
1350 int rc; 1389 int rc;
1351 1390
@@ -1373,32 +1412,8 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1373 if (rc) 1412 if (rc)
1374 return rc; 1413 return rc;
1375 1414
1376 sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); 1415 __floating_irq_kick(kvm, type);
1377 if (sigcpu == KVM_MAX_VCPUS) {
1378 do {
1379 sigcpu = fi->next_rr_cpu++;
1380 if (sigcpu == KVM_MAX_VCPUS)
1381 sigcpu = fi->next_rr_cpu = 0;
1382 } while (kvm_get_vcpu(kvm, sigcpu) == NULL);
1383 }
1384 dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
1385 li = &dst_vcpu->arch.local_int;
1386 spin_lock(&li->lock);
1387 switch (type) {
1388 case KVM_S390_MCHK:
1389 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
1390 break;
1391 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
1392 atomic_set_mask(CPUSTAT_IO_INT, li->cpuflags);
1393 break;
1394 default:
1395 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
1396 break;
1397 }
1398 spin_unlock(&li->lock);
1399 kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu));
1400 return 0; 1416 return 0;
1401
1402} 1417}
1403 1418
1404int kvm_s390_inject_vm(struct kvm *kvm, 1419int kvm_s390_inject_vm(struct kvm *kvm,
@@ -1606,6 +1621,9 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
1606 int i; 1621 int i;
1607 1622
1608 spin_lock(&fi->lock); 1623 spin_lock(&fi->lock);
1624 fi->pending_irqs = 0;
1625 memset(&fi->srv_signal, 0, sizeof(fi->srv_signal));
1626 memset(&fi->mchk, 0, sizeof(fi->mchk));
1609 for (i = 0; i < FIRQ_LIST_COUNT; i++) 1627 for (i = 0; i < FIRQ_LIST_COUNT; i++)
1610 clear_irq_list(&fi->lists[i]); 1628 clear_irq_list(&fi->lists[i]);
1611 for (i = 0; i < FIRQ_MAX_COUNT; i++) 1629 for (i = 0; i < FIRQ_MAX_COUNT; i++)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c4e81b26c1b0..2078f92d15ac 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -36,6 +36,10 @@
36#include "kvm-s390.h" 36#include "kvm-s390.h"
37#include "gaccess.h" 37#include "gaccess.h"
38 38
39#define KMSG_COMPONENT "kvm-s390"
40#undef pr_fmt
41#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
42
39#define CREATE_TRACE_POINTS 43#define CREATE_TRACE_POINTS
40#include "trace.h" 44#include "trace.h"
41#include "trace-s390.h" 45#include "trace-s390.h"
@@ -110,7 +114,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
110/* upper facilities limit for kvm */ 114/* upper facilities limit for kvm */
111unsigned long kvm_s390_fac_list_mask[] = { 115unsigned long kvm_s390_fac_list_mask[] = {
112 0xffe6fffbfcfdfc40UL, 116 0xffe6fffbfcfdfc40UL,
113 0x005c800000000000UL, 117 0x005e800000000000UL,
114}; 118};
115 119
116unsigned long kvm_s390_fac_list_mask_size(void) 120unsigned long kvm_s390_fac_list_mask_size(void)
@@ -236,6 +240,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
236{ 240{
237 int r; 241 int r;
238 unsigned long n; 242 unsigned long n;
243 struct kvm_memslots *slots;
239 struct kvm_memory_slot *memslot; 244 struct kvm_memory_slot *memslot;
240 int is_dirty = 0; 245 int is_dirty = 0;
241 246
@@ -245,7 +250,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
245 if (log->slot >= KVM_USER_MEM_SLOTS) 250 if (log->slot >= KVM_USER_MEM_SLOTS)
246 goto out; 251 goto out;
247 252
248 memslot = id_to_memslot(kvm->memslots, log->slot); 253 slots = kvm_memslots(kvm);
254 memslot = id_to_memslot(slots, log->slot);
249 r = -ENOENT; 255 r = -ENOENT;
250 if (!memslot->dirty_bitmap) 256 if (!memslot->dirty_bitmap)
251 goto out; 257 goto out;
@@ -454,10 +460,10 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
454 460
455 mutex_lock(&kvm->lock); 461 mutex_lock(&kvm->lock);
456 kvm->arch.epoch = gtod - host_tod; 462 kvm->arch.epoch = gtod - host_tod;
457 kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) { 463 kvm_s390_vcpu_block_all(kvm);
464 kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm)
458 cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; 465 cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
459 exit_sie(cur_vcpu); 466 kvm_s390_vcpu_unblock_all(kvm);
460 }
461 mutex_unlock(&kvm->lock); 467 mutex_unlock(&kvm->lock);
462 return 0; 468 return 0;
463} 469}
@@ -1311,8 +1317,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
1311 1317
1312 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | 1318 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
1313 CPUSTAT_SM | 1319 CPUSTAT_SM |
1314 CPUSTAT_STOPPED | 1320 CPUSTAT_STOPPED);
1315 CPUSTAT_GED); 1321
1322 if (test_kvm_facility(vcpu->kvm, 78))
1323 atomic_set_mask(CPUSTAT_GED2, &vcpu->arch.sie_block->cpuflags);
1324 else if (test_kvm_facility(vcpu->kvm, 8))
1325 atomic_set_mask(CPUSTAT_GED, &vcpu->arch.sie_block->cpuflags);
1326
1316 kvm_s390_vcpu_setup_model(vcpu); 1327 kvm_s390_vcpu_setup_model(vcpu);
1317 1328
1318 vcpu->arch.sie_block->ecb = 6; 1329 vcpu->arch.sie_block->ecb = 6;
@@ -1409,16 +1420,28 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
1409 return kvm_s390_vcpu_has_irq(vcpu, 0); 1420 return kvm_s390_vcpu_has_irq(vcpu, 0);
1410} 1421}
1411 1422
1412void s390_vcpu_block(struct kvm_vcpu *vcpu) 1423void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu)
1413{ 1424{
1414 atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); 1425 atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
1426 exit_sie(vcpu);
1415} 1427}
1416 1428
1417void s390_vcpu_unblock(struct kvm_vcpu *vcpu) 1429void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu)
1418{ 1430{
1419 atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); 1431 atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
1420} 1432}
1421 1433
1434static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu)
1435{
1436 atomic_set_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
1437 exit_sie(vcpu);
1438}
1439
1440static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu)
1441{
1442 atomic_clear_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
1443}
1444
1422/* 1445/*
1423 * Kick a guest cpu out of SIE and wait until SIE is not running. 1446 * Kick a guest cpu out of SIE and wait until SIE is not running.
1424 * If the CPU is not running (e.g. waiting as idle) the function will 1447 * If the CPU is not running (e.g. waiting as idle) the function will
@@ -1430,11 +1453,11 @@ void exit_sie(struct kvm_vcpu *vcpu)
1430 cpu_relax(); 1453 cpu_relax();
1431} 1454}
1432 1455
1433/* Kick a guest cpu out of SIE and prevent SIE-reentry */ 1456/* Kick a guest cpu out of SIE to process a request synchronously */
1434void exit_sie_sync(struct kvm_vcpu *vcpu) 1457void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
1435{ 1458{
1436 s390_vcpu_block(vcpu); 1459 kvm_make_request(req, vcpu);
1437 exit_sie(vcpu); 1460 kvm_s390_vcpu_request(vcpu);
1438} 1461}
1439 1462
1440static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) 1463static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
@@ -1447,8 +1470,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
1447 /* match against both prefix pages */ 1470 /* match against both prefix pages */
1448 if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) { 1471 if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
1449 VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); 1472 VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
1450 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); 1473 kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
1451 exit_sie_sync(vcpu);
1452 } 1474 }
1453 } 1475 }
1454} 1476}
@@ -1720,8 +1742,10 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu)
1720 1742
1721static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) 1743static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
1722{ 1744{
1745 if (!vcpu->requests)
1746 return 0;
1723retry: 1747retry:
1724 s390_vcpu_unblock(vcpu); 1748 kvm_s390_vcpu_request_handled(vcpu);
1725 /* 1749 /*
1726 * We use MMU_RELOAD just to re-arm the ipte notifier for the 1750 * We use MMU_RELOAD just to re-arm the ipte notifier for the
1727 * guest prefix page. gmap_ipte_notify will wait on the ptl lock. 1751 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
@@ -1993,12 +2017,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
1993 * As PF_VCPU will be used in fault handler, between 2017 * As PF_VCPU will be used in fault handler, between
1994 * guest_enter and guest_exit should be no uaccess. 2018 * guest_enter and guest_exit should be no uaccess.
1995 */ 2019 */
1996 preempt_disable(); 2020 local_irq_disable();
1997 kvm_guest_enter(); 2021 __kvm_guest_enter();
1998 preempt_enable(); 2022 local_irq_enable();
1999 exit_reason = sie64a(vcpu->arch.sie_block, 2023 exit_reason = sie64a(vcpu->arch.sie_block,
2000 vcpu->run->s.regs.gprs); 2024 vcpu->run->s.regs.gprs);
2001 kvm_guest_exit(); 2025 local_irq_disable();
2026 __kvm_guest_exit();
2027 local_irq_enable();
2002 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 2028 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2003 2029
2004 rc = vcpu_post_run(vcpu, exit_reason); 2030 rc = vcpu_post_run(vcpu, exit_reason);
@@ -2068,7 +2094,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2068 if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) { 2094 if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
2069 kvm_s390_vcpu_start(vcpu); 2095 kvm_s390_vcpu_start(vcpu);
2070 } else if (is_vcpu_stopped(vcpu)) { 2096 } else if (is_vcpu_stopped(vcpu)) {
2071 pr_err_ratelimited("kvm-s390: can't run stopped vcpu %d\n", 2097 pr_err_ratelimited("can't run stopped vcpu %d\n",
2072 vcpu->vcpu_id); 2098 vcpu->vcpu_id);
2073 return -EINVAL; 2099 return -EINVAL;
2074 } 2100 }
@@ -2206,8 +2232,7 @@ int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
2206static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) 2232static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
2207{ 2233{
2208 kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu); 2234 kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
2209 kvm_make_request(KVM_REQ_DISABLE_IBS, vcpu); 2235 kvm_s390_sync_request(KVM_REQ_DISABLE_IBS, vcpu);
2210 exit_sie_sync(vcpu);
2211} 2236}
2212 2237
2213static void __disable_ibs_on_all_vcpus(struct kvm *kvm) 2238static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
@@ -2223,8 +2248,7 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
2223static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) 2248static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
2224{ 2249{
2225 kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu); 2250 kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
2226 kvm_make_request(KVM_REQ_ENABLE_IBS, vcpu); 2251 kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
2227 exit_sie_sync(vcpu);
2228} 2252}
2229 2253
2230void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) 2254void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
@@ -2563,7 +2587,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
2563/* Section: memory related */ 2587/* Section: memory related */
2564int kvm_arch_prepare_memory_region(struct kvm *kvm, 2588int kvm_arch_prepare_memory_region(struct kvm *kvm,
2565 struct kvm_memory_slot *memslot, 2589 struct kvm_memory_slot *memslot,
2566 struct kvm_userspace_memory_region *mem, 2590 const struct kvm_userspace_memory_region *mem,
2567 enum kvm_mr_change change) 2591 enum kvm_mr_change change)
2568{ 2592{
2569 /* A few sanity checks. We can have memory slots which have to be 2593 /* A few sanity checks. We can have memory slots which have to be
@@ -2581,8 +2605,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
2581} 2605}
2582 2606
2583void kvm_arch_commit_memory_region(struct kvm *kvm, 2607void kvm_arch_commit_memory_region(struct kvm *kvm,
2584 struct kvm_userspace_memory_region *mem, 2608 const struct kvm_userspace_memory_region *mem,
2585 const struct kvm_memory_slot *old, 2609 const struct kvm_memory_slot *old,
2610 const struct kvm_memory_slot *new,
2586 enum kvm_mr_change change) 2611 enum kvm_mr_change change)
2587{ 2612{
2588 int rc; 2613 int rc;
@@ -2601,7 +2626,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
2601 rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, 2626 rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
2602 mem->guest_phys_addr, mem->memory_size); 2627 mem->guest_phys_addr, mem->memory_size);
2603 if (rc) 2628 if (rc)
2604 printk(KERN_WARNING "kvm-s390: failed to commit memory region\n"); 2629 pr_warn("failed to commit memory region\n");
2605 return; 2630 return;
2606} 2631}
2607 2632
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index ca108b90ae56..c5704786e473 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -211,10 +211,10 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
211int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr); 211int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr);
212void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); 212void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
213void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); 213void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
214void s390_vcpu_block(struct kvm_vcpu *vcpu); 214void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
215void s390_vcpu_unblock(struct kvm_vcpu *vcpu); 215void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
216void exit_sie(struct kvm_vcpu *vcpu); 216void exit_sie(struct kvm_vcpu *vcpu);
217void exit_sie_sync(struct kvm_vcpu *vcpu); 217void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu);
218int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); 218int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
219void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); 219void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
220/* is cmma enabled */ 220/* is cmma enabled */
@@ -228,6 +228,25 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
228int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, 228int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
229 struct kvm_s390_pgm_info *pgm_info); 229 struct kvm_s390_pgm_info *pgm_info);
230 230
231static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
232{
233 int i;
234 struct kvm_vcpu *vcpu;
235
236 WARN_ON(!mutex_is_locked(&kvm->lock));
237 kvm_for_each_vcpu(i, vcpu, kvm)
238 kvm_s390_vcpu_block(vcpu);
239}
240
241static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
242{
243 int i;
244 struct kvm_vcpu *vcpu;
245
246 kvm_for_each_vcpu(i, vcpu, kvm)
247 kvm_s390_vcpu_unblock(vcpu);
248}
249
231/** 250/**
232 * kvm_s390_inject_prog_cond - conditionally inject a program check 251 * kvm_s390_inject_prog_cond - conditionally inject a program check
233 * @vcpu: virtual cpu 252 * @vcpu: virtual cpu
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d22d8ee1ff9d..ad4242245771 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -698,10 +698,14 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
698 case 0x00001000: 698 case 0x00001000:
699 end = (start + (1UL << 20)) & ~((1UL << 20) - 1); 699 end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
700 break; 700 break;
701 /* We dont support EDAT2
702 case 0x00002000: 701 case 0x00002000:
702 /* only support 2G frame size if EDAT2 is available and we are
703 not in 24-bit addressing mode */
704 if (!test_kvm_facility(vcpu->kvm, 78) ||
705 psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_24BIT)
706 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
703 end = (start + (1UL << 31)) & ~((1UL << 31) - 1); 707 end = (start + (1UL << 31)) & ~((1UL << 31) - 1);
704 break;*/ 708 break;
705 default: 709 default:
706 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 710 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
707 } 711 }
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 57a9d94fe160..e16466ec473c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -193,6 +193,8 @@ struct x86_emulate_ops {
193 int (*cpl)(struct x86_emulate_ctxt *ctxt); 193 int (*cpl)(struct x86_emulate_ctxt *ctxt);
194 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); 194 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
195 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); 195 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
196 u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
197 void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
196 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); 198 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
197 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); 199 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
198 int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); 200 int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -262,6 +264,11 @@ enum x86emul_mode {
262 X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ 264 X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
263}; 265};
264 266
267/* These match some of the HF_* flags defined in kvm_host.h */
268#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
269#define X86EMUL_SMM_MASK (1 << 6)
270#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
271
265struct x86_emulate_ctxt { 272struct x86_emulate_ctxt {
266 const struct x86_emulate_ops *ops; 273 const struct x86_emulate_ops *ops;
267 274
@@ -273,8 +280,8 @@ struct x86_emulate_ctxt {
273 280
274 /* interruptibility state, as a result of execution of STI or MOV SS */ 281 /* interruptibility state, as a result of execution of STI or MOV SS */
275 int interruptibility; 282 int interruptibility;
283 int emul_flags;
276 284
277 bool guest_mode; /* guest running a nested guest */
278 bool perm_ok; /* do not check permissions if true */ 285 bool perm_ok; /* do not check permissions if true */
279 bool ud; /* inject an #UD if host doesn't support insn */ 286 bool ud; /* inject an #UD if host doesn't support insn */
280 287
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8c0ec3a4a97..c7fa57b529d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -184,23 +184,12 @@ struct kvm_mmu_memory_cache {
184 void *objects[KVM_NR_MEM_OBJS]; 184 void *objects[KVM_NR_MEM_OBJS];
185}; 185};
186 186
187/*
188 * kvm_mmu_page_role, below, is defined as:
189 *
190 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
191 * bits 4:7 - page table level for this shadow (1-4)
192 * bits 8:9 - page table quadrant for 2-level guests
193 * bit 16 - direct mapping of virtual to physical mapping at gfn
194 * used for real mode and two-dimensional paging
195 * bits 17:19 - common access permissions for all ptes in this shadow page
196 */
197union kvm_mmu_page_role { 187union kvm_mmu_page_role {
198 unsigned word; 188 unsigned word;
199 struct { 189 struct {
200 unsigned level:4; 190 unsigned level:4;
201 unsigned cr4_pae:1; 191 unsigned cr4_pae:1;
202 unsigned quadrant:2; 192 unsigned quadrant:2;
203 unsigned pad_for_nice_hex_output:6;
204 unsigned direct:1; 193 unsigned direct:1;
205 unsigned access:3; 194 unsigned access:3;
206 unsigned invalid:1; 195 unsigned invalid:1;
@@ -208,6 +197,15 @@ union kvm_mmu_page_role {
208 unsigned cr0_wp:1; 197 unsigned cr0_wp:1;
209 unsigned smep_andnot_wp:1; 198 unsigned smep_andnot_wp:1;
210 unsigned smap_andnot_wp:1; 199 unsigned smap_andnot_wp:1;
200 unsigned :8;
201
202 /*
203 * This is left at the top of the word so that
204 * kvm_memslots_for_spte_role can extract it with a
205 * simple shift. While there is room, give it a whole
206 * byte so it is also faster to load it from memory.
207 */
208 unsigned smm:8;
211 }; 209 };
212}; 210};
213 211
@@ -338,12 +336,28 @@ struct kvm_pmu {
338 u64 reprogram_pmi; 336 u64 reprogram_pmi;
339}; 337};
340 338
339struct kvm_pmu_ops;
340
341enum { 341enum {
342 KVM_DEBUGREG_BP_ENABLED = 1, 342 KVM_DEBUGREG_BP_ENABLED = 1,
343 KVM_DEBUGREG_WONT_EXIT = 2, 343 KVM_DEBUGREG_WONT_EXIT = 2,
344 KVM_DEBUGREG_RELOAD = 4, 344 KVM_DEBUGREG_RELOAD = 4,
345}; 345};
346 346
347struct kvm_mtrr_range {
348 u64 base;
349 u64 mask;
350 struct list_head node;
351};
352
353struct kvm_mtrr {
354 struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
355 mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
356 u64 deftype;
357
358 struct list_head head;
359};
360
347struct kvm_vcpu_arch { 361struct kvm_vcpu_arch {
348 /* 362 /*
349 * rip and regs accesses must go through 363 * rip and regs accesses must go through
@@ -368,6 +382,7 @@ struct kvm_vcpu_arch {
368 int32_t apic_arb_prio; 382 int32_t apic_arb_prio;
369 int mp_state; 383 int mp_state;
370 u64 ia32_misc_enable_msr; 384 u64 ia32_misc_enable_msr;
385 u64 smbase;
371 bool tpr_access_reporting; 386 bool tpr_access_reporting;
372 u64 ia32_xss; 387 u64 ia32_xss;
373 388
@@ -471,8 +486,9 @@ struct kvm_vcpu_arch {
471 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 486 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
472 unsigned nmi_pending; /* NMI queued after currently running handler */ 487 unsigned nmi_pending; /* NMI queued after currently running handler */
473 bool nmi_injected; /* Trying to inject an NMI this entry */ 488 bool nmi_injected; /* Trying to inject an NMI this entry */
489 bool smi_pending; /* SMI queued after currently running handler */
474 490
475 struct mtrr_state_type mtrr_state; 491 struct kvm_mtrr mtrr_state;
476 u64 pat; 492 u64 pat;
477 493
478 unsigned switch_db_regs; 494 unsigned switch_db_regs;
@@ -637,6 +653,8 @@ struct kvm_arch {
637 #endif 653 #endif
638 654
639 bool boot_vcpu_runs_old_kvmclock; 655 bool boot_vcpu_runs_old_kvmclock;
656
657 u64 disabled_quirks;
640}; 658};
641 659
642struct kvm_vm_stat { 660struct kvm_vm_stat {
@@ -689,12 +707,13 @@ struct msr_data {
689 707
690struct kvm_lapic_irq { 708struct kvm_lapic_irq {
691 u32 vector; 709 u32 vector;
692 u32 delivery_mode; 710 u16 delivery_mode;
693 u32 dest_mode; 711 u16 dest_mode;
694 u32 level; 712 bool level;
695 u32 trig_mode; 713 u16 trig_mode;
696 u32 shorthand; 714 u32 shorthand;
697 u32 dest_id; 715 u32 dest_id;
716 bool msi_redir_hint;
698}; 717};
699 718
700struct kvm_x86_ops { 719struct kvm_x86_ops {
@@ -706,19 +725,20 @@ struct kvm_x86_ops {
706 int (*hardware_setup)(void); /* __init */ 725 int (*hardware_setup)(void); /* __init */
707 void (*hardware_unsetup)(void); /* __exit */ 726 void (*hardware_unsetup)(void); /* __exit */
708 bool (*cpu_has_accelerated_tpr)(void); 727 bool (*cpu_has_accelerated_tpr)(void);
728 bool (*cpu_has_high_real_mode_segbase)(void);
709 void (*cpuid_update)(struct kvm_vcpu *vcpu); 729 void (*cpuid_update)(struct kvm_vcpu *vcpu);
710 730
711 /* Create, but do not attach this VCPU */ 731 /* Create, but do not attach this VCPU */
712 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 732 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
713 void (*vcpu_free)(struct kvm_vcpu *vcpu); 733 void (*vcpu_free)(struct kvm_vcpu *vcpu);
714 void (*vcpu_reset)(struct kvm_vcpu *vcpu); 734 void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
715 735
716 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); 736 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
717 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 737 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
718 void (*vcpu_put)(struct kvm_vcpu *vcpu); 738 void (*vcpu_put)(struct kvm_vcpu *vcpu);
719 739
720 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); 740 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
721 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 741 int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
722 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); 742 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
723 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 743 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
724 void (*get_segment)(struct kvm_vcpu *vcpu, 744 void (*get_segment)(struct kvm_vcpu *vcpu,
@@ -836,6 +856,8 @@ struct kvm_x86_ops {
836 void (*enable_log_dirty_pt_masked)(struct kvm *kvm, 856 void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
837 struct kvm_memory_slot *slot, 857 struct kvm_memory_slot *slot,
838 gfn_t offset, unsigned long mask); 858 gfn_t offset, unsigned long mask);
859 /* pmu operations of sub-arch */
860 const struct kvm_pmu_ops *pmu_ops;
839}; 861};
840 862
841struct kvm_arch_async_pf { 863struct kvm_arch_async_pf {
@@ -871,7 +893,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
871void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 893void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
872 struct kvm_memory_slot *memslot); 894 struct kvm_memory_slot *memslot);
873void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, 895void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
874 struct kvm_memory_slot *memslot); 896 const struct kvm_memory_slot *memslot);
875void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 897void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
876 struct kvm_memory_slot *memslot); 898 struct kvm_memory_slot *memslot);
877void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, 899void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -882,7 +904,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
882 struct kvm_memory_slot *slot, 904 struct kvm_memory_slot *slot,
883 gfn_t gfn_offset, unsigned long mask); 905 gfn_t gfn_offset, unsigned long mask);
884void kvm_mmu_zap_all(struct kvm *kvm); 906void kvm_mmu_zap_all(struct kvm *kvm);
885void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); 907void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
886unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 908unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
887void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 909void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
888 910
@@ -890,7 +912,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
890 912
891int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 913int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
892 const void *val, int bytes); 914 const void *val, int bytes);
893u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
894 915
895struct kvm_irq_mask_notifier { 916struct kvm_irq_mask_notifier {
896 void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); 917 void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
@@ -938,7 +959,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
938 959
939void kvm_enable_efer_bits(u64); 960void kvm_enable_efer_bits(u64);
940bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); 961bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
941int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 962int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
942int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 963int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
943 964
944struct x86_emulate_ctxt; 965struct x86_emulate_ctxt;
@@ -967,7 +988,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
967void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 988void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
968int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); 989int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
969 990
970int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 991int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
971int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); 992int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
972 993
973unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); 994unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
@@ -1110,6 +1131,14 @@ enum {
1110#define HF_NMI_MASK (1 << 3) 1131#define HF_NMI_MASK (1 << 3)
1111#define HF_IRET_MASK (1 << 4) 1132#define HF_IRET_MASK (1 << 4)
1112#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ 1133#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
1134#define HF_SMM_MASK (1 << 6)
1135#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
1136
1137#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
1138#define KVM_ADDRESS_SPACE_NUM 2
1139
1140#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
1141#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
1113 1142
1114/* 1143/*
1115 * Hardware virtualization extension instructions may fault if a 1144 * Hardware virtualization extension instructions may fault if a
@@ -1144,7 +1173,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
1144int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1173int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
1145int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1174int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
1146int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1175int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
1147void kvm_vcpu_reset(struct kvm_vcpu *vcpu); 1176void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
1148void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); 1177void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
1149void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, 1178void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
1150 unsigned long address); 1179 unsigned long address);
@@ -1168,16 +1197,9 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
1168 1197
1169int kvm_is_in_guest(void); 1198int kvm_is_in_guest(void);
1170 1199
1171void kvm_pmu_init(struct kvm_vcpu *vcpu); 1200int __x86_set_memory_region(struct kvm *kvm,
1172void kvm_pmu_destroy(struct kvm_vcpu *vcpu); 1201 const struct kvm_userspace_memory_region *mem);
1173void kvm_pmu_reset(struct kvm_vcpu *vcpu); 1202int x86_set_memory_region(struct kvm *kvm,
1174void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); 1203 const struct kvm_userspace_memory_region *mem);
1175bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
1176int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
1177int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
1178int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc);
1179int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
1180void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
1181void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
1182 1204
1183#endif /* _ASM_X86_KVM_HOST_H */ 1205#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6167fd798188..655e07a48f6c 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -41,5 +41,6 @@ struct pvclock_wall_clock {
41 41
42#define PVCLOCK_TSC_STABLE_BIT (1 << 0) 42#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
43#define PVCLOCK_GUEST_STOPPED (1 << 1) 43#define PVCLOCK_GUEST_STOPPED (1 << 1)
44#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
44#endif /* __ASSEMBLY__ */ 45#endif /* __ASSEMBLY__ */
45#endif /* _ASM_X86_PVCLOCK_ABI_H */ 46#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e9fa28..628954ceede1 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -86,7 +86,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
86 offset = pvclock_get_nsec_offset(src); 86 offset = pvclock_get_nsec_offset(src);
87 ret = src->system_time + offset; 87 ret = src->system_time + offset;
88 ret_flags = src->flags; 88 ret_flags = src->flags;
89 rdtsc_barrier();
90 89
91 *cycles = ret; 90 *cycles = ret;
92 *flags = ret_flags; 91 *flags = ret_flags;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d7dcef58aefa..a4ae82eb82aa 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,8 @@ struct kvm_ioapic_state {
106#define KVM_IRQCHIP_IOAPIC 2 106#define KVM_IRQCHIP_IOAPIC 2
107#define KVM_NR_IRQCHIPS 3 107#define KVM_NR_IRQCHIPS 3
108 108
109#define KVM_RUN_X86_SMM (1 << 0)
110
109/* for KVM_GET_REGS and KVM_SET_REGS */ 111/* for KVM_GET_REGS and KVM_SET_REGS */
110struct kvm_regs { 112struct kvm_regs {
111 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ 113 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
@@ -281,6 +283,7 @@ struct kvm_reinject_control {
281#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 283#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
282#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 284#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
283#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 285#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
286#define KVM_VCPUEVENT_VALID_SMM 0x00000008
284 287
285/* Interrupt shadow states */ 288/* Interrupt shadow states */
286#define KVM_X86_SHADOW_INT_MOV_SS 0x01 289#define KVM_X86_SHADOW_INT_MOV_SS 0x01
@@ -309,7 +312,13 @@ struct kvm_vcpu_events {
309 } nmi; 312 } nmi;
310 __u32 sipi_vector; 313 __u32 sipi_vector;
311 __u32 flags; 314 __u32 flags;
312 __u32 reserved[10]; 315 struct {
316 __u8 smm;
317 __u8 pending;
318 __u8 smm_inside_nmi;
319 __u8 latched_init;
320 } smi;
321 __u32 reserved[9];
313}; 322};
314 323
315/* for KVM_GET/SET_DEBUGREGS */ 324/* for KVM_GET/SET_DEBUGREGS */
@@ -345,4 +354,7 @@ struct kvm_xcrs {
345struct kvm_sync_regs { 354struct kvm_sync_regs {
346}; 355};
347 356
357#define KVM_QUIRK_LINT0_REENABLED (1 << 0)
358#define KVM_QUIRK_CD_NW_CLEARED (1 << 1)
359
348#endif /* _ASM_X86_KVM_H */ 360#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1681504e44a4..47190bd399e7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -331,7 +331,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
331 apic_write(APIC_EOI, APIC_EOI_ACK); 331 apic_write(APIC_EOI, APIC_EOI_ACK);
332} 332}
333 333
334void kvm_guest_cpu_init(void) 334static void kvm_guest_cpu_init(void)
335{ 335{
336 if (!kvm_para_available()) 336 if (!kvm_para_available())
337 return; 337 return;
@@ -688,7 +688,7 @@ static inline void spin_time_accum_blocked(u64 start)
688static struct dentry *d_spin_debug; 688static struct dentry *d_spin_debug;
689static struct dentry *d_kvm_debug; 689static struct dentry *d_kvm_debug;
690 690
691struct dentry *kvm_init_debugfs(void) 691static struct dentry *kvm_init_debugfs(void)
692{ 692{
693 d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); 693 d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
694 if (!d_kvm_debug) 694 if (!d_kvm_debug)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 42caaef897c8..49487b488061 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -24,6 +24,7 @@
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/hardirq.h> 25#include <linux/hardirq.h>
26#include <linux/memblock.h> 26#include <linux/memblock.h>
27#include <linux/sched.h>
27 28
28#include <asm/x86_init.h> 29#include <asm/x86_init.h>
29#include <asm/reboot.h> 30#include <asm/reboot.h>
@@ -217,8 +218,10 @@ static void kvm_shutdown(void)
217 218
218void __init kvmclock_init(void) 219void __init kvmclock_init(void)
219{ 220{
221 struct pvclock_vcpu_time_info *vcpu_time;
220 unsigned long mem; 222 unsigned long mem;
221 int size; 223 int size, cpu;
224 u8 flags;
222 225
223 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); 226 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
224 227
@@ -264,7 +267,14 @@ void __init kvmclock_init(void)
264 pv_info.name = "KVM"; 267 pv_info.name = "KVM";
265 268
266 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) 269 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
267 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 270 pvclock_set_flags(~0);
271
272 cpu = get_cpu();
273 vcpu_time = &hv_clock[cpu].pvti;
274 flags = pvclock_read_flags(vcpu_time);
275 if (flags & PVCLOCK_COUNTS_FROM_ZERO)
276 set_sched_clock_stable();
277 put_cpu();
268} 278}
269 279
270int __init kvm_setup_vsyscall_timeinfo(void) 280int __init kvm_setup_vsyscall_timeinfo(void)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 413a7bf9efbb..d8a1d56276e1 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,15 +86,16 @@ config KVM_MMU_AUDIT
86 auditing of KVM MMU events at runtime. 86 auditing of KVM MMU events at runtime.
87 87
88config KVM_DEVICE_ASSIGNMENT 88config KVM_DEVICE_ASSIGNMENT
89 bool "KVM legacy PCI device assignment support" 89 bool "KVM legacy PCI device assignment support (DEPRECATED)"
90 depends on KVM && PCI && IOMMU_API 90 depends on KVM && PCI && IOMMU_API
91 default y 91 default n
92 ---help--- 92 ---help---
93 Provide support for legacy PCI device assignment through KVM. The 93 Provide support for legacy PCI device assignment through KVM. The
94 kernel now also supports a full featured userspace device driver 94 kernel now also supports a full featured userspace device driver
95 framework through VFIO, which supersedes much of this support. 95 framework through VFIO, which supersedes this support and provides
96 better security.
96 97
97 If unsure, say Y. 98 If unsure, say N.
98 99
99# OK, it's a little counter-intuitive to do this, but it puts it neatly under 100# OK, it's a little counter-intuitive to do this, but it puts it neatly under
100# the virtualization menu. 101# the virtualization menu.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 16e8f962eaad..67d215cb8953 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,10 +12,10 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o 12kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
13 13
14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
15 i8254.o ioapic.o irq_comm.o cpuid.o pmu.o 15 i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o
16kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o 16kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
17kvm-intel-y += vmx.o 17kvm-intel-y += vmx.o pmu_intel.o
18kvm-amd-y += svm.o 18kvm-amd-y += svm.o pmu_amd.o
19 19
20obj-$(CONFIG_KVM) += kvm.o 20obj-$(CONFIG_KVM) += kvm.o
21obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 21obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9f705e618af5..64dd46793099 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,12 +16,14 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/vmalloc.h> 17#include <linux/vmalloc.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */
19#include <asm/user.h> 20#include <asm/user.h>
20#include <asm/fpu/xstate.h> 21#include <asm/fpu/xstate.h>
21#include "cpuid.h" 22#include "cpuid.h"
22#include "lapic.h" 23#include "lapic.h"
23#include "mmu.h" 24#include "mmu.h"
24#include "trace.h" 25#include "trace.h"
26#include "pmu.h"
25 27
26static u32 xstate_required_size(u64 xstate_bv, bool compacted) 28static u32 xstate_required_size(u64 xstate_bv, bool compacted)
27{ 29{
@@ -95,7 +97,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
95 if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) 97 if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
96 best->ebx = xstate_required_size(vcpu->arch.xcr0, true); 98 best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
97 99
98 vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu); 100 vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
99 101
100 /* 102 /*
101 * The existing code assumes virtual address is 48-bit in the canonical 103 * The existing code assumes virtual address is 48-bit in the canonical
@@ -109,7 +111,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
109 /* Update physical-address width */ 111 /* Update physical-address width */
110 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 112 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
111 113
112 kvm_pmu_cpuid_update(vcpu); 114 kvm_pmu_refresh(vcpu);
113 return 0; 115 return 0;
114} 116}
115 117
@@ -413,6 +415,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
413 } 415 }
414 break; 416 break;
415 } 417 }
418 case 6: /* Thermal management */
419 entry->eax = 0x4; /* allow ARAT */
420 entry->ebx = 0;
421 entry->ecx = 0;
422 entry->edx = 0;
423 break;
416 case 7: { 424 case 7: {
417 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 425 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
418 /* Mask ebx against host capability word 9 */ 426 /* Mask ebx against host capability word 9 */
@@ -589,7 +597,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
589 break; 597 break;
590 case 3: /* Processor serial number */ 598 case 3: /* Processor serial number */
591 case 5: /* MONITOR/MWAIT */ 599 case 5: /* MONITOR/MWAIT */
592 case 6: /* Thermal management */
593 case 0xC0000002: 600 case 0xC0000002:
594 case 0xC0000003: 601 case 0xC0000003:
595 case 0xC0000004: 602 case 0xC0000004:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 496b3695d3d3..dd05b9cef6ae 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
70 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); 70 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
71} 71}
72 72
73static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
74{
75 struct kvm_cpuid_entry2 *best;
76
77 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
78 return best && (best->edx & bit(X86_FEATURE_LM));
79}
80
73static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) 81static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
74{ 82{
75 struct kvm_cpuid_entry2 *best; 83 struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 630bcb0d7a04..e7a4fde5d631 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -25,6 +25,7 @@
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
27#include <linux/stringify.h> 27#include <linux/stringify.h>
28#include <asm/debugreg.h>
28 29
29#include "x86.h" 30#include "x86.h"
30#include "tss.h" 31#include "tss.h"
@@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
523static inline void 524static inline void
524register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc) 525register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
525{ 526{
526 ulong mask; 527 ulong *preg = reg_rmw(ctxt, reg);
527 528
528 if (ctxt->ad_bytes == sizeof(unsigned long)) 529 assign_register(preg, *preg + inc, ctxt->ad_bytes);
529 mask = ~0UL;
530 else
531 mask = ad_mask(ctxt);
532 masked_increment(reg_rmw(ctxt, reg), mask, inc);
533} 530}
534 531
535static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) 532static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -2262,6 +2259,260 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
2262 return rc; 2259 return rc;
2263} 2260}
2264 2261
2262static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
2263{
2264 u32 eax, ebx, ecx, edx;
2265
2266 eax = 0x80000001;
2267 ecx = 0;
2268 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
2269 return edx & bit(X86_FEATURE_LM);
2270}
2271
2272#define GET_SMSTATE(type, smbase, offset) \
2273 ({ \
2274 type __val; \
2275 int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
2276 sizeof(__val), NULL); \
2277 if (r != X86EMUL_CONTINUE) \
2278 return X86EMUL_UNHANDLEABLE; \
2279 __val; \
2280 })
2281
2282static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
2283{
2284 desc->g = (flags >> 23) & 1;
2285 desc->d = (flags >> 22) & 1;
2286 desc->l = (flags >> 21) & 1;
2287 desc->avl = (flags >> 20) & 1;
2288 desc->p = (flags >> 15) & 1;
2289 desc->dpl = (flags >> 13) & 3;
2290 desc->s = (flags >> 12) & 1;
2291 desc->type = (flags >> 8) & 15;
2292}
2293
2294static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
2295{
2296 struct desc_struct desc;
2297 int offset;
2298 u16 selector;
2299
2300 selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
2301
2302 if (n < 3)
2303 offset = 0x7f84 + n * 12;
2304 else
2305 offset = 0x7f2c + (n - 3) * 12;
2306
2307 set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
2308 set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
2309 rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
2310 ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
2311 return X86EMUL_CONTINUE;
2312}
2313
2314static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
2315{
2316 struct desc_struct desc;
2317 int offset;
2318 u16 selector;
2319 u32 base3;
2320
2321 offset = 0x7e00 + n * 16;
2322
2323 selector = GET_SMSTATE(u16, smbase, offset);
2324 rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
2325 set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
2326 set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
2327 base3 = GET_SMSTATE(u32, smbase, offset + 12);
2328
2329 ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
2330 return X86EMUL_CONTINUE;
2331}
2332
2333static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
2334 u64 cr0, u64 cr4)
2335{
2336 int bad;
2337
2338 /*
2339 * First enable PAE, long mode needs it before CR0.PG = 1 is set.
2340 * Then enable protected mode. However, PCID cannot be enabled
2341 * if EFER.LMA=0, so set it separately.
2342 */
2343 bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
2344 if (bad)
2345 return X86EMUL_UNHANDLEABLE;
2346
2347 bad = ctxt->ops->set_cr(ctxt, 0, cr0);
2348 if (bad)
2349 return X86EMUL_UNHANDLEABLE;
2350
2351 if (cr4 & X86_CR4_PCIDE) {
2352 bad = ctxt->ops->set_cr(ctxt, 4, cr4);
2353 if (bad)
2354 return X86EMUL_UNHANDLEABLE;
2355 }
2356
2357 return X86EMUL_CONTINUE;
2358}
2359
2360static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
2361{
2362 struct desc_struct desc;
2363 struct desc_ptr dt;
2364 u16 selector;
2365 u32 val, cr0, cr4;
2366 int i;
2367
2368 cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
2369 ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
2370 ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
2371 ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
2372
2373 for (i = 0; i < 8; i++)
2374 *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
2375
2376 val = GET_SMSTATE(u32, smbase, 0x7fcc);
2377 ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
2378 val = GET_SMSTATE(u32, smbase, 0x7fc8);
2379 ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
2380
2381 selector = GET_SMSTATE(u32, smbase, 0x7fc4);
2382 set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64));
2383 set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60));
2384 rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c));
2385 ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
2386
2387 selector = GET_SMSTATE(u32, smbase, 0x7fc0);
2388 set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80));
2389 set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c));
2390 rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78));
2391 ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
2392
2393 dt.address = GET_SMSTATE(u32, smbase, 0x7f74);
2394 dt.size = GET_SMSTATE(u32, smbase, 0x7f70);
2395 ctxt->ops->set_gdt(ctxt, &dt);
2396
2397 dt.address = GET_SMSTATE(u32, smbase, 0x7f58);
2398 dt.size = GET_SMSTATE(u32, smbase, 0x7f54);
2399 ctxt->ops->set_idt(ctxt, &dt);
2400
2401 for (i = 0; i < 6; i++) {
2402 int r = rsm_load_seg_32(ctxt, smbase, i);
2403 if (r != X86EMUL_CONTINUE)
2404 return r;
2405 }
2406
2407 cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
2408
2409 ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
2410
2411 return rsm_enter_protected_mode(ctxt, cr0, cr4);
2412}
2413
2414static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
2415{
2416 struct desc_struct desc;
2417 struct desc_ptr dt;
2418 u64 val, cr0, cr4;
2419 u32 base3;
2420 u16 selector;
2421 int i;
2422
2423 for (i = 0; i < 16; i++)
2424 *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
2425
2426 ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78);
2427 ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
2428
2429 val = GET_SMSTATE(u32, smbase, 0x7f68);
2430 ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
2431 val = GET_SMSTATE(u32, smbase, 0x7f60);
2432 ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
2433
2434 cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
2435 ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
2436 cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
2437 ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
2438 val = GET_SMSTATE(u64, smbase, 0x7ed0);
2439 ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
2440
2441 selector = GET_SMSTATE(u32, smbase, 0x7e90);
2442 rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8);
2443 set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94));
2444 set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98));
2445 base3 = GET_SMSTATE(u32, smbase, 0x7e9c);
2446 ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
2447
2448 dt.size = GET_SMSTATE(u32, smbase, 0x7e84);
2449 dt.address = GET_SMSTATE(u64, smbase, 0x7e88);
2450 ctxt->ops->set_idt(ctxt, &dt);
2451
2452 selector = GET_SMSTATE(u32, smbase, 0x7e70);
2453 rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8);
2454 set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74));
2455 set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78));
2456 base3 = GET_SMSTATE(u32, smbase, 0x7e7c);
2457 ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
2458
2459 dt.size = GET_SMSTATE(u32, smbase, 0x7e64);
2460 dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
2461 ctxt->ops->set_gdt(ctxt, &dt);
2462
2463 for (i = 0; i < 6; i++) {
2464 int r = rsm_load_seg_64(ctxt, smbase, i);
2465 if (r != X86EMUL_CONTINUE)
2466 return r;
2467 }
2468
2469 return rsm_enter_protected_mode(ctxt, cr0, cr4);
2470}
2471
2472static int em_rsm(struct x86_emulate_ctxt *ctxt)
2473{
2474 unsigned long cr0, cr4, efer;
2475 u64 smbase;
2476 int ret;
2477
2478 if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
2479 return emulate_ud(ctxt);
2480
2481 /*
2482 * Get back to real mode, to prepare a safe state in which to load
2483 * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
2484 * to read_std/write_std are not virtual.
2485 *
2486 * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
2487 */
2488 cr0 = ctxt->ops->get_cr(ctxt, 0);
2489 if (cr0 & X86_CR0_PE)
2490 ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
2491 cr4 = ctxt->ops->get_cr(ctxt, 4);
2492 if (cr4 & X86_CR4_PAE)
2493 ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
2494 efer = 0;
2495 ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
2496
2497 smbase = ctxt->ops->get_smbase(ctxt);
2498 if (emulator_has_longmode(ctxt))
2499 ret = rsm_load_state_64(ctxt, smbase + 0x8000);
2500 else
2501 ret = rsm_load_state_32(ctxt, smbase + 0x8000);
2502
2503 if (ret != X86EMUL_CONTINUE) {
2504 /* FIXME: should triple fault */
2505 return X86EMUL_UNHANDLEABLE;
2506 }
2507
2508 if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
2509 ctxt->ops->set_nmi_mask(ctxt, false);
2510
2511 ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
2512 ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
2513 return X86EMUL_CONTINUE;
2514}
2515
2265static void 2516static void
2266setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 2517setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2267 struct desc_struct *cs, struct desc_struct *ss) 2518 struct desc_struct *cs, struct desc_struct *ss)
@@ -2573,6 +2824,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2573 return true; 2824 return true;
2574} 2825}
2575 2826
2827static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
2828{
2829 /*
2830 * Intel CPUs mask the counter and pointers in quite strange
2831 * manner when ECX is zero due to REP-string optimizations.
2832 */
2833#ifdef CONFIG_X86_64
2834 if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
2835 return;
2836
2837 *reg_write(ctxt, VCPU_REGS_RCX) = 0;
2838
2839 switch (ctxt->b) {
2840 case 0xa4: /* movsb */
2841 case 0xa5: /* movsd/w */
2842 *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
2843 /* fall through */
2844 case 0xaa: /* stosb */
2845 case 0xab: /* stosd/w */
2846 *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
2847 }
2848#endif
2849}
2850
2576static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2851static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2577 struct tss_segment_16 *tss) 2852 struct tss_segment_16 *tss)
2578{ 2853{
@@ -2849,7 +3124,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2849 ulong old_tss_base = 3124 ulong old_tss_base =
2850 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); 3125 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2851 u32 desc_limit; 3126 u32 desc_limit;
2852 ulong desc_addr; 3127 ulong desc_addr, dr7;
2853 3128
2854 /* FIXME: old_tss_base == ~0 ? */ 3129 /* FIXME: old_tss_base == ~0 ? */
2855 3130
@@ -2934,6 +3209,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2934 ret = em_push(ctxt); 3209 ret = em_push(ctxt);
2935 } 3210 }
2936 3211
3212 ops->get_dr(ctxt, 7, &dr7);
3213 ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
3214
2937 return ret; 3215 return ret;
2938} 3216}
2939 3217
@@ -3840,7 +4118,7 @@ static const struct opcode group5[] = {
3840 F(DstMem | SrcNone | Lock, em_inc), 4118 F(DstMem | SrcNone | Lock, em_inc),
3841 F(DstMem | SrcNone | Lock, em_dec), 4119 F(DstMem | SrcNone | Lock, em_dec),
3842 I(SrcMem | NearBranch, em_call_near_abs), 4120 I(SrcMem | NearBranch, em_call_near_abs),
3843 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), 4121 I(SrcMemFAddr | ImplicitOps, em_call_far),
3844 I(SrcMem | NearBranch, em_jmp_abs), 4122 I(SrcMem | NearBranch, em_jmp_abs),
3845 I(SrcMemFAddr | ImplicitOps, em_jmp_far), 4123 I(SrcMemFAddr | ImplicitOps, em_jmp_far),
3846 I(SrcMem | Stack, em_push), D(Undefined), 4124 I(SrcMem | Stack, em_push), D(Undefined),
@@ -4173,7 +4451,7 @@ static const struct opcode twobyte_table[256] = {
4173 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, 4451 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
4174 /* 0xA8 - 0xAF */ 4452 /* 0xA8 - 0xAF */
4175 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), 4453 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
4176 DI(ImplicitOps, rsm), 4454 II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
4177 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), 4455 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
4178 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), 4456 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
4179 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), 4457 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
@@ -4871,7 +5149,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4871 fetch_possible_mmx_operand(ctxt, &ctxt->dst); 5149 fetch_possible_mmx_operand(ctxt, &ctxt->dst);
4872 } 5150 }
4873 5151
4874 if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { 5152 if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
4875 rc = emulator_check_intercept(ctxt, ctxt->intercept, 5153 rc = emulator_check_intercept(ctxt, ctxt->intercept,
4876 X86_ICPT_PRE_EXCEPT); 5154 X86_ICPT_PRE_EXCEPT);
4877 if (rc != X86EMUL_CONTINUE) 5155 if (rc != X86EMUL_CONTINUE)
@@ -4900,7 +5178,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4900 goto done; 5178 goto done;
4901 } 5179 }
4902 5180
4903 if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { 5181 if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
4904 rc = emulator_check_intercept(ctxt, ctxt->intercept, 5182 rc = emulator_check_intercept(ctxt, ctxt->intercept,
4905 X86_ICPT_POST_EXCEPT); 5183 X86_ICPT_POST_EXCEPT);
4906 if (rc != X86EMUL_CONTINUE) 5184 if (rc != X86EMUL_CONTINUE)
@@ -4910,6 +5188,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4910 if (ctxt->rep_prefix && (ctxt->d & String)) { 5188 if (ctxt->rep_prefix && (ctxt->d & String)) {
4911 /* All REP prefixes have the same first termination condition */ 5189 /* All REP prefixes have the same first termination condition */
4912 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { 5190 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
5191 string_registers_quirk(ctxt);
4913 ctxt->eip = ctxt->_eip; 5192 ctxt->eip = ctxt->_eip;
4914 ctxt->eflags &= ~X86_EFLAGS_RF; 5193 ctxt->eflags &= ~X86_EFLAGS_RF;
4915 goto done; 5194 goto done;
@@ -4953,7 +5232,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4953 5232
4954special_insn: 5233special_insn:
4955 5234
4956 if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { 5235 if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
4957 rc = emulator_check_intercept(ctxt, ctxt->intercept, 5236 rc = emulator_check_intercept(ctxt, ctxt->intercept,
4958 X86_ICPT_POST_MEMACCESS); 5237 X86_ICPT_POST_MEMACCESS);
4959 if (rc != X86EMUL_CONTINUE) 5238 if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 28146f03c514..856f79105bb5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
349 irqe.delivery_mode = entry->fields.delivery_mode << 8; 349 irqe.delivery_mode = entry->fields.delivery_mode << 8;
350 irqe.level = 1; 350 irqe.level = 1;
351 irqe.shorthand = 0; 351 irqe.shorthand = 0;
352 irqe.msi_redir_hint = false;
352 353
353 if (irqe.trig_mode == IOAPIC_EDGE_TRIG) 354 if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
354 ioapic->irr_delivered |= 1 << irq; 355 ioapic->irr_delivered |= 1 << irq;
@@ -637,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
637 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 638 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
638 639
639 cancel_delayed_work_sync(&ioapic->eoi_inject); 640 cancel_delayed_work_sync(&ioapic->eoi_inject);
640 if (ioapic) { 641 kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
641 kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); 642 kvm->arch.vioapic = NULL;
642 kvm->arch.vioapic = NULL; 643 kfree(ioapic);
643 kfree(ioapic);
644 }
645} 644}
646 645
647int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) 646int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 72298b3ac025..9efff9e5b58c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -31,6 +31,8 @@
31 31
32#include "ioapic.h" 32#include "ioapic.h"
33 33
34#include "lapic.h"
35
34static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, 36static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
35 struct kvm *kvm, int irq_source_id, int level, 37 struct kvm *kvm, int irq_source_id, int level,
36 bool line_status) 38 bool line_status)
@@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
48 line_status); 50 line_status);
49} 51}
50 52
51inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
52{
53 return irq->delivery_mode == APIC_DM_LOWEST;
54}
55
56int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 53int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
57 struct kvm_lapic_irq *irq, unsigned long *dest_map) 54 struct kvm_lapic_irq *irq, unsigned long *dest_map)
58{ 55{
@@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
60 struct kvm_vcpu *vcpu, *lowest = NULL; 57 struct kvm_vcpu *vcpu, *lowest = NULL;
61 58
62 if (irq->dest_mode == 0 && irq->dest_id == 0xff && 59 if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
63 kvm_is_dm_lowest_prio(irq)) { 60 kvm_lowest_prio_delivery(irq)) {
64 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); 61 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
65 irq->delivery_mode = APIC_DM_FIXED; 62 irq->delivery_mode = APIC_DM_FIXED;
66 } 63 }
@@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
76 irq->dest_id, irq->dest_mode)) 73 irq->dest_id, irq->dest_mode))
77 continue; 74 continue;
78 75
79 if (!kvm_is_dm_lowest_prio(irq)) { 76 if (!kvm_lowest_prio_delivery(irq)) {
80 if (r < 0) 77 if (r < 0)
81 r = 0; 78 r = 0;
82 r += kvm_apic_set_irq(vcpu, irq, dest_map); 79 r += kvm_apic_set_irq(vcpu, irq, dest_map);
@@ -106,9 +103,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
106 irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; 103 irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
107 irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; 104 irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
108 irq->delivery_mode = e->msi.data & 0x700; 105 irq->delivery_mode = e->msi.data & 0x700;
106 irq->msi_redir_hint = ((e->msi.address_lo
107 & MSI_ADDR_REDIRECTION_LOWPRI) > 0);
109 irq->level = 1; 108 irq->level = 1;
110 irq->shorthand = 0; 109 irq->shorthand = 0;
111 /* TODO Deal with RH bit of MSI message address */
112} 110}
113 111
114int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 112int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 544076c4f44b..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
99 return vcpu->arch.hflags & HF_GUEST_MASK; 99 return vcpu->arch.hflags & HF_GUEST_MASK;
100} 100}
101 101
102static inline bool is_smm(struct kvm_vcpu *vcpu)
103{
104 return vcpu->arch.hflags & HF_SMM_MASK;
105}
106
102#endif 107#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4c7deb4f78a1..36e9de1b4127 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
240 recalculate_apic_map(apic->vcpu->kvm); 240 recalculate_apic_map(apic->vcpu->kvm);
241} 241}
242 242
243static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
244{
245 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
246
247 apic_set_reg(apic, APIC_ID, id << 24);
248 apic_set_reg(apic, APIC_LDR, ldr);
249 recalculate_apic_map(apic->vcpu->kvm);
250}
251
243static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) 252static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
244{ 253{
245 return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); 254 return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
@@ -728,7 +737,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
728 737
729 dst = map->logical_map[cid]; 738 dst = map->logical_map[cid];
730 739
731 if (irq->delivery_mode == APIC_DM_LOWEST) { 740 if (kvm_lowest_prio_delivery(irq)) {
732 int l = -1; 741 int l = -1;
733 for_each_set_bit(i, &bitmap, 16) { 742 for_each_set_bit(i, &bitmap, 16) {
734 if (!dst[i]) 743 if (!dst[i])
@@ -799,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
799 break; 808 break;
800 809
801 case APIC_DM_SMI: 810 case APIC_DM_SMI:
802 apic_debug("Ignoring guest SMI\n"); 811 result = 1;
812 kvm_make_request(KVM_REQ_SMI, vcpu);
813 kvm_vcpu_kick(vcpu);
803 break; 814 break;
804 815
805 case APIC_DM_NMI: 816 case APIC_DM_NMI:
@@ -914,9 +925,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
914 irq.vector = icr_low & APIC_VECTOR_MASK; 925 irq.vector = icr_low & APIC_VECTOR_MASK;
915 irq.delivery_mode = icr_low & APIC_MODE_MASK; 926 irq.delivery_mode = icr_low & APIC_MODE_MASK;
916 irq.dest_mode = icr_low & APIC_DEST_MASK; 927 irq.dest_mode = icr_low & APIC_DEST_MASK;
917 irq.level = icr_low & APIC_INT_ASSERT; 928 irq.level = (icr_low & APIC_INT_ASSERT) != 0;
918 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; 929 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
919 irq.shorthand = icr_low & APIC_SHORT_MASK; 930 irq.shorthand = icr_low & APIC_SHORT_MASK;
931 irq.msi_redir_hint = false;
920 if (apic_x2apic_mode(apic)) 932 if (apic_x2apic_mode(apic))
921 irq.dest_id = icr_high; 933 irq.dest_id = icr_high;
922 else 934 else
@@ -926,10 +938,11 @@ static void apic_send_ipi(struct kvm_lapic *apic)
926 938
927 apic_debug("icr_high 0x%x, icr_low 0x%x, " 939 apic_debug("icr_high 0x%x, icr_low 0x%x, "
928 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " 940 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
929 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", 941 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
942 "msi_redir_hint 0x%x\n",
930 icr_high, icr_low, irq.shorthand, irq.dest_id, 943 icr_high, icr_low, irq.shorthand, irq.dest_id,
931 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 944 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
932 irq.vector); 945 irq.vector, irq.msi_redir_hint);
933 946
934 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); 947 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
935} 948}
@@ -1541,9 +1554,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1541 1554
1542 if ((old_value ^ value) & X2APIC_ENABLE) { 1555 if ((old_value ^ value) & X2APIC_ENABLE) {
1543 if (value & X2APIC_ENABLE) { 1556 if (value & X2APIC_ENABLE) {
1544 u32 id = kvm_apic_id(apic); 1557 kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
1545 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
1546 kvm_apic_set_ldr(apic, ldr);
1547 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); 1558 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
1548 } else 1559 } else
1549 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); 1560 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
@@ -1562,7 +1573,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1562 1573
1563} 1574}
1564 1575
1565void kvm_lapic_reset(struct kvm_vcpu *vcpu) 1576void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
1566{ 1577{
1567 struct kvm_lapic *apic; 1578 struct kvm_lapic *apic;
1568 int i; 1579 int i;
@@ -1576,19 +1587,22 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1576 /* Stop the timer in case it's a reset to an active apic */ 1587 /* Stop the timer in case it's a reset to an active apic */
1577 hrtimer_cancel(&apic->lapic_timer.timer); 1588 hrtimer_cancel(&apic->lapic_timer.timer);
1578 1589
1579 kvm_apic_set_id(apic, vcpu->vcpu_id); 1590 if (!init_event)
1591 kvm_apic_set_id(apic, vcpu->vcpu_id);
1580 kvm_apic_set_version(apic->vcpu); 1592 kvm_apic_set_version(apic->vcpu);
1581 1593
1582 for (i = 0; i < APIC_LVT_NUM; i++) 1594 for (i = 0; i < APIC_LVT_NUM; i++)
1583 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); 1595 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
1584 apic_update_lvtt(apic); 1596 apic_update_lvtt(apic);
1585 apic_set_reg(apic, APIC_LVT0, 1597 if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED))
1586 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); 1598 apic_set_reg(apic, APIC_LVT0,
1599 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
1587 1600
1588 apic_set_reg(apic, APIC_DFR, 0xffffffffU); 1601 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
1589 apic_set_spiv(apic, 0xff); 1602 apic_set_spiv(apic, 0xff);
1590 apic_set_reg(apic, APIC_TASKPRI, 0); 1603 apic_set_reg(apic, APIC_TASKPRI, 0);
1591 kvm_apic_set_ldr(apic, 0); 1604 if (!apic_x2apic_mode(apic))
1605 kvm_apic_set_ldr(apic, 0);
1592 apic_set_reg(apic, APIC_ESR, 0); 1606 apic_set_reg(apic, APIC_ESR, 0);
1593 apic_set_reg(apic, APIC_ICR, 0); 1607 apic_set_reg(apic, APIC_ICR, 0);
1594 apic_set_reg(apic, APIC_ICR2, 0); 1608 apic_set_reg(apic, APIC_ICR2, 0);
@@ -1717,7 +1731,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1717 APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); 1731 APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
1718 1732
1719 static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ 1733 static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
1720 kvm_lapic_reset(vcpu); 1734 kvm_lapic_reset(vcpu, false);
1721 kvm_iodevice_init(&apic->dev, &apic_mmio_ops); 1735 kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
1722 1736
1723 return 0; 1737 return 0;
@@ -2049,11 +2063,22 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
2049 if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) 2063 if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
2050 return; 2064 return;
2051 2065
2052 pe = xchg(&apic->pending_events, 0); 2066 /*
2067 * INITs are latched while in SMM. Because an SMM CPU cannot
2068 * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs
2069 * and delay processing of INIT until the next RSM.
2070 */
2071 if (is_smm(vcpu)) {
2072 WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
2073 if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
2074 clear_bit(KVM_APIC_SIPI, &apic->pending_events);
2075 return;
2076 }
2053 2077
2078 pe = xchg(&apic->pending_events, 0);
2054 if (test_bit(KVM_APIC_INIT, &pe)) { 2079 if (test_bit(KVM_APIC_INIT, &pe)) {
2055 kvm_lapic_reset(vcpu); 2080 kvm_lapic_reset(vcpu, true);
2056 kvm_vcpu_reset(vcpu); 2081 kvm_vcpu_reset(vcpu, true);
2057 if (kvm_vcpu_is_bsp(apic->vcpu)) 2082 if (kvm_vcpu_is_bsp(apic->vcpu))
2058 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 2083 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2059 else 2084 else
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 9d28383fc1e7..f2f4e10ab772 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
48int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); 48int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
49int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); 49int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
50void kvm_apic_accept_events(struct kvm_vcpu *vcpu); 50void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
51void kvm_lapic_reset(struct kvm_vcpu *vcpu); 51void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
52u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 52u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
53void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 53void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
54void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); 54void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
@@ -150,7 +150,18 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
150 150
151static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) 151static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
152{ 152{
153 return vcpu->arch.apic->pending_events; 153 return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
154}
155
156static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
157{
158 return (irq->delivery_mode == APIC_DM_LOWEST ||
159 irq->msi_redir_hint);
160}
161
162static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
163{
164 return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
154} 165}
155 166
156bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); 167bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b73337634214..f807496b62c2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte)
223 return gen; 223 return gen;
224} 224}
225 225
226static unsigned int kvm_current_mmio_generation(struct kvm *kvm) 226static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
227{ 227{
228 return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; 228 return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
229} 229}
230 230
231static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, 231static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
232 unsigned access) 232 unsigned access)
233{ 233{
234 unsigned int gen = kvm_current_mmio_generation(kvm); 234 unsigned int gen = kvm_current_mmio_generation(vcpu);
235 u64 mask = generation_mmio_spte_mask(gen); 235 u64 mask = generation_mmio_spte_mask(gen);
236 236
237 access &= ACC_WRITE_MASK | ACC_USER_MASK; 237 access &= ACC_WRITE_MASK | ACC_USER_MASK;
@@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte)
258 return (spte & ~mask) & ~PAGE_MASK; 258 return (spte & ~mask) & ~PAGE_MASK;
259} 259}
260 260
261static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, 261static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
262 pfn_t pfn, unsigned access) 262 pfn_t pfn, unsigned access)
263{ 263{
264 if (unlikely(is_noslot_pfn(pfn))) { 264 if (unlikely(is_noslot_pfn(pfn))) {
265 mark_mmio_spte(kvm, sptep, gfn, access); 265 mark_mmio_spte(vcpu, sptep, gfn, access);
266 return true; 266 return true;
267 } 267 }
268 268
269 return false; 269 return false;
270} 270}
271 271
272static bool check_mmio_spte(struct kvm *kvm, u64 spte) 272static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
273{ 273{
274 unsigned int kvm_gen, spte_gen; 274 unsigned int kvm_gen, spte_gen;
275 275
276 kvm_gen = kvm_current_mmio_generation(kvm); 276 kvm_gen = kvm_current_mmio_generation(vcpu);
277 spte_gen = get_mmio_spte_generation(spte); 277 spte_gen = get_mmio_spte_generation(spte);
278 278
279 trace_check_mmio_spte(spte, kvm_gen, spte_gen); 279 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -804,30 +804,36 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
804 return &slot->arch.lpage_info[level - 2][idx]; 804 return &slot->arch.lpage_info[level - 2][idx];
805} 805}
806 806
807static void account_shadowed(struct kvm *kvm, gfn_t gfn) 807static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
808{ 808{
809 struct kvm_memslots *slots;
809 struct kvm_memory_slot *slot; 810 struct kvm_memory_slot *slot;
810 struct kvm_lpage_info *linfo; 811 struct kvm_lpage_info *linfo;
812 gfn_t gfn;
811 int i; 813 int i;
812 814
813 slot = gfn_to_memslot(kvm, gfn); 815 gfn = sp->gfn;
814 for (i = PT_DIRECTORY_LEVEL; 816 slots = kvm_memslots_for_spte_role(kvm, sp->role);
815 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 817 slot = __gfn_to_memslot(slots, gfn);
818 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
816 linfo = lpage_info_slot(gfn, slot, i); 819 linfo = lpage_info_slot(gfn, slot, i);
817 linfo->write_count += 1; 820 linfo->write_count += 1;
818 } 821 }
819 kvm->arch.indirect_shadow_pages++; 822 kvm->arch.indirect_shadow_pages++;
820} 823}
821 824
822static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 825static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
823{ 826{
827 struct kvm_memslots *slots;
824 struct kvm_memory_slot *slot; 828 struct kvm_memory_slot *slot;
825 struct kvm_lpage_info *linfo; 829 struct kvm_lpage_info *linfo;
830 gfn_t gfn;
826 int i; 831 int i;
827 832
828 slot = gfn_to_memslot(kvm, gfn); 833 gfn = sp->gfn;
829 for (i = PT_DIRECTORY_LEVEL; 834 slots = kvm_memslots_for_spte_role(kvm, sp->role);
830 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 835 slot = __gfn_to_memslot(slots, gfn);
836 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
831 linfo = lpage_info_slot(gfn, slot, i); 837 linfo = lpage_info_slot(gfn, slot, i);
832 linfo->write_count -= 1; 838 linfo->write_count -= 1;
833 WARN_ON(linfo->write_count < 0); 839 WARN_ON(linfo->write_count < 0);
@@ -835,14 +841,14 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
835 kvm->arch.indirect_shadow_pages--; 841 kvm->arch.indirect_shadow_pages--;
836} 842}
837 843
838static int has_wrprotected_page(struct kvm *kvm, 844static int has_wrprotected_page(struct kvm_vcpu *vcpu,
839 gfn_t gfn, 845 gfn_t gfn,
840 int level) 846 int level)
841{ 847{
842 struct kvm_memory_slot *slot; 848 struct kvm_memory_slot *slot;
843 struct kvm_lpage_info *linfo; 849 struct kvm_lpage_info *linfo;
844 850
845 slot = gfn_to_memslot(kvm, gfn); 851 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
846 if (slot) { 852 if (slot) {
847 linfo = lpage_info_slot(gfn, slot, level); 853 linfo = lpage_info_slot(gfn, slot, level);
848 return linfo->write_count; 854 return linfo->write_count;
@@ -858,8 +864,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
858 864
859 page_size = kvm_host_page_size(kvm, gfn); 865 page_size = kvm_host_page_size(kvm, gfn);
860 866
861 for (i = PT_PAGE_TABLE_LEVEL; 867 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
862 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
863 if (page_size >= KVM_HPAGE_SIZE(i)) 868 if (page_size >= KVM_HPAGE_SIZE(i))
864 ret = i; 869 ret = i;
865 else 870 else
@@ -875,7 +880,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
875{ 880{
876 struct kvm_memory_slot *slot; 881 struct kvm_memory_slot *slot;
877 882
878 slot = gfn_to_memslot(vcpu->kvm, gfn); 883 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
879 if (!slot || slot->flags & KVM_MEMSLOT_INVALID || 884 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
880 (no_dirty_log && slot->dirty_bitmap)) 885 (no_dirty_log && slot->dirty_bitmap))
881 slot = NULL; 886 slot = NULL;
@@ -900,7 +905,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
900 max_level = min(kvm_x86_ops->get_lpage_level(), host_level); 905 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
901 906
902 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 907 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
903 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 908 if (has_wrprotected_page(vcpu, large_gfn, level))
904 break; 909 break;
905 910
906 return level - 1; 911 return level - 1;
@@ -1042,12 +1047,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
1042/* 1047/*
1043 * Take gfn and return the reverse mapping to it. 1048 * Take gfn and return the reverse mapping to it.
1044 */ 1049 */
1045static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 1050static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
1046{ 1051{
1052 struct kvm_memslots *slots;
1047 struct kvm_memory_slot *slot; 1053 struct kvm_memory_slot *slot;
1048 1054
1049 slot = gfn_to_memslot(kvm, gfn); 1055 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1050 return __gfn_to_rmap(gfn, level, slot); 1056 slot = __gfn_to_memslot(slots, gfn);
1057 return __gfn_to_rmap(gfn, sp->role.level, slot);
1051} 1058}
1052 1059
1053static bool rmap_can_add(struct kvm_vcpu *vcpu) 1060static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1065,7 +1072,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1065 1072
1066 sp = page_header(__pa(spte)); 1073 sp = page_header(__pa(spte));
1067 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 1074 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1068 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 1075 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
1069 return pte_list_add(vcpu, spte, rmapp); 1076 return pte_list_add(vcpu, spte, rmapp);
1070} 1077}
1071 1078
@@ -1077,7 +1084,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
1077 1084
1078 sp = page_header(__pa(spte)); 1085 sp = page_header(__pa(spte));
1079 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 1086 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1080 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 1087 rmapp = gfn_to_rmap(kvm, gfn, sp);
1081 pte_list_remove(spte, rmapp); 1088 pte_list_remove(spte, rmapp);
1082} 1089}
1083 1090
@@ -1142,6 +1149,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
1142 return NULL; 1149 return NULL;
1143} 1150}
1144 1151
1152#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \
1153 for (_spte_ = rmap_get_first(*_rmap_, _iter_); \
1154 _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \
1155 _spte_ = rmap_get_next(_iter_))
1156
1145static void drop_spte(struct kvm *kvm, u64 *sptep) 1157static void drop_spte(struct kvm *kvm, u64 *sptep)
1146{ 1158{
1147 if (mmu_spte_clear_track_bits(sptep)) 1159 if (mmu_spte_clear_track_bits(sptep))
@@ -1205,12 +1217,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1205 struct rmap_iterator iter; 1217 struct rmap_iterator iter;
1206 bool flush = false; 1218 bool flush = false;
1207 1219
1208 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1220 for_each_rmap_spte(rmapp, &iter, sptep)
1209 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1210
1211 flush |= spte_write_protect(kvm, sptep, pt_protect); 1221 flush |= spte_write_protect(kvm, sptep, pt_protect);
1212 sptep = rmap_get_next(&iter);
1213 }
1214 1222
1215 return flush; 1223 return flush;
1216} 1224}
@@ -1232,12 +1240,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
1232 struct rmap_iterator iter; 1240 struct rmap_iterator iter;
1233 bool flush = false; 1241 bool flush = false;
1234 1242
1235 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1243 for_each_rmap_spte(rmapp, &iter, sptep)
1236 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1237
1238 flush |= spte_clear_dirty(kvm, sptep); 1244 flush |= spte_clear_dirty(kvm, sptep);
1239 sptep = rmap_get_next(&iter);
1240 }
1241 1245
1242 return flush; 1246 return flush;
1243} 1247}
@@ -1259,12 +1263,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
1259 struct rmap_iterator iter; 1263 struct rmap_iterator iter;
1260 bool flush = false; 1264 bool flush = false;
1261 1265
1262 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1266 for_each_rmap_spte(rmapp, &iter, sptep)
1263 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1264
1265 flush |= spte_set_dirty(kvm, sptep); 1267 flush |= spte_set_dirty(kvm, sptep);
1266 sptep = rmap_get_next(&iter);
1267 }
1268 1268
1269 return flush; 1269 return flush;
1270} 1270}
@@ -1342,42 +1342,45 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1342 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1342 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1343} 1343}
1344 1344
1345static bool rmap_write_protect(struct kvm *kvm, u64 gfn) 1345static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1346{ 1346{
1347 struct kvm_memory_slot *slot; 1347 struct kvm_memory_slot *slot;
1348 unsigned long *rmapp; 1348 unsigned long *rmapp;
1349 int i; 1349 int i;
1350 bool write_protected = false; 1350 bool write_protected = false;
1351 1351
1352 slot = gfn_to_memslot(kvm, gfn); 1352 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1353 1353
1354 for (i = PT_PAGE_TABLE_LEVEL; 1354 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1355 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1356 rmapp = __gfn_to_rmap(gfn, i, slot); 1355 rmapp = __gfn_to_rmap(gfn, i, slot);
1357 write_protected |= __rmap_write_protect(kvm, rmapp, true); 1356 write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true);
1358 } 1357 }
1359 1358
1360 return write_protected; 1359 return write_protected;
1361} 1360}
1362 1361
1363static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1362static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
1364 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1365 unsigned long data)
1366{ 1363{
1367 u64 *sptep; 1364 u64 *sptep;
1368 struct rmap_iterator iter; 1365 struct rmap_iterator iter;
1369 int need_tlb_flush = 0; 1366 bool flush = false;
1370 1367
1371 while ((sptep = rmap_get_first(*rmapp, &iter))) { 1368 while ((sptep = rmap_get_first(*rmapp, &iter))) {
1372 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1369 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1373 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n", 1370 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1374 sptep, *sptep, gfn, level);
1375 1371
1376 drop_spte(kvm, sptep); 1372 drop_spte(kvm, sptep);
1377 need_tlb_flush = 1; 1373 flush = true;
1378 } 1374 }
1379 1375
1380 return need_tlb_flush; 1376 return flush;
1377}
1378
1379static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1380 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1381 unsigned long data)
1382{
1383 return kvm_zap_rmapp(kvm, rmapp);
1381} 1384}
1382 1385
1383static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1386static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
@@ -1394,8 +1397,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1394 WARN_ON(pte_huge(*ptep)); 1397 WARN_ON(pte_huge(*ptep));
1395 new_pfn = pte_pfn(*ptep); 1398 new_pfn = pte_pfn(*ptep);
1396 1399
1397 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1400restart:
1398 BUG_ON(!is_shadow_present_pte(*sptep)); 1401 for_each_rmap_spte(rmapp, &iter, sptep) {
1399 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", 1402 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1400 sptep, *sptep, gfn, level); 1403 sptep, *sptep, gfn, level);
1401 1404
@@ -1403,7 +1406,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1403 1406
1404 if (pte_write(*ptep)) { 1407 if (pte_write(*ptep)) {
1405 drop_spte(kvm, sptep); 1408 drop_spte(kvm, sptep);
1406 sptep = rmap_get_first(*rmapp, &iter); 1409 goto restart;
1407 } else { 1410 } else {
1408 new_spte = *sptep & ~PT64_BASE_ADDR_MASK; 1411 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1409 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1412 new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1414,7 +1417,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1414 1417
1415 mmu_spte_clear_track_bits(sptep); 1418 mmu_spte_clear_track_bits(sptep);
1416 mmu_spte_set(sptep, new_spte); 1419 mmu_spte_set(sptep, new_spte);
1417 sptep = rmap_get_next(&iter);
1418 } 1420 }
1419 } 1421 }
1420 1422
@@ -1424,6 +1426,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1424 return 0; 1426 return 0;
1425} 1427}
1426 1428
1429struct slot_rmap_walk_iterator {
1430 /* input fields. */
1431 struct kvm_memory_slot *slot;
1432 gfn_t start_gfn;
1433 gfn_t end_gfn;
1434 int start_level;
1435 int end_level;
1436
1437 /* output fields. */
1438 gfn_t gfn;
1439 unsigned long *rmap;
1440 int level;
1441
1442 /* private field. */
1443 unsigned long *end_rmap;
1444};
1445
1446static void
1447rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1448{
1449 iterator->level = level;
1450 iterator->gfn = iterator->start_gfn;
1451 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1452 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1453 iterator->slot);
1454}
1455
1456static void
1457slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1458 struct kvm_memory_slot *slot, int start_level,
1459 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1460{
1461 iterator->slot = slot;
1462 iterator->start_level = start_level;
1463 iterator->end_level = end_level;
1464 iterator->start_gfn = start_gfn;
1465 iterator->end_gfn = end_gfn;
1466
1467 rmap_walk_init_level(iterator, iterator->start_level);
1468}
1469
1470static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1471{
1472 return !!iterator->rmap;
1473}
1474
1475static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1476{
1477 if (++iterator->rmap <= iterator->end_rmap) {
1478 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1479 return;
1480 }
1481
1482 if (++iterator->level > iterator->end_level) {
1483 iterator->rmap = NULL;
1484 return;
1485 }
1486
1487 rmap_walk_init_level(iterator, iterator->level);
1488}
1489
1490#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1491 _start_gfn, _end_gfn, _iter_) \
1492 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1493 _end_level_, _start_gfn, _end_gfn); \
1494 slot_rmap_walk_okay(_iter_); \
1495 slot_rmap_walk_next(_iter_))
1496
1427static int kvm_handle_hva_range(struct kvm *kvm, 1497static int kvm_handle_hva_range(struct kvm *kvm,
1428 unsigned long start, 1498 unsigned long start,
1429 unsigned long end, 1499 unsigned long end,
@@ -1435,48 +1505,36 @@ static int kvm_handle_hva_range(struct kvm *kvm,
1435 int level, 1505 int level,
1436 unsigned long data)) 1506 unsigned long data))
1437{ 1507{
1438 int j;
1439 int ret = 0;
1440 struct kvm_memslots *slots; 1508 struct kvm_memslots *slots;
1441 struct kvm_memory_slot *memslot; 1509 struct kvm_memory_slot *memslot;
1510 struct slot_rmap_walk_iterator iterator;
1511 int ret = 0;
1512 int i;
1442 1513
1443 slots = kvm_memslots(kvm); 1514 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1444 1515 slots = __kvm_memslots(kvm, i);
1445 kvm_for_each_memslot(memslot, slots) { 1516 kvm_for_each_memslot(memslot, slots) {
1446 unsigned long hva_start, hva_end; 1517 unsigned long hva_start, hva_end;
1447 gfn_t gfn_start, gfn_end; 1518 gfn_t gfn_start, gfn_end;
1448
1449 hva_start = max(start, memslot->userspace_addr);
1450 hva_end = min(end, memslot->userspace_addr +
1451 (memslot->npages << PAGE_SHIFT));
1452 if (hva_start >= hva_end)
1453 continue;
1454 /*
1455 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1456 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1457 */
1458 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1459 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1460
1461 for (j = PT_PAGE_TABLE_LEVEL;
1462 j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
1463 unsigned long idx, idx_end;
1464 unsigned long *rmapp;
1465 gfn_t gfn = gfn_start;
1466 1519
1520 hva_start = max(start, memslot->userspace_addr);
1521 hva_end = min(end, memslot->userspace_addr +
1522 (memslot->npages << PAGE_SHIFT));
1523 if (hva_start >= hva_end)
1524 continue;
1467 /* 1525 /*
1468 * {idx(page_j) | page_j intersects with 1526 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1469 * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}. 1527 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1470 */ 1528 */
1471 idx = gfn_to_index(gfn_start, memslot->base_gfn, j); 1529 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1472 idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j); 1530 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1473 1531
1474 rmapp = __gfn_to_rmap(gfn_start, j, memslot); 1532 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1475 1533 PT_MAX_HUGEPAGE_LEVEL,
1476 for (; idx <= idx_end; 1534 gfn_start, gfn_end - 1,
1477 ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j))) 1535 &iterator)
1478 ret |= handler(kvm, rmapp++, memslot, 1536 ret |= handler(kvm, iterator.rmap, memslot,
1479 gfn, j, data); 1537 iterator.gfn, iterator.level, data);
1480 } 1538 }
1481 } 1539 }
1482 1540
@@ -1518,16 +1576,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1518 1576
1519 BUG_ON(!shadow_accessed_mask); 1577 BUG_ON(!shadow_accessed_mask);
1520 1578
1521 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1579 for_each_rmap_spte(rmapp, &iter, sptep)
1522 sptep = rmap_get_next(&iter)) {
1523 BUG_ON(!is_shadow_present_pte(*sptep));
1524
1525 if (*sptep & shadow_accessed_mask) { 1580 if (*sptep & shadow_accessed_mask) {
1526 young = 1; 1581 young = 1;
1527 clear_bit((ffs(shadow_accessed_mask) - 1), 1582 clear_bit((ffs(shadow_accessed_mask) - 1),
1528 (unsigned long *)sptep); 1583 (unsigned long *)sptep);
1529 } 1584 }
1530 } 1585
1531 trace_kvm_age_page(gfn, level, slot, young); 1586 trace_kvm_age_page(gfn, level, slot, young);
1532 return young; 1587 return young;
1533} 1588}
@@ -1548,15 +1603,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1548 if (!shadow_accessed_mask) 1603 if (!shadow_accessed_mask)
1549 goto out; 1604 goto out;
1550 1605
1551 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1606 for_each_rmap_spte(rmapp, &iter, sptep)
1552 sptep = rmap_get_next(&iter)) {
1553 BUG_ON(!is_shadow_present_pte(*sptep));
1554
1555 if (*sptep & shadow_accessed_mask) { 1607 if (*sptep & shadow_accessed_mask) {
1556 young = 1; 1608 young = 1;
1557 break; 1609 break;
1558 } 1610 }
1559 }
1560out: 1611out:
1561 return young; 1612 return young;
1562} 1613}
@@ -1570,7 +1621,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1570 1621
1571 sp = page_header(__pa(spte)); 1622 sp = page_header(__pa(spte));
1572 1623
1573 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 1624 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
1574 1625
1575 kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); 1626 kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
1576 kvm_flush_remote_tlbs(vcpu->kvm); 1627 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1990,7 +2041,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1990 bool protected = false; 2041 bool protected = false;
1991 2042
1992 for_each_sp(pages, sp, parents, i) 2043 for_each_sp(pages, sp, parents, i)
1993 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); 2044 protected |= rmap_write_protect(vcpu, sp->gfn);
1994 2045
1995 if (protected) 2046 if (protected)
1996 kvm_flush_remote_tlbs(vcpu->kvm); 2047 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2088,12 +2139,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2088 hlist_add_head(&sp->hash_link, 2139 hlist_add_head(&sp->hash_link,
2089 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); 2140 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2090 if (!direct) { 2141 if (!direct) {
2091 if (rmap_write_protect(vcpu->kvm, gfn)) 2142 if (rmap_write_protect(vcpu, gfn))
2092 kvm_flush_remote_tlbs(vcpu->kvm); 2143 kvm_flush_remote_tlbs(vcpu->kvm);
2093 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 2144 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2094 kvm_sync_pages(vcpu, gfn); 2145 kvm_sync_pages(vcpu, gfn);
2095 2146
2096 account_shadowed(vcpu->kvm, gfn); 2147 account_shadowed(vcpu->kvm, sp);
2097 } 2148 }
2098 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; 2149 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2099 init_shadow_page_table(sp); 2150 init_shadow_page_table(sp);
@@ -2274,7 +2325,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2274 kvm_mmu_unlink_parents(kvm, sp); 2325 kvm_mmu_unlink_parents(kvm, sp);
2275 2326
2276 if (!sp->role.invalid && !sp->role.direct) 2327 if (!sp->role.invalid && !sp->role.direct)
2277 unaccount_shadowed(kvm, sp->gfn); 2328 unaccount_shadowed(kvm, sp);
2278 2329
2279 if (sp->unsync) 2330 if (sp->unsync)
2280 kvm_unlink_unsync_page(kvm, sp); 2331 kvm_unlink_unsync_page(kvm, sp);
@@ -2386,111 +2437,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2386} 2437}
2387EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); 2438EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2388 2439
2389/*
2390 * The function is based on mtrr_type_lookup() in
2391 * arch/x86/kernel/cpu/mtrr/generic.c
2392 */
2393static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
2394 u64 start, u64 end)
2395{
2396 int i;
2397 u64 base, mask;
2398 u8 prev_match, curr_match;
2399 int num_var_ranges = KVM_NR_VAR_MTRR;
2400
2401 if (!mtrr_state->enabled)
2402 return 0xFF;
2403
2404 /* Make end inclusive end, instead of exclusive */
2405 end--;
2406
2407 /* Look in fixed ranges. Just return the type as per start */
2408 if (mtrr_state->have_fixed && (start < 0x100000)) {
2409 int idx;
2410
2411 if (start < 0x80000) {
2412 idx = 0;
2413 idx += (start >> 16);
2414 return mtrr_state->fixed_ranges[idx];
2415 } else if (start < 0xC0000) {
2416 idx = 1 * 8;
2417 idx += ((start - 0x80000) >> 14);
2418 return mtrr_state->fixed_ranges[idx];
2419 } else if (start < 0x1000000) {
2420 idx = 3 * 8;
2421 idx += ((start - 0xC0000) >> 12);
2422 return mtrr_state->fixed_ranges[idx];
2423 }
2424 }
2425
2426 /*
2427 * Look in variable ranges
2428 * Look of multiple ranges matching this address and pick type
2429 * as per MTRR precedence
2430 */
2431 if (!(mtrr_state->enabled & 2))
2432 return mtrr_state->def_type;
2433
2434 prev_match = 0xFF;
2435 for (i = 0; i < num_var_ranges; ++i) {
2436 unsigned short start_state, end_state;
2437
2438 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
2439 continue;
2440
2441 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
2442 (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
2443 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
2444 (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
2445
2446 start_state = ((start & mask) == (base & mask));
2447 end_state = ((end & mask) == (base & mask));
2448 if (start_state != end_state)
2449 return 0xFE;
2450
2451 if ((start & mask) != (base & mask))
2452 continue;
2453
2454 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
2455 if (prev_match == 0xFF) {
2456 prev_match = curr_match;
2457 continue;
2458 }
2459
2460 if (prev_match == MTRR_TYPE_UNCACHABLE ||
2461 curr_match == MTRR_TYPE_UNCACHABLE)
2462 return MTRR_TYPE_UNCACHABLE;
2463
2464 if ((prev_match == MTRR_TYPE_WRBACK &&
2465 curr_match == MTRR_TYPE_WRTHROUGH) ||
2466 (prev_match == MTRR_TYPE_WRTHROUGH &&
2467 curr_match == MTRR_TYPE_WRBACK)) {
2468 prev_match = MTRR_TYPE_WRTHROUGH;
2469 curr_match = MTRR_TYPE_WRTHROUGH;
2470 }
2471
2472 if (prev_match != curr_match)
2473 return MTRR_TYPE_UNCACHABLE;
2474 }
2475
2476 if (prev_match != 0xFF)
2477 return prev_match;
2478
2479 return mtrr_state->def_type;
2480}
2481
2482u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
2483{
2484 u8 mtrr;
2485
2486 mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
2487 (gfn << PAGE_SHIFT) + PAGE_SIZE);
2488 if (mtrr == 0xfe || mtrr == 0xff)
2489 mtrr = MTRR_TYPE_WRBACK;
2490 return mtrr;
2491}
2492EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
2493
2494static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 2440static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2495{ 2441{
2496 trace_kvm_mmu_unsync_page(sp); 2442 trace_kvm_mmu_unsync_page(sp);
@@ -2541,7 +2487,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2541 u64 spte; 2487 u64 spte;
2542 int ret = 0; 2488 int ret = 0;
2543 2489
2544 if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) 2490 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
2545 return 0; 2491 return 0;
2546 2492
2547 spte = PT_PRESENT_MASK; 2493 spte = PT_PRESENT_MASK;
@@ -2578,7 +2524,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2578 * be fixed if guest refault. 2524 * be fixed if guest refault.
2579 */ 2525 */
2580 if (level > PT_PAGE_TABLE_LEVEL && 2526 if (level > PT_PAGE_TABLE_LEVEL &&
2581 has_wrprotected_page(vcpu->kvm, gfn, level)) 2527 has_wrprotected_page(vcpu, gfn, level))
2582 goto done; 2528 goto done;
2583 2529
2584 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2530 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2602,7 +2548,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2602 } 2548 }
2603 2549
2604 if (pte_access & ACC_WRITE_MASK) { 2550 if (pte_access & ACC_WRITE_MASK) {
2605 mark_page_dirty(vcpu->kvm, gfn); 2551 kvm_vcpu_mark_page_dirty(vcpu, gfn);
2606 spte |= shadow_dirty_mask; 2552 spte |= shadow_dirty_mask;
2607 } 2553 }
2608 2554
@@ -2692,15 +2638,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2692 u64 *start, u64 *end) 2638 u64 *start, u64 *end)
2693{ 2639{
2694 struct page *pages[PTE_PREFETCH_NUM]; 2640 struct page *pages[PTE_PREFETCH_NUM];
2641 struct kvm_memory_slot *slot;
2695 unsigned access = sp->role.access; 2642 unsigned access = sp->role.access;
2696 int i, ret; 2643 int i, ret;
2697 gfn_t gfn; 2644 gfn_t gfn;
2698 2645
2699 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2646 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2700 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) 2647 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2648 if (!slot)
2701 return -1; 2649 return -1;
2702 2650
2703 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); 2651 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2704 if (ret <= 0) 2652 if (ret <= 0)
2705 return -1; 2653 return -1;
2706 2654
@@ -2818,7 +2766,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2818 return 1; 2766 return 1;
2819 2767
2820 if (pfn == KVM_PFN_ERR_HWPOISON) { 2768 if (pfn == KVM_PFN_ERR_HWPOISON) {
2821 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); 2769 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
2822 return 0; 2770 return 0;
2823 } 2771 }
2824 2772
@@ -2841,7 +2789,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2841 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && 2789 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
2842 level == PT_PAGE_TABLE_LEVEL && 2790 level == PT_PAGE_TABLE_LEVEL &&
2843 PageTransCompound(pfn_to_page(pfn)) && 2791 PageTransCompound(pfn_to_page(pfn)) &&
2844 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { 2792 !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
2845 unsigned long mask; 2793 unsigned long mask;
2846 /* 2794 /*
2847 * mmu_notifier_retry was successful and we hold the 2795 * mmu_notifier_retry was successful and we hold the
@@ -2933,7 +2881,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2933 * Compare with set_spte where instead shadow_dirty_mask is set. 2881 * Compare with set_spte where instead shadow_dirty_mask is set.
2934 */ 2882 */
2935 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) 2883 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2936 mark_page_dirty(vcpu->kvm, gfn); 2884 kvm_vcpu_mark_page_dirty(vcpu, gfn);
2937 2885
2938 return true; 2886 return true;
2939} 2887}
@@ -3388,7 +3336,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3388 gfn_t gfn = get_mmio_spte_gfn(spte); 3336 gfn_t gfn = get_mmio_spte_gfn(spte);
3389 unsigned access = get_mmio_spte_access(spte); 3337 unsigned access = get_mmio_spte_access(spte);
3390 3338
3391 if (!check_mmio_spte(vcpu->kvm, spte)) 3339 if (!check_mmio_spte(vcpu, spte))
3392 return RET_MMIO_PF_INVALID; 3340 return RET_MMIO_PF_INVALID;
3393 3341
3394 if (direct) 3342 if (direct)
@@ -3460,7 +3408,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
3460 arch.direct_map = vcpu->arch.mmu.direct_map; 3408 arch.direct_map = vcpu->arch.mmu.direct_map;
3461 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); 3409 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
3462 3410
3463 return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); 3411 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
3464} 3412}
3465 3413
3466static bool can_do_async_pf(struct kvm_vcpu *vcpu) 3414static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -3475,10 +3423,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
3475static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3423static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3476 gva_t gva, pfn_t *pfn, bool write, bool *writable) 3424 gva_t gva, pfn_t *pfn, bool write, bool *writable)
3477{ 3425{
3426 struct kvm_memory_slot *slot;
3478 bool async; 3427 bool async;
3479 3428
3480 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); 3429 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3481 3430 async = false;
3431 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
3482 if (!async) 3432 if (!async)
3483 return false; /* *pfn has correct page already */ 3433 return false; /* *pfn has correct page already */
3484 3434
@@ -3492,11 +3442,20 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3492 return true; 3442 return true;
3493 } 3443 }
3494 3444
3495 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); 3445 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
3496
3497 return false; 3446 return false;
3498} 3447}
3499 3448
3449static bool
3450check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
3451{
3452 int page_num = KVM_PAGES_PER_HPAGE(level);
3453
3454 gfn &= ~(page_num - 1);
3455
3456 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
3457}
3458
3500static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, 3459static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3501 bool prefault) 3460 bool prefault)
3502{ 3461{
@@ -3522,9 +3481,17 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3522 if (r) 3481 if (r)
3523 return r; 3482 return r;
3524 3483
3525 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 3484 if (mapping_level_dirty_bitmap(vcpu, gfn) ||
3485 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
3486 force_pt_level = 1;
3487 else
3488 force_pt_level = 0;
3489
3526 if (likely(!force_pt_level)) { 3490 if (likely(!force_pt_level)) {
3527 level = mapping_level(vcpu, gfn); 3491 level = mapping_level(vcpu, gfn);
3492 if (level > PT_DIRECTORY_LEVEL &&
3493 !check_hugepage_cache_consistency(vcpu, gfn, level))
3494 level = PT_DIRECTORY_LEVEL;
3528 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3495 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3529 } else 3496 } else
3530 level = PT_PAGE_TABLE_LEVEL; 3497 level = PT_PAGE_TABLE_LEVEL;
@@ -3590,7 +3557,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
3590 vcpu->arch.mmu.inject_page_fault(vcpu, fault); 3557 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
3591} 3558}
3592 3559
3593static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, 3560static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
3594 unsigned access, int *nr_present) 3561 unsigned access, int *nr_present)
3595{ 3562{
3596 if (unlikely(is_mmio_spte(*sptep))) { 3563 if (unlikely(is_mmio_spte(*sptep))) {
@@ -3600,7 +3567,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3600 } 3567 }
3601 3568
3602 (*nr_present)++; 3569 (*nr_present)++;
3603 mark_mmio_spte(kvm, sptep, gfn, access); 3570 mark_mmio_spte(vcpu, sptep, gfn, access);
3604 return true; 3571 return true;
3605 } 3572 }
3606 3573
@@ -3878,6 +3845,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3878 struct kvm_mmu *context = &vcpu->arch.mmu; 3845 struct kvm_mmu *context = &vcpu->arch.mmu;
3879 3846
3880 context->base_role.word = 0; 3847 context->base_role.word = 0;
3848 context->base_role.smm = is_smm(vcpu);
3881 context->page_fault = tdp_page_fault; 3849 context->page_fault = tdp_page_fault;
3882 context->sync_page = nonpaging_sync_page; 3850 context->sync_page = nonpaging_sync_page;
3883 context->invlpg = nonpaging_invlpg; 3851 context->invlpg = nonpaging_invlpg;
@@ -3939,6 +3907,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
3939 = smep && !is_write_protection(vcpu); 3907 = smep && !is_write_protection(vcpu);
3940 context->base_role.smap_andnot_wp 3908 context->base_role.smap_andnot_wp
3941 = smap && !is_write_protection(vcpu); 3909 = smap && !is_write_protection(vcpu);
3910 context->base_role.smm = is_smm(vcpu);
3942} 3911}
3943EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 3912EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3944 3913
@@ -4110,7 +4079,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
4110 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 4079 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
4111 *gpa &= ~(gpa_t)7; 4080 *gpa &= ~(gpa_t)7;
4112 *bytes = 8; 4081 *bytes = 8;
4113 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); 4082 r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
4114 if (r) 4083 if (r)
4115 gentry = 0; 4084 gentry = 0;
4116 new = (const u8 *)&gentry; 4085 new = (const u8 *)&gentry;
@@ -4222,6 +4191,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
4222 mask.nxe = 1; 4191 mask.nxe = 1;
4223 mask.smep_andnot_wp = 1; 4192 mask.smep_andnot_wp = 1;
4224 mask.smap_andnot_wp = 1; 4193 mask.smap_andnot_wp = 1;
4194 mask.smm = 1;
4225 4195
4226 /* 4196 /*
4227 * If we don't have indirect shadow pages, it means no page is 4197 * If we don't have indirect shadow pages, it means no page is
@@ -4420,36 +4390,115 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
4420 init_kvm_mmu(vcpu); 4390 init_kvm_mmu(vcpu);
4421} 4391}
4422 4392
4423void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 4393/* The return value indicates if tlb flush on all vcpus is needed. */
4424 struct kvm_memory_slot *memslot) 4394typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
4395
4396/* The caller should hold mmu-lock before calling this function. */
4397static bool
4398slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
4399 slot_level_handler fn, int start_level, int end_level,
4400 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
4425{ 4401{
4426 gfn_t last_gfn; 4402 struct slot_rmap_walk_iterator iterator;
4427 int i;
4428 bool flush = false; 4403 bool flush = false;
4429 4404
4430 last_gfn = memslot->base_gfn + memslot->npages - 1; 4405 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
4406 end_gfn, &iterator) {
4407 if (iterator.rmap)
4408 flush |= fn(kvm, iterator.rmap);
4431 4409
4432 spin_lock(&kvm->mmu_lock); 4410 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
4411 if (flush && lock_flush_tlb) {
4412 kvm_flush_remote_tlbs(kvm);
4413 flush = false;
4414 }
4415 cond_resched_lock(&kvm->mmu_lock);
4416 }
4417 }
4433 4418
4434 for (i = PT_PAGE_TABLE_LEVEL; 4419 if (flush && lock_flush_tlb) {
4435 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 4420 kvm_flush_remote_tlbs(kvm);
4436 unsigned long *rmapp; 4421 flush = false;
4437 unsigned long last_index, index; 4422 }
4438 4423
4439 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; 4424 return flush;
4440 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); 4425}
4441 4426
4442 for (index = 0; index <= last_index; ++index, ++rmapp) { 4427static bool
4443 if (*rmapp) 4428slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
4444 flush |= __rmap_write_protect(kvm, rmapp, 4429 slot_level_handler fn, int start_level, int end_level,
4445 false); 4430 bool lock_flush_tlb)
4431{
4432 return slot_handle_level_range(kvm, memslot, fn, start_level,
4433 end_level, memslot->base_gfn,
4434 memslot->base_gfn + memslot->npages - 1,
4435 lock_flush_tlb);
4436}
4437
4438static bool
4439slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
4440 slot_level_handler fn, bool lock_flush_tlb)
4441{
4442 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
4443 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
4444}
4445
4446static bool
4447slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
4448 slot_level_handler fn, bool lock_flush_tlb)
4449{
4450 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
4451 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
4452}
4453
4454static bool
4455slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
4456 slot_level_handler fn, bool lock_flush_tlb)
4457{
4458 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
4459 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
4460}
4446 4461
4447 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) 4462void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
4448 cond_resched_lock(&kvm->mmu_lock); 4463{
4464 struct kvm_memslots *slots;
4465 struct kvm_memory_slot *memslot;
4466 int i;
4467
4468 spin_lock(&kvm->mmu_lock);
4469 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
4470 slots = __kvm_memslots(kvm, i);
4471 kvm_for_each_memslot(memslot, slots) {
4472 gfn_t start, end;
4473
4474 start = max(gfn_start, memslot->base_gfn);
4475 end = min(gfn_end, memslot->base_gfn + memslot->npages);
4476 if (start >= end)
4477 continue;
4478
4479 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
4480 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
4481 start, end - 1, true);
4449 } 4482 }
4450 } 4483 }
4451 4484
4452 spin_unlock(&kvm->mmu_lock); 4485 spin_unlock(&kvm->mmu_lock);
4486}
4487
4488static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
4489{
4490 return __rmap_write_protect(kvm, rmapp, false);
4491}
4492
4493void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
4494 struct kvm_memory_slot *memslot)
4495{
4496 bool flush;
4497
4498 spin_lock(&kvm->mmu_lock);
4499 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
4500 false);
4501 spin_unlock(&kvm->mmu_lock);
4453 4502
4454 /* 4503 /*
4455 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() 4504 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
@@ -4482,9 +4531,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
4482 pfn_t pfn; 4531 pfn_t pfn;
4483 struct kvm_mmu_page *sp; 4532 struct kvm_mmu_page *sp;
4484 4533
4485 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 4534restart:
4486 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 4535 for_each_rmap_spte(rmapp, &iter, sptep) {
4487
4488 sp = page_header(__pa(sptep)); 4536 sp = page_header(__pa(sptep));
4489 pfn = spte_to_pfn(*sptep); 4537 pfn = spte_to_pfn(*sptep);
4490 4538
@@ -4499,71 +4547,31 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
4499 !kvm_is_reserved_pfn(pfn) && 4547 !kvm_is_reserved_pfn(pfn) &&
4500 PageTransCompound(pfn_to_page(pfn))) { 4548 PageTransCompound(pfn_to_page(pfn))) {
4501 drop_spte(kvm, sptep); 4549 drop_spte(kvm, sptep);
4502 sptep = rmap_get_first(*rmapp, &iter);
4503 need_tlb_flush = 1; 4550 need_tlb_flush = 1;
4504 } else 4551 goto restart;
4505 sptep = rmap_get_next(&iter); 4552 }
4506 } 4553 }
4507 4554
4508 return need_tlb_flush; 4555 return need_tlb_flush;
4509} 4556}
4510 4557
4511void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, 4558void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
4512 struct kvm_memory_slot *memslot) 4559 const struct kvm_memory_slot *memslot)
4513{ 4560{
4514 bool flush = false; 4561 /* FIXME: const-ify all uses of struct kvm_memory_slot. */
4515 unsigned long *rmapp;
4516 unsigned long last_index, index;
4517
4518 spin_lock(&kvm->mmu_lock); 4562 spin_lock(&kvm->mmu_lock);
4519 4563 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
4520 rmapp = memslot->arch.rmap[0]; 4564 kvm_mmu_zap_collapsible_spte, true);
4521 last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
4522 memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
4523
4524 for (index = 0; index <= last_index; ++index, ++rmapp) {
4525 if (*rmapp)
4526 flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
4527
4528 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
4529 if (flush) {
4530 kvm_flush_remote_tlbs(kvm);
4531 flush = false;
4532 }
4533 cond_resched_lock(&kvm->mmu_lock);
4534 }
4535 }
4536
4537 if (flush)
4538 kvm_flush_remote_tlbs(kvm);
4539
4540 spin_unlock(&kvm->mmu_lock); 4565 spin_unlock(&kvm->mmu_lock);
4541} 4566}
4542 4567
4543void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 4568void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
4544 struct kvm_memory_slot *memslot) 4569 struct kvm_memory_slot *memslot)
4545{ 4570{
4546 gfn_t last_gfn; 4571 bool flush;
4547 unsigned long *rmapp;
4548 unsigned long last_index, index;
4549 bool flush = false;
4550
4551 last_gfn = memslot->base_gfn + memslot->npages - 1;
4552 4572
4553 spin_lock(&kvm->mmu_lock); 4573 spin_lock(&kvm->mmu_lock);
4554 4574 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
4555 rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
4556 last_index = gfn_to_index(last_gfn, memslot->base_gfn,
4557 PT_PAGE_TABLE_LEVEL);
4558
4559 for (index = 0; index <= last_index; ++index, ++rmapp) {
4560 if (*rmapp)
4561 flush |= __rmap_clear_dirty(kvm, rmapp);
4562
4563 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4564 cond_resched_lock(&kvm->mmu_lock);
4565 }
4566
4567 spin_unlock(&kvm->mmu_lock); 4575 spin_unlock(&kvm->mmu_lock);
4568 4576
4569 lockdep_assert_held(&kvm->slots_lock); 4577 lockdep_assert_held(&kvm->slots_lock);
@@ -4582,31 +4590,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
4582void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, 4590void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
4583 struct kvm_memory_slot *memslot) 4591 struct kvm_memory_slot *memslot)
4584{ 4592{
4585 gfn_t last_gfn; 4593 bool flush;
4586 int i;
4587 bool flush = false;
4588
4589 last_gfn = memslot->base_gfn + memslot->npages - 1;
4590 4594
4591 spin_lock(&kvm->mmu_lock); 4595 spin_lock(&kvm->mmu_lock);
4592 4596 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
4593 for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ 4597 false);
4594 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4595 unsigned long *rmapp;
4596 unsigned long last_index, index;
4597
4598 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4599 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4600
4601 for (index = 0; index <= last_index; ++index, ++rmapp) {
4602 if (*rmapp)
4603 flush |= __rmap_write_protect(kvm, rmapp,
4604 false);
4605
4606 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4607 cond_resched_lock(&kvm->mmu_lock);
4608 }
4609 }
4610 spin_unlock(&kvm->mmu_lock); 4598 spin_unlock(&kvm->mmu_lock);
4611 4599
4612 /* see kvm_mmu_slot_remove_write_access */ 4600 /* see kvm_mmu_slot_remove_write_access */
@@ -4620,31 +4608,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
4620void kvm_mmu_slot_set_dirty(struct kvm *kvm, 4608void kvm_mmu_slot_set_dirty(struct kvm *kvm,
4621 struct kvm_memory_slot *memslot) 4609 struct kvm_memory_slot *memslot)
4622{ 4610{
4623 gfn_t last_gfn; 4611 bool flush;
4624 int i;
4625 bool flush = false;
4626
4627 last_gfn = memslot->base_gfn + memslot->npages - 1;
4628 4612
4629 spin_lock(&kvm->mmu_lock); 4613 spin_lock(&kvm->mmu_lock);
4630 4614 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
4631 for (i = PT_PAGE_TABLE_LEVEL;
4632 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4633 unsigned long *rmapp;
4634 unsigned long last_index, index;
4635
4636 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4637 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4638
4639 for (index = 0; index <= last_index; ++index, ++rmapp) {
4640 if (*rmapp)
4641 flush |= __rmap_set_dirty(kvm, rmapp);
4642
4643 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4644 cond_resched_lock(&kvm->mmu_lock);
4645 }
4646 }
4647
4648 spin_unlock(&kvm->mmu_lock); 4615 spin_unlock(&kvm->mmu_lock);
4649 4616
4650 lockdep_assert_held(&kvm->slots_lock); 4617 lockdep_assert_held(&kvm->slots_lock);
@@ -4741,13 +4708,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
4741 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); 4708 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
4742} 4709}
4743 4710
4744void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) 4711void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
4745{ 4712{
4746 /* 4713 /*
4747 * The very rare case: if the generation-number is round, 4714 * The very rare case: if the generation-number is round,
4748 * zap all shadow pages. 4715 * zap all shadow pages.
4749 */ 4716 */
4750 if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { 4717 if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
4751 printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); 4718 printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n");
4752 kvm_mmu_invalidate_zap_all_pages(kvm); 4719 kvm_mmu_invalidate_zap_all_pages(kvm);
4753 } 4720 }
@@ -4869,15 +4836,18 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
4869 unsigned int nr_pages = 0; 4836 unsigned int nr_pages = 0;
4870 struct kvm_memslots *slots; 4837 struct kvm_memslots *slots;
4871 struct kvm_memory_slot *memslot; 4838 struct kvm_memory_slot *memslot;
4839 int i;
4872 4840
4873 slots = kvm_memslots(kvm); 4841 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
4842 slots = __kvm_memslots(kvm, i);
4874 4843
4875 kvm_for_each_memslot(memslot, slots) 4844 kvm_for_each_memslot(memslot, slots)
4876 nr_pages += memslot->npages; 4845 nr_pages += memslot->npages;
4846 }
4877 4847
4878 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 4848 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
4879 nr_mmu_pages = max(nr_mmu_pages, 4849 nr_mmu_pages = max(nr_mmu_pages,
4880 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); 4850 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
4881 4851
4882 return nr_mmu_pages; 4852 return nr_mmu_pages;
4883} 4853}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 0ada65ecddcf..398d21c0f6dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -43,6 +43,7 @@
43#define PT_PDPE_LEVEL 3 43#define PT_PDPE_LEVEL 3
44#define PT_DIRECTORY_LEVEL 2 44#define PT_DIRECTORY_LEVEL 2
45#define PT_PAGE_TABLE_LEVEL 1 45#define PT_PAGE_TABLE_LEVEL 1
46#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
46 47
47static inline u64 rsvd_bits(int s, int e) 48static inline u64 rsvd_bits(int s, int e)
48{ 49{
@@ -170,4 +171,5 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
170} 171}
171 172
172void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); 173void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
174void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
173#endif 175#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 9ade5cfb5a4c..a4f62e6f2db2 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -114,7 +114,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
114 return; 114 return;
115 115
116 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 116 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
117 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); 117 pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
118 118
119 if (is_error_pfn(pfn)) 119 if (is_error_pfn(pfn))
120 return; 120 return;
@@ -131,12 +131,16 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
131 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); 131 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
132 unsigned long *rmapp; 132 unsigned long *rmapp;
133 struct kvm_mmu_page *rev_sp; 133 struct kvm_mmu_page *rev_sp;
134 struct kvm_memslots *slots;
135 struct kvm_memory_slot *slot;
134 gfn_t gfn; 136 gfn_t gfn;
135 137
136 rev_sp = page_header(__pa(sptep)); 138 rev_sp = page_header(__pa(sptep));
137 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); 139 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
138 140
139 if (!gfn_to_memslot(kvm, gfn)) { 141 slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
142 slot = __gfn_to_memslot(slots, gfn);
143 if (!slot) {
140 if (!__ratelimit(&ratelimit_state)) 144 if (!__ratelimit(&ratelimit_state))
141 return; 145 return;
142 audit_printk(kvm, "no memslot for gfn %llx\n", gfn); 146 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
@@ -146,7 +150,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
146 return; 150 return;
147 } 151 }
148 152
149 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); 153 rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
150 if (!*rmapp) { 154 if (!*rmapp) {
151 if (!__ratelimit(&ratelimit_state)) 155 if (!__ratelimit(&ratelimit_state))
152 return; 156 return;
@@ -191,19 +195,21 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
191 unsigned long *rmapp; 195 unsigned long *rmapp;
192 u64 *sptep; 196 u64 *sptep;
193 struct rmap_iterator iter; 197 struct rmap_iterator iter;
198 struct kvm_memslots *slots;
199 struct kvm_memory_slot *slot;
194 200
195 if (sp->role.direct || sp->unsync || sp->role.invalid) 201 if (sp->role.direct || sp->unsync || sp->role.invalid)
196 return; 202 return;
197 203
198 rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); 204 slots = kvm_memslots_for_spte_role(kvm, sp->role);
205 slot = __gfn_to_memslot(slots, sp->gfn);
206 rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
199 207
200 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 208 for_each_rmap_spte(rmapp, &iter, sptep)
201 sptep = rmap_get_next(&iter)) {
202 if (is_writable_pte(*sptep)) 209 if (is_writable_pte(*sptep))
203 audit_printk(kvm, "shadow page has writable " 210 audit_printk(kvm, "shadow page has writable "
204 "mappings: gfn %llx role %x\n", 211 "mappings: gfn %llx role %x\n",
205 sp->gfn, sp->role.word); 212 sp->gfn, sp->role.word);
206 }
207} 213}
208 214
209static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 215static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000000..de1d2d8062e2
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,699 @@
1/*
2 * vMTRR implementation
3 *
4 * Copyright (C) 2006 Qumranet, Inc.
5 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6 * Copyright(C) 2015 Intel Corporation.
7 *
8 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com>
10 * Avi Kivity <avi@qumranet.com>
11 * Marcelo Tosatti <mtosatti@redhat.com>
12 * Paolo Bonzini <pbonzini@redhat.com>
13 * Xiao Guangrong <guangrong.xiao@linux.intel.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 */
18
19#include <linux/kvm_host.h>
20#include <asm/mtrr.h>
21
22#include "cpuid.h"
23#include "mmu.h"
24
25#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
26#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
27#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
28
29static bool msr_mtrr_valid(unsigned msr)
30{
31 switch (msr) {
32 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
33 case MSR_MTRRfix64K_00000:
34 case MSR_MTRRfix16K_80000:
35 case MSR_MTRRfix16K_A0000:
36 case MSR_MTRRfix4K_C0000:
37 case MSR_MTRRfix4K_C8000:
38 case MSR_MTRRfix4K_D0000:
39 case MSR_MTRRfix4K_D8000:
40 case MSR_MTRRfix4K_E0000:
41 case MSR_MTRRfix4K_E8000:
42 case MSR_MTRRfix4K_F0000:
43 case MSR_MTRRfix4K_F8000:
44 case MSR_MTRRdefType:
45 case MSR_IA32_CR_PAT:
46 return true;
47 case 0x2f8:
48 return true;
49 }
50 return false;
51}
52
53static bool valid_pat_type(unsigned t)
54{
55 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
56}
57
58static bool valid_mtrr_type(unsigned t)
59{
60 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
61}
62
63bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
64{
65 int i;
66 u64 mask;
67
68 if (!msr_mtrr_valid(msr))
69 return false;
70
71 if (msr == MSR_IA32_CR_PAT) {
72 for (i = 0; i < 8; i++)
73 if (!valid_pat_type((data >> (i * 8)) & 0xff))
74 return false;
75 return true;
76 } else if (msr == MSR_MTRRdefType) {
77 if (data & ~0xcff)
78 return false;
79 return valid_mtrr_type(data & 0xff);
80 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
81 for (i = 0; i < 8 ; i++)
82 if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
83 return false;
84 return true;
85 }
86
87 /* variable MTRRs */
88 WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
89
90 mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
91 if ((msr & 1) == 0) {
92 /* MTRR base */
93 if (!valid_mtrr_type(data & 0xff))
94 return false;
95 mask |= 0xf00;
96 } else
97 /* MTRR mask */
98 mask |= 0x7ff;
99 if (data & mask) {
100 kvm_inject_gp(vcpu, 0);
101 return false;
102 }
103
104 return true;
105}
106EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
107
108static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
109{
110 return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
111}
112
113static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
114{
115 return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
116}
117
118static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
119{
120 return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
121}
122
123/*
124* Three terms are used in the following code:
125* - segment, it indicates the address segments covered by fixed MTRRs.
126* - unit, it corresponds to the MSR entry in the segment.
127* - range, a range is covered in one memory cache type.
128*/
129struct fixed_mtrr_segment {
130 u64 start;
131 u64 end;
132
133 int range_shift;
134
135 /* the start position in kvm_mtrr.fixed_ranges[]. */
136 int range_start;
137};
138
139static struct fixed_mtrr_segment fixed_seg_table[] = {
140 /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
141 {
142 .start = 0x0,
143 .end = 0x80000,
144 .range_shift = 16, /* 64K */
145 .range_start = 0,
146 },
147
148 /*
149 * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
150 * 16K fixed mtrr.
151 */
152 {
153 .start = 0x80000,
154 .end = 0xc0000,
155 .range_shift = 14, /* 16K */
156 .range_start = 8,
157 },
158
159 /*
160 * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
161 * 4K fixed mtrr.
162 */
163 {
164 .start = 0xc0000,
165 .end = 0x100000,
166 .range_shift = 12, /* 12K */
167 .range_start = 24,
168 }
169};
170
171/*
172 * The size of unit is covered in one MSR, one MSR entry contains
173 * 8 ranges so that unit size is always 8 * 2^range_shift.
174 */
175static u64 fixed_mtrr_seg_unit_size(int seg)
176{
177 return 8 << fixed_seg_table[seg].range_shift;
178}
179
180static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
181{
182 switch (msr) {
183 case MSR_MTRRfix64K_00000:
184 *seg = 0;
185 *unit = 0;
186 break;
187 case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
188 *seg = 1;
189 *unit = msr - MSR_MTRRfix16K_80000;
190 break;
191 case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
192 *seg = 2;
193 *unit = msr - MSR_MTRRfix4K_C0000;
194 break;
195 default:
196 return false;
197 }
198
199 return true;
200}
201
202static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
203{
204 struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
205 u64 unit_size = fixed_mtrr_seg_unit_size(seg);
206
207 *start = mtrr_seg->start + unit * unit_size;
208 *end = *start + unit_size;
209 WARN_ON(*end > mtrr_seg->end);
210}
211
212static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
213{
214 struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
215
216 WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
217 > mtrr_seg->end);
218
219 /* each unit has 8 ranges. */
220 return mtrr_seg->range_start + 8 * unit;
221}
222
223static int fixed_mtrr_seg_end_range_index(int seg)
224{
225 struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
226 int n;
227
228 n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
229 return mtrr_seg->range_start + n - 1;
230}
231
232static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
233{
234 int seg, unit;
235
236 if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
237 return false;
238
239 fixed_mtrr_seg_unit_range(seg, unit, start, end);
240 return true;
241}
242
243static int fixed_msr_to_range_index(u32 msr)
244{
245 int seg, unit;
246
247 if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
248 return -1;
249
250 return fixed_mtrr_seg_unit_range_index(seg, unit);
251}
252
253static int fixed_mtrr_addr_to_seg(u64 addr)
254{
255 struct fixed_mtrr_segment *mtrr_seg;
256 int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
257
258 for (seg = 0; seg < seg_num; seg++) {
259 mtrr_seg = &fixed_seg_table[seg];
260 if (mtrr_seg->start >= addr && addr < mtrr_seg->end)
261 return seg;
262 }
263
264 return -1;
265}
266
267static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
268{
269 struct fixed_mtrr_segment *mtrr_seg;
270 int index;
271
272 mtrr_seg = &fixed_seg_table[seg];
273 index = mtrr_seg->range_start;
274 index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
275 return index;
276}
277
278static u64 fixed_mtrr_range_end_addr(int seg, int index)
279{
280 struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
281 int pos = index - mtrr_seg->range_start;
282
283 return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
284}
285
286static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
287{
288 u64 mask;
289
290 *start = range->base & PAGE_MASK;
291
292 mask = range->mask & PAGE_MASK;
293 mask |= ~0ULL << boot_cpu_data.x86_phys_bits;
294
295 /* This cannot overflow because writing to the reserved bits of
296 * variable MTRRs causes a #GP.
297 */
298 *end = (*start | ~mask) + 1;
299}
300
301static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
302{
303 struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
304 gfn_t start, end;
305 int index;
306
307 if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
308 !kvm_arch_has_noncoherent_dma(vcpu->kvm))
309 return;
310
311 if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
312 return;
313
314 /* fixed MTRRs. */
315 if (fixed_msr_to_range(msr, &start, &end)) {
316 if (!fixed_mtrr_is_enabled(mtrr_state))
317 return;
318 } else if (msr == MSR_MTRRdefType) {
319 start = 0x0;
320 end = ~0ULL;
321 } else {
322 /* variable range MTRRs. */
323 index = (msr - 0x200) / 2;
324 var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end);
325 }
326
327 kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
328}
329
330static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
331{
332 return (range->mask & (1 << 11)) != 0;
333}
334
335static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
336{
337 struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
338 struct kvm_mtrr_range *tmp, *cur;
339 int index, is_mtrr_mask;
340
341 index = (msr - 0x200) / 2;
342 is_mtrr_mask = msr - 0x200 - 2 * index;
343 cur = &mtrr_state->var_ranges[index];
344
345 /* remove the entry if it's in the list. */
346 if (var_mtrr_range_is_valid(cur))
347 list_del(&mtrr_state->var_ranges[index].node);
348
349 if (!is_mtrr_mask)
350 cur->base = data;
351 else
352 cur->mask = data;
353
354 /* add it to the list if it's enabled. */
355 if (var_mtrr_range_is_valid(cur)) {
356 list_for_each_entry(tmp, &mtrr_state->head, node)
357 if (cur->base >= tmp->base)
358 break;
359 list_add_tail(&cur->node, &tmp->node);
360 }
361}
362
363int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
364{
365 int index;
366
367 if (!kvm_mtrr_valid(vcpu, msr, data))
368 return 1;
369
370 index = fixed_msr_to_range_index(msr);
371 if (index >= 0)
372 *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
373 else if (msr == MSR_MTRRdefType)
374 vcpu->arch.mtrr_state.deftype = data;
375 else if (msr == MSR_IA32_CR_PAT)
376 vcpu->arch.pat = data;
377 else
378 set_var_mtrr_msr(vcpu, msr, data);
379
380 update_mtrr(vcpu, msr);
381 return 0;
382}
383
384int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
385{
386 int index;
387
388 /* MSR_MTRRcap is a readonly MSR. */
389 if (msr == MSR_MTRRcap) {
390 /*
391 * SMRR = 0
392 * WC = 1
393 * FIX = 1
394 * VCNT = KVM_NR_VAR_MTRR
395 */
396 *pdata = 0x500 | KVM_NR_VAR_MTRR;
397 return 0;
398 }
399
400 if (!msr_mtrr_valid(msr))
401 return 1;
402
403 index = fixed_msr_to_range_index(msr);
404 if (index >= 0)
405 *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
406 else if (msr == MSR_MTRRdefType)
407 *pdata = vcpu->arch.mtrr_state.deftype;
408 else if (msr == MSR_IA32_CR_PAT)
409 *pdata = vcpu->arch.pat;
410 else { /* Variable MTRRs */
411 int is_mtrr_mask;
412
413 index = (msr - 0x200) / 2;
414 is_mtrr_mask = msr - 0x200 - 2 * index;
415 if (!is_mtrr_mask)
416 *pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
417 else
418 *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
419 }
420
421 return 0;
422}
423
424void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
425{
426 INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
427}
428
429struct mtrr_iter {
430 /* input fields. */
431 struct kvm_mtrr *mtrr_state;
432 u64 start;
433 u64 end;
434
435 /* output fields. */
436 int mem_type;
437 /* [start, end) is not fully covered in MTRRs? */
438 bool partial_map;
439
440 /* private fields. */
441 union {
442 /* used for fixed MTRRs. */
443 struct {
444 int index;
445 int seg;
446 };
447
448 /* used for var MTRRs. */
449 struct {
450 struct kvm_mtrr_range *range;
451 /* max address has been covered in var MTRRs. */
452 u64 start_max;
453 };
454 };
455
456 bool fixed;
457};
458
459static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
460{
461 int seg, index;
462
463 if (!fixed_mtrr_is_enabled(iter->mtrr_state))
464 return false;
465
466 seg = fixed_mtrr_addr_to_seg(iter->start);
467 if (seg < 0)
468 return false;
469
470 iter->fixed = true;
471 index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
472 iter->index = index;
473 iter->seg = seg;
474 return true;
475}
476
477static bool match_var_range(struct mtrr_iter *iter,
478 struct kvm_mtrr_range *range)
479{
480 u64 start, end;
481
482 var_mtrr_range(range, &start, &end);
483 if (!(start >= iter->end || end <= iter->start)) {
484 iter->range = range;
485
486 /*
487 * the function is called when we do kvm_mtrr.head walking.
488 * Range has the minimum base address which interleaves
489 * [looker->start_max, looker->end).
490 */
491 iter->partial_map |= iter->start_max < start;
492
493 /* update the max address has been covered. */
494 iter->start_max = max(iter->start_max, end);
495 return true;
496 }
497
498 return false;
499}
500
501static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
502{
503 struct kvm_mtrr *mtrr_state = iter->mtrr_state;
504
505 list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
506 if (match_var_range(iter, iter->range))
507 return;
508
509 iter->range = NULL;
510 iter->partial_map |= iter->start_max < iter->end;
511}
512
513static void mtrr_lookup_var_start(struct mtrr_iter *iter)
514{
515 struct kvm_mtrr *mtrr_state = iter->mtrr_state;
516
517 iter->fixed = false;
518 iter->start_max = iter->start;
519 iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
520
521 __mtrr_lookup_var_next(iter);
522}
523
524static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
525{
526 /* terminate the lookup. */
527 if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
528 iter->fixed = false;
529 iter->range = NULL;
530 return;
531 }
532
533 iter->index++;
534
535 /* have looked up for all fixed MTRRs. */
536 if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
537 return mtrr_lookup_var_start(iter);
538
539 /* switch to next segment. */
540 if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
541 iter->seg++;
542}
543
544static void mtrr_lookup_var_next(struct mtrr_iter *iter)
545{
546 __mtrr_lookup_var_next(iter);
547}
548
549static void mtrr_lookup_start(struct mtrr_iter *iter)
550{
551 if (!mtrr_is_enabled(iter->mtrr_state)) {
552 iter->partial_map = true;
553 return;
554 }
555
556 if (!mtrr_lookup_fixed_start(iter))
557 mtrr_lookup_var_start(iter);
558}
559
560static void mtrr_lookup_init(struct mtrr_iter *iter,
561 struct kvm_mtrr *mtrr_state, u64 start, u64 end)
562{
563 iter->mtrr_state = mtrr_state;
564 iter->start = start;
565 iter->end = end;
566 iter->partial_map = false;
567 iter->fixed = false;
568 iter->range = NULL;
569
570 mtrr_lookup_start(iter);
571}
572
573static bool mtrr_lookup_okay(struct mtrr_iter *iter)
574{
575 if (iter->fixed) {
576 iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
577 return true;
578 }
579
580 if (iter->range) {
581 iter->mem_type = iter->range->base & 0xff;
582 return true;
583 }
584
585 return false;
586}
587
588static void mtrr_lookup_next(struct mtrr_iter *iter)
589{
590 if (iter->fixed)
591 mtrr_lookup_fixed_next(iter);
592 else
593 mtrr_lookup_var_next(iter);
594}
595
596#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
597 for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
598 mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
599
600u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
601{
602 struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
603 struct mtrr_iter iter;
604 u64 start, end;
605 int type = -1;
606 const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
607 | (1 << MTRR_TYPE_WRTHROUGH);
608
609 start = gfn_to_gpa(gfn);
610 end = start + PAGE_SIZE;
611
612 mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
613 int curr_type = iter.mem_type;
614
615 /*
616 * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
617 * Precedences.
618 */
619
620 if (type == -1) {
621 type = curr_type;
622 continue;
623 }
624
625 /*
626 * If two or more variable memory ranges match and the
627 * memory types are identical, then that memory type is
628 * used.
629 */
630 if (type == curr_type)
631 continue;
632
633 /*
634 * If two or more variable memory ranges match and one of
635 * the memory types is UC, the UC memory type used.
636 */
637 if (curr_type == MTRR_TYPE_UNCACHABLE)
638 return MTRR_TYPE_UNCACHABLE;
639
640 /*
641 * If two or more variable memory ranges match and the
642 * memory types are WT and WB, the WT memory type is used.
643 */
644 if (((1 << type) & wt_wb_mask) &&
645 ((1 << curr_type) & wt_wb_mask)) {
646 type = MTRR_TYPE_WRTHROUGH;
647 continue;
648 }
649
650 /*
651 * For overlaps not defined by the above rules, processor
652 * behavior is undefined.
653 */
654
655 /* We use WB for this undefined behavior. :( */
656 return MTRR_TYPE_WRBACK;
657 }
658
659 /* It is not covered by MTRRs. */
660 if (iter.partial_map) {
661 /*
662 * We just check one page, partially covered by MTRRs is
663 * impossible.
664 */
665 WARN_ON(type != -1);
666 type = mtrr_default_type(mtrr_state);
667 }
668 return type;
669}
670EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
671
672bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
673 int page_num)
674{
675 struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
676 struct mtrr_iter iter;
677 u64 start, end;
678 int type = -1;
679
680 start = gfn_to_gpa(gfn);
681 end = gfn_to_gpa(gfn + page_num);
682 mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
683 if (type == -1) {
684 type = iter.mem_type;
685 continue;
686 }
687
688 if (type != iter.mem_type)
689 return false;
690 }
691
692 if (!iter.partial_map)
693 return true;
694
695 if (type == -1)
696 return true;
697
698 return type == mtrr_default_type(mtrr_state);
699}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6e6d115fe9b5..0f67d7e24800 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -256,7 +256,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
256 if (ret) 256 if (ret)
257 return ret; 257 return ret;
258 258
259 mark_page_dirty(vcpu->kvm, table_gfn); 259 kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
260 walker->ptes[level] = pte; 260 walker->ptes[level] = pte;
261 } 261 }
262 return 0; 262 return 0;
@@ -338,7 +338,7 @@ retry_walk:
338 338
339 real_gfn = gpa_to_gfn(real_gfn); 339 real_gfn = gpa_to_gfn(real_gfn);
340 340
341 host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, 341 host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
342 &walker->pte_writable[walker->level - 1]); 342 &walker->pte_writable[walker->level - 1]);
343 if (unlikely(kvm_is_error_hva(host_addr))) 343 if (unlikely(kvm_is_error_hva(host_addr)))
344 goto error; 344 goto error;
@@ -511,11 +511,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
511 base_gpa = pte_gpa & ~mask; 511 base_gpa = pte_gpa & ~mask;
512 index = (pte_gpa - base_gpa) / sizeof(pt_element_t); 512 index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
513 513
514 r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, 514 r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
515 gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); 515 gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
516 curr_pte = gw->prefetch_ptes[index]; 516 curr_pte = gw->prefetch_ptes[index];
517 } else 517 } else
518 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, 518 r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
519 &curr_pte, sizeof(curr_pte)); 519 &curr_pte, sizeof(curr_pte));
520 520
521 return r || curr_pte != gw->ptes[level - 1]; 521 return r || curr_pte != gw->ptes[level - 1];
@@ -869,8 +869,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
869 if (!rmap_can_add(vcpu)) 869 if (!rmap_can_add(vcpu))
870 break; 870 break;
871 871
872 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 872 if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
873 sizeof(pt_element_t))) 873 sizeof(pt_element_t)))
874 break; 874 break;
875 875
876 FNAME(update_pte)(vcpu, sp, sptep, &gpte); 876 FNAME(update_pte)(vcpu, sp, sptep, &gpte);
@@ -956,8 +956,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
956 956
957 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); 957 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
958 958
959 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 959 if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
960 sizeof(pt_element_t))) 960 sizeof(pt_element_t)))
961 return -EINVAL; 961 return -EINVAL;
962 962
963 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 963 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
@@ -970,7 +970,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
970 pte_access &= FNAME(gpte_access)(vcpu, gpte); 970 pte_access &= FNAME(gpte_access)(vcpu, gpte);
971 FNAME(protect_clean_gpte)(&pte_access, gpte); 971 FNAME(protect_clean_gpte)(&pte_access, gpte);
972 972
973 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, 973 if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
974 &nr_present)) 974 &nr_present))
975 continue; 975 continue;
976 976
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 29fbf9dfdc54..31aa2c85dc97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,11 +1,12 @@
1/* 1/*
2 * Kernel-based Virtual Machine -- Performance Monitoring Unit support 2 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
3 * 3 *
4 * Copyright 2011 Red Hat, Inc. and/or its affiliates. 4 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
5 * 5 *
6 * Authors: 6 * Authors:
7 * Avi Kivity <avi@redhat.com> 7 * Avi Kivity <avi@redhat.com>
8 * Gleb Natapov <gleb@redhat.com> 8 * Gleb Natapov <gleb@redhat.com>
9 * Wei Huang <wei@redhat.com>
9 * 10 *
10 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory. 12 * the COPYING file in the top-level directory.
@@ -19,88 +20,39 @@
19#include "x86.h" 20#include "x86.h"
20#include "cpuid.h" 21#include "cpuid.h"
21#include "lapic.h" 22#include "lapic.h"
23#include "pmu.h"
24
25/* NOTE:
26 * - Each perf counter is defined as "struct kvm_pmc";
27 * - There are two types of perf counters: general purpose (gp) and fixed.
28 * gp counters are stored in gp_counters[] and fixed counters are stored
29 * in fixed_counters[] respectively. Both of them are part of "struct
30 * kvm_pmu";
31 * - pmu.c understands the difference between gp counters and fixed counters.
32 * However AMD doesn't support fixed-counters;
33 * - There are three types of index to access perf counters (PMC):
34 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
35 * has MSR_K7_PERFCTRn.
36 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
37 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
38 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
39 * that it also supports fixed counters. idx can be used to as index to
40 * gp and fixed counters.
41 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
42 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
43 * all perf counters (both gp and fixed). The mapping relationship
44 * between pmc and perf counters is as the following:
45 * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
46 * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
47 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
48 */
22 49
23static struct kvm_arch_event_perf_mapping { 50static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
24 u8 eventsel;
25 u8 unit_mask;
26 unsigned event_type;
27 bool inexact;
28} arch_events[] = {
29 /* Index must match CPUID 0x0A.EBX bit vector */
30 [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
31 [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
32 [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
33 [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
34 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
35 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
36 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
37 [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
38};
39
40/* mapping between fixed pmc index and arch_events array */
41static int fixed_pmc_events[] = {1, 0, 7};
42
43static bool pmc_is_gp(struct kvm_pmc *pmc)
44{
45 return pmc->type == KVM_PMC_GP;
46}
47
48static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
49{
50 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
51
52 return pmu->counter_bitmask[pmc->type];
53}
54
55static inline bool pmc_enabled(struct kvm_pmc *pmc)
56{
57 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
58 return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
59}
60
61static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
62 u32 base)
63{
64 if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
65 return &pmu->gp_counters[msr - base];
66 return NULL;
67}
68
69static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
70{
71 int base = MSR_CORE_PERF_FIXED_CTR0;
72 if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
73 return &pmu->fixed_counters[msr - base];
74 return NULL;
75}
76
77static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
78{
79 return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
80}
81
82static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
83{
84 if (idx < INTEL_PMC_IDX_FIXED)
85 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
86 else
87 return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
88}
89
90void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
91{
92 if (vcpu->arch.apic)
93 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
94}
95
96static void trigger_pmi(struct irq_work *irq_work)
97{ 51{
98 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, 52 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
99 irq_work); 53 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
100 struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
101 arch.pmu);
102 54
103 kvm_deliver_pmi(vcpu); 55 kvm_pmu_deliver_pmi(vcpu);
104} 56}
105 57
106static void kvm_perf_overflow(struct perf_event *perf_event, 58static void kvm_perf_overflow(struct perf_event *perf_event,
@@ -108,63 +60,46 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
108 struct pt_regs *regs) 60 struct pt_regs *regs)
109{ 61{
110 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 62 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
111 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; 63 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
112 if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { 64
65 if (!test_and_set_bit(pmc->idx,
66 (unsigned long *)&pmu->reprogram_pmi)) {
113 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 67 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
114 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 68 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
115 } 69 }
116} 70}
117 71
118static void kvm_perf_overflow_intr(struct perf_event *perf_event, 72static void kvm_perf_overflow_intr(struct perf_event *perf_event,
119 struct perf_sample_data *data, struct pt_regs *regs) 73 struct perf_sample_data *data,
74 struct pt_regs *regs)
120{ 75{
121 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 76 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
122 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; 77 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
123 if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { 78
79 if (!test_and_set_bit(pmc->idx,
80 (unsigned long *)&pmu->reprogram_pmi)) {
124 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 81 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
125 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 82 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
83
126 /* 84 /*
127 * Inject PMI. If vcpu was in a guest mode during NMI PMI 85 * Inject PMI. If vcpu was in a guest mode during NMI PMI
128 * can be ejected on a guest mode re-entry. Otherwise we can't 86 * can be ejected on a guest mode re-entry. Otherwise we can't
129 * be sure that vcpu wasn't executing hlt instruction at the 87 * be sure that vcpu wasn't executing hlt instruction at the
130 * time of vmexit and is not going to re-enter guest mode until, 88 * time of vmexit and is not going to re-enter guest mode until
131 * woken up. So we should wake it, but this is impossible from 89 * woken up. So we should wake it, but this is impossible from
132 * NMI context. Do it from irq work instead. 90 * NMI context. Do it from irq work instead.
133 */ 91 */
134 if (!kvm_is_in_guest()) 92 if (!kvm_is_in_guest())
135 irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); 93 irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
136 else 94 else
137 kvm_make_request(KVM_REQ_PMI, pmc->vcpu); 95 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
138 } 96 }
139} 97}
140 98
141static u64 read_pmc(struct kvm_pmc *pmc) 99static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
142{ 100 unsigned config, bool exclude_user,
143 u64 counter, enabled, running; 101 bool exclude_kernel, bool intr,
144 102 bool in_tx, bool in_tx_cp)
145 counter = pmc->counter;
146
147 if (pmc->perf_event)
148 counter += perf_event_read_value(pmc->perf_event,
149 &enabled, &running);
150
151 /* FIXME: Scaling needed? */
152
153 return counter & pmc_bitmask(pmc);
154}
155
156static void stop_counter(struct kvm_pmc *pmc)
157{
158 if (pmc->perf_event) {
159 pmc->counter = read_pmc(pmc);
160 perf_event_release_kernel(pmc->perf_event);
161 pmc->perf_event = NULL;
162 }
163}
164
165static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
166 unsigned config, bool exclude_user, bool exclude_kernel,
167 bool intr, bool in_tx, bool in_tx_cp)
168{ 103{
169 struct perf_event *event; 104 struct perf_event *event;
170 struct perf_event_attr attr = { 105 struct perf_event_attr attr = {
@@ -177,6 +112,7 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
177 .exclude_kernel = exclude_kernel, 112 .exclude_kernel = exclude_kernel,
178 .config = config, 113 .config = config,
179 }; 114 };
115
180 if (in_tx) 116 if (in_tx)
181 attr.config |= HSW_IN_TX; 117 attr.config |= HSW_IN_TX;
182 if (in_tx_cp) 118 if (in_tx_cp)
@@ -188,33 +124,16 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
188 intr ? kvm_perf_overflow_intr : 124 intr ? kvm_perf_overflow_intr :
189 kvm_perf_overflow, pmc); 125 kvm_perf_overflow, pmc);
190 if (IS_ERR(event)) { 126 if (IS_ERR(event)) {
191 printk_once("kvm: pmu event creation failed %ld\n", 127 printk_once("kvm_pmu: event creation failed %ld\n",
192 PTR_ERR(event)); 128 PTR_ERR(event));
193 return; 129 return;
194 } 130 }
195 131
196 pmc->perf_event = event; 132 pmc->perf_event = event;
197 clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi); 133 clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
198}
199
200static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
201 u8 unit_mask)
202{
203 int i;
204
205 for (i = 0; i < ARRAY_SIZE(arch_events); i++)
206 if (arch_events[i].eventsel == event_select
207 && arch_events[i].unit_mask == unit_mask
208 && (pmu->available_event_types & (1 << i)))
209 break;
210
211 if (i == ARRAY_SIZE(arch_events))
212 return PERF_COUNT_HW_MAX;
213
214 return arch_events[i].event_type;
215} 134}
216 135
217static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) 136void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
218{ 137{
219 unsigned config, type = PERF_TYPE_RAW; 138 unsigned config, type = PERF_TYPE_RAW;
220 u8 event_select, unit_mask; 139 u8 event_select, unit_mask;
@@ -224,21 +143,22 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
224 143
225 pmc->eventsel = eventsel; 144 pmc->eventsel = eventsel;
226 145
227 stop_counter(pmc); 146 pmc_stop_counter(pmc);
228 147
229 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc)) 148 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
230 return; 149 return;
231 150
232 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; 151 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
233 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 152 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
234 153
235 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | 154 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
236 ARCH_PERFMON_EVENTSEL_INV | 155 ARCH_PERFMON_EVENTSEL_INV |
237 ARCH_PERFMON_EVENTSEL_CMASK | 156 ARCH_PERFMON_EVENTSEL_CMASK |
238 HSW_IN_TX | 157 HSW_IN_TX |
239 HSW_IN_TX_CHECKPOINTED))) { 158 HSW_IN_TX_CHECKPOINTED))) {
240 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, 159 config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc),
241 unit_mask); 160 event_select,
161 unit_mask);
242 if (config != PERF_COUNT_HW_MAX) 162 if (config != PERF_COUNT_HW_MAX)
243 type = PERF_TYPE_HARDWARE; 163 type = PERF_TYPE_HARDWARE;
244 } 164 }
@@ -246,56 +166,36 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
246 if (type == PERF_TYPE_RAW) 166 if (type == PERF_TYPE_RAW)
247 config = eventsel & X86_RAW_EVENT_MASK; 167 config = eventsel & X86_RAW_EVENT_MASK;
248 168
249 reprogram_counter(pmc, type, config, 169 pmc_reprogram_counter(pmc, type, config,
250 !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 170 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
251 !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 171 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
252 eventsel & ARCH_PERFMON_EVENTSEL_INT, 172 eventsel & ARCH_PERFMON_EVENTSEL_INT,
253 (eventsel & HSW_IN_TX), 173 (eventsel & HSW_IN_TX),
254 (eventsel & HSW_IN_TX_CHECKPOINTED)); 174 (eventsel & HSW_IN_TX_CHECKPOINTED));
255} 175}
176EXPORT_SYMBOL_GPL(reprogram_gp_counter);
256 177
257static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) 178void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
258{ 179{
259 unsigned en = en_pmi & 0x3; 180 unsigned en_field = ctrl & 0x3;
260 bool pmi = en_pmi & 0x8; 181 bool pmi = ctrl & 0x8;
261 182
262 stop_counter(pmc); 183 pmc_stop_counter(pmc);
263 184
264 if (!en || !pmc_enabled(pmc)) 185 if (!en_field || !pmc_is_enabled(pmc))
265 return; 186 return;
266 187
267 reprogram_counter(pmc, PERF_TYPE_HARDWARE, 188 pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
268 arch_events[fixed_pmc_events[idx]].event_type, 189 kvm_x86_ops->pmu_ops->find_fixed_event(idx),
269 !(en & 0x2), /* exclude user */ 190 !(en_field & 0x2), /* exclude user */
270 !(en & 0x1), /* exclude kernel */ 191 !(en_field & 0x1), /* exclude kernel */
271 pmi, false, false); 192 pmi, false, false);
272} 193}
194EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
273 195
274static inline u8 fixed_en_pmi(u64 ctrl, int idx) 196void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
275{ 197{
276 return (ctrl >> (idx * 4)) & 0xf; 198 struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
277}
278
279static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
280{
281 int i;
282
283 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
284 u8 en_pmi = fixed_en_pmi(data, i);
285 struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
286
287 if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
288 continue;
289
290 reprogram_fixed_counter(pmc, en_pmi, i);
291 }
292
293 pmu->fixed_ctr_ctrl = data;
294}
295
296static void reprogram_idx(struct kvm_pmu *pmu, int idx)
297{
298 struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
299 199
300 if (!pmc) 200 if (!pmc)
301 return; 201 return;
@@ -303,274 +203,107 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
303 if (pmc_is_gp(pmc)) 203 if (pmc_is_gp(pmc))
304 reprogram_gp_counter(pmc, pmc->eventsel); 204 reprogram_gp_counter(pmc, pmc->eventsel);
305 else { 205 else {
306 int fidx = idx - INTEL_PMC_IDX_FIXED; 206 int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
307 reprogram_fixed_counter(pmc, 207 u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
308 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); 208
209 reprogram_fixed_counter(pmc, ctrl, idx);
309 } 210 }
310} 211}
212EXPORT_SYMBOL_GPL(reprogram_counter);
311 213
312static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) 214void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
313{ 215{
216 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
217 u64 bitmask;
314 int bit; 218 int bit;
315 u64 diff = pmu->global_ctrl ^ data;
316
317 pmu->global_ctrl = data;
318
319 for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
320 reprogram_idx(pmu, bit);
321}
322 219
323bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) 220 bitmask = pmu->reprogram_pmi;
324{
325 struct kvm_pmu *pmu = &vcpu->arch.pmu;
326 int ret;
327
328 switch (msr) {
329 case MSR_CORE_PERF_FIXED_CTR_CTRL:
330 case MSR_CORE_PERF_GLOBAL_STATUS:
331 case MSR_CORE_PERF_GLOBAL_CTRL:
332 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
333 ret = pmu->version > 1;
334 break;
335 default:
336 ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
337 || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
338 || get_fixed_pmc(pmu, msr);
339 break;
340 }
341 return ret;
342}
343 221
344int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) 222 for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
345{ 223 struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
346 struct kvm_pmu *pmu = &vcpu->arch.pmu;
347 struct kvm_pmc *pmc;
348 224
349 switch (index) { 225 if (unlikely(!pmc || !pmc->perf_event)) {
350 case MSR_CORE_PERF_FIXED_CTR_CTRL: 226 clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
351 *data = pmu->fixed_ctr_ctrl; 227 continue;
352 return 0;
353 case MSR_CORE_PERF_GLOBAL_STATUS:
354 *data = pmu->global_status;
355 return 0;
356 case MSR_CORE_PERF_GLOBAL_CTRL:
357 *data = pmu->global_ctrl;
358 return 0;
359 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
360 *data = pmu->global_ovf_ctrl;
361 return 0;
362 default:
363 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
364 (pmc = get_fixed_pmc(pmu, index))) {
365 *data = read_pmc(pmc);
366 return 0;
367 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
368 *data = pmc->eventsel;
369 return 0;
370 } 228 }
371 }
372 return 1;
373}
374 229
375int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 230 reprogram_counter(pmu, bit);
376{
377 struct kvm_pmu *pmu = &vcpu->arch.pmu;
378 struct kvm_pmc *pmc;
379 u32 index = msr_info->index;
380 u64 data = msr_info->data;
381
382 switch (index) {
383 case MSR_CORE_PERF_FIXED_CTR_CTRL:
384 if (pmu->fixed_ctr_ctrl == data)
385 return 0;
386 if (!(data & 0xfffffffffffff444ull)) {
387 reprogram_fixed_counters(pmu, data);
388 return 0;
389 }
390 break;
391 case MSR_CORE_PERF_GLOBAL_STATUS:
392 if (msr_info->host_initiated) {
393 pmu->global_status = data;
394 return 0;
395 }
396 break; /* RO MSR */
397 case MSR_CORE_PERF_GLOBAL_CTRL:
398 if (pmu->global_ctrl == data)
399 return 0;
400 if (!(data & pmu->global_ctrl_mask)) {
401 global_ctrl_changed(pmu, data);
402 return 0;
403 }
404 break;
405 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
406 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
407 if (!msr_info->host_initiated)
408 pmu->global_status &= ~data;
409 pmu->global_ovf_ctrl = data;
410 return 0;
411 }
412 break;
413 default:
414 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
415 (pmc = get_fixed_pmc(pmu, index))) {
416 if (!msr_info->host_initiated)
417 data = (s64)(s32)data;
418 pmc->counter += data - read_pmc(pmc);
419 return 0;
420 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
421 if (data == pmc->eventsel)
422 return 0;
423 if (!(data & pmu->reserved_bits)) {
424 reprogram_gp_counter(pmc, data);
425 return 0;
426 }
427 }
428 } 231 }
429 return 1;
430} 232}
431 233
432int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc) 234/* check if idx is a valid index to access PMU */
235int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
433{ 236{
434 struct kvm_pmu *pmu = &vcpu->arch.pmu; 237 return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
435 bool fixed = pmc & (1u << 30);
436 pmc &= ~(3u << 30);
437 return (!fixed && pmc >= pmu->nr_arch_gp_counters) ||
438 (fixed && pmc >= pmu->nr_arch_fixed_counters);
439} 238}
440 239
441int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) 240int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
442{ 241{
443 struct kvm_pmu *pmu = &vcpu->arch.pmu; 242 bool fast_mode = idx & (1u << 31);
444 bool fast_mode = pmc & (1u << 31); 243 struct kvm_pmc *pmc;
445 bool fixed = pmc & (1u << 30); 244 u64 ctr_val;
446 struct kvm_pmc *counters; 245
447 u64 ctr; 246 pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
448 247 if (!pmc)
449 pmc &= ~(3u << 30);
450 if (!fixed && pmc >= pmu->nr_arch_gp_counters)
451 return 1;
452 if (fixed && pmc >= pmu->nr_arch_fixed_counters)
453 return 1; 248 return 1;
454 counters = fixed ? pmu->fixed_counters : pmu->gp_counters; 249
455 ctr = read_pmc(&counters[pmc]); 250 ctr_val = pmc_read_counter(pmc);
456 if (fast_mode) 251 if (fast_mode)
457 ctr = (u32)ctr; 252 ctr_val = (u32)ctr_val;
458 *data = ctr;
459 253
254 *data = ctr_val;
460 return 0; 255 return 0;
461} 256}
462 257
463void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) 258void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
464{ 259{
465 struct kvm_pmu *pmu = &vcpu->arch.pmu; 260 if (vcpu->arch.apic)
466 struct kvm_cpuid_entry2 *entry; 261 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
467 union cpuid10_eax eax; 262}
468 union cpuid10_edx edx;
469
470 pmu->nr_arch_gp_counters = 0;
471 pmu->nr_arch_fixed_counters = 0;
472 pmu->counter_bitmask[KVM_PMC_GP] = 0;
473 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
474 pmu->version = 0;
475 pmu->reserved_bits = 0xffffffff00200000ull;
476
477 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
478 if (!entry)
479 return;
480 eax.full = entry->eax;
481 edx.full = entry->edx;
482
483 pmu->version = eax.split.version_id;
484 if (!pmu->version)
485 return;
486
487 pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
488 INTEL_PMC_MAX_GENERIC);
489 pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
490 pmu->available_event_types = ~entry->ebx &
491 ((1ull << eax.split.mask_length) - 1);
492
493 if (pmu->version == 1) {
494 pmu->nr_arch_fixed_counters = 0;
495 } else {
496 pmu->nr_arch_fixed_counters =
497 min_t(int, edx.split.num_counters_fixed,
498 INTEL_PMC_MAX_FIXED);
499 pmu->counter_bitmask[KVM_PMC_FIXED] =
500 ((u64)1 << edx.split.bit_width_fixed) - 1;
501 }
502 263
503 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | 264bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
504 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); 265{
505 pmu->global_ctrl_mask = ~pmu->global_ctrl; 266 return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
267}
506 268
507 entry = kvm_find_cpuid_entry(vcpu, 7, 0); 269int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
508 if (entry && 270{
509 (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && 271 return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data);
510 (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
511 pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
512} 272}
513 273
514void kvm_pmu_init(struct kvm_vcpu *vcpu) 274int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
515{ 275{
516 int i; 276 return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
517 struct kvm_pmu *pmu = &vcpu->arch.pmu; 277}
518 278
519 memset(pmu, 0, sizeof(*pmu)); 279/* refresh PMU settings. This function generally is called when underlying
520 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { 280 * settings are changed (such as changes of PMU CPUID by guest VMs), which
521 pmu->gp_counters[i].type = KVM_PMC_GP; 281 * should rarely happen.
522 pmu->gp_counters[i].vcpu = vcpu; 282 */
523 pmu->gp_counters[i].idx = i; 283void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
524 } 284{
525 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { 285 kvm_x86_ops->pmu_ops->refresh(vcpu);
526 pmu->fixed_counters[i].type = KVM_PMC_FIXED;
527 pmu->fixed_counters[i].vcpu = vcpu;
528 pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
529 }
530 init_irq_work(&pmu->irq_work, trigger_pmi);
531 kvm_pmu_cpuid_update(vcpu);
532} 286}
533 287
534void kvm_pmu_reset(struct kvm_vcpu *vcpu) 288void kvm_pmu_reset(struct kvm_vcpu *vcpu)
535{ 289{
536 struct kvm_pmu *pmu = &vcpu->arch.pmu; 290 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
537 int i;
538 291
539 irq_work_sync(&pmu->irq_work); 292 irq_work_sync(&pmu->irq_work);
540 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { 293 kvm_x86_ops->pmu_ops->reset(vcpu);
541 struct kvm_pmc *pmc = &pmu->gp_counters[i]; 294}
542 stop_counter(pmc);
543 pmc->counter = pmc->eventsel = 0;
544 }
545 295
546 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) 296void kvm_pmu_init(struct kvm_vcpu *vcpu)
547 stop_counter(&pmu->fixed_counters[i]); 297{
298 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
548 299
549 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 300 memset(pmu, 0, sizeof(*pmu));
550 pmu->global_ovf_ctrl = 0; 301 kvm_x86_ops->pmu_ops->init(vcpu);
302 init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
303 kvm_pmu_refresh(vcpu);
551} 304}
552 305
553void kvm_pmu_destroy(struct kvm_vcpu *vcpu) 306void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
554{ 307{
555 kvm_pmu_reset(vcpu); 308 kvm_pmu_reset(vcpu);
556} 309}
557
558void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
559{
560 struct kvm_pmu *pmu = &vcpu->arch.pmu;
561 u64 bitmask;
562 int bit;
563
564 bitmask = pmu->reprogram_pmi;
565
566 for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
567 struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
568
569 if (unlikely(!pmc || !pmc->perf_event)) {
570 clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
571 continue;
572 }
573
574 reprogram_idx(pmu, bit);
575 }
576}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000000..f96e1f962587
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,118 @@
1#ifndef __KVM_X86_PMU_H
2#define __KVM_X86_PMU_H
3
4#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
5#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
6#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
7
8/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
9#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
10
11struct kvm_event_hw_type_mapping {
12 u8 eventsel;
13 u8 unit_mask;
14 unsigned event_type;
15};
16
17struct kvm_pmu_ops {
18 unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
19 u8 unit_mask);
20 unsigned (*find_fixed_event)(int idx);
21 bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
22 struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
23 struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx);
24 int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
25 bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
26 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
27 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
28 void (*refresh)(struct kvm_vcpu *vcpu);
29 void (*init)(struct kvm_vcpu *vcpu);
30 void (*reset)(struct kvm_vcpu *vcpu);
31};
32
33static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
34{
35 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
36
37 return pmu->counter_bitmask[pmc->type];
38}
39
40static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
41{
42 u64 counter, enabled, running;
43
44 counter = pmc->counter;
45 if (pmc->perf_event)
46 counter += perf_event_read_value(pmc->perf_event,
47 &enabled, &running);
48 /* FIXME: Scaling needed? */
49 return counter & pmc_bitmask(pmc);
50}
51
52static inline void pmc_stop_counter(struct kvm_pmc *pmc)
53{
54 if (pmc->perf_event) {
55 pmc->counter = pmc_read_counter(pmc);
56 perf_event_release_kernel(pmc->perf_event);
57 pmc->perf_event = NULL;
58 }
59}
60
61static inline bool pmc_is_gp(struct kvm_pmc *pmc)
62{
63 return pmc->type == KVM_PMC_GP;
64}
65
66static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
67{
68 return pmc->type == KVM_PMC_FIXED;
69}
70
71static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
72{
73 return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
74}
75
76/* returns general purpose PMC with the specified MSR. Note that it can be
77 * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
78 * paramenter to tell them apart.
79 */
80static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
81 u32 base)
82{
83 if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
84 return &pmu->gp_counters[msr - base];
85
86 return NULL;
87}
88
89/* returns fixed PMC with the specified MSR */
90static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
91{
92 int base = MSR_CORE_PERF_FIXED_CTR0;
93
94 if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
95 return &pmu->fixed_counters[msr - base];
96
97 return NULL;
98}
99
100void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
101void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
102void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
103
104void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
105void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
106int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
107int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
108bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
109int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
110int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
111void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
112void kvm_pmu_reset(struct kvm_vcpu *vcpu);
113void kvm_pmu_init(struct kvm_vcpu *vcpu);
114void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
115
116extern struct kvm_pmu_ops intel_pmu_ops;
117extern struct kvm_pmu_ops amd_pmu_ops;
118#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
new file mode 100644
index 000000000000..886aa25a7131
--- /dev/null
+++ b/arch/x86/kvm/pmu_amd.c
@@ -0,0 +1,207 @@
1/*
2 * KVM PMU support for AMD
3 *
4 * Copyright 2015, Red Hat, Inc. and/or its affiliates.
5 *
6 * Author:
7 * Wei Huang <wei@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Implementation is based on pmu_intel.c file
13 */
14#include <linux/types.h>
15#include <linux/kvm_host.h>
16#include <linux/perf_event.h>
17#include "x86.h"
18#include "cpuid.h"
19#include "lapic.h"
20#include "pmu.h"
21
22/* duplicated from amd_perfmon_event_map, K7 and above should work. */
23static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
24 [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
25 [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
26 [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES },
27 [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES },
28 [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
29 [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
30 [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
31 [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
32};
33
34static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
35 u8 event_select,
36 u8 unit_mask)
37{
38 int i;
39
40 for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
41 if (amd_event_mapping[i].eventsel == event_select
42 && amd_event_mapping[i].unit_mask == unit_mask)
43 break;
44
45 if (i == ARRAY_SIZE(amd_event_mapping))
46 return PERF_COUNT_HW_MAX;
47
48 return amd_event_mapping[i].event_type;
49}
50
51/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
52static unsigned amd_find_fixed_event(int idx)
53{
54 return PERF_COUNT_HW_MAX;
55}
56
57/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
58 * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
59 */
60static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
61{
62 return true;
63}
64
65static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
66{
67 return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0);
68}
69
70/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
71static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
72{
73 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
74
75 idx &= ~(3u << 30);
76
77 return (idx >= pmu->nr_arch_gp_counters);
78}
79
80/* idx is the ECX register of RDPMC instruction */
81static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx)
82{
83 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
84 struct kvm_pmc *counters;
85
86 idx &= ~(3u << 30);
87 if (idx >= pmu->nr_arch_gp_counters)
88 return NULL;
89 counters = pmu->gp_counters;
90
91 return &counters[idx];
92}
93
94static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
95{
96 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
97 int ret = false;
98
99 ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) ||
100 get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
101
102 return ret;
103}
104
105static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
106{
107 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
108 struct kvm_pmc *pmc;
109
110 /* MSR_K7_PERFCTRn */
111 pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
112 if (pmc) {
113 *data = pmc_read_counter(pmc);
114 return 0;
115 }
116 /* MSR_K7_EVNTSELn */
117 pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
118 if (pmc) {
119 *data = pmc->eventsel;
120 return 0;
121 }
122
123 return 1;
124}
125
126static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
127{
128 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
129 struct kvm_pmc *pmc;
130 u32 msr = msr_info->index;
131 u64 data = msr_info->data;
132
133 /* MSR_K7_PERFCTRn */
134 pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
135 if (pmc) {
136 if (!msr_info->host_initiated)
137 data = (s64)data;
138 pmc->counter += data - pmc_read_counter(pmc);
139 return 0;
140 }
141 /* MSR_K7_EVNTSELn */
142 pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
143 if (pmc) {
144 if (data == pmc->eventsel)
145 return 0;
146 if (!(data & pmu->reserved_bits)) {
147 reprogram_gp_counter(pmc, data);
148 return 0;
149 }
150 }
151
152 return 1;
153}
154
155static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
156{
157 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
158
159 pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
160 pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
161 pmu->reserved_bits = 0xffffffff00200000ull;
162 /* not applicable to AMD; but clean them to prevent any fall out */
163 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
164 pmu->nr_arch_fixed_counters = 0;
165 pmu->version = 0;
166 pmu->global_status = 0;
167}
168
169static void amd_pmu_init(struct kvm_vcpu *vcpu)
170{
171 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
172 int i;
173
174 for (i = 0; i < AMD64_NUM_COUNTERS ; i++) {
175 pmu->gp_counters[i].type = KVM_PMC_GP;
176 pmu->gp_counters[i].vcpu = vcpu;
177 pmu->gp_counters[i].idx = i;
178 }
179}
180
181static void amd_pmu_reset(struct kvm_vcpu *vcpu)
182{
183 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
184 int i;
185
186 for (i = 0; i < AMD64_NUM_COUNTERS; i++) {
187 struct kvm_pmc *pmc = &pmu->gp_counters[i];
188
189 pmc_stop_counter(pmc);
190 pmc->counter = pmc->eventsel = 0;
191 }
192}
193
194struct kvm_pmu_ops amd_pmu_ops = {
195 .find_arch_event = amd_find_arch_event,
196 .find_fixed_event = amd_find_fixed_event,
197 .pmc_is_enabled = amd_pmc_is_enabled,
198 .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
199 .msr_idx_to_pmc = amd_msr_idx_to_pmc,
200 .is_valid_msr_idx = amd_is_valid_msr_idx,
201 .is_valid_msr = amd_is_valid_msr,
202 .get_msr = amd_pmu_get_msr,
203 .set_msr = amd_pmu_set_msr,
204 .refresh = amd_pmu_refresh,
205 .init = amd_pmu_init,
206 .reset = amd_pmu_reset,
207};
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
new file mode 100644
index 000000000000..ab38af4f4947
--- /dev/null
+++ b/arch/x86/kvm/pmu_intel.c
@@ -0,0 +1,358 @@
1/*
2 * KVM PMU support for Intel CPUs
3 *
4 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
5 *
6 * Authors:
7 * Avi Kivity <avi@redhat.com>
8 * Gleb Natapov <gleb@redhat.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
12 *
13 */
14#include <linux/types.h>
15#include <linux/kvm_host.h>
16#include <linux/perf_event.h>
17#include <asm/perf_event.h>
18#include "x86.h"
19#include "cpuid.h"
20#include "lapic.h"
21#include "pmu.h"
22
23static struct kvm_event_hw_type_mapping intel_arch_events[] = {
24 /* Index must match CPUID 0x0A.EBX bit vector */
25 [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
26 [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
27 [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
28 [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
29 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
30 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
31 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
32 [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
33};
34
35/* mapping between fixed pmc index and intel_arch_events array */
36static int fixed_pmc_events[] = {1, 0, 7};
37
38static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
39{
40 int i;
41
42 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
43 u8 new_ctrl = fixed_ctrl_field(data, i);
44 u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
45 struct kvm_pmc *pmc;
46
47 pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
48
49 if (old_ctrl == new_ctrl)
50 continue;
51
52 reprogram_fixed_counter(pmc, new_ctrl, i);
53 }
54
55 pmu->fixed_ctr_ctrl = data;
56}
57
58/* function is called when global control register has been updated. */
59static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
60{
61 int bit;
62 u64 diff = pmu->global_ctrl ^ data;
63
64 pmu->global_ctrl = data;
65
66 for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
67 reprogram_counter(pmu, bit);
68}
69
70static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
71 u8 event_select,
72 u8 unit_mask)
73{
74 int i;
75
76 for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
77 if (intel_arch_events[i].eventsel == event_select
78 && intel_arch_events[i].unit_mask == unit_mask
79 && (pmu->available_event_types & (1 << i)))
80 break;
81
82 if (i == ARRAY_SIZE(intel_arch_events))
83 return PERF_COUNT_HW_MAX;
84
85 return intel_arch_events[i].event_type;
86}
87
88static unsigned intel_find_fixed_event(int idx)
89{
90 if (idx >= ARRAY_SIZE(fixed_pmc_events))
91 return PERF_COUNT_HW_MAX;
92
93 return intel_arch_events[fixed_pmc_events[idx]].event_type;
94}
95
96/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
97static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
98{
99 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
100
101 return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
102}
103
104static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
105{
106 if (pmc_idx < INTEL_PMC_IDX_FIXED)
107 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
108 MSR_P6_EVNTSEL0);
109 else {
110 u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
111
112 return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
113 }
114}
115
116/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
117static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
118{
119 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
120 bool fixed = idx & (1u << 30);
121
122 idx &= ~(3u << 30);
123
124 return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
125 (fixed && idx >= pmu->nr_arch_fixed_counters);
126}
127
128static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
129 unsigned idx)
130{
131 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
132 bool fixed = idx & (1u << 30);
133 struct kvm_pmc *counters;
134
135 idx &= ~(3u << 30);
136 if (!fixed && idx >= pmu->nr_arch_gp_counters)
137 return NULL;
138 if (fixed && idx >= pmu->nr_arch_fixed_counters)
139 return NULL;
140 counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
141
142 return &counters[idx];
143}
144
145static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
146{
147 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
148 int ret;
149
150 switch (msr) {
151 case MSR_CORE_PERF_FIXED_CTR_CTRL:
152 case MSR_CORE_PERF_GLOBAL_STATUS:
153 case MSR_CORE_PERF_GLOBAL_CTRL:
154 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
155 ret = pmu->version > 1;
156 break;
157 default:
158 ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
159 get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
160 get_fixed_pmc(pmu, msr);
161 break;
162 }
163
164 return ret;
165}
166
167static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
168{
169 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
170 struct kvm_pmc *pmc;
171
172 switch (msr) {
173 case MSR_CORE_PERF_FIXED_CTR_CTRL:
174 *data = pmu->fixed_ctr_ctrl;
175 return 0;
176 case MSR_CORE_PERF_GLOBAL_STATUS:
177 *data = pmu->global_status;
178 return 0;
179 case MSR_CORE_PERF_GLOBAL_CTRL:
180 *data = pmu->global_ctrl;
181 return 0;
182 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
183 *data = pmu->global_ovf_ctrl;
184 return 0;
185 default:
186 if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
187 (pmc = get_fixed_pmc(pmu, msr))) {
188 *data = pmc_read_counter(pmc);
189 return 0;
190 } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
191 *data = pmc->eventsel;
192 return 0;
193 }
194 }
195
196 return 1;
197}
198
199static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
200{
201 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
202 struct kvm_pmc *pmc;
203 u32 msr = msr_info->index;
204 u64 data = msr_info->data;
205
206 switch (msr) {
207 case MSR_CORE_PERF_FIXED_CTR_CTRL:
208 if (pmu->fixed_ctr_ctrl == data)
209 return 0;
210 if (!(data & 0xfffffffffffff444ull)) {
211 reprogram_fixed_counters(pmu, data);
212 return 0;
213 }
214 break;
215 case MSR_CORE_PERF_GLOBAL_STATUS:
216 if (msr_info->host_initiated) {
217 pmu->global_status = data;
218 return 0;
219 }
220 break; /* RO MSR */
221 case MSR_CORE_PERF_GLOBAL_CTRL:
222 if (pmu->global_ctrl == data)
223 return 0;
224 if (!(data & pmu->global_ctrl_mask)) {
225 global_ctrl_changed(pmu, data);
226 return 0;
227 }
228 break;
229 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
230 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
231 if (!msr_info->host_initiated)
232 pmu->global_status &= ~data;
233 pmu->global_ovf_ctrl = data;
234 return 0;
235 }
236 break;
237 default:
238 if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
239 (pmc = get_fixed_pmc(pmu, msr))) {
240 if (!msr_info->host_initiated)
241 data = (s64)(s32)data;
242 pmc->counter += data - pmc_read_counter(pmc);
243 return 0;
244 } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
245 if (data == pmc->eventsel)
246 return 0;
247 if (!(data & pmu->reserved_bits)) {
248 reprogram_gp_counter(pmc, data);
249 return 0;
250 }
251 }
252 }
253
254 return 1;
255}
256
257static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
258{
259 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
260 struct kvm_cpuid_entry2 *entry;
261 union cpuid10_eax eax;
262 union cpuid10_edx edx;
263
264 pmu->nr_arch_gp_counters = 0;
265 pmu->nr_arch_fixed_counters = 0;
266 pmu->counter_bitmask[KVM_PMC_GP] = 0;
267 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
268 pmu->version = 0;
269 pmu->reserved_bits = 0xffffffff00200000ull;
270
271 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
272 if (!entry)
273 return;
274 eax.full = entry->eax;
275 edx.full = entry->edx;
276
277 pmu->version = eax.split.version_id;
278 if (!pmu->version)
279 return;
280
281 pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
282 INTEL_PMC_MAX_GENERIC);
283 pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
284 pmu->available_event_types = ~entry->ebx &
285 ((1ull << eax.split.mask_length) - 1);
286
287 if (pmu->version == 1) {
288 pmu->nr_arch_fixed_counters = 0;
289 } else {
290 pmu->nr_arch_fixed_counters =
291 min_t(int, edx.split.num_counters_fixed,
292 INTEL_PMC_MAX_FIXED);
293 pmu->counter_bitmask[KVM_PMC_FIXED] =
294 ((u64)1 << edx.split.bit_width_fixed) - 1;
295 }
296
297 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
298 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
299 pmu->global_ctrl_mask = ~pmu->global_ctrl;
300
301 entry = kvm_find_cpuid_entry(vcpu, 7, 0);
302 if (entry &&
303 (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
304 (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
305 pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
306}
307
308static void intel_pmu_init(struct kvm_vcpu *vcpu)
309{
310 int i;
311 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
312
313 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
314 pmu->gp_counters[i].type = KVM_PMC_GP;
315 pmu->gp_counters[i].vcpu = vcpu;
316 pmu->gp_counters[i].idx = i;
317 }
318
319 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
320 pmu->fixed_counters[i].type = KVM_PMC_FIXED;
321 pmu->fixed_counters[i].vcpu = vcpu;
322 pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
323 }
324}
325
326static void intel_pmu_reset(struct kvm_vcpu *vcpu)
327{
328 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
329 int i;
330
331 for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
332 struct kvm_pmc *pmc = &pmu->gp_counters[i];
333
334 pmc_stop_counter(pmc);
335 pmc->counter = pmc->eventsel = 0;
336 }
337
338 for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
339 pmc_stop_counter(&pmu->fixed_counters[i]);
340
341 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
342 pmu->global_ovf_ctrl = 0;
343}
344
345struct kvm_pmu_ops intel_pmu_ops = {
346 .find_arch_event = intel_find_arch_event,
347 .find_fixed_event = intel_find_fixed_event,
348 .pmc_is_enabled = intel_pmc_is_enabled,
349 .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
350 .msr_idx_to_pmc = intel_msr_idx_to_pmc,
351 .is_valid_msr_idx = intel_is_valid_msr_idx,
352 .is_valid_msr = intel_is_valid_msr,
353 .get_msr = intel_pmu_get_msr,
354 .set_msr = intel_pmu_set_msr,
355 .refresh = intel_pmu_refresh,
356 .init = intel_pmu_init,
357 .reset = intel_pmu_reset,
358};
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9afa233b5482..851a9a1c6dfc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -21,6 +21,7 @@
21#include "kvm_cache_regs.h" 21#include "kvm_cache_regs.h"
22#include "x86.h" 22#include "x86.h"
23#include "cpuid.h" 23#include "cpuid.h"
24#include "pmu.h"
24 25
25#include <linux/module.h> 26#include <linux/module.h>
26#include <linux/mod_devicetable.h> 27#include <linux/mod_devicetable.h>
@@ -511,8 +512,10 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
511{ 512{
512 struct vcpu_svm *svm = to_svm(vcpu); 513 struct vcpu_svm *svm = to_svm(vcpu);
513 514
514 if (svm->vmcb->control.next_rip != 0) 515 if (svm->vmcb->control.next_rip != 0) {
516 WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
515 svm->next_rip = svm->vmcb->control.next_rip; 517 svm->next_rip = svm->vmcb->control.next_rip;
518 }
516 519
517 if (!svm->next_rip) { 520 if (!svm->next_rip) {
518 if (emulate_instruction(vcpu, EMULTYPE_SKIP) != 521 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
@@ -1082,7 +1085,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1082 return target_tsc - tsc; 1085 return target_tsc - tsc;
1083} 1086}
1084 1087
1085static void init_vmcb(struct vcpu_svm *svm) 1088static void init_vmcb(struct vcpu_svm *svm, bool init_event)
1086{ 1089{
1087 struct vmcb_control_area *control = &svm->vmcb->control; 1090 struct vmcb_control_area *control = &svm->vmcb->control;
1088 struct vmcb_save_area *save = &svm->vmcb->save; 1091 struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1153,17 +1156,17 @@ static void init_vmcb(struct vcpu_svm *svm)
1153 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1156 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1154 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1157 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1155 1158
1156 svm_set_efer(&svm->vcpu, 0); 1159 if (!init_event)
1160 svm_set_efer(&svm->vcpu, 0);
1157 save->dr6 = 0xffff0ff0; 1161 save->dr6 = 0xffff0ff0;
1158 kvm_set_rflags(&svm->vcpu, 2); 1162 kvm_set_rflags(&svm->vcpu, 2);
1159 save->rip = 0x0000fff0; 1163 save->rip = 0x0000fff0;
1160 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1164 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1161 1165
1162 /* 1166 /*
1163 * This is the guest-visible cr0 value.
1164 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 1167 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1168 * It also updates the guest-visible cr0 value.
1165 */ 1169 */
1166 svm->vcpu.arch.cr0 = 0;
1167 (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); 1170 (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1168 1171
1169 save->cr4 = X86_CR4_PAE; 1172 save->cr4 = X86_CR4_PAE;
@@ -1176,7 +1179,7 @@ static void init_vmcb(struct vcpu_svm *svm)
1176 clr_exception_intercept(svm, PF_VECTOR); 1179 clr_exception_intercept(svm, PF_VECTOR);
1177 clr_cr_intercept(svm, INTERCEPT_CR3_READ); 1180 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1178 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); 1181 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1179 save->g_pat = 0x0007040600070406ULL; 1182 save->g_pat = svm->vcpu.arch.pat;
1180 save->cr3 = 0; 1183 save->cr3 = 0;
1181 save->cr4 = 0; 1184 save->cr4 = 0;
1182 } 1185 }
@@ -1195,13 +1198,19 @@ static void init_vmcb(struct vcpu_svm *svm)
1195 enable_gif(svm); 1198 enable_gif(svm);
1196} 1199}
1197 1200
1198static void svm_vcpu_reset(struct kvm_vcpu *vcpu) 1201static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1199{ 1202{
1200 struct vcpu_svm *svm = to_svm(vcpu); 1203 struct vcpu_svm *svm = to_svm(vcpu);
1201 u32 dummy; 1204 u32 dummy;
1202 u32 eax = 1; 1205 u32 eax = 1;
1203 1206
1204 init_vmcb(svm); 1207 if (!init_event) {
1208 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1209 MSR_IA32_APICBASE_ENABLE;
1210 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1211 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1212 }
1213 init_vmcb(svm, init_event);
1205 1214
1206 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1215 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1207 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1216 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1257,12 +1266,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1257 clear_page(svm->vmcb); 1266 clear_page(svm->vmcb);
1258 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1267 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1259 svm->asid_generation = 0; 1268 svm->asid_generation = 0;
1260 init_vmcb(svm); 1269 init_vmcb(svm, false);
1261
1262 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1263 MSR_IA32_APICBASE_ENABLE;
1264 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1265 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1266 1270
1267 svm_init_osvw(&svm->vcpu); 1271 svm_init_osvw(&svm->vcpu);
1268 1272
@@ -1575,7 +1579,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1575 * does not do it - this results in some delay at 1579 * does not do it - this results in some delay at
1576 * reboot 1580 * reboot
1577 */ 1581 */
1578 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1582 if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED))
1583 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1579 svm->vmcb->save.cr0 = cr0; 1584 svm->vmcb->save.cr0 = cr0;
1580 mark_dirty(svm->vmcb, VMCB_CR); 1585 mark_dirty(svm->vmcb, VMCB_CR);
1581 update_cr0_intercept(svm); 1586 update_cr0_intercept(svm);
@@ -1883,7 +1888,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
1883 * so reinitialize it. 1888 * so reinitialize it.
1884 */ 1889 */
1885 clear_page(svm->vmcb); 1890 clear_page(svm->vmcb);
1886 init_vmcb(svm); 1891 init_vmcb(svm, false);
1887 1892
1888 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 1893 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1889 return 0; 1894 return 0;
@@ -1953,8 +1958,8 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
1953 u64 pdpte; 1958 u64 pdpte;
1954 int ret; 1959 int ret;
1955 1960
1956 ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, 1961 ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
1957 offset_in_page(cr3) + index * 8, 8); 1962 offset_in_page(cr3) + index * 8, 8);
1958 if (ret) 1963 if (ret)
1959 return 0; 1964 return 0;
1960 return pdpte; 1965 return pdpte;
@@ -2112,7 +2117,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2112 2117
2113 might_sleep(); 2118 might_sleep();
2114 2119
2115 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 2120 page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
2116 if (is_error_page(page)) 2121 if (is_error_page(page))
2117 goto error; 2122 goto error;
2118 2123
@@ -2151,7 +2156,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2151 mask = (0xf >> (4 - size)) << start_bit; 2156 mask = (0xf >> (4 - size)) << start_bit;
2152 val = 0; 2157 val = 0;
2153 2158
2154 if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len)) 2159 if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
2155 return NESTED_EXIT_DONE; 2160 return NESTED_EXIT_DONE;
2156 2161
2157 return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 2162 return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2176,7 +2181,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2176 /* Offset is in 32 bit units but need in 8 bit units */ 2181 /* Offset is in 32 bit units but need in 8 bit units */
2177 offset *= 4; 2182 offset *= 4;
2178 2183
2179 if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) 2184 if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
2180 return NESTED_EXIT_DONE; 2185 return NESTED_EXIT_DONE;
2181 2186
2182 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 2187 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2447,7 +2452,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2447 p = msrpm_offsets[i]; 2452 p = msrpm_offsets[i];
2448 offset = svm->nested.vmcb_msrpm + (p * 4); 2453 offset = svm->nested.vmcb_msrpm + (p * 4);
2449 2454
2450 if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) 2455 if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
2451 return false; 2456 return false;
2452 2457
2453 svm->nested.msrpm[p] = svm->msrpm[p] | value; 2458 svm->nested.msrpm[p] = svm->msrpm[p] | value;
@@ -3067,42 +3072,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3067 svm_scale_tsc(vcpu, host_tsc); 3072 svm_scale_tsc(vcpu, host_tsc);
3068} 3073}
3069 3074
3070static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 3075static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3071{ 3076{
3072 struct vcpu_svm *svm = to_svm(vcpu); 3077 struct vcpu_svm *svm = to_svm(vcpu);
3073 3078
3074 switch (ecx) { 3079 switch (msr_info->index) {
3075 case MSR_IA32_TSC: { 3080 case MSR_IA32_TSC: {
3076 *data = svm->vmcb->control.tsc_offset + 3081 msr_info->data = svm->vmcb->control.tsc_offset +
3077 svm_scale_tsc(vcpu, native_read_tsc()); 3082 svm_scale_tsc(vcpu, native_read_tsc());
3078 3083
3079 break; 3084 break;
3080 } 3085 }
3081 case MSR_STAR: 3086 case MSR_STAR:
3082 *data = svm->vmcb->save.star; 3087 msr_info->data = svm->vmcb->save.star;
3083 break; 3088 break;
3084#ifdef CONFIG_X86_64 3089#ifdef CONFIG_X86_64
3085 case MSR_LSTAR: 3090 case MSR_LSTAR:
3086 *data = svm->vmcb->save.lstar; 3091 msr_info->data = svm->vmcb->save.lstar;
3087 break; 3092 break;
3088 case MSR_CSTAR: 3093 case MSR_CSTAR:
3089 *data = svm->vmcb->save.cstar; 3094 msr_info->data = svm->vmcb->save.cstar;
3090 break; 3095 break;
3091 case MSR_KERNEL_GS_BASE: 3096 case MSR_KERNEL_GS_BASE:
3092 *data = svm->vmcb->save.kernel_gs_base; 3097 msr_info->data = svm->vmcb->save.kernel_gs_base;
3093 break; 3098 break;
3094 case MSR_SYSCALL_MASK: 3099 case MSR_SYSCALL_MASK:
3095 *data = svm->vmcb->save.sfmask; 3100 msr_info->data = svm->vmcb->save.sfmask;
3096 break; 3101 break;
3097#endif 3102#endif
3098 case MSR_IA32_SYSENTER_CS: 3103 case MSR_IA32_SYSENTER_CS:
3099 *data = svm->vmcb->save.sysenter_cs; 3104 msr_info->data = svm->vmcb->save.sysenter_cs;
3100 break; 3105 break;
3101 case MSR_IA32_SYSENTER_EIP: 3106 case MSR_IA32_SYSENTER_EIP:
3102 *data = svm->sysenter_eip; 3107 msr_info->data = svm->sysenter_eip;
3103 break; 3108 break;
3104 case MSR_IA32_SYSENTER_ESP: 3109 case MSR_IA32_SYSENTER_ESP:
3105 *data = svm->sysenter_esp; 3110 msr_info->data = svm->sysenter_esp;
3106 break; 3111 break;
3107 /* 3112 /*
3108 * Nobody will change the following 5 values in the VMCB so we can 3113 * Nobody will change the following 5 values in the VMCB so we can
@@ -3110,31 +3115,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
3110 * implemented. 3115 * implemented.
3111 */ 3116 */
3112 case MSR_IA32_DEBUGCTLMSR: 3117 case MSR_IA32_DEBUGCTLMSR:
3113 *data = svm->vmcb->save.dbgctl; 3118 msr_info->data = svm->vmcb->save.dbgctl;
3114 break; 3119 break;
3115 case MSR_IA32_LASTBRANCHFROMIP: 3120 case MSR_IA32_LASTBRANCHFROMIP:
3116 *data = svm->vmcb->save.br_from; 3121 msr_info->data = svm->vmcb->save.br_from;
3117 break; 3122 break;
3118 case MSR_IA32_LASTBRANCHTOIP: 3123 case MSR_IA32_LASTBRANCHTOIP:
3119 *data = svm->vmcb->save.br_to; 3124 msr_info->data = svm->vmcb->save.br_to;
3120 break; 3125 break;
3121 case MSR_IA32_LASTINTFROMIP: 3126 case MSR_IA32_LASTINTFROMIP:
3122 *data = svm->vmcb->save.last_excp_from; 3127 msr_info->data = svm->vmcb->save.last_excp_from;
3123 break; 3128 break;
3124 case MSR_IA32_LASTINTTOIP: 3129 case MSR_IA32_LASTINTTOIP:
3125 *data = svm->vmcb->save.last_excp_to; 3130 msr_info->data = svm->vmcb->save.last_excp_to;
3126 break; 3131 break;
3127 case MSR_VM_HSAVE_PA: 3132 case MSR_VM_HSAVE_PA:
3128 *data = svm->nested.hsave_msr; 3133 msr_info->data = svm->nested.hsave_msr;
3129 break; 3134 break;
3130 case MSR_VM_CR: 3135 case MSR_VM_CR:
3131 *data = svm->nested.vm_cr_msr; 3136 msr_info->data = svm->nested.vm_cr_msr;
3132 break; 3137 break;
3133 case MSR_IA32_UCODE_REV: 3138 case MSR_IA32_UCODE_REV:
3134 *data = 0x01000065; 3139 msr_info->data = 0x01000065;
3135 break; 3140 break;
3136 default: 3141 default:
3137 return kvm_get_msr_common(vcpu, ecx, data); 3142 return kvm_get_msr_common(vcpu, msr_info);
3138 } 3143 }
3139 return 0; 3144 return 0;
3140} 3145}
@@ -3142,16 +3147,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
3142static int rdmsr_interception(struct vcpu_svm *svm) 3147static int rdmsr_interception(struct vcpu_svm *svm)
3143{ 3148{
3144 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); 3149 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3145 u64 data; 3150 struct msr_data msr_info;
3146 3151
3147 if (svm_get_msr(&svm->vcpu, ecx, &data)) { 3152 msr_info.index = ecx;
3153 msr_info.host_initiated = false;
3154 if (svm_get_msr(&svm->vcpu, &msr_info)) {
3148 trace_kvm_msr_read_ex(ecx); 3155 trace_kvm_msr_read_ex(ecx);
3149 kvm_inject_gp(&svm->vcpu, 0); 3156 kvm_inject_gp(&svm->vcpu, 0);
3150 } else { 3157 } else {
3151 trace_kvm_msr_read(ecx, data); 3158 trace_kvm_msr_read(ecx, msr_info.data);
3152 3159
3153 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff); 3160 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
3154 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32); 3161 msr_info.data & 0xffffffff);
3162 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
3163 msr_info.data >> 32);
3155 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3164 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3156 skip_emulated_instruction(&svm->vcpu); 3165 skip_emulated_instruction(&svm->vcpu);
3157 } 3166 }
@@ -3388,6 +3397,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3388 [SVM_EXIT_MWAIT] = mwait_interception, 3397 [SVM_EXIT_MWAIT] = mwait_interception,
3389 [SVM_EXIT_XSETBV] = xsetbv_interception, 3398 [SVM_EXIT_XSETBV] = xsetbv_interception,
3390 [SVM_EXIT_NPF] = pf_interception, 3399 [SVM_EXIT_NPF] = pf_interception,
3400 [SVM_EXIT_RSM] = emulate_on_interception,
3391}; 3401};
3392 3402
3393static void dump_vmcb(struct kvm_vcpu *vcpu) 3403static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -4073,6 +4083,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
4073 return false; 4083 return false;
4074} 4084}
4075 4085
4086static bool svm_has_high_real_mode_segbase(void)
4087{
4088 return true;
4089}
4090
4076static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 4091static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4077{ 4092{
4078 return 0; 4093 return 0;
@@ -4317,7 +4332,9 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
4317 break; 4332 break;
4318 } 4333 }
4319 4334
4320 vmcb->control.next_rip = info->next_rip; 4335 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4336 if (static_cpu_has(X86_FEATURE_NRIPS))
4337 vmcb->control.next_rip = info->next_rip;
4321 vmcb->control.exit_code = icpt_info.exit_code; 4338 vmcb->control.exit_code = icpt_info.exit_code;
4322 vmexit = nested_svm_exit_handled(svm); 4339 vmexit = nested_svm_exit_handled(svm);
4323 4340
@@ -4346,6 +4363,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4346 .hardware_enable = svm_hardware_enable, 4363 .hardware_enable = svm_hardware_enable,
4347 .hardware_disable = svm_hardware_disable, 4364 .hardware_disable = svm_hardware_disable,
4348 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, 4365 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4366 .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
4349 4367
4350 .vcpu_create = svm_create_vcpu, 4368 .vcpu_create = svm_create_vcpu,
4351 .vcpu_free = svm_free_vcpu, 4369 .vcpu_free = svm_free_vcpu,
@@ -4440,6 +4458,8 @@ static struct kvm_x86_ops svm_x86_ops = {
4440 .handle_external_intr = svm_handle_external_intr, 4458 .handle_external_intr = svm_handle_external_intr,
4441 4459
4442 .sched_in = svm_sched_in, 4460 .sched_in = svm_sched_in,
4461
4462 .pmu_ops = &amd_pmu_ops,
4443}; 4463};
4444 4464
4445static int __init svm_init(void) 4465static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c7bc8bef21f..4eae7c35ddf5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire,
952 __entry->delta < 0 ? "early" : "late") 952 __entry->delta < 0 ? "early" : "late")
953); 953);
954 954
955TRACE_EVENT(kvm_enter_smm,
956 TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
957 TP_ARGS(vcpu_id, smbase, entering),
958
959 TP_STRUCT__entry(
960 __field( unsigned int, vcpu_id )
961 __field( u64, smbase )
962 __field( bool, entering )
963 ),
964
965 TP_fast_assign(
966 __entry->vcpu_id = vcpu_id;
967 __entry->smbase = smbase;
968 __entry->entering = entering;
969 ),
970
971 TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
972 __entry->vcpu_id,
973 __entry->entering ? "entering" : "leaving",
974 __entry->smbase)
975);
976
955#endif /* _TRACE_KVM_H */ 977#endif /* _TRACE_KVM_H */
956 978
957#undef TRACE_INCLUDE_PATH 979#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e11dd59398f1..ab53d80b0f64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -47,6 +47,7 @@
47#include <asm/apic.h> 47#include <asm/apic.h>
48 48
49#include "trace.h" 49#include "trace.h"
50#include "pmu.h"
50 51
51#define __ex(x) __kvm_handle_fault_on_reboot(x) 52#define __ex(x) __kvm_handle_fault_on_reboot(x)
52#define __ex_clear(x, reg) \ 53#define __ex_clear(x, reg) \
@@ -785,7 +786,7 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
785 786
786static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 787static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
787{ 788{
788 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); 789 struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
789 if (is_error_page(page)) 790 if (is_error_page(page))
790 return NULL; 791 return NULL;
791 792
@@ -2169,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2169 2170
2170 if (is_guest_mode(vcpu)) 2171 if (is_guest_mode(vcpu))
2171 msr_bitmap = vmx_msr_bitmap_nested; 2172 msr_bitmap = vmx_msr_bitmap_nested;
2172 else if (irqchip_in_kernel(vcpu->kvm) && 2173 else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
2173 apic_x2apic_mode(vcpu->arch.apic)) {
2174 if (is_long_mode(vcpu)) 2174 if (is_long_mode(vcpu))
2175 msr_bitmap = vmx_msr_bitmap_longmode_x2apic; 2175 msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2176 else 2176 else
@@ -2622,76 +2622,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2622 * Returns 0 on success, non-0 otherwise. 2622 * Returns 0 on success, non-0 otherwise.
2623 * Assumes vcpu_load() was already called. 2623 * Assumes vcpu_load() was already called.
2624 */ 2624 */
2625static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2625static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2626{ 2626{
2627 u64 data;
2628 struct shared_msr_entry *msr; 2627 struct shared_msr_entry *msr;
2629 2628
2630 if (!pdata) { 2629 switch (msr_info->index) {
2631 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
2632 return -EINVAL;
2633 }
2634
2635 switch (msr_index) {
2636#ifdef CONFIG_X86_64 2630#ifdef CONFIG_X86_64
2637 case MSR_FS_BASE: 2631 case MSR_FS_BASE:
2638 data = vmcs_readl(GUEST_FS_BASE); 2632 msr_info->data = vmcs_readl(GUEST_FS_BASE);
2639 break; 2633 break;
2640 case MSR_GS_BASE: 2634 case MSR_GS_BASE:
2641 data = vmcs_readl(GUEST_GS_BASE); 2635 msr_info->data = vmcs_readl(GUEST_GS_BASE);
2642 break; 2636 break;
2643 case MSR_KERNEL_GS_BASE: 2637 case MSR_KERNEL_GS_BASE:
2644 vmx_load_host_state(to_vmx(vcpu)); 2638 vmx_load_host_state(to_vmx(vcpu));
2645 data = to_vmx(vcpu)->msr_guest_kernel_gs_base; 2639 msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
2646 break; 2640 break;
2647#endif 2641#endif
2648 case MSR_EFER: 2642 case MSR_EFER:
2649 return kvm_get_msr_common(vcpu, msr_index, pdata); 2643 return kvm_get_msr_common(vcpu, msr_info);
2650 case MSR_IA32_TSC: 2644 case MSR_IA32_TSC:
2651 data = guest_read_tsc(); 2645 msr_info->data = guest_read_tsc();
2652 break; 2646 break;
2653 case MSR_IA32_SYSENTER_CS: 2647 case MSR_IA32_SYSENTER_CS:
2654 data = vmcs_read32(GUEST_SYSENTER_CS); 2648 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2655 break; 2649 break;
2656 case MSR_IA32_SYSENTER_EIP: 2650 case MSR_IA32_SYSENTER_EIP:
2657 data = vmcs_readl(GUEST_SYSENTER_EIP); 2651 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2658 break; 2652 break;
2659 case MSR_IA32_SYSENTER_ESP: 2653 case MSR_IA32_SYSENTER_ESP:
2660 data = vmcs_readl(GUEST_SYSENTER_ESP); 2654 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2661 break; 2655 break;
2662 case MSR_IA32_BNDCFGS: 2656 case MSR_IA32_BNDCFGS:
2663 if (!vmx_mpx_supported()) 2657 if (!vmx_mpx_supported())
2664 return 1; 2658 return 1;
2665 data = vmcs_read64(GUEST_BNDCFGS); 2659 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2666 break; 2660 break;
2667 case MSR_IA32_FEATURE_CONTROL: 2661 case MSR_IA32_FEATURE_CONTROL:
2668 if (!nested_vmx_allowed(vcpu)) 2662 if (!nested_vmx_allowed(vcpu))
2669 return 1; 2663 return 1;
2670 data = to_vmx(vcpu)->nested.msr_ia32_feature_control; 2664 msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2671 break; 2665 break;
2672 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2666 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2673 if (!nested_vmx_allowed(vcpu)) 2667 if (!nested_vmx_allowed(vcpu))
2674 return 1; 2668 return 1;
2675 return vmx_get_vmx_msr(vcpu, msr_index, pdata); 2669 return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
2676 case MSR_IA32_XSS: 2670 case MSR_IA32_XSS:
2677 if (!vmx_xsaves_supported()) 2671 if (!vmx_xsaves_supported())
2678 return 1; 2672 return 1;
2679 data = vcpu->arch.ia32_xss; 2673 msr_info->data = vcpu->arch.ia32_xss;
2680 break; 2674 break;
2681 case MSR_TSC_AUX: 2675 case MSR_TSC_AUX:
2682 if (!to_vmx(vcpu)->rdtscp_enabled) 2676 if (!to_vmx(vcpu)->rdtscp_enabled)
2683 return 1; 2677 return 1;
2684 /* Otherwise falls through */ 2678 /* Otherwise falls through */
2685 default: 2679 default:
2686 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2680 msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
2687 if (msr) { 2681 if (msr) {
2688 data = msr->data; 2682 msr_info->data = msr->data;
2689 break; 2683 break;
2690 } 2684 }
2691 return kvm_get_msr_common(vcpu, msr_index, pdata); 2685 return kvm_get_msr_common(vcpu, msr_info);
2692 } 2686 }
2693 2687
2694 *pdata = data;
2695 return 0; 2688 return 0;
2696} 2689}
2697 2690
@@ -4122,7 +4115,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
4122 kvm_userspace_mem.flags = 0; 4115 kvm_userspace_mem.flags = 0;
4123 kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; 4116 kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
4124 kvm_userspace_mem.memory_size = PAGE_SIZE; 4117 kvm_userspace_mem.memory_size = PAGE_SIZE;
4125 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); 4118 r = __x86_set_memory_region(kvm, &kvm_userspace_mem);
4126 if (r) 4119 if (r)
4127 goto out; 4120 goto out;
4128 4121
@@ -4157,7 +4150,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
4157 kvm_userspace_mem.guest_phys_addr = 4150 kvm_userspace_mem.guest_phys_addr =
4158 kvm->arch.ept_identity_map_addr; 4151 kvm->arch.ept_identity_map_addr;
4159 kvm_userspace_mem.memory_size = PAGE_SIZE; 4152 kvm_userspace_mem.memory_size = PAGE_SIZE;
4160 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); 4153 r = __x86_set_memory_region(kvm, &kvm_userspace_mem);
4161 4154
4162 return r; 4155 return r;
4163} 4156}
@@ -4666,16 +4659,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4666 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4659 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4667 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 4660 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
4668 4661
4669 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 4662 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4670 u32 msr_low, msr_high; 4663 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4671 u64 host_pat;
4672 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
4673 host_pat = msr_low | ((u64) msr_high << 32);
4674 /* Write the default value follow host pat */
4675 vmcs_write64(GUEST_IA32_PAT, host_pat);
4676 /* Keep arch.pat sync with GUEST_IA32_PAT */
4677 vmx->vcpu.arch.pat = host_pat;
4678 }
4679 4664
4680 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { 4665 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
4681 u32 index = vmx_msr_index[i]; 4666 u32 index = vmx_msr_index[i];
@@ -4707,22 +4692,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4707 return 0; 4692 return 0;
4708} 4693}
4709 4694
4710static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4695static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4711{ 4696{
4712 struct vcpu_vmx *vmx = to_vmx(vcpu); 4697 struct vcpu_vmx *vmx = to_vmx(vcpu);
4713 struct msr_data apic_base_msr; 4698 struct msr_data apic_base_msr;
4699 u64 cr0;
4714 4700
4715 vmx->rmode.vm86_active = 0; 4701 vmx->rmode.vm86_active = 0;
4716 4702
4717 vmx->soft_vnmi_blocked = 0; 4703 vmx->soft_vnmi_blocked = 0;
4718 4704
4719 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4705 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4720 kvm_set_cr8(&vmx->vcpu, 0); 4706 kvm_set_cr8(vcpu, 0);
4721 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; 4707
4722 if (kvm_vcpu_is_reset_bsp(&vmx->vcpu)) 4708 if (!init_event) {
4723 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4709 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
4724 apic_base_msr.host_initiated = true; 4710 MSR_IA32_APICBASE_ENABLE;
4725 kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); 4711 if (kvm_vcpu_is_reset_bsp(vcpu))
4712 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4713 apic_base_msr.host_initiated = true;
4714 kvm_set_apic_base(vcpu, &apic_base_msr);
4715 }
4726 4716
4727 vmx_segment_cache_clear(vmx); 4717 vmx_segment_cache_clear(vmx);
4728 4718
@@ -4746,9 +4736,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4746 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4736 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4747 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4737 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4748 4738
4749 vmcs_write32(GUEST_SYSENTER_CS, 0); 4739 if (!init_event) {
4750 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4740 vmcs_write32(GUEST_SYSENTER_CS, 0);
4751 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4741 vmcs_writel(GUEST_SYSENTER_ESP, 0);
4742 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4743 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4744 }
4752 4745
4753 vmcs_writel(GUEST_RFLAGS, 0x02); 4746 vmcs_writel(GUEST_RFLAGS, 0x02);
4754 kvm_rip_write(vcpu, 0xfff0); 4747 kvm_rip_write(vcpu, 0xfff0);
@@ -4763,18 +4756,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4763 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4756 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4764 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4757 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4765 4758
4766 /* Special registers */
4767 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4768
4769 setup_msrs(vmx); 4759 setup_msrs(vmx);
4770 4760
4771 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4761 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
4772 4762
4773 if (cpu_has_vmx_tpr_shadow()) { 4763 if (cpu_has_vmx_tpr_shadow() && !init_event) {
4774 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4764 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4775 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 4765 if (vm_need_tpr_shadow(vcpu->kvm))
4776 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4766 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4777 __pa(vmx->vcpu.arch.apic->regs)); 4767 __pa(vcpu->arch.apic->regs));
4778 vmcs_write32(TPR_THRESHOLD, 0); 4768 vmcs_write32(TPR_THRESHOLD, 0);
4779 } 4769 }
4780 4770
@@ -4786,12 +4776,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4786 if (vmx->vpid != 0) 4776 if (vmx->vpid != 0)
4787 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4777 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4788 4778
4789 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4779 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4790 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ 4780 vmx_set_cr0(vcpu, cr0); /* enter rmode */
4791 vmx_set_cr4(&vmx->vcpu, 0); 4781 vmx->vcpu.arch.cr0 = cr0;
4792 vmx_set_efer(&vmx->vcpu, 0); 4782 vmx_set_cr4(vcpu, 0);
4793 vmx_fpu_activate(&vmx->vcpu); 4783 if (!init_event)
4794 update_exception_bitmap(&vmx->vcpu); 4784 vmx_set_efer(vcpu, 0);
4785 vmx_fpu_activate(vcpu);
4786 update_exception_bitmap(vcpu);
4795 4787
4796 vpid_sync_context(vmx); 4788 vpid_sync_context(vmx);
4797} 4789}
@@ -4964,7 +4956,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4964 .flags = 0, 4956 .flags = 0,
4965 }; 4957 };
4966 4958
4967 ret = kvm_set_memory_region(kvm, &tss_mem); 4959 ret = x86_set_memory_region(kvm, &tss_mem);
4968 if (ret) 4960 if (ret)
4969 return ret; 4961 return ret;
4970 kvm->arch.tss_addr = addr; 4962 kvm->arch.tss_addr = addr;
@@ -5474,19 +5466,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
5474static int handle_rdmsr(struct kvm_vcpu *vcpu) 5466static int handle_rdmsr(struct kvm_vcpu *vcpu)
5475{ 5467{
5476 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5468 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5477 u64 data; 5469 struct msr_data msr_info;
5478 5470
5479 if (vmx_get_msr(vcpu, ecx, &data)) { 5471 msr_info.index = ecx;
5472 msr_info.host_initiated = false;
5473 if (vmx_get_msr(vcpu, &msr_info)) {
5480 trace_kvm_msr_read_ex(ecx); 5474 trace_kvm_msr_read_ex(ecx);
5481 kvm_inject_gp(vcpu, 0); 5475 kvm_inject_gp(vcpu, 0);
5482 return 1; 5476 return 1;
5483 } 5477 }
5484 5478
5485 trace_kvm_msr_read(ecx, data); 5479 trace_kvm_msr_read(ecx, msr_info.data);
5486 5480
5487 /* FIXME: handling of bits 32:63 of rax, rdx */ 5481 /* FIXME: handling of bits 32:63 of rax, rdx */
5488 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; 5482 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
5489 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; 5483 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
5490 skip_emulated_instruction(vcpu); 5484 skip_emulated_instruction(vcpu);
5491 return 1; 5485 return 1;
5492} 5486}
@@ -5709,9 +5703,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
5709 return 0; 5703 return 0;
5710 } 5704 }
5711 5705
5712 /* clear all local breakpoint enable flags */
5713 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
5714
5715 /* 5706 /*
5716 * TODO: What about debug traps on tss switch? 5707 * TODO: What about debug traps on tss switch?
5717 * Are we supposed to inject them and update dr6? 5708 * Are we supposed to inject them and update dr6?
@@ -7332,7 +7323,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
7332 bitmap += (port & 0x7fff) / 8; 7323 bitmap += (port & 0x7fff) / 8;
7333 7324
7334 if (last_bitmap != bitmap) 7325 if (last_bitmap != bitmap)
7335 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) 7326 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
7336 return true; 7327 return true;
7337 if (b & (1 << (port & 7))) 7328 if (b & (1 << (port & 7)))
7338 return true; 7329 return true;
@@ -7376,7 +7367,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
7376 /* Then read the msr_index'th bit from this bitmap: */ 7367 /* Then read the msr_index'th bit from this bitmap: */
7377 if (msr_index < 1024*8) { 7368 if (msr_index < 1024*8) {
7378 unsigned char b; 7369 unsigned char b;
7379 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) 7370 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
7380 return true; 7371 return true;
7381 return 1 & (b >> (msr_index & 7)); 7372 return 1 & (b >> (msr_index & 7));
7382 } else 7373 } else
@@ -7641,9 +7632,9 @@ static void vmx_disable_pml(struct vcpu_vmx *vmx)
7641 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7632 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7642} 7633}
7643 7634
7644static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) 7635static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
7645{ 7636{
7646 struct kvm *kvm = vmx->vcpu.kvm; 7637 struct vcpu_vmx *vmx = to_vmx(vcpu);
7647 u64 *pml_buf; 7638 u64 *pml_buf;
7648 u16 pml_idx; 7639 u16 pml_idx;
7649 7640
@@ -7665,7 +7656,7 @@ static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
7665 7656
7666 gpa = pml_buf[pml_idx]; 7657 gpa = pml_buf[pml_idx];
7667 WARN_ON(gpa & (PAGE_SIZE - 1)); 7658 WARN_ON(gpa & (PAGE_SIZE - 1));
7668 mark_page_dirty(kvm, gpa >> PAGE_SHIFT); 7659 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
7669 } 7660 }
7670 7661
7671 /* reset PML index */ 7662 /* reset PML index */
@@ -7690,6 +7681,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
7690 kvm_vcpu_kick(vcpu); 7681 kvm_vcpu_kick(vcpu);
7691} 7682}
7692 7683
7684static void vmx_dump_sel(char *name, uint32_t sel)
7685{
7686 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
7687 name, vmcs_read32(sel),
7688 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
7689 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
7690 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
7691}
7692
7693static void vmx_dump_dtsel(char *name, uint32_t limit)
7694{
7695 pr_err("%s limit=0x%08x, base=0x%016lx\n",
7696 name, vmcs_read32(limit),
7697 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
7698}
7699
7700static void dump_vmcs(void)
7701{
7702 u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
7703 u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
7704 u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
7705 u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
7706 u32 secondary_exec_control = 0;
7707 unsigned long cr4 = vmcs_readl(GUEST_CR4);
7708 u64 efer = vmcs_readl(GUEST_IA32_EFER);
7709 int i, n;
7710
7711 if (cpu_has_secondary_exec_ctrls())
7712 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7713
7714 pr_err("*** Guest State ***\n");
7715 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
7716 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
7717 vmcs_readl(CR0_GUEST_HOST_MASK));
7718 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
7719 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
7720 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
7721 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
7722 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
7723 {
7724 pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n",
7725 vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
7726 pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n",
7727 vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
7728 }
7729 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
7730 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
7731 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
7732 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
7733 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
7734 vmcs_readl(GUEST_SYSENTER_ESP),
7735 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
7736 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
7737 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
7738 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
7739 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
7740 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
7741 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
7742 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
7743 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
7744 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
7745 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
7746 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
7747 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
7748 pr_err("EFER = 0x%016llx PAT = 0x%016lx\n",
7749 efer, vmcs_readl(GUEST_IA32_PAT));
7750 pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n",
7751 vmcs_readl(GUEST_IA32_DEBUGCTL),
7752 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
7753 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
7754 pr_err("PerfGlobCtl = 0x%016lx\n",
7755 vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
7756 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
7757 pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
7758 pr_err("Interruptibility = %08x ActivityState = %08x\n",
7759 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
7760 vmcs_read32(GUEST_ACTIVITY_STATE));
7761 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
7762 pr_err("InterruptStatus = %04x\n",
7763 vmcs_read16(GUEST_INTR_STATUS));
7764
7765 pr_err("*** Host State ***\n");
7766 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
7767 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
7768 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
7769 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
7770 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
7771 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
7772 vmcs_read16(HOST_TR_SELECTOR));
7773 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
7774 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
7775 vmcs_readl(HOST_TR_BASE));
7776 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
7777 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
7778 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
7779 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
7780 vmcs_readl(HOST_CR4));
7781 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
7782 vmcs_readl(HOST_IA32_SYSENTER_ESP),
7783 vmcs_read32(HOST_IA32_SYSENTER_CS),
7784 vmcs_readl(HOST_IA32_SYSENTER_EIP));
7785 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
7786 pr_err("EFER = 0x%016lx PAT = 0x%016lx\n",
7787 vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
7788 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
7789 pr_err("PerfGlobCtl = 0x%016lx\n",
7790 vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
7791
7792 pr_err("*** Control State ***\n");
7793 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
7794 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
7795 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
7796 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
7797 vmcs_read32(EXCEPTION_BITMAP),
7798 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
7799 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
7800 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
7801 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7802 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
7803 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
7804 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
7805 vmcs_read32(VM_EXIT_INTR_INFO),
7806 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
7807 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
7808 pr_err(" reason=%08x qualification=%016lx\n",
7809 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
7810 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
7811 vmcs_read32(IDT_VECTORING_INFO_FIELD),
7812 vmcs_read32(IDT_VECTORING_ERROR_CODE));
7813 pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
7814 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
7815 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
7816 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
7817 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
7818 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
7819 pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
7820 n = vmcs_read32(CR3_TARGET_COUNT);
7821 for (i = 0; i + 1 < n; i += 4)
7822 pr_err("CR3 target%u=%016lx target%u=%016lx\n",
7823 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
7824 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
7825 if (i < n)
7826 pr_err("CR3 target%u=%016lx\n",
7827 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
7828 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
7829 pr_err("PLE Gap=%08x Window=%08x\n",
7830 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
7831 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
7832 pr_err("Virtual processor ID = 0x%04x\n",
7833 vmcs_read16(VIRTUAL_PROCESSOR_ID));
7834}
7835
7693/* 7836/*
7694 * The guest has exited. See if we can fix it or if we need userspace 7837 * The guest has exited. See if we can fix it or if we need userspace
7695 * assistance. 7838 * assistance.
@@ -7708,7 +7851,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
7708 * flushed already. 7851 * flushed already.
7709 */ 7852 */
7710 if (enable_pml) 7853 if (enable_pml)
7711 vmx_flush_pml_buffer(vmx); 7854 vmx_flush_pml_buffer(vcpu);
7712 7855
7713 /* If guest state is invalid, start emulating */ 7856 /* If guest state is invalid, start emulating */
7714 if (vmx->emulation_required) 7857 if (vmx->emulation_required)
@@ -7722,6 +7865,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
7722 } 7865 }
7723 7866
7724 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 7867 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
7868 dump_vmcs();
7725 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 7869 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
7726 vcpu->run->fail_entry.hardware_entry_failure_reason 7870 vcpu->run->fail_entry.hardware_entry_failure_reason
7727 = exit_reason; 7871 = exit_reason;
@@ -7995,6 +8139,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
7995 local_irq_enable(); 8139 local_irq_enable();
7996} 8140}
7997 8141
8142static bool vmx_has_high_real_mode_segbase(void)
8143{
8144 return enable_unrestricted_guest || emulate_invalid_guest_state;
8145}
8146
7998static bool vmx_mpx_supported(void) 8147static bool vmx_mpx_supported(void)
7999{ 8148{
8000 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && 8149 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
@@ -8479,7 +8628,8 @@ static int get_ept_level(void)
8479 8628
8480static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 8629static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
8481{ 8630{
8482 u64 ret; 8631 u8 cache;
8632 u64 ipat = 0;
8483 8633
8484 /* For VT-d and EPT combination 8634 /* For VT-d and EPT combination
8485 * 1. MMIO: always map as UC 8635 * 1. MMIO: always map as UC
@@ -8492,16 +8642,27 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
8492 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep 8642 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
8493 * consistent with host MTRR 8643 * consistent with host MTRR
8494 */ 8644 */
8495 if (is_mmio) 8645 if (is_mmio) {
8496 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 8646 cache = MTRR_TYPE_UNCACHABLE;
8497 else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) 8647 goto exit;
8498 ret = kvm_get_guest_memory_type(vcpu, gfn) << 8648 }
8499 VMX_EPT_MT_EPTE_SHIFT;
8500 else
8501 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
8502 | VMX_EPT_IPAT_BIT;
8503 8649
8504 return ret; 8650 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
8651 ipat = VMX_EPT_IPAT_BIT;
8652 cache = MTRR_TYPE_WRBACK;
8653 goto exit;
8654 }
8655
8656 if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
8657 ipat = VMX_EPT_IPAT_BIT;
8658 cache = MTRR_TYPE_UNCACHABLE;
8659 goto exit;
8660 }
8661
8662 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
8663
8664exit:
8665 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
8505} 8666}
8506 8667
8507static int vmx_get_lpage_level(void) 8668static int vmx_get_lpage_level(void)
@@ -8923,7 +9084,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
8923 struct vmx_msr_entry *e) 9084 struct vmx_msr_entry *e)
8924{ 9085{
8925 /* x2APIC MSR accesses are not allowed */ 9086 /* x2APIC MSR accesses are not allowed */
8926 if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) 9087 if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
8927 return -EINVAL; 9088 return -EINVAL;
8928 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ 9089 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
8929 e->index == MSR_IA32_UCODE_REV) 9090 e->index == MSR_IA32_UCODE_REV)
@@ -8965,8 +9126,8 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8965 9126
8966 msr.host_initiated = false; 9127 msr.host_initiated = false;
8967 for (i = 0; i < count; i++) { 9128 for (i = 0; i < count; i++) {
8968 if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), 9129 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
8969 &e, sizeof(e))) { 9130 &e, sizeof(e))) {
8970 pr_warn_ratelimited( 9131 pr_warn_ratelimited(
8971 "%s cannot read MSR entry (%u, 0x%08llx)\n", 9132 "%s cannot read MSR entry (%u, 0x%08llx)\n",
8972 __func__, i, gpa + i * sizeof(e)); 9133 __func__, i, gpa + i * sizeof(e));
@@ -8998,9 +9159,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8998 struct vmx_msr_entry e; 9159 struct vmx_msr_entry e;
8999 9160
9000 for (i = 0; i < count; i++) { 9161 for (i = 0; i < count; i++) {
9001 if (kvm_read_guest(vcpu->kvm, 9162 struct msr_data msr_info;
9002 gpa + i * sizeof(e), 9163 if (kvm_vcpu_read_guest(vcpu,
9003 &e, 2 * sizeof(u32))) { 9164 gpa + i * sizeof(e),
9165 &e, 2 * sizeof(u32))) {
9004 pr_warn_ratelimited( 9166 pr_warn_ratelimited(
9005 "%s cannot read MSR entry (%u, 0x%08llx)\n", 9167 "%s cannot read MSR entry (%u, 0x%08llx)\n",
9006 __func__, i, gpa + i * sizeof(e)); 9168 __func__, i, gpa + i * sizeof(e));
@@ -9012,19 +9174,21 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
9012 __func__, i, e.index, e.reserved); 9174 __func__, i, e.index, e.reserved);
9013 return -EINVAL; 9175 return -EINVAL;
9014 } 9176 }
9015 if (kvm_get_msr(vcpu, e.index, &e.value)) { 9177 msr_info.host_initiated = false;
9178 msr_info.index = e.index;
9179 if (kvm_get_msr(vcpu, &msr_info)) {
9016 pr_warn_ratelimited( 9180 pr_warn_ratelimited(
9017 "%s cannot read MSR (%u, 0x%x)\n", 9181 "%s cannot read MSR (%u, 0x%x)\n",
9018 __func__, i, e.index); 9182 __func__, i, e.index);
9019 return -EINVAL; 9183 return -EINVAL;
9020 } 9184 }
9021 if (kvm_write_guest(vcpu->kvm, 9185 if (kvm_vcpu_write_guest(vcpu,
9022 gpa + i * sizeof(e) + 9186 gpa + i * sizeof(e) +
9023 offsetof(struct vmx_msr_entry, value), 9187 offsetof(struct vmx_msr_entry, value),
9024 &e.value, sizeof(e.value))) { 9188 &msr_info.data, sizeof(msr_info.data))) {
9025 pr_warn_ratelimited( 9189 pr_warn_ratelimited(
9026 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 9190 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
9027 __func__, i, e.index, e.value); 9191 __func__, i, e.index, msr_info.data);
9028 return -EINVAL; 9192 return -EINVAL;
9029 } 9193 }
9030 } 9194 }
@@ -10149,6 +10313,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
10149 .hardware_enable = hardware_enable, 10313 .hardware_enable = hardware_enable,
10150 .hardware_disable = hardware_disable, 10314 .hardware_disable = hardware_disable,
10151 .cpu_has_accelerated_tpr = report_flexpriority, 10315 .cpu_has_accelerated_tpr = report_flexpriority,
10316 .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
10152 10317
10153 .vcpu_create = vmx_create_vcpu, 10318 .vcpu_create = vmx_create_vcpu,
10154 .vcpu_free = vmx_free_vcpu, 10319 .vcpu_free = vmx_free_vcpu,
@@ -10254,6 +10419,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
10254 .slot_disable_log_dirty = vmx_slot_disable_log_dirty, 10419 .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
10255 .flush_log_dirty = vmx_flush_log_dirty, 10420 .flush_log_dirty = vmx_flush_log_dirty,
10256 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 10421 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
10422
10423 .pmu_ops = &intel_pmu_ops,
10257}; 10424};
10258 10425
10259static int __init vmx_init(void) 10426static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26eaeb522cab..ac165c2fb8e5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -28,6 +28,7 @@
28#include "x86.h" 28#include "x86.h"
29#include "cpuid.h" 29#include "cpuid.h"
30#include "assigned-dev.h" 30#include "assigned-dev.h"
31#include "pmu.h"
31 32
32#include <linux/clocksource.h> 33#include <linux/clocksource.h>
33#include <linux/interrupt.h> 34#include <linux/interrupt.h>
@@ -57,7 +58,6 @@
57#include <asm/debugreg.h> 58#include <asm/debugreg.h>
58#include <asm/msr.h> 59#include <asm/msr.h>
59#include <asm/desc.h> 60#include <asm/desc.h>
60#include <asm/mtrr.h>
61#include <asm/mce.h> 61#include <asm/mce.h>
62#include <linux/kernel_stat.h> 62#include <linux/kernel_stat.h>
63#include <asm/fpu/internal.h> /* Ugh! */ 63#include <asm/fpu/internal.h> /* Ugh! */
@@ -98,6 +98,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
98unsigned int min_timer_period_us = 500; 98unsigned int min_timer_period_us = 500;
99module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); 99module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
100 100
101static bool __read_mostly kvmclock_periodic_sync = true;
102module_param(kvmclock_periodic_sync, bool, S_IRUGO);
103
101bool kvm_has_tsc_control; 104bool kvm_has_tsc_control;
102EXPORT_SYMBOL_GPL(kvm_has_tsc_control); 105EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
103u32 kvm_max_guest_tsc_khz; 106u32 kvm_max_guest_tsc_khz;
@@ -474,7 +477,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr);
474 477
475/* 478/*
476 * This function will be used to read from the physical memory of the currently 479 * This function will be used to read from the physical memory of the currently
477 * running guest. The difference to kvm_read_guest_page is that this function 480 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
478 * can read from guest physical or from the guest's guest physical memory. 481 * can read from guest physical or from the guest's guest physical memory.
479 */ 482 */
480int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 483int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -492,7 +495,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
492 495
493 real_gfn = gpa_to_gfn(real_gfn); 496 real_gfn = gpa_to_gfn(real_gfn);
494 497
495 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); 498 return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
496} 499}
497EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); 500EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
498 501
@@ -571,8 +574,7 @@ out:
571int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 574int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
572{ 575{
573 unsigned long old_cr0 = kvm_read_cr0(vcpu); 576 unsigned long old_cr0 = kvm_read_cr0(vcpu);
574 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | 577 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
575 X86_CR0_CD | X86_CR0_NW;
576 578
577 cr0 |= X86_CR0_ET; 579 cr0 |= X86_CR0_ET;
578 580
@@ -618,6 +620,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
618 620
619 if ((cr0 ^ old_cr0) & update_bits) 621 if ((cr0 ^ old_cr0) & update_bits)
620 kvm_mmu_reset_context(vcpu); 622 kvm_mmu_reset_context(vcpu);
623
624 if ((cr0 ^ old_cr0) & X86_CR0_CD)
625 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
626
621 return 0; 627 return 0;
622} 628}
623EXPORT_SYMBOL_GPL(kvm_set_cr0); 629EXPORT_SYMBOL_GPL(kvm_set_cr0);
@@ -907,7 +913,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
907 u64 data; 913 u64 data;
908 int err; 914 int err;
909 915
910 err = kvm_pmu_read_pmc(vcpu, ecx, &data); 916 err = kvm_pmu_rdpmc(vcpu, ecx, &data);
911 if (err) 917 if (err)
912 return err; 918 return err;
913 kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); 919 kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
@@ -922,17 +928,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
922 * 928 *
923 * This list is modified at module load time to reflect the 929 * This list is modified at module load time to reflect the
924 * capabilities of the host cpu. This capabilities test skips MSRs that are 930 * capabilities of the host cpu. This capabilities test skips MSRs that are
925 * kvm-specific. Those are put in the beginning of the list. 931 * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
932 * may depend on host virtualization features rather than host cpu features.
926 */ 933 */
927 934
928#define KVM_SAVE_MSRS_BEGIN 12
929static u32 msrs_to_save[] = { 935static u32 msrs_to_save[] = {
930 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
931 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
932 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
933 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
934 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
935 MSR_KVM_PV_EOI_EN,
936 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 936 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
937 MSR_STAR, 937 MSR_STAR,
938#ifdef CONFIG_X86_64 938#ifdef CONFIG_X86_64
@@ -944,14 +944,24 @@ static u32 msrs_to_save[] = {
944 944
945static unsigned num_msrs_to_save; 945static unsigned num_msrs_to_save;
946 946
947static const u32 emulated_msrs[] = { 947static u32 emulated_msrs[] = {
948 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
949 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
950 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
951 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
952 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
953 MSR_KVM_PV_EOI_EN,
954
948 MSR_IA32_TSC_ADJUST, 955 MSR_IA32_TSC_ADJUST,
949 MSR_IA32_TSCDEADLINE, 956 MSR_IA32_TSCDEADLINE,
950 MSR_IA32_MISC_ENABLE, 957 MSR_IA32_MISC_ENABLE,
951 MSR_IA32_MCG_STATUS, 958 MSR_IA32_MCG_STATUS,
952 MSR_IA32_MCG_CTL, 959 MSR_IA32_MCG_CTL,
960 MSR_IA32_SMBASE,
953}; 961};
954 962
963static unsigned num_emulated_msrs;
964
955bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) 965bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
956{ 966{
957 if (efer & efer_reserved_bits) 967 if (efer & efer_reserved_bits)
@@ -1045,6 +1055,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr);
1045/* 1055/*
1046 * Adapt set_msr() to msr_io()'s calling convention 1056 * Adapt set_msr() to msr_io()'s calling convention
1047 */ 1057 */
1058static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1059{
1060 struct msr_data msr;
1061 int r;
1062
1063 msr.index = index;
1064 msr.host_initiated = true;
1065 r = kvm_get_msr(vcpu, &msr);
1066 if (r)
1067 return r;
1068
1069 *data = msr.data;
1070 return 0;
1071}
1072
1048static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1073static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1049{ 1074{
1050 struct msr_data msr; 1075 struct msr_data msr;
@@ -1697,6 +1722,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1697 vcpu->pvclock_set_guest_stopped_request = false; 1722 vcpu->pvclock_set_guest_stopped_request = false;
1698 } 1723 }
1699 1724
1725 pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO;
1726
1700 /* If the host uses TSC clocksource, then it is stable */ 1727 /* If the host uses TSC clocksource, then it is stable */
1701 if (use_master_clock) 1728 if (use_master_clock)
1702 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; 1729 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
@@ -1767,127 +1794,14 @@ static void kvmclock_sync_fn(struct work_struct *work)
1767 kvmclock_sync_work); 1794 kvmclock_sync_work);
1768 struct kvm *kvm = container_of(ka, struct kvm, arch); 1795 struct kvm *kvm = container_of(ka, struct kvm, arch);
1769 1796
1797 if (!kvmclock_periodic_sync)
1798 return;
1799
1770 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); 1800 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
1771 schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 1801 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
1772 KVMCLOCK_SYNC_PERIOD); 1802 KVMCLOCK_SYNC_PERIOD);
1773} 1803}
1774 1804
1775static bool msr_mtrr_valid(unsigned msr)
1776{
1777 switch (msr) {
1778 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
1779 case MSR_MTRRfix64K_00000:
1780 case MSR_MTRRfix16K_80000:
1781 case MSR_MTRRfix16K_A0000:
1782 case MSR_MTRRfix4K_C0000:
1783 case MSR_MTRRfix4K_C8000:
1784 case MSR_MTRRfix4K_D0000:
1785 case MSR_MTRRfix4K_D8000:
1786 case MSR_MTRRfix4K_E0000:
1787 case MSR_MTRRfix4K_E8000:
1788 case MSR_MTRRfix4K_F0000:
1789 case MSR_MTRRfix4K_F8000:
1790 case MSR_MTRRdefType:
1791 case MSR_IA32_CR_PAT:
1792 return true;
1793 case 0x2f8:
1794 return true;
1795 }
1796 return false;
1797}
1798
1799static bool valid_pat_type(unsigned t)
1800{
1801 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
1802}
1803
1804static bool valid_mtrr_type(unsigned t)
1805{
1806 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
1807}
1808
1809bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1810{
1811 int i;
1812 u64 mask;
1813
1814 if (!msr_mtrr_valid(msr))
1815 return false;
1816
1817 if (msr == MSR_IA32_CR_PAT) {
1818 for (i = 0; i < 8; i++)
1819 if (!valid_pat_type((data >> (i * 8)) & 0xff))
1820 return false;
1821 return true;
1822 } else if (msr == MSR_MTRRdefType) {
1823 if (data & ~0xcff)
1824 return false;
1825 return valid_mtrr_type(data & 0xff);
1826 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
1827 for (i = 0; i < 8 ; i++)
1828 if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
1829 return false;
1830 return true;
1831 }
1832
1833 /* variable MTRRs */
1834 WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
1835
1836 mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
1837 if ((msr & 1) == 0) {
1838 /* MTRR base */
1839 if (!valid_mtrr_type(data & 0xff))
1840 return false;
1841 mask |= 0xf00;
1842 } else
1843 /* MTRR mask */
1844 mask |= 0x7ff;
1845 if (data & mask) {
1846 kvm_inject_gp(vcpu, 0);
1847 return false;
1848 }
1849
1850 return true;
1851}
1852EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
1853
1854static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1855{
1856 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1857
1858 if (!kvm_mtrr_valid(vcpu, msr, data))
1859 return 1;
1860
1861 if (msr == MSR_MTRRdefType) {
1862 vcpu->arch.mtrr_state.def_type = data;
1863 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
1864 } else if (msr == MSR_MTRRfix64K_00000)
1865 p[0] = data;
1866 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1867 p[1 + msr - MSR_MTRRfix16K_80000] = data;
1868 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1869 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
1870 else if (msr == MSR_IA32_CR_PAT)
1871 vcpu->arch.pat = data;
1872 else { /* Variable MTRRs */
1873 int idx, is_mtrr_mask;
1874 u64 *pt;
1875
1876 idx = (msr - 0x200) / 2;
1877 is_mtrr_mask = msr - 0x200 - 2 * idx;
1878 if (!is_mtrr_mask)
1879 pt =
1880 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1881 else
1882 pt =
1883 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1884 *pt = data;
1885 }
1886
1887 kvm_mmu_reset_context(vcpu);
1888 return 0;
1889}
1890
1891static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1805static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1892{ 1806{
1893 u64 mcg_cap = vcpu->arch.mcg_cap; 1807 u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -1946,7 +1860,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1946 r = PTR_ERR(page); 1860 r = PTR_ERR(page);
1947 goto out; 1861 goto out;
1948 } 1862 }
1949 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 1863 if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
1950 goto out_free; 1864 goto out_free;
1951 r = 0; 1865 r = 0;
1952out_free: 1866out_free:
@@ -2046,13 +1960,13 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
2046 break; 1960 break;
2047 } 1961 }
2048 gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; 1962 gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
2049 addr = gfn_to_hva(vcpu->kvm, gfn); 1963 addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
2050 if (kvm_is_error_hva(addr)) 1964 if (kvm_is_error_hva(addr))
2051 return 1; 1965 return 1;
2052 if (__clear_user((void __user *)addr, PAGE_SIZE)) 1966 if (__clear_user((void __user *)addr, PAGE_SIZE))
2053 return 1; 1967 return 1;
2054 vcpu->arch.hv_vapic = data; 1968 vcpu->arch.hv_vapic = data;
2055 mark_page_dirty(vcpu->kvm, gfn); 1969 kvm_vcpu_mark_page_dirty(vcpu, gfn);
2056 if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) 1970 if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
2057 return 1; 1971 return 1;
2058 break; 1972 break;
@@ -2179,7 +2093,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2179 __func__, data); 2093 __func__, data);
2180 break; 2094 break;
2181 case 0x200 ... 0x2ff: 2095 case 0x200 ... 0x2ff:
2182 return set_msr_mtrr(vcpu, msr, data); 2096 return kvm_mtrr_set_msr(vcpu, msr, data);
2183 case MSR_IA32_APICBASE: 2097 case MSR_IA32_APICBASE:
2184 return kvm_set_apic_base(vcpu, msr_info); 2098 return kvm_set_apic_base(vcpu, msr_info);
2185 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 2099 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
@@ -2199,6 +2113,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2199 case MSR_IA32_MISC_ENABLE: 2113 case MSR_IA32_MISC_ENABLE:
2200 vcpu->arch.ia32_misc_enable_msr = data; 2114 vcpu->arch.ia32_misc_enable_msr = data;
2201 break; 2115 break;
2116 case MSR_IA32_SMBASE:
2117 if (!msr_info->host_initiated)
2118 return 1;
2119 vcpu->arch.smbase = data;
2120 break;
2202 case MSR_KVM_WALL_CLOCK_NEW: 2121 case MSR_KVM_WALL_CLOCK_NEW:
2203 case MSR_KVM_WALL_CLOCK: 2122 case MSR_KVM_WALL_CLOCK:
2204 vcpu->kvm->arch.wall_clock = data; 2123 vcpu->kvm->arch.wall_clock = data;
@@ -2219,6 +2138,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2219 &vcpu->requests); 2138 &vcpu->requests);
2220 2139
2221 ka->boot_vcpu_runs_old_kvmclock = tmp; 2140 ka->boot_vcpu_runs_old_kvmclock = tmp;
2141
2142 ka->kvmclock_offset = -get_kernel_ns();
2222 } 2143 }
2223 2144
2224 vcpu->arch.time = data; 2145 vcpu->arch.time = data;
@@ -2280,37 +2201,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2280 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2201 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2281 return set_msr_mce(vcpu, msr, data); 2202 return set_msr_mce(vcpu, msr, data);
2282 2203
2283 /* Performance counters are not protected by a CPUID bit, 2204 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2284 * so we should check all of them in the generic path for the sake of 2205 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2285 * cross vendor migration. 2206 pr = true; /* fall through */
2286 * Writing a zero into the event select MSRs disables them, 2207 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2287 * which we perfectly emulate ;-). Any other value should be at least 2208 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2288 * reported, some guests depend on them. 2209 if (kvm_pmu_is_valid_msr(vcpu, msr))
2289 */
2290 case MSR_K7_EVNTSEL0:
2291 case MSR_K7_EVNTSEL1:
2292 case MSR_K7_EVNTSEL2:
2293 case MSR_K7_EVNTSEL3:
2294 if (data != 0)
2295 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2296 "0x%x data 0x%llx\n", msr, data);
2297 break;
2298 /* at least RHEL 4 unconditionally writes to the perfctr registers,
2299 * so we ignore writes to make it happy.
2300 */
2301 case MSR_K7_PERFCTR0:
2302 case MSR_K7_PERFCTR1:
2303 case MSR_K7_PERFCTR2:
2304 case MSR_K7_PERFCTR3:
2305 vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2306 "0x%x data 0x%llx\n", msr, data);
2307 break;
2308 case MSR_P6_PERFCTR0:
2309 case MSR_P6_PERFCTR1:
2310 pr = true;
2311 case MSR_P6_EVNTSEL0:
2312 case MSR_P6_EVNTSEL1:
2313 if (kvm_pmu_msr(vcpu, msr))
2314 return kvm_pmu_set_msr(vcpu, msr_info); 2210 return kvm_pmu_set_msr(vcpu, msr_info);
2315 2211
2316 if (pr || data != 0) 2212 if (pr || data != 0)
@@ -2356,7 +2252,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2356 default: 2252 default:
2357 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 2253 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2358 return xen_hvm_config(vcpu, data); 2254 return xen_hvm_config(vcpu, data);
2359 if (kvm_pmu_msr(vcpu, msr)) 2255 if (kvm_pmu_is_valid_msr(vcpu, msr))
2360 return kvm_pmu_set_msr(vcpu, msr_info); 2256 return kvm_pmu_set_msr(vcpu, msr_info);
2361 if (!ignore_msrs) { 2257 if (!ignore_msrs) {
2362 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 2258 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
@@ -2378,48 +2274,12 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2378 * Returns 0 on success, non-0 otherwise. 2274 * Returns 0 on success, non-0 otherwise.
2379 * Assumes vcpu_load() was already called. 2275 * Assumes vcpu_load() was already called.
2380 */ 2276 */
2381int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2277int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2382{ 2278{
2383 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 2279 return kvm_x86_ops->get_msr(vcpu, msr);
2384} 2280}
2385EXPORT_SYMBOL_GPL(kvm_get_msr); 2281EXPORT_SYMBOL_GPL(kvm_get_msr);
2386 2282
2387static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2388{
2389 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
2390
2391 if (!msr_mtrr_valid(msr))
2392 return 1;
2393
2394 if (msr == MSR_MTRRdefType)
2395 *pdata = vcpu->arch.mtrr_state.def_type +
2396 (vcpu->arch.mtrr_state.enabled << 10);
2397 else if (msr == MSR_MTRRfix64K_00000)
2398 *pdata = p[0];
2399 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
2400 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
2401 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
2402 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
2403 else if (msr == MSR_IA32_CR_PAT)
2404 *pdata = vcpu->arch.pat;
2405 else { /* Variable MTRRs */
2406 int idx, is_mtrr_mask;
2407 u64 *pt;
2408
2409 idx = (msr - 0x200) / 2;
2410 is_mtrr_mask = msr - 0x200 - 2 * idx;
2411 if (!is_mtrr_mask)
2412 pt =
2413 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
2414 else
2415 pt =
2416 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
2417 *pdata = *pt;
2418 }
2419
2420 return 0;
2421}
2422
2423static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2283static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2424{ 2284{
2425 u64 data; 2285 u64 data;
@@ -2517,11 +2377,11 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2517 return 0; 2377 return 0;
2518} 2378}
2519 2379
2520int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2380int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2521{ 2381{
2522 u64 data; 2382 u64 data;
2523 2383
2524 switch (msr) { 2384 switch (msr_info->index) {
2525 case MSR_IA32_PLATFORM_ID: 2385 case MSR_IA32_PLATFORM_ID:
2526 case MSR_IA32_EBL_CR_POWERON: 2386 case MSR_IA32_EBL_CR_POWERON:
2527 case MSR_IA32_DEBUGCTLMSR: 2387 case MSR_IA32_DEBUGCTLMSR:
@@ -2532,38 +2392,28 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2532 case MSR_K8_SYSCFG: 2392 case MSR_K8_SYSCFG:
2533 case MSR_K7_HWCR: 2393 case MSR_K7_HWCR:
2534 case MSR_VM_HSAVE_PA: 2394 case MSR_VM_HSAVE_PA:
2535 case MSR_K7_EVNTSEL0:
2536 case MSR_K7_EVNTSEL1:
2537 case MSR_K7_EVNTSEL2:
2538 case MSR_K7_EVNTSEL3:
2539 case MSR_K7_PERFCTR0:
2540 case MSR_K7_PERFCTR1:
2541 case MSR_K7_PERFCTR2:
2542 case MSR_K7_PERFCTR3:
2543 case MSR_K8_INT_PENDING_MSG: 2395 case MSR_K8_INT_PENDING_MSG:
2544 case MSR_AMD64_NB_CFG: 2396 case MSR_AMD64_NB_CFG:
2545 case MSR_FAM10H_MMIO_CONF_BASE: 2397 case MSR_FAM10H_MMIO_CONF_BASE:
2546 case MSR_AMD64_BU_CFG2: 2398 case MSR_AMD64_BU_CFG2:
2547 data = 0; 2399 msr_info->data = 0;
2548 break; 2400 break;
2549 case MSR_P6_PERFCTR0: 2401 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2550 case MSR_P6_PERFCTR1: 2402 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2551 case MSR_P6_EVNTSEL0: 2403 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2552 case MSR_P6_EVNTSEL1: 2404 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2553 if (kvm_pmu_msr(vcpu, msr)) 2405 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2554 return kvm_pmu_get_msr(vcpu, msr, pdata); 2406 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2555 data = 0; 2407 msr_info->data = 0;
2556 break; 2408 break;
2557 case MSR_IA32_UCODE_REV: 2409 case MSR_IA32_UCODE_REV:
2558 data = 0x100000000ULL; 2410 msr_info->data = 0x100000000ULL;
2559 break; 2411 break;
2560 case MSR_MTRRcap: 2412 case MSR_MTRRcap:
2561 data = 0x500 | KVM_NR_VAR_MTRR;
2562 break;
2563 case 0x200 ... 0x2ff: 2413 case 0x200 ... 0x2ff:
2564 return get_msr_mtrr(vcpu, msr, pdata); 2414 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
2565 case 0xcd: /* fsb frequency */ 2415 case 0xcd: /* fsb frequency */
2566 data = 3; 2416 msr_info->data = 3;
2567 break; 2417 break;
2568 /* 2418 /*
2569 * MSR_EBC_FREQUENCY_ID 2419 * MSR_EBC_FREQUENCY_ID
@@ -2577,48 +2427,53 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2577 * multiplying by zero otherwise. 2427 * multiplying by zero otherwise.
2578 */ 2428 */
2579 case MSR_EBC_FREQUENCY_ID: 2429 case MSR_EBC_FREQUENCY_ID:
2580 data = 1 << 24; 2430 msr_info->data = 1 << 24;
2581 break; 2431 break;
2582 case MSR_IA32_APICBASE: 2432 case MSR_IA32_APICBASE:
2583 data = kvm_get_apic_base(vcpu); 2433 msr_info->data = kvm_get_apic_base(vcpu);
2584 break; 2434 break;
2585 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 2435 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2586 return kvm_x2apic_msr_read(vcpu, msr, pdata); 2436 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2587 break; 2437 break;
2588 case MSR_IA32_TSCDEADLINE: 2438 case MSR_IA32_TSCDEADLINE:
2589 data = kvm_get_lapic_tscdeadline_msr(vcpu); 2439 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2590 break; 2440 break;
2591 case MSR_IA32_TSC_ADJUST: 2441 case MSR_IA32_TSC_ADJUST:
2592 data = (u64)vcpu->arch.ia32_tsc_adjust_msr; 2442 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2593 break; 2443 break;
2594 case MSR_IA32_MISC_ENABLE: 2444 case MSR_IA32_MISC_ENABLE:
2595 data = vcpu->arch.ia32_misc_enable_msr; 2445 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
2446 break;
2447 case MSR_IA32_SMBASE:
2448 if (!msr_info->host_initiated)
2449 return 1;
2450 msr_info->data = vcpu->arch.smbase;
2596 break; 2451 break;
2597 case MSR_IA32_PERF_STATUS: 2452 case MSR_IA32_PERF_STATUS:
2598 /* TSC increment by tick */ 2453 /* TSC increment by tick */
2599 data = 1000ULL; 2454 msr_info->data = 1000ULL;
2600 /* CPU multiplier */ 2455 /* CPU multiplier */
2601 data |= (((uint64_t)4ULL) << 40); 2456 data |= (((uint64_t)4ULL) << 40);
2602 break; 2457 break;
2603 case MSR_EFER: 2458 case MSR_EFER:
2604 data = vcpu->arch.efer; 2459 msr_info->data = vcpu->arch.efer;
2605 break; 2460 break;
2606 case MSR_KVM_WALL_CLOCK: 2461 case MSR_KVM_WALL_CLOCK:
2607 case MSR_KVM_WALL_CLOCK_NEW: 2462 case MSR_KVM_WALL_CLOCK_NEW:
2608 data = vcpu->kvm->arch.wall_clock; 2463 msr_info->data = vcpu->kvm->arch.wall_clock;
2609 break; 2464 break;
2610 case MSR_KVM_SYSTEM_TIME: 2465 case MSR_KVM_SYSTEM_TIME:
2611 case MSR_KVM_SYSTEM_TIME_NEW: 2466 case MSR_KVM_SYSTEM_TIME_NEW:
2612 data = vcpu->arch.time; 2467 msr_info->data = vcpu->arch.time;
2613 break; 2468 break;
2614 case MSR_KVM_ASYNC_PF_EN: 2469 case MSR_KVM_ASYNC_PF_EN:
2615 data = vcpu->arch.apf.msr_val; 2470 msr_info->data = vcpu->arch.apf.msr_val;
2616 break; 2471 break;
2617 case MSR_KVM_STEAL_TIME: 2472 case MSR_KVM_STEAL_TIME:
2618 data = vcpu->arch.st.msr_val; 2473 msr_info->data = vcpu->arch.st.msr_val;
2619 break; 2474 break;
2620 case MSR_KVM_PV_EOI_EN: 2475 case MSR_KVM_PV_EOI_EN:
2621 data = vcpu->arch.pv_eoi.msr_val; 2476 msr_info->data = vcpu->arch.pv_eoi.msr_val;
2622 break; 2477 break;
2623 case MSR_IA32_P5_MC_ADDR: 2478 case MSR_IA32_P5_MC_ADDR:
2624 case MSR_IA32_P5_MC_TYPE: 2479 case MSR_IA32_P5_MC_TYPE:
@@ -2626,7 +2481,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2626 case MSR_IA32_MCG_CTL: 2481 case MSR_IA32_MCG_CTL:
2627 case MSR_IA32_MCG_STATUS: 2482 case MSR_IA32_MCG_STATUS:
2628 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2483 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2629 return get_msr_mce(vcpu, msr, pdata); 2484 return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
2630 case MSR_K7_CLK_CTL: 2485 case MSR_K7_CLK_CTL:
2631 /* 2486 /*
2632 * Provide expected ramp-up count for K7. All other 2487 * Provide expected ramp-up count for K7. All other
@@ -2637,17 +2492,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2637 * type 6, model 8 and higher from exploding due to 2492 * type 6, model 8 and higher from exploding due to
2638 * the rdmsr failing. 2493 * the rdmsr failing.
2639 */ 2494 */
2640 data = 0x20000000; 2495 msr_info->data = 0x20000000;
2641 break; 2496 break;
2642 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 2497 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2643 if (kvm_hv_msr_partition_wide(msr)) { 2498 if (kvm_hv_msr_partition_wide(msr_info->index)) {
2644 int r; 2499 int r;
2645 mutex_lock(&vcpu->kvm->lock); 2500 mutex_lock(&vcpu->kvm->lock);
2646 r = get_msr_hyperv_pw(vcpu, msr, pdata); 2501 r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data);
2647 mutex_unlock(&vcpu->kvm->lock); 2502 mutex_unlock(&vcpu->kvm->lock);
2648 return r; 2503 return r;
2649 } else 2504 } else
2650 return get_msr_hyperv(vcpu, msr, pdata); 2505 return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data);
2651 break; 2506 break;
2652 case MSR_IA32_BBL_CR_CTL3: 2507 case MSR_IA32_BBL_CR_CTL3:
2653 /* This legacy MSR exists but isn't fully documented in current 2508 /* This legacy MSR exists but isn't fully documented in current
@@ -2660,31 +2515,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2660 * L2 cache control register 3: 64GB range, 256KB size, 2515 * L2 cache control register 3: 64GB range, 256KB size,
2661 * enabled, latency 0x1, configured 2516 * enabled, latency 0x1, configured
2662 */ 2517 */
2663 data = 0xbe702111; 2518 msr_info->data = 0xbe702111;
2664 break; 2519 break;
2665 case MSR_AMD64_OSVW_ID_LENGTH: 2520 case MSR_AMD64_OSVW_ID_LENGTH:
2666 if (!guest_cpuid_has_osvw(vcpu)) 2521 if (!guest_cpuid_has_osvw(vcpu))
2667 return 1; 2522 return 1;
2668 data = vcpu->arch.osvw.length; 2523 msr_info->data = vcpu->arch.osvw.length;
2669 break; 2524 break;
2670 case MSR_AMD64_OSVW_STATUS: 2525 case MSR_AMD64_OSVW_STATUS:
2671 if (!guest_cpuid_has_osvw(vcpu)) 2526 if (!guest_cpuid_has_osvw(vcpu))
2672 return 1; 2527 return 1;
2673 data = vcpu->arch.osvw.status; 2528 msr_info->data = vcpu->arch.osvw.status;
2674 break; 2529 break;
2675 default: 2530 default:
2676 if (kvm_pmu_msr(vcpu, msr)) 2531 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2677 return kvm_pmu_get_msr(vcpu, msr, pdata); 2532 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2678 if (!ignore_msrs) { 2533 if (!ignore_msrs) {
2679 vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 2534 vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
2680 return 1; 2535 return 1;
2681 } else { 2536 } else {
2682 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 2537 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
2683 data = 0; 2538 msr_info->data = 0;
2684 } 2539 }
2685 break; 2540 break;
2686 } 2541 }
2687 *pdata = data;
2688 return 0; 2542 return 0;
2689} 2543}
2690EXPORT_SYMBOL_GPL(kvm_get_msr_common); 2544EXPORT_SYMBOL_GPL(kvm_get_msr_common);
@@ -2797,12 +2651,25 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2797 case KVM_CAP_HYPERV_TIME: 2651 case KVM_CAP_HYPERV_TIME:
2798 case KVM_CAP_IOAPIC_POLARITY_IGNORED: 2652 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2799 case KVM_CAP_TSC_DEADLINE_TIMER: 2653 case KVM_CAP_TSC_DEADLINE_TIMER:
2654 case KVM_CAP_ENABLE_CAP_VM:
2655 case KVM_CAP_DISABLE_QUIRKS:
2800#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2656#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2801 case KVM_CAP_ASSIGN_DEV_IRQ: 2657 case KVM_CAP_ASSIGN_DEV_IRQ:
2802 case KVM_CAP_PCI_2_3: 2658 case KVM_CAP_PCI_2_3:
2803#endif 2659#endif
2804 r = 1; 2660 r = 1;
2805 break; 2661 break;
2662 case KVM_CAP_X86_SMM:
2663 /* SMBASE is usually relocated above 1M on modern chipsets,
2664 * and SMM handlers might indeed rely on 4G segment limits,
2665 * so do not report SMM to be available if real mode is
2666 * emulated via vm86 mode. Still, do not go to great lengths
2667 * to avoid userspace's usage of the feature, because it is a
2668 * fringe case that is not enabled except via specific settings
2669 * of the module parameters.
2670 */
2671 r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
2672 break;
2806 case KVM_CAP_COALESCED_MMIO: 2673 case KVM_CAP_COALESCED_MMIO:
2807 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 2674 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
2808 break; 2675 break;
@@ -2859,7 +2726,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
2859 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 2726 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2860 goto out; 2727 goto out;
2861 n = msr_list.nmsrs; 2728 n = msr_list.nmsrs;
2862 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 2729 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
2863 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 2730 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2864 goto out; 2731 goto out;
2865 r = -E2BIG; 2732 r = -E2BIG;
@@ -2871,7 +2738,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
2871 goto out; 2738 goto out;
2872 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 2739 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2873 &emulated_msrs, 2740 &emulated_msrs,
2874 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 2741 num_emulated_msrs * sizeof(u32)))
2875 goto out; 2742 goto out;
2876 r = 0; 2743 r = 0;
2877 break; 2744 break;
@@ -3015,6 +2882,13 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
3015 return 0; 2882 return 0;
3016} 2883}
3017 2884
2885static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
2886{
2887 kvm_make_request(KVM_REQ_SMI, vcpu);
2888
2889 return 0;
2890}
2891
3018static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 2892static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
3019 struct kvm_tpr_access_ctl *tac) 2893 struct kvm_tpr_access_ctl *tac)
3020{ 2894{
@@ -3120,8 +2994,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3120 2994
3121 events->sipi_vector = 0; /* never valid when reporting to user space */ 2995 events->sipi_vector = 0; /* never valid when reporting to user space */
3122 2996
2997 events->smi.smm = is_smm(vcpu);
2998 events->smi.pending = vcpu->arch.smi_pending;
2999 events->smi.smm_inside_nmi =
3000 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
3001 events->smi.latched_init = kvm_lapic_latched_init(vcpu);
3002
3123 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 3003 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3124 | KVM_VCPUEVENT_VALID_SHADOW); 3004 | KVM_VCPUEVENT_VALID_SHADOW
3005 | KVM_VCPUEVENT_VALID_SMM);
3125 memset(&events->reserved, 0, sizeof(events->reserved)); 3006 memset(&events->reserved, 0, sizeof(events->reserved));
3126} 3007}
3127 3008
@@ -3130,7 +3011,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3130{ 3011{
3131 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 3012 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3132 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 3013 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3133 | KVM_VCPUEVENT_VALID_SHADOW)) 3014 | KVM_VCPUEVENT_VALID_SHADOW
3015 | KVM_VCPUEVENT_VALID_SMM))
3134 return -EINVAL; 3016 return -EINVAL;
3135 3017
3136 process_nmi(vcpu); 3018 process_nmi(vcpu);
@@ -3155,6 +3037,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3155 kvm_vcpu_has_lapic(vcpu)) 3037 kvm_vcpu_has_lapic(vcpu))
3156 vcpu->arch.apic->sipi_vector = events->sipi_vector; 3038 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3157 3039
3040 if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3041 if (events->smi.smm)
3042 vcpu->arch.hflags |= HF_SMM_MASK;
3043 else
3044 vcpu->arch.hflags &= ~HF_SMM_MASK;
3045 vcpu->arch.smi_pending = events->smi.pending;
3046 if (events->smi.smm_inside_nmi)
3047 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
3048 else
3049 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3050 if (kvm_vcpu_has_lapic(vcpu)) {
3051 if (events->smi.latched_init)
3052 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3053 else
3054 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3055 }
3056 }
3057
3158 kvm_make_request(KVM_REQ_EVENT, vcpu); 3058 kvm_make_request(KVM_REQ_EVENT, vcpu);
3159 3059
3160 return 0; 3060 return 0;
@@ -3414,6 +3314,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3414 r = kvm_vcpu_ioctl_nmi(vcpu); 3314 r = kvm_vcpu_ioctl_nmi(vcpu);
3415 break; 3315 break;
3416 } 3316 }
3317 case KVM_SMI: {
3318 r = kvm_vcpu_ioctl_smi(vcpu);
3319 break;
3320 }
3417 case KVM_SET_CPUID: { 3321 case KVM_SET_CPUID: {
3418 struct kvm_cpuid __user *cpuid_arg = argp; 3322 struct kvm_cpuid __user *cpuid_arg = argp;
3419 struct kvm_cpuid cpuid; 3323 struct kvm_cpuid cpuid;
@@ -3453,7 +3357,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3453 break; 3357 break;
3454 } 3358 }
3455 case KVM_GET_MSRS: 3359 case KVM_GET_MSRS:
3456 r = msr_io(vcpu, argp, kvm_get_msr, 1); 3360 r = msr_io(vcpu, argp, do_get_msr, 1);
3457 break; 3361 break;
3458 case KVM_SET_MSRS: 3362 case KVM_SET_MSRS:
3459 r = msr_io(vcpu, argp, do_set_msr, 0); 3363 r = msr_io(vcpu, argp, do_set_msr, 0);
@@ -3844,6 +3748,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
3844 return 0; 3748 return 0;
3845} 3749}
3846 3750
3751static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3752 struct kvm_enable_cap *cap)
3753{
3754 int r;
3755
3756 if (cap->flags)
3757 return -EINVAL;
3758
3759 switch (cap->cap) {
3760 case KVM_CAP_DISABLE_QUIRKS:
3761 kvm->arch.disabled_quirks = cap->args[0];
3762 r = 0;
3763 break;
3764 default:
3765 r = -EINVAL;
3766 break;
3767 }
3768 return r;
3769}
3770
3847long kvm_arch_vm_ioctl(struct file *filp, 3771long kvm_arch_vm_ioctl(struct file *filp,
3848 unsigned int ioctl, unsigned long arg) 3772 unsigned int ioctl, unsigned long arg)
3849{ 3773{
@@ -4096,7 +4020,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
4096 r = 0; 4020 r = 0;
4097 break; 4021 break;
4098 } 4022 }
4023 case KVM_ENABLE_CAP: {
4024 struct kvm_enable_cap cap;
4099 4025
4026 r = -EFAULT;
4027 if (copy_from_user(&cap, argp, sizeof(cap)))
4028 goto out;
4029 r = kvm_vm_ioctl_enable_cap(kvm, &cap);
4030 break;
4031 }
4100 default: 4032 default:
4101 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 4033 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
4102 } 4034 }
@@ -4109,8 +4041,7 @@ static void kvm_init_msr_list(void)
4109 u32 dummy[2]; 4041 u32 dummy[2];
4110 unsigned i, j; 4042 unsigned i, j;
4111 4043
4112 /* skip the first msrs in the list. KVM-specific */ 4044 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
4113 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
4114 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 4045 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
4115 continue; 4046 continue;
4116 4047
@@ -4135,6 +4066,22 @@ static void kvm_init_msr_list(void)
4135 j++; 4066 j++;
4136 } 4067 }
4137 num_msrs_to_save = j; 4068 num_msrs_to_save = j;
4069
4070 for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
4071 switch (emulated_msrs[i]) {
4072 case MSR_IA32_SMBASE:
4073 if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
4074 continue;
4075 break;
4076 default:
4077 break;
4078 }
4079
4080 if (j < i)
4081 emulated_msrs[j] = emulated_msrs[i];
4082 j++;
4083 }
4084 num_emulated_msrs = j;
4138} 4085}
4139 4086
4140static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 4087static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -4252,8 +4199,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
4252 4199
4253 if (gpa == UNMAPPED_GVA) 4200 if (gpa == UNMAPPED_GVA)
4254 return X86EMUL_PROPAGATE_FAULT; 4201 return X86EMUL_PROPAGATE_FAULT;
4255 ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data, 4202 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
4256 offset, toread); 4203 offset, toread);
4257 if (ret < 0) { 4204 if (ret < 0) {
4258 r = X86EMUL_IO_NEEDED; 4205 r = X86EMUL_IO_NEEDED;
4259 goto out; 4206 goto out;
@@ -4286,8 +4233,8 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
4286 offset = addr & (PAGE_SIZE-1); 4233 offset = addr & (PAGE_SIZE-1);
4287 if (WARN_ON(offset + bytes > PAGE_SIZE)) 4234 if (WARN_ON(offset + bytes > PAGE_SIZE))
4288 bytes = (unsigned)PAGE_SIZE - offset; 4235 bytes = (unsigned)PAGE_SIZE - offset;
4289 ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val, 4236 ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
4290 offset, bytes); 4237 offset, bytes);
4291 if (unlikely(ret < 0)) 4238 if (unlikely(ret < 0))
4292 return X86EMUL_IO_NEEDED; 4239 return X86EMUL_IO_NEEDED;
4293 4240
@@ -4333,7 +4280,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4333 4280
4334 if (gpa == UNMAPPED_GVA) 4281 if (gpa == UNMAPPED_GVA)
4335 return X86EMUL_PROPAGATE_FAULT; 4282 return X86EMUL_PROPAGATE_FAULT;
4336 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 4283 ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
4337 if (ret < 0) { 4284 if (ret < 0) {
4338 r = X86EMUL_IO_NEEDED; 4285 r = X86EMUL_IO_NEEDED;
4339 goto out; 4286 goto out;
@@ -4386,7 +4333,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4386{ 4333{
4387 int ret; 4334 int ret;
4388 4335
4389 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 4336 ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
4390 if (ret < 0) 4337 if (ret < 0)
4391 return 0; 4338 return 0;
4392 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 4339 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
@@ -4420,7 +4367,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
4420static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, 4367static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4421 void *val, int bytes) 4368 void *val, int bytes)
4422{ 4369{
4423 return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); 4370 return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
4424} 4371}
4425 4372
4426static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, 4373static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4618,7 +4565,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4618 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 4565 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
4619 goto emul_write; 4566 goto emul_write;
4620 4567
4621 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 4568 page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
4622 if (is_error_page(page)) 4569 if (is_error_page(page))
4623 goto emul_write; 4570 goto emul_write;
4624 4571
@@ -4646,7 +4593,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4646 if (!exchanged) 4593 if (!exchanged)
4647 return X86EMUL_CMPXCHG_FAILED; 4594 return X86EMUL_CMPXCHG_FAILED;
4648 4595
4649 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); 4596 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
4650 kvm_mmu_pte_write(vcpu, gpa, new, bytes); 4597 kvm_mmu_pte_write(vcpu, gpa, new, bytes);
4651 4598
4652 return X86EMUL_CONTINUE; 4599 return X86EMUL_CONTINUE;
@@ -4945,7 +4892,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
4945static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, 4892static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4946 u32 msr_index, u64 *pdata) 4893 u32 msr_index, u64 *pdata)
4947{ 4894{
4948 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); 4895 struct msr_data msr;
4896 int r;
4897
4898 msr.index = msr_index;
4899 msr.host_initiated = false;
4900 r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
4901 if (r)
4902 return r;
4903
4904 *pdata = msr.data;
4905 return 0;
4949} 4906}
4950 4907
4951static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, 4908static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
@@ -4959,16 +4916,30 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4959 return kvm_set_msr(emul_to_vcpu(ctxt), &msr); 4916 return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
4960} 4917}
4961 4918
4919static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
4920{
4921 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4922
4923 return vcpu->arch.smbase;
4924}
4925
4926static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
4927{
4928 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4929
4930 vcpu->arch.smbase = smbase;
4931}
4932
4962static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, 4933static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
4963 u32 pmc) 4934 u32 pmc)
4964{ 4935{
4965 return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc); 4936 return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
4966} 4937}
4967 4938
4968static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, 4939static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
4969 u32 pmc, u64 *pdata) 4940 u32 pmc, u64 *pdata)
4970{ 4941{
4971 return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata); 4942 return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
4972} 4943}
4973 4944
4974static void emulator_halt(struct x86_emulate_ctxt *ctxt) 4945static void emulator_halt(struct x86_emulate_ctxt *ctxt)
@@ -5044,6 +5015,8 @@ static const struct x86_emulate_ops emulate_ops = {
5044 .cpl = emulator_get_cpl, 5015 .cpl = emulator_get_cpl,
5045 .get_dr = emulator_get_dr, 5016 .get_dr = emulator_get_dr,
5046 .set_dr = emulator_set_dr, 5017 .set_dr = emulator_set_dr,
5018 .get_smbase = emulator_get_smbase,
5019 .set_smbase = emulator_set_smbase,
5047 .set_msr = emulator_set_msr, 5020 .set_msr = emulator_set_msr,
5048 .get_msr = emulator_get_msr, 5021 .get_msr = emulator_get_msr,
5049 .check_pmc = emulator_check_pmc, 5022 .check_pmc = emulator_check_pmc,
@@ -5105,7 +5078,10 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
5105 (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : 5078 (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
5106 cs_db ? X86EMUL_MODE_PROT32 : 5079 cs_db ? X86EMUL_MODE_PROT32 :
5107 X86EMUL_MODE_PROT16; 5080 X86EMUL_MODE_PROT16;
5108 ctxt->guest_mode = is_guest_mode(vcpu); 5081 BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
5082 BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
5083 BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
5084 ctxt->emul_flags = vcpu->arch.hflags;
5109 5085
5110 init_decode_cache(ctxt); 5086 init_decode_cache(ctxt);
5111 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 5087 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5274,6 +5250,34 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
5274static int complete_emulated_mmio(struct kvm_vcpu *vcpu); 5250static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
5275static int complete_emulated_pio(struct kvm_vcpu *vcpu); 5251static int complete_emulated_pio(struct kvm_vcpu *vcpu);
5276 5252
5253static void kvm_smm_changed(struct kvm_vcpu *vcpu)
5254{
5255 if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
5256 /* This is a good place to trace that we are exiting SMM. */
5257 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
5258
5259 if (unlikely(vcpu->arch.smi_pending)) {
5260 kvm_make_request(KVM_REQ_SMI, vcpu);
5261 vcpu->arch.smi_pending = 0;
5262 } else {
5263 /* Process a latched INIT, if any. */
5264 kvm_make_request(KVM_REQ_EVENT, vcpu);
5265 }
5266 }
5267
5268 kvm_mmu_reset_context(vcpu);
5269}
5270
5271static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
5272{
5273 unsigned changed = vcpu->arch.hflags ^ emul_flags;
5274
5275 vcpu->arch.hflags = emul_flags;
5276
5277 if (changed & HF_SMM_MASK)
5278 kvm_smm_changed(vcpu);
5279}
5280
5277static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, 5281static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
5278 unsigned long *db) 5282 unsigned long *db)
5279{ 5283{
@@ -5473,6 +5477,8 @@ restart:
5473 unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); 5477 unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
5474 toggle_interruptibility(vcpu, ctxt->interruptibility); 5478 toggle_interruptibility(vcpu, ctxt->interruptibility);
5475 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5479 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5480 if (vcpu->arch.hflags != ctxt->emul_flags)
5481 kvm_set_hflags(vcpu, ctxt->emul_flags);
5476 kvm_rip_write(vcpu, ctxt->eip); 5482 kvm_rip_write(vcpu, ctxt->eip);
5477 if (r == EMULATE_DONE) 5483 if (r == EMULATE_DONE)
5478 kvm_vcpu_check_singlestep(vcpu, rflags, &r); 5484 kvm_vcpu_check_singlestep(vcpu, rflags, &r);
@@ -5951,6 +5957,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5951 lapic_irq.shorthand = 0; 5957 lapic_irq.shorthand = 0;
5952 lapic_irq.dest_mode = 0; 5958 lapic_irq.dest_mode = 0;
5953 lapic_irq.dest_id = apicid; 5959 lapic_irq.dest_id = apicid;
5960 lapic_irq.msi_redir_hint = false;
5954 5961
5955 lapic_irq.delivery_mode = APIC_DM_REMRD; 5962 lapic_irq.delivery_mode = APIC_DM_REMRD;
5956 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); 5963 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6038,6 +6045,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
6038 struct kvm_run *kvm_run = vcpu->run; 6045 struct kvm_run *kvm_run = vcpu->run;
6039 6046
6040 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 6047 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
6048 kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
6041 kvm_run->cr8 = kvm_get_cr8(vcpu); 6049 kvm_run->cr8 = kvm_get_cr8(vcpu);
6042 kvm_run->apic_base = kvm_get_apic_base(vcpu); 6050 kvm_run->apic_base = kvm_get_apic_base(vcpu);
6043 if (irqchip_in_kernel(vcpu->kvm)) 6051 if (irqchip_in_kernel(vcpu->kvm))
@@ -6161,6 +6169,233 @@ static void process_nmi(struct kvm_vcpu *vcpu)
6161 kvm_make_request(KVM_REQ_EVENT, vcpu); 6169 kvm_make_request(KVM_REQ_EVENT, vcpu);
6162} 6170}
6163 6171
6172#define put_smstate(type, buf, offset, val) \
6173 *(type *)((buf) + (offset) - 0x7e00) = val
6174
6175static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
6176{
6177 u32 flags = 0;
6178 flags |= seg->g << 23;
6179 flags |= seg->db << 22;
6180 flags |= seg->l << 21;
6181 flags |= seg->avl << 20;
6182 flags |= seg->present << 15;
6183 flags |= seg->dpl << 13;
6184 flags |= seg->s << 12;
6185 flags |= seg->type << 8;
6186 return flags;
6187}
6188
6189static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
6190{
6191 struct kvm_segment seg;
6192 int offset;
6193
6194 kvm_get_segment(vcpu, &seg, n);
6195 put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
6196
6197 if (n < 3)
6198 offset = 0x7f84 + n * 12;
6199 else
6200 offset = 0x7f2c + (n - 3) * 12;
6201
6202 put_smstate(u32, buf, offset + 8, seg.base);
6203 put_smstate(u32, buf, offset + 4, seg.limit);
6204 put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
6205}
6206
6207static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
6208{
6209 struct kvm_segment seg;
6210 int offset;
6211 u16 flags;
6212
6213 kvm_get_segment(vcpu, &seg, n);
6214 offset = 0x7e00 + n * 16;
6215
6216 flags = process_smi_get_segment_flags(&seg) >> 8;
6217 put_smstate(u16, buf, offset, seg.selector);
6218 put_smstate(u16, buf, offset + 2, flags);
6219 put_smstate(u32, buf, offset + 4, seg.limit);
6220 put_smstate(u64, buf, offset + 8, seg.base);
6221}
6222
6223static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
6224{
6225 struct desc_ptr dt;
6226 struct kvm_segment seg;
6227 unsigned long val;
6228 int i;
6229
6230 put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
6231 put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
6232 put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
6233 put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
6234
6235 for (i = 0; i < 8; i++)
6236 put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
6237
6238 kvm_get_dr(vcpu, 6, &val);
6239 put_smstate(u32, buf, 0x7fcc, (u32)val);
6240 kvm_get_dr(vcpu, 7, &val);
6241 put_smstate(u32, buf, 0x7fc8, (u32)val);
6242
6243 kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
6244 put_smstate(u32, buf, 0x7fc4, seg.selector);
6245 put_smstate(u32, buf, 0x7f64, seg.base);
6246 put_smstate(u32, buf, 0x7f60, seg.limit);
6247 put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
6248
6249 kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
6250 put_smstate(u32, buf, 0x7fc0, seg.selector);
6251 put_smstate(u32, buf, 0x7f80, seg.base);
6252 put_smstate(u32, buf, 0x7f7c, seg.limit);
6253 put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
6254
6255 kvm_x86_ops->get_gdt(vcpu, &dt);
6256 put_smstate(u32, buf, 0x7f74, dt.address);
6257 put_smstate(u32, buf, 0x7f70, dt.size);
6258
6259 kvm_x86_ops->get_idt(vcpu, &dt);
6260 put_smstate(u32, buf, 0x7f58, dt.address);
6261 put_smstate(u32, buf, 0x7f54, dt.size);
6262
6263 for (i = 0; i < 6; i++)
6264 process_smi_save_seg_32(vcpu, buf, i);
6265
6266 put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
6267
6268 /* revision id */
6269 put_smstate(u32, buf, 0x7efc, 0x00020000);
6270 put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
6271}
6272
6273static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
6274{
6275#ifdef CONFIG_X86_64
6276 struct desc_ptr dt;
6277 struct kvm_segment seg;
6278 unsigned long val;
6279 int i;
6280
6281 for (i = 0; i < 16; i++)
6282 put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
6283
6284 put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
6285 put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
6286
6287 kvm_get_dr(vcpu, 6, &val);
6288 put_smstate(u64, buf, 0x7f68, val);
6289 kvm_get_dr(vcpu, 7, &val);
6290 put_smstate(u64, buf, 0x7f60, val);
6291
6292 put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
6293 put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
6294 put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
6295
6296 put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
6297
6298 /* revision id */
6299 put_smstate(u32, buf, 0x7efc, 0x00020064);
6300
6301 put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
6302
6303 kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
6304 put_smstate(u16, buf, 0x7e90, seg.selector);
6305 put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
6306 put_smstate(u32, buf, 0x7e94, seg.limit);
6307 put_smstate(u64, buf, 0x7e98, seg.base);
6308
6309 kvm_x86_ops->get_idt(vcpu, &dt);
6310 put_smstate(u32, buf, 0x7e84, dt.size);
6311 put_smstate(u64, buf, 0x7e88, dt.address);
6312
6313 kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
6314 put_smstate(u16, buf, 0x7e70, seg.selector);
6315 put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
6316 put_smstate(u32, buf, 0x7e74, seg.limit);
6317 put_smstate(u64, buf, 0x7e78, seg.base);
6318
6319 kvm_x86_ops->get_gdt(vcpu, &dt);
6320 put_smstate(u32, buf, 0x7e64, dt.size);
6321 put_smstate(u64, buf, 0x7e68, dt.address);
6322
6323 for (i = 0; i < 6; i++)
6324 process_smi_save_seg_64(vcpu, buf, i);
6325#else
6326 WARN_ON_ONCE(1);
6327#endif
6328}
6329
6330static void process_smi(struct kvm_vcpu *vcpu)
6331{
6332 struct kvm_segment cs, ds;
6333 char buf[512];
6334 u32 cr0;
6335
6336 if (is_smm(vcpu)) {
6337 vcpu->arch.smi_pending = true;
6338 return;
6339 }
6340
6341 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
6342 vcpu->arch.hflags |= HF_SMM_MASK;
6343 memset(buf, 0, 512);
6344 if (guest_cpuid_has_longmode(vcpu))
6345 process_smi_save_state_64(vcpu, buf);
6346 else
6347 process_smi_save_state_32(vcpu, buf);
6348
6349 kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
6350
6351 if (kvm_x86_ops->get_nmi_mask(vcpu))
6352 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
6353 else
6354 kvm_x86_ops->set_nmi_mask(vcpu, true);
6355
6356 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
6357 kvm_rip_write(vcpu, 0x8000);
6358
6359 cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
6360 kvm_x86_ops->set_cr0(vcpu, cr0);
6361 vcpu->arch.cr0 = cr0;
6362
6363 kvm_x86_ops->set_cr4(vcpu, 0);
6364
6365 __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
6366
6367 cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
6368 cs.base = vcpu->arch.smbase;
6369
6370 ds.selector = 0;
6371 ds.base = 0;
6372
6373 cs.limit = ds.limit = 0xffffffff;
6374 cs.type = ds.type = 0x3;
6375 cs.dpl = ds.dpl = 0;
6376 cs.db = ds.db = 0;
6377 cs.s = ds.s = 1;
6378 cs.l = ds.l = 0;
6379 cs.g = ds.g = 1;
6380 cs.avl = ds.avl = 0;
6381 cs.present = ds.present = 1;
6382 cs.unusable = ds.unusable = 0;
6383 cs.padding = ds.padding = 0;
6384
6385 kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
6386 kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
6387 kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
6388 kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
6389 kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
6390 kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
6391
6392 if (guest_cpuid_has_longmode(vcpu))
6393 kvm_x86_ops->set_efer(vcpu, 0);
6394
6395 kvm_update_cpuid(vcpu);
6396 kvm_mmu_reset_context(vcpu);
6397}
6398
6164static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 6399static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
6165{ 6400{
6166 u64 eoi_exit_bitmap[4]; 6401 u64 eoi_exit_bitmap[4];
@@ -6269,12 +6504,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6269 } 6504 }
6270 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) 6505 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
6271 record_steal_time(vcpu); 6506 record_steal_time(vcpu);
6507 if (kvm_check_request(KVM_REQ_SMI, vcpu))
6508 process_smi(vcpu);
6272 if (kvm_check_request(KVM_REQ_NMI, vcpu)) 6509 if (kvm_check_request(KVM_REQ_NMI, vcpu))
6273 process_nmi(vcpu); 6510 process_nmi(vcpu);
6274 if (kvm_check_request(KVM_REQ_PMU, vcpu)) 6511 if (kvm_check_request(KVM_REQ_PMU, vcpu))
6275 kvm_handle_pmu_event(vcpu); 6512 kvm_pmu_handle_event(vcpu);
6276 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 6513 if (kvm_check_request(KVM_REQ_PMI, vcpu))
6277 kvm_deliver_pmi(vcpu); 6514 kvm_pmu_deliver_pmi(vcpu);
6278 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) 6515 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
6279 vcpu_scan_ioapic(vcpu); 6516 vcpu_scan_ioapic(vcpu);
6280 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) 6517 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6346,7 +6583,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6346 if (req_immediate_exit) 6583 if (req_immediate_exit)
6347 smp_send_reschedule(vcpu->cpu); 6584 smp_send_reschedule(vcpu->cpu);
6348 6585
6349 kvm_guest_enter(); 6586 __kvm_guest_enter();
6350 6587
6351 if (unlikely(vcpu->arch.switch_db_regs)) { 6588 if (unlikely(vcpu->arch.switch_db_regs)) {
6352 set_debugreg(0, 7); 6589 set_debugreg(0, 7);
@@ -7038,16 +7275,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
7038{ 7275{
7039 kvm_put_guest_xcr0(vcpu); 7276 kvm_put_guest_xcr0(vcpu);
7040 7277
7041 if (!vcpu->guest_fpu_loaded) 7278 if (!vcpu->guest_fpu_loaded) {
7279 vcpu->fpu_counter = 0;
7042 return; 7280 return;
7281 }
7043 7282
7044 vcpu->guest_fpu_loaded = 0; 7283 vcpu->guest_fpu_loaded = 0;
7045 copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); 7284 copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
7046 __kernel_fpu_end(); 7285 __kernel_fpu_end();
7047 ++vcpu->stat.fpu_reload; 7286 ++vcpu->stat.fpu_reload;
7048 if (!vcpu->arch.eager_fpu) 7287 /*
7049 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); 7288 * If using eager FPU mode, or if the guest is a frequent user
7050 7289 * of the FPU, just leave the FPU active for next time.
7290 * Every 255 times fpu_counter rolls over to 0; a guest that uses
7291 * the FPU in bursts will revert to loading it on demand.
7292 */
7293 if (!vcpu->arch.eager_fpu) {
7294 if (++vcpu->fpu_counter < 5)
7295 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
7296 }
7051 trace_kvm_fpu(0); 7297 trace_kvm_fpu(0);
7052} 7298}
7053 7299
@@ -7083,14 +7329,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
7083{ 7329{
7084 int r; 7330 int r;
7085 7331
7086 vcpu->arch.mtrr_state.have_fixed = 1; 7332 kvm_vcpu_mtrr_init(vcpu);
7087 r = vcpu_load(vcpu); 7333 r = vcpu_load(vcpu);
7088 if (r) 7334 if (r)
7089 return r; 7335 return r;
7090 kvm_vcpu_reset(vcpu); 7336 kvm_vcpu_reset(vcpu, false);
7091 kvm_mmu_setup(vcpu); 7337 kvm_mmu_setup(vcpu);
7092 vcpu_put(vcpu); 7338 vcpu_put(vcpu);
7093
7094 return r; 7339 return r;
7095} 7340}
7096 7341
@@ -7107,6 +7352,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
7107 kvm_write_tsc(vcpu, &msr); 7352 kvm_write_tsc(vcpu, &msr);
7108 vcpu_put(vcpu); 7353 vcpu_put(vcpu);
7109 7354
7355 if (!kvmclock_periodic_sync)
7356 return;
7357
7110 schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 7358 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
7111 KVMCLOCK_SYNC_PERIOD); 7359 KVMCLOCK_SYNC_PERIOD);
7112} 7360}
@@ -7124,8 +7372,10 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
7124 kvm_x86_ops->vcpu_free(vcpu); 7372 kvm_x86_ops->vcpu_free(vcpu);
7125} 7373}
7126 7374
7127void kvm_vcpu_reset(struct kvm_vcpu *vcpu) 7375void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
7128{ 7376{
7377 vcpu->arch.hflags = 0;
7378
7129 atomic_set(&vcpu->arch.nmi_queued, 0); 7379 atomic_set(&vcpu->arch.nmi_queued, 0);
7130 vcpu->arch.nmi_pending = 0; 7380 vcpu->arch.nmi_pending = 0;
7131 vcpu->arch.nmi_injected = false; 7381 vcpu->arch.nmi_injected = false;
@@ -7151,13 +7401,16 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
7151 kvm_async_pf_hash_reset(vcpu); 7401 kvm_async_pf_hash_reset(vcpu);
7152 vcpu->arch.apf.halted = false; 7402 vcpu->arch.apf.halted = false;
7153 7403
7154 kvm_pmu_reset(vcpu); 7404 if (!init_event) {
7405 kvm_pmu_reset(vcpu);
7406 vcpu->arch.smbase = 0x30000;
7407 }
7155 7408
7156 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 7409 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
7157 vcpu->arch.regs_avail = ~0; 7410 vcpu->arch.regs_avail = ~0;
7158 vcpu->arch.regs_dirty = ~0; 7411 vcpu->arch.regs_dirty = ~0;
7159 7412
7160 kvm_x86_ops->vcpu_reset(vcpu); 7413 kvm_x86_ops->vcpu_reset(vcpu, init_event);
7161} 7414}
7162 7415
7163void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 7416void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7356,6 +7609,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7356 7609
7357 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 7610 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
7358 7611
7612 vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
7613
7359 kvm_async_pf_hash_reset(vcpu); 7614 kvm_async_pf_hash_reset(vcpu);
7360 kvm_pmu_init(vcpu); 7615 kvm_pmu_init(vcpu);
7361 7616
@@ -7462,6 +7717,40 @@ void kvm_arch_sync_events(struct kvm *kvm)
7462 kvm_free_pit(kvm); 7717 kvm_free_pit(kvm);
7463} 7718}
7464 7719
7720int __x86_set_memory_region(struct kvm *kvm,
7721 const struct kvm_userspace_memory_region *mem)
7722{
7723 int i, r;
7724
7725 /* Called with kvm->slots_lock held. */
7726 BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM);
7727
7728 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
7729 struct kvm_userspace_memory_region m = *mem;
7730
7731 m.slot |= i << 16;
7732 r = __kvm_set_memory_region(kvm, &m);
7733 if (r < 0)
7734 return r;
7735 }
7736
7737 return 0;
7738}
7739EXPORT_SYMBOL_GPL(__x86_set_memory_region);
7740
7741int x86_set_memory_region(struct kvm *kvm,
7742 const struct kvm_userspace_memory_region *mem)
7743{
7744 int r;
7745
7746 mutex_lock(&kvm->slots_lock);
7747 r = __x86_set_memory_region(kvm, mem);
7748 mutex_unlock(&kvm->slots_lock);
7749
7750 return r;
7751}
7752EXPORT_SYMBOL_GPL(x86_set_memory_region);
7753
7465void kvm_arch_destroy_vm(struct kvm *kvm) 7754void kvm_arch_destroy_vm(struct kvm *kvm)
7466{ 7755{
7467 if (current->mm == kvm->mm) { 7756 if (current->mm == kvm->mm) {
@@ -7473,13 +7762,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
7473 struct kvm_userspace_memory_region mem; 7762 struct kvm_userspace_memory_region mem;
7474 memset(&mem, 0, sizeof(mem)); 7763 memset(&mem, 0, sizeof(mem));
7475 mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 7764 mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
7476 kvm_set_memory_region(kvm, &mem); 7765 x86_set_memory_region(kvm, &mem);
7477 7766
7478 mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 7767 mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
7479 kvm_set_memory_region(kvm, &mem); 7768 x86_set_memory_region(kvm, &mem);
7480 7769
7481 mem.slot = TSS_PRIVATE_MEMSLOT; 7770 mem.slot = TSS_PRIVATE_MEMSLOT;
7482 kvm_set_memory_region(kvm, &mem); 7771 x86_set_memory_region(kvm, &mem);
7483 } 7772 }
7484 kvm_iommu_unmap_guest(kvm); 7773 kvm_iommu_unmap_guest(kvm);
7485 kfree(kvm->arch.vpic); 7774 kfree(kvm->arch.vpic);
@@ -7568,18 +7857,18 @@ out_free:
7568 return -ENOMEM; 7857 return -ENOMEM;
7569} 7858}
7570 7859
7571void kvm_arch_memslots_updated(struct kvm *kvm) 7860void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
7572{ 7861{
7573 /* 7862 /*
7574 * memslots->generation has been incremented. 7863 * memslots->generation has been incremented.
7575 * mmio generation may have reached its maximum value. 7864 * mmio generation may have reached its maximum value.
7576 */ 7865 */
7577 kvm_mmu_invalidate_mmio_sptes(kvm); 7866 kvm_mmu_invalidate_mmio_sptes(kvm, slots);
7578} 7867}
7579 7868
7580int kvm_arch_prepare_memory_region(struct kvm *kvm, 7869int kvm_arch_prepare_memory_region(struct kvm *kvm,
7581 struct kvm_memory_slot *memslot, 7870 struct kvm_memory_slot *memslot,
7582 struct kvm_userspace_memory_region *mem, 7871 const struct kvm_userspace_memory_region *mem,
7583 enum kvm_mr_change change) 7872 enum kvm_mr_change change)
7584{ 7873{
7585 /* 7874 /*
@@ -7657,14 +7946,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
7657} 7946}
7658 7947
7659void kvm_arch_commit_memory_region(struct kvm *kvm, 7948void kvm_arch_commit_memory_region(struct kvm *kvm,
7660 struct kvm_userspace_memory_region *mem, 7949 const struct kvm_userspace_memory_region *mem,
7661 const struct kvm_memory_slot *old, 7950 const struct kvm_memory_slot *old,
7951 const struct kvm_memory_slot *new,
7662 enum kvm_mr_change change) 7952 enum kvm_mr_change change)
7663{ 7953{
7664 struct kvm_memory_slot *new;
7665 int nr_mmu_pages = 0; 7954 int nr_mmu_pages = 0;
7666 7955
7667 if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { 7956 if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
7668 int ret; 7957 int ret;
7669 7958
7670 ret = vm_munmap(old->userspace_addr, 7959 ret = vm_munmap(old->userspace_addr,
@@ -7681,9 +7970,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7681 if (nr_mmu_pages) 7970 if (nr_mmu_pages)
7682 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 7971 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
7683 7972
7684 /* It's OK to get 'new' slot here as it has already been installed */
7685 new = id_to_memslot(kvm->memslots, mem->slot);
7686
7687 /* 7973 /*
7688 * Dirty logging tracks sptes in 4k granularity, meaning that large 7974 * Dirty logging tracks sptes in 4k granularity, meaning that large
7689 * sptes have to be split. If live migration is successful, the guest 7975 * sptes have to be split. If live migration is successful, the guest
@@ -7708,9 +7994,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7708 * been zapped so no dirty logging staff is needed for old slot. For 7994 * been zapped so no dirty logging staff is needed for old slot. For
7709 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the 7995 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
7710 * new and it's also covered when dealing with the new slot. 7996 * new and it's also covered when dealing with the new slot.
7997 *
7998 * FIXME: const-ify all uses of struct kvm_memory_slot.
7711 */ 7999 */
7712 if (change != KVM_MR_DELETE) 8000 if (change != KVM_MR_DELETE)
7713 kvm_mmu_slot_apply_flags(kvm, new); 8001 kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
7714} 8002}
7715 8003
7716void kvm_arch_flush_shadow_all(struct kvm *kvm) 8004void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f5fef1868096..edc8cdcd786b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -4,6 +4,8 @@
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h" 5#include "kvm_cache_regs.h"
6 6
7#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
8
7static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 9static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
8{ 10{
9 vcpu->arch.exception.pending = false; 11 vcpu->arch.exception.pending = false;
@@ -160,7 +162,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
160 gva_t addr, void *val, unsigned int bytes, 162 gva_t addr, void *val, unsigned int bytes,
161 struct x86_exception *exception); 163 struct x86_exception *exception);
162 164
165void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
166u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
163bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); 167bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
168int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
169int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
170bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
171 int page_num);
164 172
165#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ 173#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
166 | XSTATE_BNDREGS | XSTATE_BNDCSR \ 174 | XSTATE_BNDREGS | XSTATE_BNDCSR \
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ad45054309a0..9564fd78c547 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -44,6 +44,10 @@
44/* Two fragments for cross MMIO pages. */ 44/* Two fragments for cross MMIO pages. */
45#define KVM_MAX_MMIO_FRAGMENTS 2 45#define KVM_MAX_MMIO_FRAGMENTS 2
46 46
47#ifndef KVM_ADDRESS_SPACE_NUM
48#define KVM_ADDRESS_SPACE_NUM 1
49#endif
50
47/* 51/*
48 * For the normal pfn, the highest 12 bits should be zero, 52 * For the normal pfn, the highest 12 bits should be zero,
49 * so we can mask bit 62 ~ bit 52 to indicate the error pfn, 53 * so we can mask bit 62 ~ bit 52 to indicate the error pfn,
@@ -134,6 +138,7 @@ static inline bool is_error_page(struct page *page)
134#define KVM_REQ_ENABLE_IBS 23 138#define KVM_REQ_ENABLE_IBS 23
135#define KVM_REQ_DISABLE_IBS 24 139#define KVM_REQ_DISABLE_IBS 24
136#define KVM_REQ_APIC_PAGE_RELOAD 25 140#define KVM_REQ_APIC_PAGE_RELOAD 25
141#define KVM_REQ_SMI 26
137 142
138#define KVM_USERSPACE_IRQ_SOURCE_ID 0 143#define KVM_USERSPACE_IRQ_SOURCE_ID 0
139#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 144#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -230,6 +235,7 @@ struct kvm_vcpu {
230 235
231 int fpu_active; 236 int fpu_active;
232 int guest_fpu_loaded, guest_xcr0_loaded; 237 int guest_fpu_loaded, guest_xcr0_loaded;
238 unsigned char fpu_counter;
233 wait_queue_head_t wq; 239 wait_queue_head_t wq;
234 struct pid *pid; 240 struct pid *pid;
235 int sigset_active; 241 int sigset_active;
@@ -329,6 +335,13 @@ struct kvm_kernel_irq_routing_entry {
329#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) 335#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
330#endif 336#endif
331 337
338#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
339static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
340{
341 return 0;
342}
343#endif
344
332/* 345/*
333 * Note: 346 * Note:
334 * memslots are not sorted by id anymore, please use id_to_memslot() 347 * memslots are not sorted by id anymore, please use id_to_memslot()
@@ -347,7 +360,7 @@ struct kvm {
347 spinlock_t mmu_lock; 360 spinlock_t mmu_lock;
348 struct mutex slots_lock; 361 struct mutex slots_lock;
349 struct mm_struct *mm; /* userspace tied to this vm */ 362 struct mm_struct *mm; /* userspace tied to this vm */
350 struct kvm_memslots *memslots; 363 struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
351 struct srcu_struct srcu; 364 struct srcu_struct srcu;
352 struct srcu_struct irq_srcu; 365 struct srcu_struct irq_srcu;
353#ifdef CONFIG_KVM_APIC_ARCHITECTURE 366#ifdef CONFIG_KVM_APIC_ARCHITECTURE
@@ -462,13 +475,25 @@ void kvm_exit(void);
462void kvm_get_kvm(struct kvm *kvm); 475void kvm_get_kvm(struct kvm *kvm);
463void kvm_put_kvm(struct kvm *kvm); 476void kvm_put_kvm(struct kvm *kvm);
464 477
465static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) 478static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
466{ 479{
467 return rcu_dereference_check(kvm->memslots, 480 return rcu_dereference_check(kvm->memslots[as_id],
468 srcu_read_lock_held(&kvm->srcu) 481 srcu_read_lock_held(&kvm->srcu)
469 || lockdep_is_held(&kvm->slots_lock)); 482 || lockdep_is_held(&kvm->slots_lock));
470} 483}
471 484
485static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
486{
487 return __kvm_memslots(kvm, 0);
488}
489
490static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
491{
492 int as_id = kvm_arch_vcpu_memslots_id(vcpu);
493
494 return __kvm_memslots(vcpu->kvm, as_id);
495}
496
472static inline struct kvm_memory_slot * 497static inline struct kvm_memory_slot *
473id_to_memslot(struct kvm_memslots *slots, int id) 498id_to_memslot(struct kvm_memslots *slots, int id)
474{ 499{
@@ -500,21 +525,22 @@ enum kvm_mr_change {
500}; 525};
501 526
502int kvm_set_memory_region(struct kvm *kvm, 527int kvm_set_memory_region(struct kvm *kvm,
503 struct kvm_userspace_memory_region *mem); 528 const struct kvm_userspace_memory_region *mem);
504int __kvm_set_memory_region(struct kvm *kvm, 529int __kvm_set_memory_region(struct kvm *kvm,
505 struct kvm_userspace_memory_region *mem); 530 const struct kvm_userspace_memory_region *mem);
506void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 531void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
507 struct kvm_memory_slot *dont); 532 struct kvm_memory_slot *dont);
508int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 533int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
509 unsigned long npages); 534 unsigned long npages);
510void kvm_arch_memslots_updated(struct kvm *kvm); 535void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots);
511int kvm_arch_prepare_memory_region(struct kvm *kvm, 536int kvm_arch_prepare_memory_region(struct kvm *kvm,
512 struct kvm_memory_slot *memslot, 537 struct kvm_memory_slot *memslot,
513 struct kvm_userspace_memory_region *mem, 538 const struct kvm_userspace_memory_region *mem,
514 enum kvm_mr_change change); 539 enum kvm_mr_change change);
515void kvm_arch_commit_memory_region(struct kvm *kvm, 540void kvm_arch_commit_memory_region(struct kvm *kvm,
516 struct kvm_userspace_memory_region *mem, 541 const struct kvm_userspace_memory_region *mem,
517 const struct kvm_memory_slot *old, 542 const struct kvm_memory_slot *old,
543 const struct kvm_memory_slot *new,
518 enum kvm_mr_change change); 544 enum kvm_mr_change change);
519bool kvm_largepages_enabled(void); 545bool kvm_largepages_enabled(void);
520void kvm_disable_largepages(void); 546void kvm_disable_largepages(void);
@@ -524,8 +550,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm);
524void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 550void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
525 struct kvm_memory_slot *slot); 551 struct kvm_memory_slot *slot);
526 552
527int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 553int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
528 int nr_pages); 554 struct page **pages, int nr_pages);
529 555
530struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 556struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
531unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); 557unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
@@ -538,13 +564,13 @@ void kvm_release_page_dirty(struct page *page);
538void kvm_set_page_accessed(struct page *page); 564void kvm_set_page_accessed(struct page *page);
539 565
540pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); 566pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
541pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
542 bool write_fault, bool *writable);
543pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 567pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
544pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 568pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
545 bool *writable); 569 bool *writable);
546pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); 570pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
547pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); 571pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
572pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
573 bool *async, bool write_fault, bool *writable);
548 574
549void kvm_release_pfn_clean(pfn_t pfn); 575void kvm_release_pfn_clean(pfn_t pfn);
550void kvm_set_pfn_dirty(pfn_t pfn); 576void kvm_set_pfn_dirty(pfn_t pfn);
@@ -573,6 +599,25 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
573unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); 599unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
574void mark_page_dirty(struct kvm *kvm, gfn_t gfn); 600void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
575 601
602struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
603struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
604pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
605pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
606struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
607unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
608unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
609int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
610 int len);
611int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
612 unsigned long len);
613int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
614 unsigned long len);
615int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data,
616 int offset, int len);
617int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
618 unsigned long len);
619void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
620
576void kvm_vcpu_block(struct kvm_vcpu *vcpu); 621void kvm_vcpu_block(struct kvm_vcpu *vcpu);
577void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 622void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
578int kvm_vcpu_yield_to(struct kvm_vcpu *target); 623int kvm_vcpu_yield_to(struct kvm_vcpu *target);
@@ -762,16 +807,10 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
762} 807}
763#endif 808#endif
764 809
765static inline void kvm_guest_enter(void) 810/* must be called with irqs disabled */
811static inline void __kvm_guest_enter(void)
766{ 812{
767 unsigned long flags;
768
769 BUG_ON(preemptible());
770
771 local_irq_save(flags);
772 guest_enter(); 813 guest_enter();
773 local_irq_restore(flags);
774
775 /* KVM does not hold any references to rcu protected data when it 814 /* KVM does not hold any references to rcu protected data when it
776 * switches CPU into a guest mode. In fact switching to a guest mode 815 * switches CPU into a guest mode. In fact switching to a guest mode
777 * is very similar to exiting to userspace from rcu point of view. In 816 * is very similar to exiting to userspace from rcu point of view. In
@@ -783,12 +822,27 @@ static inline void kvm_guest_enter(void)
783 rcu_virt_note_context_switch(smp_processor_id()); 822 rcu_virt_note_context_switch(smp_processor_id());
784} 823}
785 824
825/* must be called with irqs disabled */
826static inline void __kvm_guest_exit(void)
827{
828 guest_exit();
829}
830
831static inline void kvm_guest_enter(void)
832{
833 unsigned long flags;
834
835 local_irq_save(flags);
836 __kvm_guest_enter();
837 local_irq_restore(flags);
838}
839
786static inline void kvm_guest_exit(void) 840static inline void kvm_guest_exit(void)
787{ 841{
788 unsigned long flags; 842 unsigned long flags;
789 843
790 local_irq_save(flags); 844 local_irq_save(flags);
791 guest_exit(); 845 __kvm_guest_exit();
792 local_irq_restore(flags); 846 local_irq_restore(flags);
793} 847}
794 848
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 931da7e917cf..1b47a185c2f0 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -28,6 +28,7 @@ struct kvm_run;
28struct kvm_userspace_memory_region; 28struct kvm_userspace_memory_region;
29struct kvm_vcpu; 29struct kvm_vcpu;
30struct kvm_vcpu_init; 30struct kvm_vcpu_init;
31struct kvm_memslots;
31 32
32enum kvm_mr_change; 33enum kvm_mr_change;
33 34
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4b60056776d1..716ad4ae4d4b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -202,7 +202,7 @@ struct kvm_run {
202 __u32 exit_reason; 202 __u32 exit_reason;
203 __u8 ready_for_interrupt_injection; 203 __u8 ready_for_interrupt_injection;
204 __u8 if_flag; 204 __u8 if_flag;
205 __u8 padding2[2]; 205 __u16 flags;
206 206
207 /* in (pre_kvm_run), out (post_kvm_run) */ 207 /* in (pre_kvm_run), out (post_kvm_run) */
208 __u64 cr8; 208 __u64 cr8;
@@ -814,6 +814,9 @@ struct kvm_ppc_smmu_info {
814#define KVM_CAP_S390_INJECT_IRQ 113 814#define KVM_CAP_S390_INJECT_IRQ 113
815#define KVM_CAP_S390_IRQ_STATE 114 815#define KVM_CAP_S390_IRQ_STATE 114
816#define KVM_CAP_PPC_HWRNG 115 816#define KVM_CAP_PPC_HWRNG 115
817#define KVM_CAP_DISABLE_QUIRKS 116
818#define KVM_CAP_X86_SMM 117
819#define KVM_CAP_MULTI_ADDRESS_SPACE 118
817 820
818#ifdef KVM_CAP_IRQ_ROUTING 821#ifdef KVM_CAP_IRQ_ROUTING
819 822
@@ -894,7 +897,7 @@ struct kvm_xen_hvm_config {
894 * 897 *
895 * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies 898 * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
896 * the irqfd to operate in resampling mode for level triggered interrupt 899 * the irqfd to operate in resampling mode for level triggered interrupt
897 * emlation. See Documentation/virtual/kvm/api.txt. 900 * emulation. See Documentation/virtual/kvm/api.txt.
898 */ 901 */
899#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) 902#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)
900 903
@@ -1199,6 +1202,8 @@ struct kvm_s390_ucas_mapping {
1199/* Available with KVM_CAP_S390_IRQ_STATE */ 1202/* Available with KVM_CAP_S390_IRQ_STATE */
1200#define KVM_S390_SET_IRQ_STATE _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state) 1203#define KVM_S390_SET_IRQ_STATE _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
1201#define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state) 1204#define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
1205/* Available with KVM_CAP_X86_SMM */
1206#define KVM_SMI _IO(KVMIO, 0xb7)
1202 1207
1203#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 1208#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
1204#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) 1209#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index e9c3a7a83833..e661e7fb9d91 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -76,8 +76,6 @@ static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
76 vgic_reg_access(mmio, &reg, offset, 76 vgic_reg_access(mmio, &reg, offset,
77 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); 77 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
78 if (mmio->is_write) { 78 if (mmio->is_write) {
79 if (reg & GICD_CTLR_ENABLE_SS_G0)
80 kvm_info("guest tried to enable unsupported Group0 interrupts\n");
81 vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1); 79 vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
82 vgic_update_state(vcpu->kvm); 80 vgic_update_state(vcpu->kvm);
83 return true; 81 return true;
@@ -173,6 +171,32 @@ static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
173 return false; 171 return false;
174} 172}
175 173
174static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu,
175 struct kvm_exit_mmio *mmio,
176 phys_addr_t offset)
177{
178 if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
179 return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
180 vcpu->vcpu_id);
181
182 vgic_reg_access(mmio, NULL, offset,
183 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
184 return false;
185}
186
187static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu,
188 struct kvm_exit_mmio *mmio,
189 phys_addr_t offset)
190{
191 if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
192 return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
193 vcpu->vcpu_id);
194
195 vgic_reg_access(mmio, NULL, offset,
196 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
197 return false;
198}
199
176static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu, 200static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
177 struct kvm_exit_mmio *mmio, 201 struct kvm_exit_mmio *mmio,
178 phys_addr_t offset) 202 phys_addr_t offset)
@@ -428,13 +452,13 @@ static const struct vgic_io_range vgic_v3_dist_ranges[] = {
428 .base = GICD_ISACTIVER, 452 .base = GICD_ISACTIVER,
429 .len = 0x80, 453 .len = 0x80,
430 .bits_per_irq = 1, 454 .bits_per_irq = 1,
431 .handle_mmio = handle_mmio_raz_wi, 455 .handle_mmio = handle_mmio_set_active_reg_dist,
432 }, 456 },
433 { 457 {
434 .base = GICD_ICACTIVER, 458 .base = GICD_ICACTIVER,
435 .len = 0x80, 459 .len = 0x80,
436 .bits_per_irq = 1, 460 .bits_per_irq = 1,
437 .handle_mmio = handle_mmio_raz_wi, 461 .handle_mmio = handle_mmio_clear_active_reg_dist,
438 }, 462 },
439 { 463 {
440 .base = GICD_IPRIORITYR, 464 .base = GICD_IPRIORITYR,
@@ -561,6 +585,26 @@ static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
561 ACCESS_WRITE_CLEARBIT); 585 ACCESS_WRITE_CLEARBIT);
562} 586}
563 587
588static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu,
589 struct kvm_exit_mmio *mmio,
590 phys_addr_t offset)
591{
592 struct kvm_vcpu *redist_vcpu = mmio->private;
593
594 return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
595 redist_vcpu->vcpu_id);
596}
597
598static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu,
599 struct kvm_exit_mmio *mmio,
600 phys_addr_t offset)
601{
602 struct kvm_vcpu *redist_vcpu = mmio->private;
603
604 return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
605 redist_vcpu->vcpu_id);
606}
607
564static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu, 608static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
565 struct kvm_exit_mmio *mmio, 609 struct kvm_exit_mmio *mmio,
566 phys_addr_t offset) 610 phys_addr_t offset)
@@ -674,13 +718,13 @@ static const struct vgic_io_range vgic_redist_ranges[] = {
674 .base = SGI_base(GICR_ISACTIVER0), 718 .base = SGI_base(GICR_ISACTIVER0),
675 .len = 0x04, 719 .len = 0x04,
676 .bits_per_irq = 1, 720 .bits_per_irq = 1,
677 .handle_mmio = handle_mmio_raz_wi, 721 .handle_mmio = handle_mmio_set_active_reg_redist,
678 }, 722 },
679 { 723 {
680 .base = SGI_base(GICR_ICACTIVER0), 724 .base = SGI_base(GICR_ICACTIVER0),
681 .len = 0x04, 725 .len = 0x04,
682 .bits_per_irq = 1, 726 .bits_per_irq = 1,
683 .handle_mmio = handle_mmio_raz_wi, 727 .handle_mmio = handle_mmio_clear_active_reg_redist,
684 }, 728 },
685 { 729 {
686 .base = SGI_base(GICR_IPRIORITYR0), 730 .base = SGI_base(GICR_IPRIORITYR0),
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 78fb8201014f..f94d887d20e6 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -26,8 +26,6 @@
26#include <linux/of_irq.h> 26#include <linux/of_irq.h>
27#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28 28
29#include <linux/irqchip/arm-gic.h>
30
31#include <asm/kvm_emulate.h> 29#include <asm/kvm_emulate.h>
32#include <asm/kvm_arm.h> 30#include <asm/kvm_arm.h>
33#include <asm/kvm_mmu.h> 31#include <asm/kvm_mmu.h>
@@ -1561,7 +1559,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
1561 goto out; 1559 goto out;
1562 } 1560 }
1563 1561
1564 if (irq_num >= kvm->arch.vgic.nr_irqs) 1562 if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
1565 return -EINVAL; 1563 return -EINVAL;
1566 1564
1567 vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level); 1565 vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level);
@@ -2161,10 +2159,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id,
2161 2159
2162 BUG_ON(!vgic_initialized(kvm)); 2160 BUG_ON(!vgic_initialized(kvm));
2163 2161
2164 if (spi > kvm->arch.vgic.nr_irqs)
2165 return -EINVAL;
2166 return kvm_vgic_inject_irq(kvm, 0, spi, level); 2162 return kvm_vgic_inject_irq(kvm, 0, spi, level);
2167
2168} 2163}
2169 2164
2170/* MSI not implemented yet */ 2165/* MSI not implemented yet */
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
index e7ef6447cb82..ec4cfa278f04 100644
--- a/virt/kvm/async_pf.h
+++ b/virt/kvm/async_pf.h
@@ -29,8 +29,8 @@ void kvm_async_pf_deinit(void);
29void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu); 29void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
30#else 30#else
31#define kvm_async_pf_init() (0) 31#define kvm_async_pf_init() (0)
32#define kvm_async_pf_deinit() do{}while(0) 32#define kvm_async_pf_deinit() do {} while (0)
33#define kvm_async_pf_vcpu_init(C) do{}while(0) 33#define kvm_async_pf_vcpu_init(C) do {} while (0)
34#endif 34#endif
35 35
36#endif 36#endif
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index b280c20444d1..5cbf190d238c 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -24,9 +24,9 @@ struct kvm_coalesced_mmio_dev {
24int kvm_coalesced_mmio_init(struct kvm *kvm); 24int kvm_coalesced_mmio_init(struct kvm *kvm);
25void kvm_coalesced_mmio_free(struct kvm *kvm); 25void kvm_coalesced_mmio_free(struct kvm *kvm);
26int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, 26int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
27 struct kvm_coalesced_mmio_zone *zone); 27 struct kvm_coalesced_mmio_zone *zone);
28int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, 28int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
29 struct kvm_coalesced_mmio_zone *zone); 29 struct kvm_coalesced_mmio_zone *zone);
30 30
31#else 31#else
32 32
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 1d56a901e791..21c14244f4c4 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -33,7 +33,6 @@
33 33
34struct kvm_irq_routing_table { 34struct kvm_irq_routing_table {
35 int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; 35 int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
36 struct kvm_kernel_irq_routing_entry *rt_entries;
37 u32 nr_rt_entries; 36 u32 nr_rt_entries;
38 /* 37 /*
39 * Array indexed by gsi. Each entry contains list of irq chips 38 * Array indexed by gsi. Each entry contains list of irq chips
@@ -118,11 +117,32 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
118 return ret; 117 return ret;
119} 118}
120 119
120static void free_irq_routing_table(struct kvm_irq_routing_table *rt)
121{
122 int i;
123
124 if (!rt)
125 return;
126
127 for (i = 0; i < rt->nr_rt_entries; ++i) {
128 struct kvm_kernel_irq_routing_entry *e;
129 struct hlist_node *n;
130
131 hlist_for_each_entry_safe(e, n, &rt->map[i], link) {
132 hlist_del(&e->link);
133 kfree(e);
134 }
135 }
136
137 kfree(rt);
138}
139
121void kvm_free_irq_routing(struct kvm *kvm) 140void kvm_free_irq_routing(struct kvm *kvm)
122{ 141{
123 /* Called only during vm destruction. Nobody can use the pointer 142 /* Called only during vm destruction. Nobody can use the pointer
124 at this stage */ 143 at this stage */
125 kfree(kvm->irq_routing); 144 struct kvm_irq_routing_table *rt = rcu_access_pointer(kvm->irq_routing);
145 free_irq_routing_table(rt);
126} 146}
127 147
128static int setup_routing_entry(struct kvm_irq_routing_table *rt, 148static int setup_routing_entry(struct kvm_irq_routing_table *rt,
@@ -173,25 +193,29 @@ int kvm_set_irq_routing(struct kvm *kvm,
173 193
174 nr_rt_entries += 1; 194 nr_rt_entries += 1;
175 195
176 new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) 196 new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
177 + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
178 GFP_KERNEL); 197 GFP_KERNEL);
179 198
180 if (!new) 199 if (!new)
181 return -ENOMEM; 200 return -ENOMEM;
182 201
183 new->rt_entries = (void *)&new->map[nr_rt_entries];
184
185 new->nr_rt_entries = nr_rt_entries; 202 new->nr_rt_entries = nr_rt_entries;
186 for (i = 0; i < KVM_NR_IRQCHIPS; i++) 203 for (i = 0; i < KVM_NR_IRQCHIPS; i++)
187 for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) 204 for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
188 new->chip[i][j] = -1; 205 new->chip[i][j] = -1;
189 206
190 for (i = 0; i < nr; ++i) { 207 for (i = 0; i < nr; ++i) {
208 struct kvm_kernel_irq_routing_entry *e;
209
210 r = -ENOMEM;
211 e = kzalloc(sizeof(*e), GFP_KERNEL);
212 if (!e)
213 goto out;
214
191 r = -EINVAL; 215 r = -EINVAL;
192 if (ue->flags) 216 if (ue->flags)
193 goto out; 217 goto out;
194 r = setup_routing_entry(new, &new->rt_entries[i], ue); 218 r = setup_routing_entry(new, e, ue);
195 if (r) 219 if (r)
196 goto out; 220 goto out;
197 ++ue; 221 ++ue;
@@ -209,6 +233,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
209 r = 0; 233 r = 0;
210 234
211out: 235out:
212 kfree(new); 236 free_irq_routing_table(new);
237
213 return r; 238 return r;
214} 239}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 90977418aeb6..848af90b8091 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -103,8 +103,7 @@ static void hardware_disable_all(void);
103static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 103static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
104 104
105static void kvm_release_pfn_dirty(pfn_t pfn); 105static void kvm_release_pfn_dirty(pfn_t pfn);
106static void mark_page_dirty_in_slot(struct kvm *kvm, 106static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
107 struct kvm_memory_slot *memslot, gfn_t gfn);
108 107
109__visible bool kvm_rebooting; 108__visible bool kvm_rebooting;
110EXPORT_SYMBOL_GPL(kvm_rebooting); 109EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -440,13 +439,60 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
440 439
441#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 440#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
442 441
443static void kvm_init_memslots_id(struct kvm *kvm) 442static struct kvm_memslots *kvm_alloc_memslots(void)
444{ 443{
445 int i; 444 int i;
446 struct kvm_memslots *slots = kvm->memslots; 445 struct kvm_memslots *slots;
447 446
447 slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
448 if (!slots)
449 return NULL;
450
451 /*
452 * Init kvm generation close to the maximum to easily test the
453 * code of handling generation number wrap-around.
454 */
455 slots->generation = -150;
448 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 456 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
449 slots->id_to_index[i] = slots->memslots[i].id = i; 457 slots->id_to_index[i] = slots->memslots[i].id = i;
458
459 return slots;
460}
461
462static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
463{
464 if (!memslot->dirty_bitmap)
465 return;
466
467 kvfree(memslot->dirty_bitmap);
468 memslot->dirty_bitmap = NULL;
469}
470
471/*
472 * Free any memory in @free but not in @dont.
473 */
474static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
475 struct kvm_memory_slot *dont)
476{
477 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
478 kvm_destroy_dirty_bitmap(free);
479
480 kvm_arch_free_memslot(kvm, free, dont);
481
482 free->npages = 0;
483}
484
485static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
486{
487 struct kvm_memory_slot *memslot;
488
489 if (!slots)
490 return;
491
492 kvm_for_each_memslot(memslot, slots)
493 kvm_free_memslot(kvm, memslot, NULL);
494
495 kvfree(slots);
450} 496}
451 497
452static struct kvm *kvm_create_vm(unsigned long type) 498static struct kvm *kvm_create_vm(unsigned long type)
@@ -472,17 +518,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
472 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 518 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
473 519
474 r = -ENOMEM; 520 r = -ENOMEM;
475 kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots)); 521 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
476 if (!kvm->memslots) 522 kvm->memslots[i] = kvm_alloc_memslots();
477 goto out_err_no_srcu; 523 if (!kvm->memslots[i])
478 524 goto out_err_no_srcu;
479 /* 525 }
480 * Init kvm generation close to the maximum to easily test the
481 * code of handling generation number wrap-around.
482 */
483 kvm->memslots->generation = -150;
484 526
485 kvm_init_memslots_id(kvm);
486 if (init_srcu_struct(&kvm->srcu)) 527 if (init_srcu_struct(&kvm->srcu))
487 goto out_err_no_srcu; 528 goto out_err_no_srcu;
488 if (init_srcu_struct(&kvm->irq_srcu)) 529 if (init_srcu_struct(&kvm->irq_srcu))
@@ -523,7 +564,8 @@ out_err_no_srcu:
523out_err_no_disable: 564out_err_no_disable:
524 for (i = 0; i < KVM_NR_BUSES; i++) 565 for (i = 0; i < KVM_NR_BUSES; i++)
525 kfree(kvm->buses[i]); 566 kfree(kvm->buses[i]);
526 kvfree(kvm->memslots); 567 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
568 kvm_free_memslots(kvm, kvm->memslots[i]);
527 kvm_arch_free_vm(kvm); 569 kvm_arch_free_vm(kvm);
528 return ERR_PTR(r); 570 return ERR_PTR(r);
529} 571}
@@ -540,40 +582,6 @@ void *kvm_kvzalloc(unsigned long size)
540 return kzalloc(size, GFP_KERNEL); 582 return kzalloc(size, GFP_KERNEL);
541} 583}
542 584
543static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
544{
545 if (!memslot->dirty_bitmap)
546 return;
547
548 kvfree(memslot->dirty_bitmap);
549 memslot->dirty_bitmap = NULL;
550}
551
552/*
553 * Free any memory in @free but not in @dont.
554 */
555static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free,
556 struct kvm_memory_slot *dont)
557{
558 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
559 kvm_destroy_dirty_bitmap(free);
560
561 kvm_arch_free_memslot(kvm, free, dont);
562
563 free->npages = 0;
564}
565
566static void kvm_free_physmem(struct kvm *kvm)
567{
568 struct kvm_memslots *slots = kvm->memslots;
569 struct kvm_memory_slot *memslot;
570
571 kvm_for_each_memslot(memslot, slots)
572 kvm_free_physmem_slot(kvm, memslot, NULL);
573
574 kvfree(kvm->memslots);
575}
576
577static void kvm_destroy_devices(struct kvm *kvm) 585static void kvm_destroy_devices(struct kvm *kvm)
578{ 586{
579 struct list_head *node, *tmp; 587 struct list_head *node, *tmp;
@@ -607,7 +615,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
607#endif 615#endif
608 kvm_arch_destroy_vm(kvm); 616 kvm_arch_destroy_vm(kvm);
609 kvm_destroy_devices(kvm); 617 kvm_destroy_devices(kvm);
610 kvm_free_physmem(kvm); 618 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
619 kvm_free_memslots(kvm, kvm->memslots[i]);
611 cleanup_srcu_struct(&kvm->irq_srcu); 620 cleanup_srcu_struct(&kvm->irq_srcu);
612 cleanup_srcu_struct(&kvm->srcu); 621 cleanup_srcu_struct(&kvm->srcu);
613 kvm_arch_free_vm(kvm); 622 kvm_arch_free_vm(kvm);
@@ -670,8 +679,6 @@ static void update_memslots(struct kvm_memslots *slots,
670 WARN_ON(mslots[i].id != id); 679 WARN_ON(mslots[i].id != id);
671 if (!new->npages) { 680 if (!new->npages) {
672 WARN_ON(!mslots[i].npages); 681 WARN_ON(!mslots[i].npages);
673 new->base_gfn = 0;
674 new->flags = 0;
675 if (mslots[i].npages) 682 if (mslots[i].npages)
676 slots->used_slots--; 683 slots->used_slots--;
677 } else { 684 } else {
@@ -711,7 +718,7 @@ static void update_memslots(struct kvm_memslots *slots,
711 slots->id_to_index[mslots[i].id] = i; 718 slots->id_to_index[mslots[i].id] = i;
712} 719}
713 720
714static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) 721static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
715{ 722{
716 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 723 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
717 724
@@ -726,9 +733,9 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
726} 733}
727 734
728static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 735static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
729 struct kvm_memslots *slots) 736 int as_id, struct kvm_memslots *slots)
730{ 737{
731 struct kvm_memslots *old_memslots = kvm->memslots; 738 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
732 739
733 /* 740 /*
734 * Set the low bit in the generation, which disables SPTE caching 741 * Set the low bit in the generation, which disables SPTE caching
@@ -737,7 +744,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
737 WARN_ON(old_memslots->generation & 1); 744 WARN_ON(old_memslots->generation & 1);
738 slots->generation = old_memslots->generation + 1; 745 slots->generation = old_memslots->generation + 1;
739 746
740 rcu_assign_pointer(kvm->memslots, slots); 747 rcu_assign_pointer(kvm->memslots[as_id], slots);
741 synchronize_srcu_expedited(&kvm->srcu); 748 synchronize_srcu_expedited(&kvm->srcu);
742 749
743 /* 750 /*
@@ -747,7 +754,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
747 */ 754 */
748 slots->generation++; 755 slots->generation++;
749 756
750 kvm_arch_memslots_updated(kvm); 757 kvm_arch_memslots_updated(kvm, slots);
751 758
752 return old_memslots; 759 return old_memslots;
753} 760}
@@ -761,7 +768,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
761 * Must be called holding kvm->slots_lock for write. 768 * Must be called holding kvm->slots_lock for write.
762 */ 769 */
763int __kvm_set_memory_region(struct kvm *kvm, 770int __kvm_set_memory_region(struct kvm *kvm,
764 struct kvm_userspace_memory_region *mem) 771 const struct kvm_userspace_memory_region *mem)
765{ 772{
766 int r; 773 int r;
767 gfn_t base_gfn; 774 gfn_t base_gfn;
@@ -769,6 +776,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
769 struct kvm_memory_slot *slot; 776 struct kvm_memory_slot *slot;
770 struct kvm_memory_slot old, new; 777 struct kvm_memory_slot old, new;
771 struct kvm_memslots *slots = NULL, *old_memslots; 778 struct kvm_memslots *slots = NULL, *old_memslots;
779 int as_id, id;
772 enum kvm_mr_change change; 780 enum kvm_mr_change change;
773 781
774 r = check_memory_region_flags(mem); 782 r = check_memory_region_flags(mem);
@@ -776,36 +784,36 @@ int __kvm_set_memory_region(struct kvm *kvm,
776 goto out; 784 goto out;
777 785
778 r = -EINVAL; 786 r = -EINVAL;
787 as_id = mem->slot >> 16;
788 id = (u16)mem->slot;
789
779 /* General sanity checks */ 790 /* General sanity checks */
780 if (mem->memory_size & (PAGE_SIZE - 1)) 791 if (mem->memory_size & (PAGE_SIZE - 1))
781 goto out; 792 goto out;
782 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 793 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
783 goto out; 794 goto out;
784 /* We can read the guest memory with __xxx_user() later on. */ 795 /* We can read the guest memory with __xxx_user() later on. */
785 if ((mem->slot < KVM_USER_MEM_SLOTS) && 796 if ((id < KVM_USER_MEM_SLOTS) &&
786 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 797 ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
787 !access_ok(VERIFY_WRITE, 798 !access_ok(VERIFY_WRITE,
788 (void __user *)(unsigned long)mem->userspace_addr, 799 (void __user *)(unsigned long)mem->userspace_addr,
789 mem->memory_size))) 800 mem->memory_size)))
790 goto out; 801 goto out;
791 if (mem->slot >= KVM_MEM_SLOTS_NUM) 802 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
792 goto out; 803 goto out;
793 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 804 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
794 goto out; 805 goto out;
795 806
796 slot = id_to_memslot(kvm->memslots, mem->slot); 807 slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
797 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 808 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
798 npages = mem->memory_size >> PAGE_SHIFT; 809 npages = mem->memory_size >> PAGE_SHIFT;
799 810
800 if (npages > KVM_MEM_MAX_NR_PAGES) 811 if (npages > KVM_MEM_MAX_NR_PAGES)
801 goto out; 812 goto out;
802 813
803 if (!npages)
804 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
805
806 new = old = *slot; 814 new = old = *slot;
807 815
808 new.id = mem->slot; 816 new.id = id;
809 new.base_gfn = base_gfn; 817 new.base_gfn = base_gfn;
810 new.npages = npages; 818 new.npages = npages;
811 new.flags = mem->flags; 819 new.flags = mem->flags;
@@ -828,17 +836,21 @@ int __kvm_set_memory_region(struct kvm *kvm,
828 goto out; 836 goto out;
829 } 837 }
830 } 838 }
831 } else if (old.npages) { 839 } else {
840 if (!old.npages)
841 goto out;
842
832 change = KVM_MR_DELETE; 843 change = KVM_MR_DELETE;
833 } else /* Modify a non-existent slot: disallowed. */ 844 new.base_gfn = 0;
834 goto out; 845 new.flags = 0;
846 }
835 847
836 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 848 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
837 /* Check for overlaps */ 849 /* Check for overlaps */
838 r = -EEXIST; 850 r = -EEXIST;
839 kvm_for_each_memslot(slot, kvm->memslots) { 851 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
840 if ((slot->id >= KVM_USER_MEM_SLOTS) || 852 if ((slot->id >= KVM_USER_MEM_SLOTS) ||
841 (slot->id == mem->slot)) 853 (slot->id == id))
842 continue; 854 continue;
843 if (!((base_gfn + npages <= slot->base_gfn) || 855 if (!((base_gfn + npages <= slot->base_gfn) ||
844 (base_gfn >= slot->base_gfn + slot->npages))) 856 (base_gfn >= slot->base_gfn + slot->npages)))
@@ -867,13 +879,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
867 slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); 879 slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
868 if (!slots) 880 if (!slots)
869 goto out_free; 881 goto out_free;
870 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 882 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
871 883
872 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 884 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
873 slot = id_to_memslot(slots, mem->slot); 885 slot = id_to_memslot(slots, id);
874 slot->flags |= KVM_MEMSLOT_INVALID; 886 slot->flags |= KVM_MEMSLOT_INVALID;
875 887
876 old_memslots = install_new_memslots(kvm, slots); 888 old_memslots = install_new_memslots(kvm, as_id, slots);
877 889
878 /* slot was deleted or moved, clear iommu mapping */ 890 /* slot was deleted or moved, clear iommu mapping */
879 kvm_iommu_unmap_pages(kvm, &old); 891 kvm_iommu_unmap_pages(kvm, &old);
@@ -898,18 +910,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
898 if (r) 910 if (r)
899 goto out_slots; 911 goto out_slots;
900 912
901 /* actual memory is freed via old in kvm_free_physmem_slot below */ 913 /* actual memory is freed via old in kvm_free_memslot below */
902 if (change == KVM_MR_DELETE) { 914 if (change == KVM_MR_DELETE) {
903 new.dirty_bitmap = NULL; 915 new.dirty_bitmap = NULL;
904 memset(&new.arch, 0, sizeof(new.arch)); 916 memset(&new.arch, 0, sizeof(new.arch));
905 } 917 }
906 918
907 update_memslots(slots, &new); 919 update_memslots(slots, &new);
908 old_memslots = install_new_memslots(kvm, slots); 920 old_memslots = install_new_memslots(kvm, as_id, slots);
909 921
910 kvm_arch_commit_memory_region(kvm, mem, &old, change); 922 kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
911 923
912 kvm_free_physmem_slot(kvm, &old, &new); 924 kvm_free_memslot(kvm, &old, &new);
913 kvfree(old_memslots); 925 kvfree(old_memslots);
914 926
915 /* 927 /*
@@ -931,14 +943,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
931out_slots: 943out_slots:
932 kvfree(slots); 944 kvfree(slots);
933out_free: 945out_free:
934 kvm_free_physmem_slot(kvm, &new, &old); 946 kvm_free_memslot(kvm, &new, &old);
935out: 947out:
936 return r; 948 return r;
937} 949}
938EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 950EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
939 951
940int kvm_set_memory_region(struct kvm *kvm, 952int kvm_set_memory_region(struct kvm *kvm,
941 struct kvm_userspace_memory_region *mem) 953 const struct kvm_userspace_memory_region *mem)
942{ 954{
943 int r; 955 int r;
944 956
@@ -952,24 +964,29 @@ EXPORT_SYMBOL_GPL(kvm_set_memory_region);
952static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 964static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
953 struct kvm_userspace_memory_region *mem) 965 struct kvm_userspace_memory_region *mem)
954{ 966{
955 if (mem->slot >= KVM_USER_MEM_SLOTS) 967 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
956 return -EINVAL; 968 return -EINVAL;
969
957 return kvm_set_memory_region(kvm, mem); 970 return kvm_set_memory_region(kvm, mem);
958} 971}
959 972
960int kvm_get_dirty_log(struct kvm *kvm, 973int kvm_get_dirty_log(struct kvm *kvm,
961 struct kvm_dirty_log *log, int *is_dirty) 974 struct kvm_dirty_log *log, int *is_dirty)
962{ 975{
976 struct kvm_memslots *slots;
963 struct kvm_memory_slot *memslot; 977 struct kvm_memory_slot *memslot;
964 int r, i; 978 int r, i, as_id, id;
965 unsigned long n; 979 unsigned long n;
966 unsigned long any = 0; 980 unsigned long any = 0;
967 981
968 r = -EINVAL; 982 r = -EINVAL;
969 if (log->slot >= KVM_USER_MEM_SLOTS) 983 as_id = log->slot >> 16;
984 id = (u16)log->slot;
985 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
970 goto out; 986 goto out;
971 987
972 memslot = id_to_memslot(kvm->memslots, log->slot); 988 slots = __kvm_memslots(kvm, as_id);
989 memslot = id_to_memslot(slots, id);
973 r = -ENOENT; 990 r = -ENOENT;
974 if (!memslot->dirty_bitmap) 991 if (!memslot->dirty_bitmap)
975 goto out; 992 goto out;
@@ -1018,17 +1035,21 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1018int kvm_get_dirty_log_protect(struct kvm *kvm, 1035int kvm_get_dirty_log_protect(struct kvm *kvm,
1019 struct kvm_dirty_log *log, bool *is_dirty) 1036 struct kvm_dirty_log *log, bool *is_dirty)
1020{ 1037{
1038 struct kvm_memslots *slots;
1021 struct kvm_memory_slot *memslot; 1039 struct kvm_memory_slot *memslot;
1022 int r, i; 1040 int r, i, as_id, id;
1023 unsigned long n; 1041 unsigned long n;
1024 unsigned long *dirty_bitmap; 1042 unsigned long *dirty_bitmap;
1025 unsigned long *dirty_bitmap_buffer; 1043 unsigned long *dirty_bitmap_buffer;
1026 1044
1027 r = -EINVAL; 1045 r = -EINVAL;
1028 if (log->slot >= KVM_USER_MEM_SLOTS) 1046 as_id = log->slot >> 16;
1047 id = (u16)log->slot;
1048 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1029 goto out; 1049 goto out;
1030 1050
1031 memslot = id_to_memslot(kvm->memslots, log->slot); 1051 slots = __kvm_memslots(kvm, as_id);
1052 memslot = id_to_memslot(slots, id);
1032 1053
1033 dirty_bitmap = memslot->dirty_bitmap; 1054 dirty_bitmap = memslot->dirty_bitmap;
1034 r = -ENOENT; 1055 r = -ENOENT;
@@ -1091,6 +1112,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1091} 1112}
1092EXPORT_SYMBOL_GPL(gfn_to_memslot); 1113EXPORT_SYMBOL_GPL(gfn_to_memslot);
1093 1114
1115struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
1116{
1117 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
1118}
1119
1094int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1120int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1095{ 1121{
1096 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1122 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
@@ -1166,6 +1192,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1166} 1192}
1167EXPORT_SYMBOL_GPL(gfn_to_hva); 1193EXPORT_SYMBOL_GPL(gfn_to_hva);
1168 1194
1195unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
1196{
1197 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
1198}
1199EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
1200
1169/* 1201/*
1170 * If writable is set to false, the hva returned by this function is only 1202 * If writable is set to false, the hva returned by this function is only
1171 * allowed to be read. 1203 * allowed to be read.
@@ -1188,6 +1220,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
1188 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1220 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1189} 1221}
1190 1222
1223unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
1224{
1225 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1226
1227 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1228}
1229
1191static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1230static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1192 unsigned long start, int write, struct page **page) 1231 unsigned long start, int write, struct page **page)
1193{ 1232{
@@ -1355,9 +1394,8 @@ exit:
1355 return pfn; 1394 return pfn;
1356} 1395}
1357 1396
1358static pfn_t 1397pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1359__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, 1398 bool *async, bool write_fault, bool *writable)
1360 bool *async, bool write_fault, bool *writable)
1361{ 1399{
1362 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1400 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1363 1401
@@ -1376,65 +1414,59 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1376 return hva_to_pfn(addr, atomic, async, write_fault, 1414 return hva_to_pfn(addr, atomic, async, write_fault,
1377 writable); 1415 writable);
1378} 1416}
1417EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
1379 1418
1380static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1419pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1381 bool write_fault, bool *writable) 1420 bool *writable)
1382{ 1421{
1383 struct kvm_memory_slot *slot; 1422 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
1384 1423 write_fault, writable);
1385 if (async)
1386 *async = false;
1387
1388 slot = gfn_to_memslot(kvm, gfn);
1389
1390 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
1391 writable);
1392} 1424}
1425EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1393 1426
1394pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1427pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1395{ 1428{
1396 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); 1429 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
1397} 1430}
1398EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1431EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
1399 1432
1400pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 1433pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1401 bool write_fault, bool *writable)
1402{ 1434{
1403 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); 1435 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1404} 1436}
1405EXPORT_SYMBOL_GPL(gfn_to_pfn_async); 1437EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1406 1438
1407pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1439pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1408{ 1440{
1409 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); 1441 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
1410} 1442}
1411EXPORT_SYMBOL_GPL(gfn_to_pfn); 1443EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1412 1444
1413pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1445pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
1414 bool *writable)
1415{ 1446{
1416 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); 1447 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
1417} 1448}
1418EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1449EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
1419 1450
1420pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1451pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1421{ 1452{
1422 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1453 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
1423} 1454}
1455EXPORT_SYMBOL_GPL(gfn_to_pfn);
1424 1456
1425pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1457pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1426{ 1458{
1427 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1459 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
1428} 1460}
1429EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1461EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
1430 1462
1431int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1463int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
1432 int nr_pages) 1464 struct page **pages, int nr_pages)
1433{ 1465{
1434 unsigned long addr; 1466 unsigned long addr;
1435 gfn_t entry; 1467 gfn_t entry;
1436 1468
1437 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); 1469 addr = gfn_to_hva_many(slot, gfn, &entry);
1438 if (kvm_is_error_hva(addr)) 1470 if (kvm_is_error_hva(addr))
1439 return -1; 1471 return -1;
1440 1472
@@ -1468,6 +1500,16 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1468} 1500}
1469EXPORT_SYMBOL_GPL(gfn_to_page); 1501EXPORT_SYMBOL_GPL(gfn_to_page);
1470 1502
1503struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1504{
1505 pfn_t pfn;
1506
1507 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
1508
1509 return kvm_pfn_to_page(pfn);
1510}
1511EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
1512
1471void kvm_release_page_clean(struct page *page) 1513void kvm_release_page_clean(struct page *page)
1472{ 1514{
1473 WARN_ON(is_error_page(page)); 1515 WARN_ON(is_error_page(page));
@@ -1530,13 +1572,13 @@ static int next_segment(unsigned long len, int offset)
1530 return len; 1572 return len;
1531} 1573}
1532 1574
1533int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1575static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
1534 int len) 1576 void *data, int offset, int len)
1535{ 1577{
1536 int r; 1578 int r;
1537 unsigned long addr; 1579 unsigned long addr;
1538 1580
1539 addr = gfn_to_hva_prot(kvm, gfn, NULL); 1581 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
1540 if (kvm_is_error_hva(addr)) 1582 if (kvm_is_error_hva(addr))
1541 return -EFAULT; 1583 return -EFAULT;
1542 r = __copy_from_user(data, (void __user *)addr + offset, len); 1584 r = __copy_from_user(data, (void __user *)addr + offset, len);
@@ -1544,8 +1586,25 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1544 return -EFAULT; 1586 return -EFAULT;
1545 return 0; 1587 return 0;
1546} 1588}
1589
1590int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1591 int len)
1592{
1593 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1594
1595 return __kvm_read_guest_page(slot, gfn, data, offset, len);
1596}
1547EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1597EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1548 1598
1599int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
1600 int offset, int len)
1601{
1602 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1603
1604 return __kvm_read_guest_page(slot, gfn, data, offset, len);
1605}
1606EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
1607
1549int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1608int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1550{ 1609{
1551 gfn_t gfn = gpa >> PAGE_SHIFT; 1610 gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -1566,15 +1625,33 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1566} 1625}
1567EXPORT_SYMBOL_GPL(kvm_read_guest); 1626EXPORT_SYMBOL_GPL(kvm_read_guest);
1568 1627
1569int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1628int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
1570 unsigned long len)
1571{ 1629{
1572 int r;
1573 unsigned long addr;
1574 gfn_t gfn = gpa >> PAGE_SHIFT; 1630 gfn_t gfn = gpa >> PAGE_SHIFT;
1631 int seg;
1575 int offset = offset_in_page(gpa); 1632 int offset = offset_in_page(gpa);
1633 int ret;
1634
1635 while ((seg = next_segment(len, offset)) != 0) {
1636 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
1637 if (ret < 0)
1638 return ret;
1639 offset = 0;
1640 len -= seg;
1641 data += seg;
1642 ++gfn;
1643 }
1644 return 0;
1645}
1646EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
1576 1647
1577 addr = gfn_to_hva_prot(kvm, gfn, NULL); 1648static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
1649 void *data, int offset, unsigned long len)
1650{
1651 int r;
1652 unsigned long addr;
1653
1654 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
1578 if (kvm_is_error_hva(addr)) 1655 if (kvm_is_error_hva(addr))
1579 return -EFAULT; 1656 return -EFAULT;
1580 pagefault_disable(); 1657 pagefault_disable();
@@ -1584,25 +1661,63 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1584 return -EFAULT; 1661 return -EFAULT;
1585 return 0; 1662 return 0;
1586} 1663}
1587EXPORT_SYMBOL(kvm_read_guest_atomic);
1588 1664
1589int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1665int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1590 int offset, int len) 1666 unsigned long len)
1667{
1668 gfn_t gfn = gpa >> PAGE_SHIFT;
1669 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1670 int offset = offset_in_page(gpa);
1671
1672 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
1673}
1674EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
1675
1676int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
1677 void *data, unsigned long len)
1678{
1679 gfn_t gfn = gpa >> PAGE_SHIFT;
1680 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1681 int offset = offset_in_page(gpa);
1682
1683 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
1684}
1685EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
1686
1687static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
1688 const void *data, int offset, int len)
1591{ 1689{
1592 int r; 1690 int r;
1593 unsigned long addr; 1691 unsigned long addr;
1594 1692
1595 addr = gfn_to_hva(kvm, gfn); 1693 addr = gfn_to_hva_memslot(memslot, gfn);
1596 if (kvm_is_error_hva(addr)) 1694 if (kvm_is_error_hva(addr))
1597 return -EFAULT; 1695 return -EFAULT;
1598 r = __copy_to_user((void __user *)addr + offset, data, len); 1696 r = __copy_to_user((void __user *)addr + offset, data, len);
1599 if (r) 1697 if (r)
1600 return -EFAULT; 1698 return -EFAULT;
1601 mark_page_dirty(kvm, gfn); 1699 mark_page_dirty_in_slot(memslot, gfn);
1602 return 0; 1700 return 0;
1603} 1701}
1702
1703int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
1704 const void *data, int offset, int len)
1705{
1706 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1707
1708 return __kvm_write_guest_page(slot, gfn, data, offset, len);
1709}
1604EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1710EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1605 1711
1712int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
1713 const void *data, int offset, int len)
1714{
1715 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1716
1717 return __kvm_write_guest_page(slot, gfn, data, offset, len);
1718}
1719EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
1720
1606int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1721int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1607 unsigned long len) 1722 unsigned long len)
1608{ 1723{
@@ -1624,6 +1739,27 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1624} 1739}
1625EXPORT_SYMBOL_GPL(kvm_write_guest); 1740EXPORT_SYMBOL_GPL(kvm_write_guest);
1626 1741
1742int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
1743 unsigned long len)
1744{
1745 gfn_t gfn = gpa >> PAGE_SHIFT;
1746 int seg;
1747 int offset = offset_in_page(gpa);
1748 int ret;
1749
1750 while ((seg = next_segment(len, offset)) != 0) {
1751 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
1752 if (ret < 0)
1753 return ret;
1754 offset = 0;
1755 len -= seg;
1756 data += seg;
1757 ++gfn;
1758 }
1759 return 0;
1760}
1761EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
1762
1627int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1763int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1628 gpa_t gpa, unsigned long len) 1764 gpa_t gpa, unsigned long len)
1629{ 1765{
@@ -1681,7 +1817,7 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1681 r = __copy_to_user((void __user *)ghc->hva, data, len); 1817 r = __copy_to_user((void __user *)ghc->hva, data, len);
1682 if (r) 1818 if (r)
1683 return -EFAULT; 1819 return -EFAULT;
1684 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1820 mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1685 1821
1686 return 0; 1822 return 0;
1687} 1823}
@@ -1739,8 +1875,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1739} 1875}
1740EXPORT_SYMBOL_GPL(kvm_clear_guest); 1876EXPORT_SYMBOL_GPL(kvm_clear_guest);
1741 1877
1742static void mark_page_dirty_in_slot(struct kvm *kvm, 1878static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
1743 struct kvm_memory_slot *memslot,
1744 gfn_t gfn) 1879 gfn_t gfn)
1745{ 1880{
1746 if (memslot && memslot->dirty_bitmap) { 1881 if (memslot && memslot->dirty_bitmap) {
@@ -1755,10 +1890,19 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1755 struct kvm_memory_slot *memslot; 1890 struct kvm_memory_slot *memslot;
1756 1891
1757 memslot = gfn_to_memslot(kvm, gfn); 1892 memslot = gfn_to_memslot(kvm, gfn);
1758 mark_page_dirty_in_slot(kvm, memslot, gfn); 1893 mark_page_dirty_in_slot(memslot, gfn);
1759} 1894}
1760EXPORT_SYMBOL_GPL(mark_page_dirty); 1895EXPORT_SYMBOL_GPL(mark_page_dirty);
1761 1896
1897void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
1898{
1899 struct kvm_memory_slot *memslot;
1900
1901 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1902 mark_page_dirty_in_slot(memslot, gfn);
1903}
1904EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
1905
1762static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 1906static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
1763{ 1907{
1764 if (kvm_arch_vcpu_runnable(vcpu)) { 1908 if (kvm_arch_vcpu_runnable(vcpu)) {
@@ -2488,6 +2632,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
2488 case KVM_CAP_IRQ_ROUTING: 2632 case KVM_CAP_IRQ_ROUTING:
2489 return KVM_MAX_IRQ_ROUTES; 2633 return KVM_MAX_IRQ_ROUTES;
2490#endif 2634#endif
2635#if KVM_ADDRESS_SPACE_NUM > 1
2636 case KVM_CAP_MULTI_ADDRESS_SPACE:
2637 return KVM_ADDRESS_SPACE_NUM;
2638#endif
2491 default: 2639 default:
2492 break; 2640 break;
2493 } 2641 }
@@ -2882,18 +3030,12 @@ static int hardware_enable_all(void)
2882static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 3030static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2883 void *v) 3031 void *v)
2884{ 3032{
2885 int cpu = (long)v;
2886
2887 val &= ~CPU_TASKS_FROZEN; 3033 val &= ~CPU_TASKS_FROZEN;
2888 switch (val) { 3034 switch (val) {
2889 case CPU_DYING: 3035 case CPU_DYING:
2890 pr_info("kvm: disabling virtualization on CPU%d\n",
2891 cpu);
2892 hardware_disable(); 3036 hardware_disable();
2893 break; 3037 break;
2894 case CPU_STARTING: 3038 case CPU_STARTING:
2895 pr_info("kvm: enabling virtualization on CPU%d\n",
2896 cpu);
2897 hardware_enable(); 3039 hardware_enable();
2898 break; 3040 break;
2899 } 3041 }