aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt35
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic.txt37
-rw-r--r--Documentation/virtual/kvm/devices/vm.txt59
-rw-r--r--arch/arm/include/asm/kvm_asm.h1
-rw-r--r--arch/arm/include/asm/kvm_emulate.h5
-rw-r--r--arch/arm/include/asm/kvm_host.h6
-rw-r--r--arch/arm/include/asm/kvm_mmio.h1
-rw-r--r--arch/arm/include/asm/kvm_mmu.h21
-rw-r--r--arch/arm/include/asm/pgtable-3level.h1
-rw-r--r--arch/arm/include/uapi/asm/kvm.h2
-rw-r--r--arch/arm/kvm/Kconfig2
-rw-r--r--arch/arm/kvm/Makefile1
-rw-r--r--arch/arm/kvm/arm.c58
-rw-r--r--arch/arm/kvm/handle_exit.c8
-rw-r--r--arch/arm/kvm/interrupts.S11
-rw-r--r--arch/arm/kvm/mmu.c271
-rw-r--r--arch/arm/kvm/psci.c17
-rw-r--r--arch/arm/kvm/trace.h11
-rw-r--r--arch/arm64/include/asm/esr.h1
-rw-r--r--arch/arm64/include/asm/kvm_asm.h1
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h10
-rw-r--r--arch/arm64/include/asm/kvm_host.h7
-rw-r--r--arch/arm64/include/asm/kvm_mmio.h1
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h21
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h1
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h9
-rw-r--r--arch/arm64/kernel/asm-offsets.c1
-rw-r--r--arch/arm64/kvm/Kconfig2
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/handle_exit.c13
-rw-r--r--arch/arm64/kvm/hyp.S22
-rw-r--r--arch/arm64/kvm/sys_regs.c40
-rw-r--r--arch/arm64/kvm/trace.h55
-rw-r--r--arch/arm64/kvm/vgic-v3-switch.S14
-rw-r--r--arch/ia64/include/uapi/asm/Kbuild1
-rw-r--r--arch/mips/include/asm/kvm_host.h1
-rw-r--r--arch/mips/kvm/locore.S2
-rw-r--r--arch/mips/kvm/mips.c23
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/kvm/book3s.c1
-rw-r--r--arch/powerpc/kvm/booke.c1
-rw-r--r--arch/powerpc/kvm/powerpc.c3
-rw-r--r--arch/s390/include/asm/kvm_host.h56
-rw-r--r--arch/s390/include/asm/sclp.h4
-rw-r--r--arch/s390/include/asm/sysinfo.h10
-rw-r--r--arch/s390/include/uapi/asm/kvm.h37
-rw-r--r--arch/s390/kernel/sysinfo.c29
-rw-r--r--arch/s390/kvm/gaccess.c4
-rw-r--r--arch/s390/kvm/intercept.c41
-rw-r--r--arch/s390/kvm/interrupt.c191
-rw-r--r--arch/s390/kvm/kvm-s390.c596
-rw-r--r--arch/s390/kvm/kvm-s390.h19
-rw-r--r--arch/s390/kvm/priv.c13
-rw-r--r--arch/s390/kvm/sigp.c160
-rw-r--r--arch/s390/kvm/trace-s390.h14
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h59
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h3
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c230
-rw-r--r--arch/x86/kvm/ioapic.h2
-rw-r--r--arch/x86/kvm/iommu.c4
-rw-r--r--arch/x86/kvm/lapic.c147
-rw-r--r--arch/x86/kvm/lapic.h6
-rw-r--r--arch/x86/kvm/mmu.c351
-rw-r--r--arch/x86/kvm/mmu.h17
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/trace.h38
-rw-r--r--arch/x86/kvm/vmx.c1086
-rw-r--r--arch/x86/kvm/x86.c209
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--drivers/irqchip/irq-gic-v3.c14
-rw-r--r--drivers/s390/char/sclp_early.c8
-rw-r--r--include/kvm/arm_vgic.h43
-rw-r--r--include/linux/irqchip/arm-gic-v3.h44
-rw-r--r--include/linux/kvm_host.h17
-rw-r--r--include/trace/events/kvm.h19
-rw-r--r--include/uapi/linux/kvm.h9
-rw-r--r--virt/kvm/Kconfig10
-rw-r--r--virt/kvm/arm/vgic-v2-emul.c847
-rw-r--r--virt/kvm/arm/vgic-v2.c4
-rw-r--r--virt/kvm/arm/vgic-v3-emul.c1036
-rw-r--r--virt/kvm/arm/vgic-v3.c82
-rw-r--r--virt/kvm/arm/vgic.c1127
-rw-r--r--virt/kvm/arm/vgic.h123
-rw-r--r--virt/kvm/kvm_main.c144
88 files changed, 6026 insertions, 1626 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 0007fef4ed81..b112efc816f1 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -612,11 +612,14 @@ Type: vm ioctl
612Parameters: none 612Parameters: none
613Returns: 0 on success, -1 on error 613Returns: 0 on success, -1 on error
614 614
615Creates an interrupt controller model in the kernel. On x86, creates a virtual 615Creates an interrupt controller model in the kernel.
616ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a 616On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up
617local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 617future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both
618only go to the IOAPIC. On ARM/arm64, a GIC is 618PIC and IOAPIC; GSI 16-23 only go to the IOAPIC.
619created. On s390, a dummy irq routing table is created. 619On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of
620KVM_CREATE_DEVICE, which also supports creating a GICv2. Using
621KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2.
622On s390, a dummy irq routing table is created.
620 623
621Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled 624Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled
622before KVM_CREATE_IRQCHIP can be used. 625before KVM_CREATE_IRQCHIP can be used.
@@ -2312,7 +2315,7 @@ struct kvm_s390_interrupt {
2312 2315
2313type can be one of the following: 2316type can be one of the following:
2314 2317
2315KVM_S390_SIGP_STOP (vcpu) - sigp restart 2318KVM_S390_SIGP_STOP (vcpu) - sigp stop; optional flags in parm
2316KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm 2319KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm
2317KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm 2320KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm
2318KVM_S390_RESTART (vcpu) - restart 2321KVM_S390_RESTART (vcpu) - restart
@@ -3225,3 +3228,23 @@ userspace from doing that.
3225If the hcall number specified is not one that has an in-kernel 3228If the hcall number specified is not one that has an in-kernel
3226implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL 3229implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL
3227error. 3230error.
3231
32327.2 KVM_CAP_S390_USER_SIGP
3233
3234Architectures: s390
3235Parameters: none
3236
3237This capability controls which SIGP orders will be handled completely in user
3238space. With this capability enabled, all fast orders will be handled completely
3239in the kernel:
3240- SENSE
3241- SENSE RUNNING
3242- EXTERNAL CALL
3243- EMERGENCY SIGNAL
3244- CONDITIONAL EMERGENCY SIGNAL
3245
3246All other orders will be handled completely in user space.
3247
3248Only privileged operation exceptions will be checked for in the kernel (or even
3249in the hardware prior to interception). If this capability is not enabled, the
3250old way of handling SIGP orders is used (partially in kernel and user space).
diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index df8b0c7540b6..3fb905429e8a 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -3,22 +3,42 @@ ARM Virtual Generic Interrupt Controller (VGIC)
3 3
4Device types supported: 4Device types supported:
5 KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 5 KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0
6 KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0
6 7
7Only one VGIC instance may be instantiated through either this API or the 8Only one VGIC instance may be instantiated through either this API or the
8legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt 9legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt
9controller, requiring emulated user-space devices to inject interrupts to the 10controller, requiring emulated user-space devices to inject interrupts to the
10VGIC instead of directly to CPUs. 11VGIC instead of directly to CPUs.
11 12
13Creating a guest GICv3 device requires a host GICv3 as well.
14GICv3 implementations with hardware compatibility support allow a guest GICv2
15as well.
16
12Groups: 17Groups:
13 KVM_DEV_ARM_VGIC_GRP_ADDR 18 KVM_DEV_ARM_VGIC_GRP_ADDR
14 Attributes: 19 Attributes:
15 KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit) 20 KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit)
16 Base address in the guest physical address space of the GIC distributor 21 Base address in the guest physical address space of the GIC distributor
17 register mappings. 22 register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2.
23 This address needs to be 4K aligned and the region covers 4 KByte.
18 24
19 KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) 25 KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit)
20 Base address in the guest physical address space of the GIC virtual cpu 26 Base address in the guest physical address space of the GIC virtual cpu
21 interface register mappings. 27 interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2.
28 This address needs to be 4K aligned and the region covers 4 KByte.
29
30 KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit)
31 Base address in the guest physical address space of the GICv3 distributor
32 register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
33 This address needs to be 64K aligned and the region covers 64 KByte.
34
35 KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit)
36 Base address in the guest physical address space of the GICv3
37 redistributor register mappings. There are two 64K pages for each
38 VCPU and all of the redistributor pages are contiguous.
39 Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
40 This address needs to be 64K aligned.
41
22 42
23 KVM_DEV_ARM_VGIC_GRP_DIST_REGS 43 KVM_DEV_ARM_VGIC_GRP_DIST_REGS
24 Attributes: 44 Attributes:
@@ -36,6 +56,7 @@ Groups:
36 the register. 56 the register.
37 Limitations: 57 Limitations:
38 - Priorities are not implemented, and registers are RAZ/WI 58 - Priorities are not implemented, and registers are RAZ/WI
59 - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
39 Errors: 60 Errors:
40 -ENODEV: Getting or setting this register is not yet supported 61 -ENODEV: Getting or setting this register is not yet supported
41 -EBUSY: One or more VCPUs are running 62 -EBUSY: One or more VCPUs are running
@@ -68,6 +89,7 @@ Groups:
68 89
69 Limitations: 90 Limitations:
70 - Priorities are not implemented, and registers are RAZ/WI 91 - Priorities are not implemented, and registers are RAZ/WI
92 - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
71 Errors: 93 Errors:
72 -ENODEV: Getting or setting this register is not yet supported 94 -ENODEV: Getting or setting this register is not yet supported
73 -EBUSY: One or more VCPUs are running 95 -EBUSY: One or more VCPUs are running
@@ -81,3 +103,14 @@ Groups:
81 -EINVAL: Value set is out of the expected range 103 -EINVAL: Value set is out of the expected range
82 -EBUSY: Value has already be set, or GIC has already been initialized 104 -EBUSY: Value has already be set, or GIC has already been initialized
83 with default values. 105 with default values.
106
107 KVM_DEV_ARM_VGIC_GRP_CTRL
108 Attributes:
109 KVM_DEV_ARM_VGIC_CTRL_INIT
110 request the initialization of the VGIC, no additional parameter in
111 kvm_device_attr.addr.
112 Errors:
113 -ENXIO: VGIC not properly configured as required prior to calling
114 this attribute
115 -ENODEV: no online VCPU
116 -ENOMEM: memory shortage when allocating vgic internal data
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index d426fc87fe93..5542c4641a3c 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -24,3 +24,62 @@ Returns: 0
24 24
25Clear the CMMA status for all guest pages, so any pages the guest marked 25Clear the CMMA status for all guest pages, so any pages the guest marked
26as unused are again used any may not be reclaimed by the host. 26as unused are again used any may not be reclaimed by the host.
27
281.3. ATTRIBUTE KVM_S390_VM_MEM_LIMIT_SIZE
29Parameters: in attr->addr the address for the new limit of guest memory
30Returns: -EFAULT if the given address is not accessible
31 -EINVAL if the virtual machine is of type UCONTROL
32 -E2BIG if the given guest memory is to big for that machine
33 -EBUSY if a vcpu is already defined
34 -ENOMEM if not enough memory is available for a new shadow guest mapping
35 0 otherwise
36
37Allows userspace to query the actual limit and set a new limit for
38the maximum guest memory size. The limit will be rounded up to
392048 MB, 4096 GB, 8192 TB respectively, as this limit is governed by
40the number of page table levels.
41
422. GROUP: KVM_S390_VM_CPU_MODEL
43Architectures: s390
44
452.1. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE (r/o)
46
47Allows user space to retrieve machine and kvm specific cpu related information:
48
49struct kvm_s390_vm_cpu_machine {
50 __u64 cpuid; # CPUID of host
51 __u32 ibc; # IBC level range offered by host
52 __u8 pad[4];
53 __u64 fac_mask[256]; # set of cpu facilities enabled by KVM
54 __u64 fac_list[256]; # set of cpu facilities offered by host
55}
56
57Parameters: address of buffer to store the machine related cpu data
58 of type struct kvm_s390_vm_cpu_machine*
59Returns: -EFAULT if the given address is not accessible from kernel space
60 -ENOMEM if not enough memory is available to process the ioctl
61 0 in case of success
62
632.2. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR (r/w)
64
65Allows user space to retrieve or request to change cpu related information for a vcpu:
66
67struct kvm_s390_vm_cpu_processor {
68 __u64 cpuid; # CPUID currently (to be) used by this vcpu
69 __u16 ibc; # IBC level currently (to be) used by this vcpu
70 __u8 pad[6];
71 __u64 fac_list[256]; # set of cpu facilities currently (to be) used
72 # by this vcpu
73}
74
75KVM does not enforce or limit the cpu model data in any form. Take the information
76retrieved by means of KVM_S390_VM_CPU_MACHINE as hint for reasonable configuration
77setups. Instruction interceptions triggered by additionally set facilitiy bits that
78are not handled by KVM need to by imlemented in the VM driver code.
79
80Parameters: address of buffer to store/set the processor related cpu
81 data of type struct kvm_s390_vm_cpu_processor*.
82Returns: -EBUSY in case 1 or more vcpus are already activated (only in write case)
83 -EFAULT if the given address is not accessible from kernel space
84 -ENOMEM if not enough memory is available to process the ioctl
85 0 in case of success
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 3a67bec72d0c..25410b2d8bc1 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -96,6 +96,7 @@ extern char __kvm_hyp_code_end[];
96 96
97extern void __kvm_flush_vm_context(void); 97extern void __kvm_flush_vm_context(void);
98extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); 98extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
99extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
99 100
100extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 101extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
101#endif 102#endif
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 7b0152321b20..a9c80a2ea1a7 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -23,6 +23,7 @@
23#include <asm/kvm_asm.h> 23#include <asm/kvm_asm.h>
24#include <asm/kvm_mmio.h> 24#include <asm/kvm_mmio.h>
25#include <asm/kvm_arm.h> 25#include <asm/kvm_arm.h>
26#include <asm/cputype.h>
26 27
27unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); 28unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num);
28unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu); 29unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu);
@@ -177,9 +178,9 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)
177 return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; 178 return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK;
178} 179}
179 180
180static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) 181static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
181{ 182{
182 return vcpu->arch.cp15[c0_MPIDR]; 183 return vcpu->arch.cp15[c0_MPIDR] & MPIDR_HWID_BITMASK;
183} 184}
184 185
185static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) 186static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 04b4ea0b550a..41008cd7c53f 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -68,6 +68,7 @@ struct kvm_arch {
68 68
69 /* Interrupt controller */ 69 /* Interrupt controller */
70 struct vgic_dist vgic; 70 struct vgic_dist vgic;
71 int max_vcpus;
71}; 72};
72 73
73#define KVM_NR_MEM_OBJS 40 74#define KVM_NR_MEM_OBJS 40
@@ -144,6 +145,7 @@ struct kvm_vm_stat {
144}; 145};
145 146
146struct kvm_vcpu_stat { 147struct kvm_vcpu_stat {
148 u32 halt_successful_poll;
147 u32 halt_wakeup; 149 u32 halt_wakeup;
148}; 150};
149 151
@@ -231,6 +233,10 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic)
231int kvm_perf_init(void); 233int kvm_perf_init(void);
232int kvm_perf_teardown(void); 234int kvm_perf_teardown(void);
233 235
236void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
237
238struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
239
234static inline void kvm_arch_hardware_disable(void) {} 240static inline void kvm_arch_hardware_disable(void) {}
235static inline void kvm_arch_hardware_unsetup(void) {} 241static inline void kvm_arch_hardware_unsetup(void) {}
236static inline void kvm_arch_sync_events(struct kvm *kvm) {} 242static inline void kvm_arch_sync_events(struct kvm *kvm) {}
diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h
index adcc0d7d3175..3f83db2f6cf0 100644
--- a/arch/arm/include/asm/kvm_mmio.h
+++ b/arch/arm/include/asm/kvm_mmio.h
@@ -37,6 +37,7 @@ struct kvm_exit_mmio {
37 u8 data[8]; 37 u8 data[8];
38 u32 len; 38 u32 len;
39 bool is_write; 39 bool is_write;
40 void *private;
40}; 41};
41 42
42static inline void kvm_prepare_mmio(struct kvm_run *run, 43static inline void kvm_prepare_mmio(struct kvm_run *run,
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 1bca8f8af442..37ca2a4c6f09 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -115,6 +115,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
115 pmd_val(*pmd) |= L_PMD_S2_RDWR; 115 pmd_val(*pmd) |= L_PMD_S2_RDWR;
116} 116}
117 117
118static inline void kvm_set_s2pte_readonly(pte_t *pte)
119{
120 pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
121}
122
123static inline bool kvm_s2pte_readonly(pte_t *pte)
124{
125 return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
126}
127
128static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
129{
130 pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
131}
132
133static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
134{
135 return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
136}
137
138
118/* Open coded p*d_addr_end that can deal with 64bit addresses */ 139/* Open coded p*d_addr_end that can deal with 64bit addresses */
119#define kvm_pgd_addr_end(addr, end) \ 140#define kvm_pgd_addr_end(addr, end) \
120({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ 141({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 423a5ac09d3a..a745a2a53853 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -129,6 +129,7 @@
129#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ 129#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */
130#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ 130#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
131 131
132#define L_PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[1] */
132#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ 133#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
133 134
134/* 135/*
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 09ee408c1a67..0db25bc32864 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -175,6 +175,8 @@ struct kvm_arch_memory_slot {
175#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 175#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
176#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) 176#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
177#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 177#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
178#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
179#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
178 180
179/* KVM_IRQ_LINE irq field index values */ 181/* KVM_IRQ_LINE irq field index values */
180#define KVM_ARM_IRQ_TYPE_SHIFT 24 182#define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 3afee5f40f4f..338ace78ed18 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -21,8 +21,10 @@ config KVM
21 select PREEMPT_NOTIFIERS 21 select PREEMPT_NOTIFIERS
22 select ANON_INODES 22 select ANON_INODES
23 select HAVE_KVM_CPU_RELAX_INTERCEPT 23 select HAVE_KVM_CPU_RELAX_INTERCEPT
24 select HAVE_KVM_ARCH_TLB_FLUSH_ALL
24 select KVM_MMIO 25 select KVM_MMIO
25 select KVM_ARM_HOST 26 select KVM_ARM_HOST
27 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
26 select SRCU 28 select SRCU
27 depends on ARM_VIRT_EXT && ARM_LPAE 29 depends on ARM_VIRT_EXT && ARM_LPAE
28 ---help--- 30 ---help---
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index f7057ed045b6..443b8bea43e9 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -22,4 +22,5 @@ obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
22obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o 22obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
23obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o 23obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
24obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o 24obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
25obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
25obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o 26obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 0b0d58a905c4..07e7eb1d7ab6 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -132,6 +132,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
132 /* Mark the initial VMID generation invalid */ 132 /* Mark the initial VMID generation invalid */
133 kvm->arch.vmid_gen = 0; 133 kvm->arch.vmid_gen = 0;
134 134
135 /* The maximum number of VCPUs is limited by the host's GIC model */
136 kvm->arch.max_vcpus = kvm_vgic_get_max_vcpus();
137
135 return ret; 138 return ret;
136out_free_stage2_pgd: 139out_free_stage2_pgd:
137 kvm_free_stage2_pgd(kvm); 140 kvm_free_stage2_pgd(kvm);
@@ -218,6 +221,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
218 goto out; 221 goto out;
219 } 222 }
220 223
224 if (id >= kvm->arch.max_vcpus) {
225 err = -EINVAL;
226 goto out;
227 }
228
221 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 229 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
222 if (!vcpu) { 230 if (!vcpu) {
223 err = -ENOMEM; 231 err = -ENOMEM;
@@ -241,9 +249,8 @@ out:
241 return ERR_PTR(err); 249 return ERR_PTR(err);
242} 250}
243 251
244int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 252void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
245{ 253{
246 return 0;
247} 254}
248 255
249void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 256void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
@@ -777,9 +784,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
777 } 784 }
778} 785}
779 786
787/**
788 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
789 * @kvm: kvm instance
790 * @log: slot id and address to which we copy the log
791 *
792 * Steps 1-4 below provide general overview of dirty page logging. See
793 * kvm_get_dirty_log_protect() function description for additional details.
794 *
795 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
796 * always flush the TLB (step 4) even if previous step failed and the dirty
797 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
798 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
799 * writes will be marked dirty for next log read.
800 *
801 * 1. Take a snapshot of the bit and clear it if needed.
802 * 2. Write protect the corresponding page.
803 * 3. Copy the snapshot to the userspace.
804 * 4. Flush TLB's if needed.
805 */
780int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) 806int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
781{ 807{
782 return -EINVAL; 808 bool is_dirty = false;
809 int r;
810
811 mutex_lock(&kvm->slots_lock);
812
813 r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
814
815 if (is_dirty)
816 kvm_flush_remote_tlbs(kvm);
817
818 mutex_unlock(&kvm->slots_lock);
819 return r;
783} 820}
784 821
785static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, 822static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
@@ -811,7 +848,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
811 switch (ioctl) { 848 switch (ioctl) {
812 case KVM_CREATE_IRQCHIP: { 849 case KVM_CREATE_IRQCHIP: {
813 if (vgic_present) 850 if (vgic_present)
814 return kvm_vgic_create(kvm); 851 return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
815 else 852 else
816 return -ENXIO; 853 return -ENXIO;
817 } 854 }
@@ -1035,6 +1072,19 @@ static void check_kvm_target_cpu(void *ret)
1035 *(int *)ret = kvm_target_cpu(); 1072 *(int *)ret = kvm_target_cpu();
1036} 1073}
1037 1074
1075struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
1076{
1077 struct kvm_vcpu *vcpu;
1078 int i;
1079
1080 mpidr &= MPIDR_HWID_BITMASK;
1081 kvm_for_each_vcpu(i, vcpu, kvm) {
1082 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
1083 return vcpu;
1084 }
1085 return NULL;
1086}
1087
1038/** 1088/**
1039 * Initialize Hyp-mode and memory mappings on all CPUs. 1089 * Initialize Hyp-mode and memory mappings on all CPUs.
1040 */ 1090 */
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index a96a8043277c..95f12b2ccdcb 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -87,11 +87,13 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
87 */ 87 */
88static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) 88static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
89{ 89{
90 trace_kvm_wfi(*vcpu_pc(vcpu)); 90 if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) {
91 if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) 91 trace_kvm_wfx(*vcpu_pc(vcpu), true);
92 kvm_vcpu_on_spin(vcpu); 92 kvm_vcpu_on_spin(vcpu);
93 else 93 } else {
94 trace_kvm_wfx(*vcpu_pc(vcpu), false);
94 kvm_vcpu_block(vcpu); 95 kvm_vcpu_block(vcpu);
96 }
95 97
96 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 98 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
97 99
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 01dcb0e752d9..79caf79b304a 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
66 bx lr 66 bx lr
67ENDPROC(__kvm_tlb_flush_vmid_ipa) 67ENDPROC(__kvm_tlb_flush_vmid_ipa)
68 68
69/**
70 * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
71 *
72 * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
73 * parameter
74 */
75
76ENTRY(__kvm_tlb_flush_vmid)
77 b __kvm_tlb_flush_vmid_ipa
78ENDPROC(__kvm_tlb_flush_vmid)
79
69/******************************************************************** 80/********************************************************************
70 * Flush TLBs and instruction caches of all CPUs inside the inner-shareable 81 * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
71 * domain, for all VMIDs 82 * domain, for all VMIDs
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 136662547ca6..3e6859bc3e11 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -45,6 +45,26 @@ static phys_addr_t hyp_idmap_vector;
45#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 45#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
46 46
47#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) 47#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x))
48#define kvm_pud_huge(_x) pud_huge(_x)
49
50#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
51#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
52
53static bool memslot_is_logging(struct kvm_memory_slot *memslot)
54{
55 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
56}
57
58/**
59 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
60 * @kvm: pointer to kvm structure.
61 *
62 * Interface to HYP function to flush all VM TLB entries
63 */
64void kvm_flush_remote_tlbs(struct kvm *kvm)
65{
66 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
67}
48 68
49static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 69static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
50{ 70{
@@ -78,6 +98,25 @@ static void kvm_flush_dcache_pud(pud_t pud)
78 __kvm_flush_dcache_pud(pud); 98 __kvm_flush_dcache_pud(pud);
79} 99}
80 100
101/**
102 * stage2_dissolve_pmd() - clear and flush huge PMD entry
103 * @kvm: pointer to kvm structure.
104 * @addr: IPA
105 * @pmd: pmd pointer for IPA
106 *
107 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
108 * pages in the range dirty.
109 */
110static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
111{
112 if (!kvm_pmd_huge(*pmd))
113 return;
114
115 pmd_clear(pmd);
116 kvm_tlb_flush_vmid_ipa(kvm, addr);
117 put_page(virt_to_page(pmd));
118}
119
81static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 120static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
82 int min, int max) 121 int min, int max)
83{ 122{
@@ -819,10 +858,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
819} 858}
820 859
821static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 860static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
822 phys_addr_t addr, const pte_t *new_pte, bool iomap) 861 phys_addr_t addr, const pte_t *new_pte,
862 unsigned long flags)
823{ 863{
824 pmd_t *pmd; 864 pmd_t *pmd;
825 pte_t *pte, old_pte; 865 pte_t *pte, old_pte;
866 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
867 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
868
869 VM_BUG_ON(logging_active && !cache);
826 870
827 /* Create stage-2 page table mapping - Levels 0 and 1 */ 871 /* Create stage-2 page table mapping - Levels 0 and 1 */
828 pmd = stage2_get_pmd(kvm, cache, addr); 872 pmd = stage2_get_pmd(kvm, cache, addr);
@@ -834,6 +878,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
834 return 0; 878 return 0;
835 } 879 }
836 880
881 /*
882 * While dirty page logging - dissolve huge PMD, then continue on to
883 * allocate page.
884 */
885 if (logging_active)
886 stage2_dissolve_pmd(kvm, addr, pmd);
887
837 /* Create stage-2 page mappings - Level 2 */ 888 /* Create stage-2 page mappings - Level 2 */
838 if (pmd_none(*pmd)) { 889 if (pmd_none(*pmd)) {
839 if (!cache) 890 if (!cache)
@@ -890,7 +941,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
890 if (ret) 941 if (ret)
891 goto out; 942 goto out;
892 spin_lock(&kvm->mmu_lock); 943 spin_lock(&kvm->mmu_lock);
893 ret = stage2_set_pte(kvm, &cache, addr, &pte, true); 944 ret = stage2_set_pte(kvm, &cache, addr, &pte,
945 KVM_S2PTE_FLAG_IS_IOMAP);
894 spin_unlock(&kvm->mmu_lock); 946 spin_unlock(&kvm->mmu_lock);
895 if (ret) 947 if (ret)
896 goto out; 948 goto out;
@@ -957,6 +1009,165 @@ static bool kvm_is_device_pfn(unsigned long pfn)
957 return !pfn_valid(pfn); 1009 return !pfn_valid(pfn);
958} 1010}
959 1011
1012/**
1013 * stage2_wp_ptes - write protect PMD range
1014 * @pmd: pointer to pmd entry
1015 * @addr: range start address
1016 * @end: range end address
1017 */
1018static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1019{
1020 pte_t *pte;
1021
1022 pte = pte_offset_kernel(pmd, addr);
1023 do {
1024 if (!pte_none(*pte)) {
1025 if (!kvm_s2pte_readonly(pte))
1026 kvm_set_s2pte_readonly(pte);
1027 }
1028 } while (pte++, addr += PAGE_SIZE, addr != end);
1029}
1030
1031/**
1032 * stage2_wp_pmds - write protect PUD range
1033 * @pud: pointer to pud entry
1034 * @addr: range start address
1035 * @end: range end address
1036 */
1037static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
1038{
1039 pmd_t *pmd;
1040 phys_addr_t next;
1041
1042 pmd = pmd_offset(pud, addr);
1043
1044 do {
1045 next = kvm_pmd_addr_end(addr, end);
1046 if (!pmd_none(*pmd)) {
1047 if (kvm_pmd_huge(*pmd)) {
1048 if (!kvm_s2pmd_readonly(pmd))
1049 kvm_set_s2pmd_readonly(pmd);
1050 } else {
1051 stage2_wp_ptes(pmd, addr, next);
1052 }
1053 }
1054 } while (pmd++, addr = next, addr != end);
1055}
1056
1057/**
1058 * stage2_wp_puds - write protect PGD range
1059 * @pgd: pointer to pgd entry
1060 * @addr: range start address
1061 * @end: range end address
1062 *
1063 * Process PUD entries, for a huge PUD we cause a panic.
1064 */
1065static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
1066{
1067 pud_t *pud;
1068 phys_addr_t next;
1069
1070 pud = pud_offset(pgd, addr);
1071 do {
1072 next = kvm_pud_addr_end(addr, end);
1073 if (!pud_none(*pud)) {
1074 /* TODO:PUD not supported, revisit later if supported */
1075 BUG_ON(kvm_pud_huge(*pud));
1076 stage2_wp_pmds(pud, addr, next);
1077 }
1078 } while (pud++, addr = next, addr != end);
1079}
1080
1081/**
1082 * stage2_wp_range() - write protect stage2 memory region range
1083 * @kvm: The KVM pointer
1084 * @addr: Start address of range
1085 * @end: End address of range
1086 */
1087static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1088{
1089 pgd_t *pgd;
1090 phys_addr_t next;
1091
1092 pgd = kvm->arch.pgd + pgd_index(addr);
1093 do {
1094 /*
1095 * Release kvm_mmu_lock periodically if the memory region is
1096 * large. Otherwise, we may see kernel panics with
1097 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1098 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1099 * will also starve other vCPUs.
1100 */
1101 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
1102 cond_resched_lock(&kvm->mmu_lock);
1103
1104 next = kvm_pgd_addr_end(addr, end);
1105 if (pgd_present(*pgd))
1106 stage2_wp_puds(pgd, addr, next);
1107 } while (pgd++, addr = next, addr != end);
1108}
1109
1110/**
1111 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1112 * @kvm: The KVM pointer
1113 * @slot: The memory slot to write protect
1114 *
1115 * Called to start logging dirty pages after memory region
1116 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1117 * all present PMD and PTEs are write protected in the memory region.
1118 * Afterwards read of dirty page log can be called.
1119 *
1120 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1121 * serializing operations for VM memory regions.
1122 */
1123void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1124{
1125 struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
1126 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1127 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1128
1129 spin_lock(&kvm->mmu_lock);
1130 stage2_wp_range(kvm, start, end);
1131 spin_unlock(&kvm->mmu_lock);
1132 kvm_flush_remote_tlbs(kvm);
1133}
1134
1135/**
1136 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1137 * @kvm: The KVM pointer
1138 * @slot: The memory slot associated with mask
1139 * @gfn_offset: The gfn offset in memory slot
1140 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
1141 * slot to be write protected
1142 *
1143 * Walks bits set in mask write protects the associated pte's. Caller must
1144 * acquire kvm_mmu_lock.
1145 */
1146static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1147 struct kvm_memory_slot *slot,
1148 gfn_t gfn_offset, unsigned long mask)
1149{
1150 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1151 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
1152 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1153
1154 stage2_wp_range(kvm, start, end);
1155}
1156
1157/*
1158 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1159 * dirty pages.
1160 *
1161 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1162 * enable dirty logging for them.
1163 */
1164void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1165 struct kvm_memory_slot *slot,
1166 gfn_t gfn_offset, unsigned long mask)
1167{
1168 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1169}
1170
960static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, 1171static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
961 unsigned long size, bool uncached) 1172 unsigned long size, bool uncached)
962{ 1173{
@@ -977,6 +1188,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
977 pfn_t pfn; 1188 pfn_t pfn;
978 pgprot_t mem_type = PAGE_S2; 1189 pgprot_t mem_type = PAGE_S2;
979 bool fault_ipa_uncached; 1190 bool fault_ipa_uncached;
1191 bool logging_active = memslot_is_logging(memslot);
1192 unsigned long flags = 0;
980 1193
981 write_fault = kvm_is_write_fault(vcpu); 1194 write_fault = kvm_is_write_fault(vcpu);
982 if (fault_status == FSC_PERM && !write_fault) { 1195 if (fault_status == FSC_PERM && !write_fault) {
@@ -993,7 +1206,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
993 return -EFAULT; 1206 return -EFAULT;
994 } 1207 }
995 1208
996 if (is_vm_hugetlb_page(vma)) { 1209 if (is_vm_hugetlb_page(vma) && !logging_active) {
997 hugetlb = true; 1210 hugetlb = true;
998 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; 1211 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
999 } else { 1212 } else {
@@ -1034,12 +1247,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1034 if (is_error_pfn(pfn)) 1247 if (is_error_pfn(pfn))
1035 return -EFAULT; 1248 return -EFAULT;
1036 1249
1037 if (kvm_is_device_pfn(pfn)) 1250 if (kvm_is_device_pfn(pfn)) {
1038 mem_type = PAGE_S2_DEVICE; 1251 mem_type = PAGE_S2_DEVICE;
1252 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1253 } else if (logging_active) {
1254 /*
1255 * Faults on pages in a memslot with logging enabled
1256 * should not be mapped with huge pages (it introduces churn
1257 * and performance degradation), so force a pte mapping.
1258 */
1259 force_pte = true;
1260 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1261
1262 /*
1263 * Only actually map the page as writable if this was a write
1264 * fault.
1265 */
1266 if (!write_fault)
1267 writable = false;
1268 }
1039 1269
1040 spin_lock(&kvm->mmu_lock); 1270 spin_lock(&kvm->mmu_lock);
1041 if (mmu_notifier_retry(kvm, mmu_seq)) 1271 if (mmu_notifier_retry(kvm, mmu_seq))
1042 goto out_unlock; 1272 goto out_unlock;
1273
1043 if (!hugetlb && !force_pte) 1274 if (!hugetlb && !force_pte)
1044 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 1275 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
1045 1276
@@ -1056,16 +1287,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1056 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1287 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1057 } else { 1288 } else {
1058 pte_t new_pte = pfn_pte(pfn, mem_type); 1289 pte_t new_pte = pfn_pte(pfn, mem_type);
1290
1059 if (writable) { 1291 if (writable) {
1060 kvm_set_s2pte_writable(&new_pte); 1292 kvm_set_s2pte_writable(&new_pte);
1061 kvm_set_pfn_dirty(pfn); 1293 kvm_set_pfn_dirty(pfn);
1294 mark_page_dirty(kvm, gfn);
1062 } 1295 }
1063 coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached); 1296 coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
1064 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, 1297 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1065 pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
1066 } 1298 }
1067 1299
1068
1069out_unlock: 1300out_unlock:
1070 spin_unlock(&kvm->mmu_lock); 1301 spin_unlock(&kvm->mmu_lock);
1071 kvm_release_pfn_clean(pfn); 1302 kvm_release_pfn_clean(pfn);
@@ -1215,7 +1446,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
1215{ 1446{
1216 pte_t *pte = (pte_t *)data; 1447 pte_t *pte = (pte_t *)data;
1217 1448
1218 stage2_set_pte(kvm, NULL, gpa, pte, false); 1449 /*
1450 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
1451 * flag clear because MMU notifiers will have unmapped a huge PMD before
1452 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1453 * therefore stage2_set_pte() never needs to clear out a huge PMD
1454 * through this calling path.
1455 */
1456 stage2_set_pte(kvm, NULL, gpa, pte, 0);
1219} 1457}
1220 1458
1221 1459
@@ -1348,6 +1586,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
1348 const struct kvm_memory_slot *old, 1586 const struct kvm_memory_slot *old,
1349 enum kvm_mr_change change) 1587 enum kvm_mr_change change)
1350{ 1588{
1589 /*
1590 * At this point memslot has been committed and there is an
1591 * allocated dirty_bitmap[], dirty pages will be be tracked while the
1592 * memory slot is write protected.
1593 */
1594 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
1595 kvm_mmu_wp_memory_region(kvm, mem->slot);
1351} 1596}
1352 1597
1353int kvm_arch_prepare_memory_region(struct kvm *kvm, 1598int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -1360,7 +1605,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1360 bool writable = !(mem->flags & KVM_MEM_READONLY); 1605 bool writable = !(mem->flags & KVM_MEM_READONLY);
1361 int ret = 0; 1606 int ret = 0;
1362 1607
1363 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE) 1608 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1609 change != KVM_MR_FLAGS_ONLY)
1364 return 0; 1610 return 0;
1365 1611
1366 /* 1612 /*
@@ -1411,6 +1657,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1411 phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) + 1657 phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
1412 vm_start - vma->vm_start; 1658 vm_start - vma->vm_start;
1413 1659
1660 /* IO region dirty page logging not allowed */
1661 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
1662 return -EINVAL;
1663
1414 ret = kvm_phys_addr_ioremap(kvm, gpa, pa, 1664 ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
1415 vm_end - vm_start, 1665 vm_end - vm_start,
1416 writable); 1666 writable);
@@ -1420,6 +1670,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1420 hva = vm_end; 1670 hva = vm_end;
1421 } while (hva < reg_end); 1671 } while (hva < reg_end);
1422 1672
1673 if (change == KVM_MR_FLAGS_ONLY)
1674 return ret;
1675
1423 spin_lock(&kvm->mmu_lock); 1676 spin_lock(&kvm->mmu_lock);
1424 if (ret) 1677 if (ret)
1425 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); 1678 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 58cb3248d277..02fa8eff6ae1 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -22,6 +22,7 @@
22#include <asm/cputype.h> 22#include <asm/cputype.h>
23#include <asm/kvm_emulate.h> 23#include <asm/kvm_emulate.h>
24#include <asm/kvm_psci.h> 24#include <asm/kvm_psci.h>
25#include <asm/kvm_host.h>
25 26
26/* 27/*
27 * This is an implementation of the Power State Coordination Interface 28 * This is an implementation of the Power State Coordination Interface
@@ -66,25 +67,17 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
66static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) 67static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
67{ 68{
68 struct kvm *kvm = source_vcpu->kvm; 69 struct kvm *kvm = source_vcpu->kvm;
69 struct kvm_vcpu *vcpu = NULL, *tmp; 70 struct kvm_vcpu *vcpu = NULL;
70 wait_queue_head_t *wq; 71 wait_queue_head_t *wq;
71 unsigned long cpu_id; 72 unsigned long cpu_id;
72 unsigned long context_id; 73 unsigned long context_id;
73 unsigned long mpidr;
74 phys_addr_t target_pc; 74 phys_addr_t target_pc;
75 int i;
76 75
77 cpu_id = *vcpu_reg(source_vcpu, 1); 76 cpu_id = *vcpu_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK;
78 if (vcpu_mode_is_32bit(source_vcpu)) 77 if (vcpu_mode_is_32bit(source_vcpu))
79 cpu_id &= ~((u32) 0); 78 cpu_id &= ~((u32) 0);
80 79
81 kvm_for_each_vcpu(i, tmp, kvm) { 80 vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
82 mpidr = kvm_vcpu_get_mpidr(tmp);
83 if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) {
84 vcpu = tmp;
85 break;
86 }
87 }
88 81
89 /* 82 /*
90 * Make sure the caller requested a valid CPU and that the CPU is 83 * Make sure the caller requested a valid CPU and that the CPU is
@@ -155,7 +148,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
155 * then ON else OFF 148 * then ON else OFF
156 */ 149 */
157 kvm_for_each_vcpu(i, tmp, kvm) { 150 kvm_for_each_vcpu(i, tmp, kvm) {
158 mpidr = kvm_vcpu_get_mpidr(tmp); 151 mpidr = kvm_vcpu_get_mpidr_aff(tmp);
159 if (((mpidr & target_affinity_mask) == target_affinity) && 152 if (((mpidr & target_affinity_mask) == target_affinity) &&
160 !tmp->arch.pause) { 153 !tmp->arch.pause) {
161 return PSCI_0_2_AFFINITY_LEVEL_ON; 154 return PSCI_0_2_AFFINITY_LEVEL_ON;
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index b6a6e7102201..881874b1a036 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -140,19 +140,22 @@ TRACE_EVENT(kvm_emulate_cp15_imp,
140 __entry->CRm, __entry->Op2) 140 __entry->CRm, __entry->Op2)
141); 141);
142 142
143TRACE_EVENT(kvm_wfi, 143TRACE_EVENT(kvm_wfx,
144 TP_PROTO(unsigned long vcpu_pc), 144 TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
145 TP_ARGS(vcpu_pc), 145 TP_ARGS(vcpu_pc, is_wfe),
146 146
147 TP_STRUCT__entry( 147 TP_STRUCT__entry(
148 __field( unsigned long, vcpu_pc ) 148 __field( unsigned long, vcpu_pc )
149 __field( bool, is_wfe )
149 ), 150 ),
150 151
151 TP_fast_assign( 152 TP_fast_assign(
152 __entry->vcpu_pc = vcpu_pc; 153 __entry->vcpu_pc = vcpu_pc;
154 __entry->is_wfe = is_wfe;
153 ), 155 ),
154 156
155 TP_printk("guest executed wfi at: 0x%08lx", __entry->vcpu_pc) 157 TP_printk("guest executed wf%c at: 0x%08lx",
158 __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
156); 159);
157 160
158TRACE_EVENT(kvm_unmap_hva, 161TRACE_EVENT(kvm_unmap_hva,
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 62167090937d..92bbae381598 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -96,6 +96,7 @@
96#define ESR_ELx_COND_SHIFT (20) 96#define ESR_ELx_COND_SHIFT (20)
97#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) 97#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT)
98#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) 98#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0)
99#define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1)
99 100
100#ifndef __ASSEMBLY__ 101#ifndef __ASSEMBLY__
101#include <asm/types.h> 102#include <asm/types.h>
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 483842180f8f..4f7310fa77f0 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -126,6 +126,7 @@ extern char __kvm_hyp_vector[];
126 126
127extern void __kvm_flush_vm_context(void); 127extern void __kvm_flush_vm_context(void);
128extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); 128extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
129extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
129 130
130extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 131extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
131 132
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 0163b5775ca5..17e92f05b1fe 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -29,6 +29,7 @@
29#include <asm/kvm_asm.h> 29#include <asm/kvm_asm.h>
30#include <asm/kvm_mmio.h> 30#include <asm/kvm_mmio.h>
31#include <asm/ptrace.h> 31#include <asm/ptrace.h>
32#include <asm/cputype.h>
32 33
33unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num); 34unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num);
34unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu); 35unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu);
@@ -140,6 +141,11 @@ static inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu)
140 return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8; 141 return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8;
141} 142}
142 143
144static inline u32 kvm_vcpu_hvc_get_imm(const struct kvm_vcpu *vcpu)
145{
146 return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_xVC_IMM_MASK;
147}
148
143static inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu) 149static inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu)
144{ 150{
145 return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_ISV); 151 return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_ISV);
@@ -201,9 +207,9 @@ static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu)
201 return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE; 207 return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE;
202} 208}
203 209
204static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) 210static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
205{ 211{
206 return vcpu_sys_reg(vcpu, MPIDR_EL1); 212 return vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
207} 213}
208 214
209static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) 215static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index acd101a9014d..8ac3c70fe3c6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -59,6 +59,9 @@ struct kvm_arch {
59 /* VTTBR value associated with above pgd and vmid */ 59 /* VTTBR value associated with above pgd and vmid */
60 u64 vttbr; 60 u64 vttbr;
61 61
62 /* The maximum number of vCPUs depends on the used GIC model */
63 int max_vcpus;
64
62 /* Interrupt controller */ 65 /* Interrupt controller */
63 struct vgic_dist vgic; 66 struct vgic_dist vgic;
64 67
@@ -159,6 +162,7 @@ struct kvm_vm_stat {
159}; 162};
160 163
161struct kvm_vcpu_stat { 164struct kvm_vcpu_stat {
165 u32 halt_successful_poll;
162 u32 halt_wakeup; 166 u32 halt_wakeup;
163}; 167};
164 168
@@ -196,6 +200,7 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
196 200
197u64 kvm_call_hyp(void *hypfn, ...); 201u64 kvm_call_hyp(void *hypfn, ...);
198void force_vm_exit(const cpumask_t *mask); 202void force_vm_exit(const cpumask_t *mask);
203void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
199 204
200int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, 205int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
201 int exception_index); 206 int exception_index);
@@ -203,6 +208,8 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
203int kvm_perf_init(void); 208int kvm_perf_init(void);
204int kvm_perf_teardown(void); 209int kvm_perf_teardown(void);
205 210
211struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
212
206static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, 213static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
207 phys_addr_t pgd_ptr, 214 phys_addr_t pgd_ptr,
208 unsigned long hyp_stack_ptr, 215 unsigned long hyp_stack_ptr,
diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h
index fc2f689c0694..9f52beb7cb13 100644
--- a/arch/arm64/include/asm/kvm_mmio.h
+++ b/arch/arm64/include/asm/kvm_mmio.h
@@ -40,6 +40,7 @@ struct kvm_exit_mmio {
40 u8 data[8]; 40 u8 data[8];
41 u32 len; 41 u32 len;
42 bool is_write; 42 bool is_write;
43 void *private;
43}; 44};
44 45
45static inline void kvm_prepare_mmio(struct kvm_run *run, 46static inline void kvm_prepare_mmio(struct kvm_run *run,
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index adcf49547301..6458b5373142 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -118,6 +118,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
118 pmd_val(*pmd) |= PMD_S2_RDWR; 118 pmd_val(*pmd) |= PMD_S2_RDWR;
119} 119}
120 120
121static inline void kvm_set_s2pte_readonly(pte_t *pte)
122{
123 pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY;
124}
125
126static inline bool kvm_s2pte_readonly(pte_t *pte)
127{
128 return (pte_val(*pte) & PTE_S2_RDWR) == PTE_S2_RDONLY;
129}
130
131static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
132{
133 pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY;
134}
135
136static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
137{
138 return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY;
139}
140
141
121#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end) 142#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end)
122#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end) 143#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end)
123#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end) 144#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end)
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 88174e0bfafe..5f930cc9ea83 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -119,6 +119,7 @@
119#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ 119#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
120#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ 120#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
121 121
122#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
122#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ 123#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
123 124
124/* 125/*
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 8e38878c87c6..3ef77a466018 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -78,6 +78,13 @@ struct kvm_regs {
78#define KVM_VGIC_V2_DIST_SIZE 0x1000 78#define KVM_VGIC_V2_DIST_SIZE 0x1000
79#define KVM_VGIC_V2_CPU_SIZE 0x2000 79#define KVM_VGIC_V2_CPU_SIZE 0x2000
80 80
81/* Supported VGICv3 address types */
82#define KVM_VGIC_V3_ADDR_TYPE_DIST 2
83#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3
84
85#define KVM_VGIC_V3_DIST_SIZE SZ_64K
86#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K)
87
81#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ 88#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */
82#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ 89#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
83#define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */ 90#define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */
@@ -161,6 +168,8 @@ struct kvm_arch_memory_slot {
161#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 168#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
162#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) 169#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
163#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 170#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
171#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
172#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
164 173
165/* KVM_IRQ_LINE irq field index values */ 174/* KVM_IRQ_LINE irq field index values */
166#define KVM_ARM_IRQ_TYPE_SHIFT 24 175#define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index a2ae19403abb..f7fa65d4c352 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -140,6 +140,7 @@ int main(void)
140 DEFINE(VGIC_V2_CPU_ELRSR, offsetof(struct vgic_cpu, vgic_v2.vgic_elrsr)); 140 DEFINE(VGIC_V2_CPU_ELRSR, offsetof(struct vgic_cpu, vgic_v2.vgic_elrsr));
141 DEFINE(VGIC_V2_CPU_APR, offsetof(struct vgic_cpu, vgic_v2.vgic_apr)); 141 DEFINE(VGIC_V2_CPU_APR, offsetof(struct vgic_cpu, vgic_v2.vgic_apr));
142 DEFINE(VGIC_V2_CPU_LR, offsetof(struct vgic_cpu, vgic_v2.vgic_lr)); 142 DEFINE(VGIC_V2_CPU_LR, offsetof(struct vgic_cpu, vgic_v2.vgic_lr));
143 DEFINE(VGIC_V3_CPU_SRE, offsetof(struct vgic_cpu, vgic_v3.vgic_sre));
143 DEFINE(VGIC_V3_CPU_HCR, offsetof(struct vgic_cpu, vgic_v3.vgic_hcr)); 144 DEFINE(VGIC_V3_CPU_HCR, offsetof(struct vgic_cpu, vgic_v3.vgic_hcr));
144 DEFINE(VGIC_V3_CPU_VMCR, offsetof(struct vgic_cpu, vgic_v3.vgic_vmcr)); 145 DEFINE(VGIC_V3_CPU_VMCR, offsetof(struct vgic_cpu, vgic_v3.vgic_vmcr));
145 DEFINE(VGIC_V3_CPU_MISR, offsetof(struct vgic_cpu, vgic_v3.vgic_misr)); 146 DEFINE(VGIC_V3_CPU_MISR, offsetof(struct vgic_cpu, vgic_v3.vgic_misr));
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index b334084d3675..f5590c81d95f 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -22,10 +22,12 @@ config KVM
22 select PREEMPT_NOTIFIERS 22 select PREEMPT_NOTIFIERS
23 select ANON_INODES 23 select ANON_INODES
24 select HAVE_KVM_CPU_RELAX_INTERCEPT 24 select HAVE_KVM_CPU_RELAX_INTERCEPT
25 select HAVE_KVM_ARCH_TLB_FLUSH_ALL
25 select KVM_MMIO 26 select KVM_MMIO
26 select KVM_ARM_HOST 27 select KVM_ARM_HOST
27 select KVM_ARM_VGIC 28 select KVM_ARM_VGIC
28 select KVM_ARM_TIMER 29 select KVM_ARM_TIMER
30 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
29 select SRCU 31 select SRCU
30 ---help--- 32 ---help---
31 Support hosting virtualized guest machines. 33 Support hosting virtualized guest machines.
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 32a096174b94..4e6e09ee4033 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -21,7 +21,9 @@ kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
21 21
22kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o 22kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
23kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o 23kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
24kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
24kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o 25kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
25kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o 26kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
27kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
26kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o 28kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
27kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o 29kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 29b184a8f3f8..524fa25671fc 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -28,12 +28,18 @@
28#include <asm/kvm_mmu.h> 28#include <asm/kvm_mmu.h>
29#include <asm/kvm_psci.h> 29#include <asm/kvm_psci.h>
30 30
31#define CREATE_TRACE_POINTS
32#include "trace.h"
33
31typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); 34typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
32 35
33static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) 36static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
34{ 37{
35 int ret; 38 int ret;
36 39
40 trace_kvm_hvc_arm64(*vcpu_pc(vcpu), *vcpu_reg(vcpu, 0),
41 kvm_vcpu_hvc_get_imm(vcpu));
42
37 ret = kvm_psci_call(vcpu); 43 ret = kvm_psci_call(vcpu);
38 if (ret < 0) { 44 if (ret < 0) {
39 kvm_inject_undefined(vcpu); 45 kvm_inject_undefined(vcpu);
@@ -63,10 +69,13 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
63 */ 69 */
64static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) 70static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
65{ 71{
66 if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) 72 if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) {
73 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
67 kvm_vcpu_on_spin(vcpu); 74 kvm_vcpu_on_spin(vcpu);
68 else 75 } else {
76 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
69 kvm_vcpu_block(vcpu); 77 kvm_vcpu_block(vcpu);
78 }
70 79
71 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 80 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
72 81
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 9bff671cc561..5befd010e232 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -1032,6 +1032,28 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
1032 ret 1032 ret
1033ENDPROC(__kvm_tlb_flush_vmid_ipa) 1033ENDPROC(__kvm_tlb_flush_vmid_ipa)
1034 1034
1035/**
1036 * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
1037 * @struct kvm *kvm - pointer to kvm structure
1038 *
1039 * Invalidates all Stage 1 and 2 TLB entries for current VMID.
1040 */
1041ENTRY(__kvm_tlb_flush_vmid)
1042 dsb ishst
1043
1044 kern_hyp_va x0
1045 ldr x2, [x0, #KVM_VTTBR]
1046 msr vttbr_el2, x2
1047 isb
1048
1049 tlbi vmalls12e1is
1050 dsb ish
1051 isb
1052
1053 msr vttbr_el2, xzr
1054 ret
1055ENDPROC(__kvm_tlb_flush_vmid)
1056
1035ENTRY(__kvm_flush_vm_context) 1057ENTRY(__kvm_flush_vm_context)
1036 dsb ishst 1058 dsb ishst
1037 tlbi alle1is 1059 tlbi alle1is
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index b96afdf6cee4..c370b4014799 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -113,6 +113,27 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu,
113 return true; 113 return true;
114} 114}
115 115
116/*
117 * Trap handler for the GICv3 SGI generation system register.
118 * Forward the request to the VGIC emulation.
119 * The cp15_64 code makes sure this automatically works
120 * for both AArch64 and AArch32 accesses.
121 */
122static bool access_gic_sgi(struct kvm_vcpu *vcpu,
123 const struct sys_reg_params *p,
124 const struct sys_reg_desc *r)
125{
126 u64 val;
127
128 if (!p->is_write)
129 return read_from_write_only(vcpu, p);
130
131 val = *vcpu_reg(vcpu, p->Rt);
132 vgic_v3_dispatch_sgi(vcpu, val);
133
134 return true;
135}
136
116static bool trap_raz_wi(struct kvm_vcpu *vcpu, 137static bool trap_raz_wi(struct kvm_vcpu *vcpu,
117 const struct sys_reg_params *p, 138 const struct sys_reg_params *p,
118 const struct sys_reg_desc *r) 139 const struct sys_reg_desc *r)
@@ -200,10 +221,19 @@ static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
200 221
201static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 222static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
202{ 223{
224 u64 mpidr;
225
203 /* 226 /*
204 * Simply map the vcpu_id into the Aff0 field of the MPIDR. 227 * Map the vcpu_id into the first three affinity level fields of
228 * the MPIDR. We limit the number of VCPUs in level 0 due to a
229 * limitation to 16 CPUs in that level in the ICC_SGIxR registers
230 * of the GICv3 to be able to address each CPU directly when
231 * sending IPIs.
205 */ 232 */
206 vcpu_sys_reg(vcpu, MPIDR_EL1) = (1UL << 31) | (vcpu->vcpu_id & 0xff); 233 mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
234 mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
235 mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
236 vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr;
207} 237}
208 238
209/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ 239/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
@@ -373,6 +403,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
373 { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000), 403 { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000),
374 NULL, reset_val, VBAR_EL1, 0 }, 404 NULL, reset_val, VBAR_EL1, 0 },
375 405
406 /* ICC_SGI1R_EL1 */
407 { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1011), Op2(0b101),
408 access_gic_sgi },
376 /* ICC_SRE_EL1 */ 409 /* ICC_SRE_EL1 */
377 { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101), 410 { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101),
378 trap_raz_wi }, 411 trap_raz_wi },
@@ -605,6 +638,8 @@ static const struct sys_reg_desc cp14_64_regs[] = {
605 * register). 638 * register).
606 */ 639 */
607static const struct sys_reg_desc cp15_regs[] = { 640static const struct sys_reg_desc cp15_regs[] = {
641 { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi },
642
608 { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR }, 643 { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR },
609 { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, 644 { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
610 { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, 645 { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
@@ -652,6 +687,7 @@ static const struct sys_reg_desc cp15_regs[] = {
652 687
653static const struct sys_reg_desc cp15_64_regs[] = { 688static const struct sys_reg_desc cp15_64_regs[] = {
654 { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, 689 { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
690 { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi },
655 { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, 691 { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
656}; 692};
657 693
diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h
new file mode 100644
index 000000000000..157416e963f2
--- /dev/null
+++ b/arch/arm64/kvm/trace.h
@@ -0,0 +1,55 @@
1#if !defined(_TRACE_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_ARM64_KVM_H
3
4#include <linux/tracepoint.h>
5
6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm
8
9TRACE_EVENT(kvm_wfx_arm64,
10 TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
11 TP_ARGS(vcpu_pc, is_wfe),
12
13 TP_STRUCT__entry(
14 __field(unsigned long, vcpu_pc)
15 __field(bool, is_wfe)
16 ),
17
18 TP_fast_assign(
19 __entry->vcpu_pc = vcpu_pc;
20 __entry->is_wfe = is_wfe;
21 ),
22
23 TP_printk("guest executed wf%c at: 0x%08lx",
24 __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
25);
26
27TRACE_EVENT(kvm_hvc_arm64,
28 TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
29 TP_ARGS(vcpu_pc, r0, imm),
30
31 TP_STRUCT__entry(
32 __field(unsigned long, vcpu_pc)
33 __field(unsigned long, r0)
34 __field(unsigned long, imm)
35 ),
36
37 TP_fast_assign(
38 __entry->vcpu_pc = vcpu_pc;
39 __entry->r0 = r0;
40 __entry->imm = imm;
41 ),
42
43 TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
44 __entry->vcpu_pc, __entry->r0, __entry->imm)
45);
46
47#endif /* _TRACE_ARM64_KVM_H */
48
49#undef TRACE_INCLUDE_PATH
50#define TRACE_INCLUDE_PATH .
51#undef TRACE_INCLUDE_FILE
52#define TRACE_INCLUDE_FILE trace
53
54/* This part must be outside protection */
55#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/vgic-v3-switch.S b/arch/arm64/kvm/vgic-v3-switch.S
index d16046999e06..617a012a0107 100644
--- a/arch/arm64/kvm/vgic-v3-switch.S
+++ b/arch/arm64/kvm/vgic-v3-switch.S
@@ -148,17 +148,18 @@
148 * x0: Register pointing to VCPU struct 148 * x0: Register pointing to VCPU struct
149 */ 149 */
150.macro restore_vgic_v3_state 150.macro restore_vgic_v3_state
151 // Disable SRE_EL1 access. Necessary, otherwise
152 // ICH_VMCR_EL2.VFIQEn becomes one, and FIQ happens...
153 msr_s ICC_SRE_EL1, xzr
154 isb
155
156 // Compute the address of struct vgic_cpu 151 // Compute the address of struct vgic_cpu
157 add x3, x0, #VCPU_VGIC_CPU 152 add x3, x0, #VCPU_VGIC_CPU
158 153
159 // Restore all interesting registers 154 // Restore all interesting registers
160 ldr w4, [x3, #VGIC_V3_CPU_HCR] 155 ldr w4, [x3, #VGIC_V3_CPU_HCR]
161 ldr w5, [x3, #VGIC_V3_CPU_VMCR] 156 ldr w5, [x3, #VGIC_V3_CPU_VMCR]
157 ldr w25, [x3, #VGIC_V3_CPU_SRE]
158
159 msr_s ICC_SRE_EL1, x25
160
161 // make sure SRE is valid before writing the other registers
162 isb
162 163
163 msr_s ICH_HCR_EL2, x4 164 msr_s ICH_HCR_EL2, x4
164 msr_s ICH_VMCR_EL2, x5 165 msr_s ICH_VMCR_EL2, x5
@@ -244,9 +245,12 @@
244 dsb sy 245 dsb sy
245 246
246 // Prevent the guest from touching the GIC system registers 247 // Prevent the guest from touching the GIC system registers
248 // if SRE isn't enabled for GICv3 emulation
249 cbnz x25, 1f
247 mrs_s x5, ICC_SRE_EL2 250 mrs_s x5, ICC_SRE_EL2
248 and x5, x5, #~ICC_SRE_EL2_ENABLE 251 and x5, x5, #~ICC_SRE_EL2_ENABLE
249 msr_s ICC_SRE_EL2, x5 252 msr_s ICC_SRE_EL2, x5
2531:
250.endm 254.endm
251 255
252ENTRY(__save_vgic_v3_state) 256ENTRY(__save_vgic_v3_state)
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 1b3f5eb5fcdb..891002bbb995 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -18,7 +18,6 @@ header-y += intrinsics.h
18header-y += ioctl.h 18header-y += ioctl.h
19header-y += ioctls.h 19header-y += ioctls.h
20header-y += ipcbuf.h 20header-y += ipcbuf.h
21header-y += kvm.h
22header-y += kvm_para.h 21header-y += kvm_para.h
23header-y += mman.h 22header-y += mman.h
24header-y += msgbuf.h 23header-y += msgbuf.h
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f2c249796ea8..ac4fc716062b 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -120,6 +120,7 @@ struct kvm_vcpu_stat {
120 u32 resvd_inst_exits; 120 u32 resvd_inst_exits;
121 u32 break_inst_exits; 121 u32 break_inst_exits;
122 u32 flush_dcache_exits; 122 u32 flush_dcache_exits;
123 u32 halt_successful_poll;
123 u32 halt_wakeup; 124 u32 halt_wakeup;
124}; 125};
125 126
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
index d7279c03c517..4a68b176d6e4 100644
--- a/arch/mips/kvm/locore.S
+++ b/arch/mips/kvm/locore.S
@@ -434,7 +434,7 @@ __kvm_mips_return_to_guest:
434 /* Setup status register for running guest in UM */ 434 /* Setup status register for running guest in UM */
435 .set at 435 .set at
436 or v1, v1, (ST0_EXL | KSU_USER | ST0_IE) 436 or v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
437 and v1, v1, ~ST0_CU0 437 and v1, v1, ~(ST0_CU0 | ST0_MX)
438 .set noat 438 .set noat
439 mtc0 v1, CP0_STATUS 439 mtc0 v1, CP0_STATUS
440 ehb 440 ehb
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e3b21e51ff7e..c9eccf5df912 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -15,9 +15,11 @@
15#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/bootmem.h> 17#include <linux/bootmem.h>
18#include <asm/fpu.h>
18#include <asm/page.h> 19#include <asm/page.h>
19#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
20#include <asm/mmu_context.h> 21#include <asm/mmu_context.h>
22#include <asm/pgtable.h>
21 23
22#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
23 25
@@ -47,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
47 { "resvd_inst", VCPU_STAT(resvd_inst_exits), KVM_STAT_VCPU }, 49 { "resvd_inst", VCPU_STAT(resvd_inst_exits), KVM_STAT_VCPU },
48 { "break_inst", VCPU_STAT(break_inst_exits), KVM_STAT_VCPU }, 50 { "break_inst", VCPU_STAT(break_inst_exits), KVM_STAT_VCPU },
49 { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU }, 51 { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
52 { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
50 { "halt_wakeup", VCPU_STAT(halt_wakeup), KVM_STAT_VCPU }, 53 { "halt_wakeup", VCPU_STAT(halt_wakeup), KVM_STAT_VCPU },
51 {NULL} 54 {NULL}
52}; 55};
@@ -378,6 +381,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
378 vcpu->mmio_needed = 0; 381 vcpu->mmio_needed = 0;
379 } 382 }
380 383
384 lose_fpu(1);
385
381 local_irq_disable(); 386 local_irq_disable();
382 /* Check if we have any exceptions/interrupts pending */ 387 /* Check if we have any exceptions/interrupts pending */
383 kvm_mips_deliver_interrupts(vcpu, 388 kvm_mips_deliver_interrupts(vcpu,
@@ -385,8 +390,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
385 390
386 kvm_guest_enter(); 391 kvm_guest_enter();
387 392
393 /* Disable hardware page table walking while in guest */
394 htw_stop();
395
388 r = __kvm_mips_vcpu_run(run, vcpu); 396 r = __kvm_mips_vcpu_run(run, vcpu);
389 397
398 /* Re-enable HTW before enabling interrupts */
399 htw_start();
400
390 kvm_guest_exit(); 401 kvm_guest_exit();
391 local_irq_enable(); 402 local_irq_enable();
392 403
@@ -832,9 +843,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
832 return -ENOIOCTLCMD; 843 return -ENOIOCTLCMD;
833} 844}
834 845
835int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 846void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
836{ 847{
837 return 0;
838} 848}
839 849
840int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 850int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
@@ -980,9 +990,6 @@ static void kvm_mips_set_c0_status(void)
980{ 990{
981 uint32_t status = read_c0_status(); 991 uint32_t status = read_c0_status();
982 992
983 if (cpu_has_fpu)
984 status |= (ST0_CU1);
985
986 if (cpu_has_dsp) 993 if (cpu_has_dsp)
987 status |= (ST0_MX); 994 status |= (ST0_MX);
988 995
@@ -1002,6 +1009,9 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
1002 enum emulation_result er = EMULATE_DONE; 1009 enum emulation_result er = EMULATE_DONE;
1003 int ret = RESUME_GUEST; 1010 int ret = RESUME_GUEST;
1004 1011
1012 /* re-enable HTW before enabling interrupts */
1013 htw_start();
1014
1005 /* Set a default exit reason */ 1015 /* Set a default exit reason */
1006 run->exit_reason = KVM_EXIT_UNKNOWN; 1016 run->exit_reason = KVM_EXIT_UNKNOWN;
1007 run->ready_for_interrupt_injection = 1; 1017 run->ready_for_interrupt_injection = 1;
@@ -1136,6 +1146,9 @@ skip_emul:
1136 } 1146 }
1137 } 1147 }
1138 1148
1149 /* Disable HTW before returning to guest or host */
1150 htw_stop();
1151
1139 return ret; 1152 return ret;
1140} 1153}
1141 1154
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7efd666a3fa7..8ef05121d3cd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -107,6 +107,7 @@ struct kvm_vcpu_stat {
107 u32 emulated_inst_exits; 107 u32 emulated_inst_exits;
108 u32 dec_exits; 108 u32 dec_exits;
109 u32 ext_intr_exits; 109 u32 ext_intr_exits;
110 u32 halt_successful_poll;
110 u32 halt_wakeup; 111 u32 halt_wakeup;
111 u32 dbell_exits; 112 u32 dbell_exits;
112 u32 gdbell_exits; 113 u32 gdbell_exits;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 888bf466d8c6..cfbcdc654201 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,6 +52,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
52 { "dec", VCPU_STAT(dec_exits) }, 52 { "dec", VCPU_STAT(dec_exits) },
53 { "ext_intr", VCPU_STAT(ext_intr_exits) }, 53 { "ext_intr", VCPU_STAT(ext_intr_exits) },
54 { "queue_intr", VCPU_STAT(queue_intr) }, 54 { "queue_intr", VCPU_STAT(queue_intr) },
55 { "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
55 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 56 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
56 { "pf_storage", VCPU_STAT(pf_storage) }, 57 { "pf_storage", VCPU_STAT(pf_storage) },
57 { "sp_storage", VCPU_STAT(sp_storage) }, 58 { "sp_storage", VCPU_STAT(sp_storage) },
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9b55dec2d6cc..6c1316a15a27 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -62,6 +62,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
62 { "inst_emu", VCPU_STAT(emulated_inst_exits) }, 62 { "inst_emu", VCPU_STAT(emulated_inst_exits) },
63 { "dec", VCPU_STAT(dec_exits) }, 63 { "dec", VCPU_STAT(dec_exits) },
64 { "ext_intr", VCPU_STAT(ext_intr_exits) }, 64 { "ext_intr", VCPU_STAT(ext_intr_exits) },
65 { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
65 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 66 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
66 { "doorbell", VCPU_STAT(dbell_exits) }, 67 { "doorbell", VCPU_STAT(dbell_exits) },
67 { "guest doorbell", VCPU_STAT(gdbell_exits) }, 68 { "guest doorbell", VCPU_STAT(gdbell_exits) },
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c45eaab752b0..27c0face86f4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -623,9 +623,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
623 return vcpu; 623 return vcpu;
624} 624}
625 625
626int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 626void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
627{ 627{
628 return 0;
629} 628}
630 629
631void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 630void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9cba74d5d853..d84559e31f32 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -35,11 +35,13 @@
35#define KVM_NR_IRQCHIPS 1 35#define KVM_NR_IRQCHIPS 1
36#define KVM_IRQCHIP_NUM_PINS 4096 36#define KVM_IRQCHIP_NUM_PINS 4096
37 37
38#define SIGP_CTRL_C 0x00800000 38#define SIGP_CTRL_C 0x80
39#define SIGP_CTRL_SCN_MASK 0x3f
39 40
40struct sca_entry { 41struct sca_entry {
41 atomic_t ctrl; 42 __u8 reserved0;
42 __u32 reserved; 43 __u8 sigp_ctrl;
44 __u16 reserved[3];
43 __u64 sda; 45 __u64 sda;
44 __u64 reserved2[2]; 46 __u64 reserved2[2];
45} __attribute__((packed)); 47} __attribute__((packed));
@@ -87,7 +89,8 @@ struct kvm_s390_sie_block {
87 atomic_t cpuflags; /* 0x0000 */ 89 atomic_t cpuflags; /* 0x0000 */
88 __u32 : 1; /* 0x0004 */ 90 __u32 : 1; /* 0x0004 */
89 __u32 prefix : 18; 91 __u32 prefix : 18;
90 __u32 : 13; 92 __u32 : 1;
93 __u32 ibc : 12;
91 __u8 reserved08[4]; /* 0x0008 */ 94 __u8 reserved08[4]; /* 0x0008 */
92#define PROG_IN_SIE (1<<0) 95#define PROG_IN_SIE (1<<0)
93 __u32 prog0c; /* 0x000c */ 96 __u32 prog0c; /* 0x000c */
@@ -132,7 +135,9 @@ struct kvm_s390_sie_block {
132 __u8 reserved60; /* 0x0060 */ 135 __u8 reserved60; /* 0x0060 */
133 __u8 ecb; /* 0x0061 */ 136 __u8 ecb; /* 0x0061 */
134 __u8 ecb2; /* 0x0062 */ 137 __u8 ecb2; /* 0x0062 */
135 __u8 reserved63[1]; /* 0x0063 */ 138#define ECB3_AES 0x04
139#define ECB3_DEA 0x08
140 __u8 ecb3; /* 0x0063 */
136 __u32 scaol; /* 0x0064 */ 141 __u32 scaol; /* 0x0064 */
137 __u8 reserved68[4]; /* 0x0068 */ 142 __u8 reserved68[4]; /* 0x0068 */
138 __u32 todpr; /* 0x006c */ 143 __u32 todpr; /* 0x006c */
@@ -159,6 +164,7 @@ struct kvm_s390_sie_block {
159 __u64 tecmc; /* 0x00e8 */ 164 __u64 tecmc; /* 0x00e8 */
160 __u8 reservedf0[12]; /* 0x00f0 */ 165 __u8 reservedf0[12]; /* 0x00f0 */
161#define CRYCB_FORMAT1 0x00000001 166#define CRYCB_FORMAT1 0x00000001
167#define CRYCB_FORMAT2 0x00000003
162 __u32 crycbd; /* 0x00fc */ 168 __u32 crycbd; /* 0x00fc */
163 __u64 gcr[16]; /* 0x0100 */ 169 __u64 gcr[16]; /* 0x0100 */
164 __u64 gbea; /* 0x0180 */ 170 __u64 gbea; /* 0x0180 */
@@ -192,6 +198,7 @@ struct kvm_vcpu_stat {
192 u32 exit_stop_request; 198 u32 exit_stop_request;
193 u32 exit_validity; 199 u32 exit_validity;
194 u32 exit_instruction; 200 u32 exit_instruction;
201 u32 halt_successful_poll;
195 u32 halt_wakeup; 202 u32 halt_wakeup;
196 u32 instruction_lctl; 203 u32 instruction_lctl;
197 u32 instruction_lctlg; 204 u32 instruction_lctlg;
@@ -378,14 +385,11 @@ struct kvm_s390_interrupt_info {
378 struct kvm_s390_emerg_info emerg; 385 struct kvm_s390_emerg_info emerg;
379 struct kvm_s390_extcall_info extcall; 386 struct kvm_s390_extcall_info extcall;
380 struct kvm_s390_prefix_info prefix; 387 struct kvm_s390_prefix_info prefix;
388 struct kvm_s390_stop_info stop;
381 struct kvm_s390_mchk_info mchk; 389 struct kvm_s390_mchk_info mchk;
382 }; 390 };
383}; 391};
384 392
385/* for local_interrupt.action_flags */
386#define ACTION_STORE_ON_STOP (1<<0)
387#define ACTION_STOP_ON_STOP (1<<1)
388
389struct kvm_s390_irq_payload { 393struct kvm_s390_irq_payload {
390 struct kvm_s390_io_info io; 394 struct kvm_s390_io_info io;
391 struct kvm_s390_ext_info ext; 395 struct kvm_s390_ext_info ext;
@@ -393,6 +397,7 @@ struct kvm_s390_irq_payload {
393 struct kvm_s390_emerg_info emerg; 397 struct kvm_s390_emerg_info emerg;
394 struct kvm_s390_extcall_info extcall; 398 struct kvm_s390_extcall_info extcall;
395 struct kvm_s390_prefix_info prefix; 399 struct kvm_s390_prefix_info prefix;
400 struct kvm_s390_stop_info stop;
396 struct kvm_s390_mchk_info mchk; 401 struct kvm_s390_mchk_info mchk;
397}; 402};
398 403
@@ -401,7 +406,6 @@ struct kvm_s390_local_interrupt {
401 struct kvm_s390_float_interrupt *float_int; 406 struct kvm_s390_float_interrupt *float_int;
402 wait_queue_head_t *wq; 407 wait_queue_head_t *wq;
403 atomic_t *cpuflags; 408 atomic_t *cpuflags;
404 unsigned int action_bits;
405 DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); 409 DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
406 struct kvm_s390_irq_payload irq; 410 struct kvm_s390_irq_payload irq;
407 unsigned long pending_irqs; 411 unsigned long pending_irqs;
@@ -470,7 +474,6 @@ struct kvm_vcpu_arch {
470 }; 474 };
471 struct gmap *gmap; 475 struct gmap *gmap;
472 struct kvm_guestdbg_info_arch guestdbg; 476 struct kvm_guestdbg_info_arch guestdbg;
473#define KVM_S390_PFAULT_TOKEN_INVALID (-1UL)
474 unsigned long pfault_token; 477 unsigned long pfault_token;
475 unsigned long pfault_select; 478 unsigned long pfault_select;
476 unsigned long pfault_compare; 479 unsigned long pfault_compare;
@@ -504,13 +507,39 @@ struct s390_io_adapter {
504#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8) 507#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
505#define MAX_S390_ADAPTER_MAPS 256 508#define MAX_S390_ADAPTER_MAPS 256
506 509
510/* maximum size of facilities and facility mask is 2k bytes */
511#define S390_ARCH_FAC_LIST_SIZE_BYTE (1<<11)
512#define S390_ARCH_FAC_LIST_SIZE_U64 \
513 (S390_ARCH_FAC_LIST_SIZE_BYTE / sizeof(u64))
514#define S390_ARCH_FAC_MASK_SIZE_BYTE S390_ARCH_FAC_LIST_SIZE_BYTE
515#define S390_ARCH_FAC_MASK_SIZE_U64 \
516 (S390_ARCH_FAC_MASK_SIZE_BYTE / sizeof(u64))
517
518struct s390_model_fac {
519 /* facilities used in SIE context */
520 __u64 sie[S390_ARCH_FAC_LIST_SIZE_U64];
521 /* subset enabled by kvm */
522 __u64 kvm[S390_ARCH_FAC_LIST_SIZE_U64];
523};
524
525struct kvm_s390_cpu_model {
526 struct s390_model_fac *fac;
527 struct cpuid cpu_id;
528 unsigned short ibc;
529};
530
507struct kvm_s390_crypto { 531struct kvm_s390_crypto {
508 struct kvm_s390_crypto_cb *crycb; 532 struct kvm_s390_crypto_cb *crycb;
509 __u32 crycbd; 533 __u32 crycbd;
534 __u8 aes_kw;
535 __u8 dea_kw;
510}; 536};
511 537
512struct kvm_s390_crypto_cb { 538struct kvm_s390_crypto_cb {
513 __u8 reserved00[128]; /* 0x0000 */ 539 __u8 reserved00[72]; /* 0x0000 */
540 __u8 dea_wrapping_key_mask[24]; /* 0x0048 */
541 __u8 aes_wrapping_key_mask[32]; /* 0x0060 */
542 __u8 reserved80[128]; /* 0x0080 */
514}; 543};
515 544
516struct kvm_arch{ 545struct kvm_arch{
@@ -523,12 +552,15 @@ struct kvm_arch{
523 int use_irqchip; 552 int use_irqchip;
524 int use_cmma; 553 int use_cmma;
525 int user_cpu_state_ctrl; 554 int user_cpu_state_ctrl;
555 int user_sigp;
526 struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; 556 struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
527 wait_queue_head_t ipte_wq; 557 wait_queue_head_t ipte_wq;
528 int ipte_lock_count; 558 int ipte_lock_count;
529 struct mutex ipte_mutex; 559 struct mutex ipte_mutex;
530 spinlock_t start_stop_lock; 560 spinlock_t start_stop_lock;
561 struct kvm_s390_cpu_model model;
531 struct kvm_s390_crypto crypto; 562 struct kvm_s390_crypto crypto;
563 u64 epoch;
532}; 564};
533 565
534#define KVM_HVA_ERR_BAD (-1UL) 566#define KVM_HVA_ERR_BAD (-1UL)
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index edb453cfc2c6..f1096bab5199 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -31,7 +31,8 @@ struct sclp_cpu_entry {
31 u8 reserved0[2]; 31 u8 reserved0[2];
32 u8 : 3; 32 u8 : 3;
33 u8 siif : 1; 33 u8 siif : 1;
34 u8 : 4; 34 u8 sigpif : 1;
35 u8 : 3;
35 u8 reserved2[10]; 36 u8 reserved2[10];
36 u8 type; 37 u8 type;
37 u8 reserved1; 38 u8 reserved1;
@@ -69,6 +70,7 @@ int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode);
69unsigned long sclp_get_hsa_size(void); 70unsigned long sclp_get_hsa_size(void);
70void sclp_early_detect(void); 71void sclp_early_detect(void);
71int sclp_has_siif(void); 72int sclp_has_siif(void);
73int sclp_has_sigpif(void);
72unsigned int sclp_get_ibc(void); 74unsigned int sclp_get_ibc(void);
73 75
74long _sclp_print_early(const char *); 76long _sclp_print_early(const char *);
diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index 73f12d21af4d..f7054a892d9e 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -15,6 +15,7 @@
15#define __ASM_S390_SYSINFO_H 15#define __ASM_S390_SYSINFO_H
16 16
17#include <asm/bitsperlong.h> 17#include <asm/bitsperlong.h>
18#include <linux/uuid.h>
18 19
19struct sysinfo_1_1_1 { 20struct sysinfo_1_1_1 {
20 unsigned char p:1; 21 unsigned char p:1;
@@ -116,10 +117,13 @@ struct sysinfo_3_2_2 {
116 char name[8]; 117 char name[8];
117 unsigned int caf; 118 unsigned int caf;
118 char cpi[16]; 119 char cpi[16];
119 char reserved_1[24]; 120 char reserved_1[3];
120 121 char ext_name_encoding;
122 unsigned int reserved_2;
123 uuid_be uuid;
121 } vm[8]; 124 } vm[8];
122 char reserved_544[3552]; 125 char reserved_3[1504];
126 char ext_names[8][256];
123}; 127};
124 128
125extern int topology_max_mnest; 129extern int topology_max_mnest;
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 48eda3ab4944..9c77e60b9a26 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -57,10 +57,44 @@ struct kvm_s390_io_adapter_req {
57 57
58/* kvm attr_group on vm fd */ 58/* kvm attr_group on vm fd */
59#define KVM_S390_VM_MEM_CTRL 0 59#define KVM_S390_VM_MEM_CTRL 0
60#define KVM_S390_VM_TOD 1
61#define KVM_S390_VM_CRYPTO 2
62#define KVM_S390_VM_CPU_MODEL 3
60 63
61/* kvm attributes for mem_ctrl */ 64/* kvm attributes for mem_ctrl */
62#define KVM_S390_VM_MEM_ENABLE_CMMA 0 65#define KVM_S390_VM_MEM_ENABLE_CMMA 0
63#define KVM_S390_VM_MEM_CLR_CMMA 1 66#define KVM_S390_VM_MEM_CLR_CMMA 1
67#define KVM_S390_VM_MEM_LIMIT_SIZE 2
68
69/* kvm attributes for KVM_S390_VM_TOD */
70#define KVM_S390_VM_TOD_LOW 0
71#define KVM_S390_VM_TOD_HIGH 1
72
73/* kvm attributes for KVM_S390_VM_CPU_MODEL */
74/* processor related attributes are r/w */
75#define KVM_S390_VM_CPU_PROCESSOR 0
76struct kvm_s390_vm_cpu_processor {
77 __u64 cpuid;
78 __u16 ibc;
79 __u8 pad[6];
80 __u64 fac_list[256];
81};
82
83/* machine related attributes are r/o */
84#define KVM_S390_VM_CPU_MACHINE 1
85struct kvm_s390_vm_cpu_machine {
86 __u64 cpuid;
87 __u32 ibc;
88 __u8 pad[4];
89 __u64 fac_mask[256];
90 __u64 fac_list[256];
91};
92
93/* kvm attributes for crypto */
94#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0
95#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1
96#define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2
97#define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3
64 98
65/* for KVM_GET_REGS and KVM_SET_REGS */ 99/* for KVM_GET_REGS and KVM_SET_REGS */
66struct kvm_regs { 100struct kvm_regs {
@@ -107,6 +141,9 @@ struct kvm_guest_debug_arch {
107 struct kvm_hw_breakpoint __user *hw_bp; 141 struct kvm_hw_breakpoint __user *hw_bp;
108}; 142};
109 143
144/* for KVM_SYNC_PFAULT and KVM_REG_S390_PFTOKEN */
145#define KVM_S390_PFAULT_TOKEN_INVALID 0xffffffffffffffffULL
146
110#define KVM_SYNC_PREFIX (1UL << 0) 147#define KVM_SYNC_PREFIX (1UL << 0)
111#define KVM_SYNC_GPRS (1UL << 1) 148#define KVM_SYNC_GPRS (1UL << 1)
112#define KVM_SYNC_ACRS (1UL << 2) 149#define KVM_SYNC_ACRS (1UL << 2)
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 85565f1ff474..99babea026ca 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -204,6 +204,33 @@ static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info)
204 } 204 }
205} 205}
206 206
207static void print_ext_name(struct seq_file *m, int lvl,
208 struct sysinfo_3_2_2 *info)
209{
210 if (info->vm[lvl].ext_name_encoding == 0)
211 return;
212 if (info->ext_names[lvl][0] == 0)
213 return;
214 switch (info->vm[lvl].ext_name_encoding) {
215 case 1: /* EBCDIC */
216 EBCASC(info->ext_names[lvl], sizeof(info->ext_names[lvl]));
217 break;
218 case 2: /* UTF-8 */
219 break;
220 default:
221 return;
222 }
223 seq_printf(m, "VM%02d Extended Name: %-.256s\n", lvl,
224 info->ext_names[lvl]);
225}
226
227static void print_uuid(struct seq_file *m, int i, struct sysinfo_3_2_2 *info)
228{
229 if (!memcmp(&info->vm[i].uuid, &NULL_UUID_BE, sizeof(uuid_be)))
230 return;
231 seq_printf(m, "VM%02d UUID: %pUb\n", i, &info->vm[i].uuid);
232}
233
207static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info) 234static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info)
208{ 235{
209 int i; 236 int i;
@@ -221,6 +248,8 @@ static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info)
221 seq_printf(m, "VM%02d CPUs Configured: %d\n", i, info->vm[i].cpus_configured); 248 seq_printf(m, "VM%02d CPUs Configured: %d\n", i, info->vm[i].cpus_configured);
222 seq_printf(m, "VM%02d CPUs Standby: %d\n", i, info->vm[i].cpus_standby); 249 seq_printf(m, "VM%02d CPUs Standby: %d\n", i, info->vm[i].cpus_standby);
223 seq_printf(m, "VM%02d CPUs Reserved: %d\n", i, info->vm[i].cpus_reserved); 250 seq_printf(m, "VM%02d CPUs Reserved: %d\n", i, info->vm[i].cpus_reserved);
251 print_ext_name(m, i, info);
252 print_uuid(m, i, info);
224 } 253 }
225} 254}
226 255
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 8a1be9017730..267523cac6de 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -357,8 +357,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
357 union asce asce; 357 union asce asce;
358 358
359 ctlreg0.val = vcpu->arch.sie_block->gcr[0]; 359 ctlreg0.val = vcpu->arch.sie_block->gcr[0];
360 edat1 = ctlreg0.edat && test_vfacility(8); 360 edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8);
361 edat2 = edat1 && test_vfacility(78); 361 edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78);
362 asce.val = get_vcpu_asce(vcpu); 362 asce.val = get_vcpu_asce(vcpu);
363 if (asce.r) 363 if (asce.r)
364 goto real_address; 364 goto real_address;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 81c77ab8102e..bebd2157edd0 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -68,18 +68,27 @@ static int handle_noop(struct kvm_vcpu *vcpu)
68 68
69static int handle_stop(struct kvm_vcpu *vcpu) 69static int handle_stop(struct kvm_vcpu *vcpu)
70{ 70{
71 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
71 int rc = 0; 72 int rc = 0;
72 unsigned int action_bits; 73 uint8_t flags, stop_pending;
73 74
74 vcpu->stat.exit_stop_request++; 75 vcpu->stat.exit_stop_request++;
75 trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
76 76
77 action_bits = vcpu->arch.local_int.action_bits; 77 /* delay the stop if any non-stop irq is pending */
78 if (kvm_s390_vcpu_has_irq(vcpu, 1))
79 return 0;
80
81 /* avoid races with the injection/SIGP STOP code */
82 spin_lock(&li->lock);
83 flags = li->irq.stop.flags;
84 stop_pending = kvm_s390_is_stop_irq_pending(vcpu);
85 spin_unlock(&li->lock);
78 86
79 if (!(action_bits & ACTION_STOP_ON_STOP)) 87 trace_kvm_s390_stop_request(stop_pending, flags);
88 if (!stop_pending)
80 return 0; 89 return 0;
81 90
82 if (action_bits & ACTION_STORE_ON_STOP) { 91 if (flags & KVM_S390_STOP_FLAG_STORE_STATUS) {
83 rc = kvm_s390_vcpu_store_status(vcpu, 92 rc = kvm_s390_vcpu_store_status(vcpu,
84 KVM_S390_STORE_STATUS_NOADDR); 93 KVM_S390_STORE_STATUS_NOADDR);
85 if (rc) 94 if (rc)
@@ -279,11 +288,13 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
279 irq.type = KVM_S390_INT_CPU_TIMER; 288 irq.type = KVM_S390_INT_CPU_TIMER;
280 break; 289 break;
281 case EXT_IRQ_EXTERNAL_CALL: 290 case EXT_IRQ_EXTERNAL_CALL:
282 if (kvm_s390_si_ext_call_pending(vcpu))
283 return 0;
284 irq.type = KVM_S390_INT_EXTERNAL_CALL; 291 irq.type = KVM_S390_INT_EXTERNAL_CALL;
285 irq.u.extcall.code = vcpu->arch.sie_block->extcpuaddr; 292 irq.u.extcall.code = vcpu->arch.sie_block->extcpuaddr;
286 break; 293 rc = kvm_s390_inject_vcpu(vcpu, &irq);
294 /* ignore if another external call is already pending */
295 if (rc == -EBUSY)
296 return 0;
297 return rc;
287 default: 298 default:
288 return -EOPNOTSUPP; 299 return -EOPNOTSUPP;
289 } 300 }
@@ -307,17 +318,19 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
307 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2); 318 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
308 319
309 /* Make sure that the source is paged-in */ 320 /* Make sure that the source is paged-in */
310 srcaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg2]); 321 rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
311 if (kvm_is_error_gpa(vcpu->kvm, srcaddr)) 322 &srcaddr, 0);
312 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 323 if (rc)
324 return kvm_s390_inject_prog_cond(vcpu, rc);
313 rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); 325 rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
314 if (rc != 0) 326 if (rc != 0)
315 return rc; 327 return rc;
316 328
317 /* Make sure that the destination is paged-in */ 329 /* Make sure that the destination is paged-in */
318 dstaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg1]); 330 rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
319 if (kvm_is_error_gpa(vcpu->kvm, dstaddr)) 331 &dstaddr, 1);
320 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 332 if (rc)
333 return kvm_s390_inject_prog_cond(vcpu, rc);
321 rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); 334 rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
322 if (rc != 0) 335 if (rc != 0)
323 return rc; 336 return rc;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f00f31e66cd8..073b5f387d1d 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -19,6 +19,7 @@
19#include <linux/bitmap.h> 19#include <linux/bitmap.h>
20#include <asm/asm-offsets.h> 20#include <asm/asm-offsets.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <asm/sclp.h>
22#include "kvm-s390.h" 23#include "kvm-s390.h"
23#include "gaccess.h" 24#include "gaccess.h"
24#include "trace-s390.h" 25#include "trace-s390.h"
@@ -159,6 +160,12 @@ static unsigned long deliverable_local_irqs(struct kvm_vcpu *vcpu)
159 if (psw_mchk_disabled(vcpu)) 160 if (psw_mchk_disabled(vcpu))
160 active_mask &= ~IRQ_PEND_MCHK_MASK; 161 active_mask &= ~IRQ_PEND_MCHK_MASK;
161 162
163 /*
164 * STOP irqs will never be actively delivered. They are triggered via
165 * intercept requests and cleared when the stop intercept is performed.
166 */
167 __clear_bit(IRQ_PEND_SIGP_STOP, &active_mask);
168
162 return active_mask; 169 return active_mask;
163} 170}
164 171
@@ -186,9 +193,6 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
186 LCTL_CR10 | LCTL_CR11); 193 LCTL_CR10 | LCTL_CR11);
187 vcpu->arch.sie_block->ictl |= (ICTL_STCTL | ICTL_PINT); 194 vcpu->arch.sie_block->ictl |= (ICTL_STCTL | ICTL_PINT);
188 } 195 }
189
190 if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP)
191 atomic_set_mask(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags);
192} 196}
193 197
194static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) 198static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
@@ -216,11 +220,18 @@ static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
216 vcpu->arch.sie_block->lctl |= LCTL_CR14; 220 vcpu->arch.sie_block->lctl |= LCTL_CR14;
217} 221}
218 222
223static void set_intercept_indicators_stop(struct kvm_vcpu *vcpu)
224{
225 if (kvm_s390_is_stop_irq_pending(vcpu))
226 __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
227}
228
219/* Set interception request for non-deliverable local interrupts */ 229/* Set interception request for non-deliverable local interrupts */
220static void set_intercept_indicators_local(struct kvm_vcpu *vcpu) 230static void set_intercept_indicators_local(struct kvm_vcpu *vcpu)
221{ 231{
222 set_intercept_indicators_ext(vcpu); 232 set_intercept_indicators_ext(vcpu);
223 set_intercept_indicators_mchk(vcpu); 233 set_intercept_indicators_mchk(vcpu);
234 set_intercept_indicators_stop(vcpu);
224} 235}
225 236
226static void __set_intercept_indicator(struct kvm_vcpu *vcpu, 237static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
@@ -392,18 +403,6 @@ static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
392 return rc ? -EFAULT : 0; 403 return rc ? -EFAULT : 0;
393} 404}
394 405
395static int __must_check __deliver_stop(struct kvm_vcpu *vcpu)
396{
397 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
398 vcpu->stat.deliver_stop_signal++;
399 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_SIGP_STOP,
400 0, 0);
401
402 __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
403 clear_bit(IRQ_PEND_SIGP_STOP, &vcpu->arch.local_int.pending_irqs);
404 return 0;
405}
406
407static int __must_check __deliver_set_prefix(struct kvm_vcpu *vcpu) 406static int __must_check __deliver_set_prefix(struct kvm_vcpu *vcpu)
408{ 407{
409 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 408 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -705,7 +704,6 @@ static const deliver_irq_t deliver_irq_funcs[] = {
705 [IRQ_PEND_EXT_CLOCK_COMP] = __deliver_ckc, 704 [IRQ_PEND_EXT_CLOCK_COMP] = __deliver_ckc,
706 [IRQ_PEND_EXT_CPU_TIMER] = __deliver_cpu_timer, 705 [IRQ_PEND_EXT_CPU_TIMER] = __deliver_cpu_timer,
707 [IRQ_PEND_RESTART] = __deliver_restart, 706 [IRQ_PEND_RESTART] = __deliver_restart,
708 [IRQ_PEND_SIGP_STOP] = __deliver_stop,
709 [IRQ_PEND_SET_PREFIX] = __deliver_set_prefix, 707 [IRQ_PEND_SET_PREFIX] = __deliver_set_prefix,
710 [IRQ_PEND_PFAULT_INIT] = __deliver_pfault_init, 708 [IRQ_PEND_PFAULT_INIT] = __deliver_pfault_init,
711}; 709};
@@ -738,21 +736,20 @@ static int __must_check __deliver_floating_interrupt(struct kvm_vcpu *vcpu,
738 return rc; 736 return rc;
739} 737}
740 738
741/* Check whether SIGP interpretation facility has an external call pending */ 739/* Check whether an external call is pending (deliverable or not) */
742int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu) 740int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
743{ 741{
744 atomic_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl; 742 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
743 uint8_t sigp_ctrl = vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl;
745 744
746 if (!psw_extint_disabled(vcpu) && 745 if (!sclp_has_sigpif())
747 (vcpu->arch.sie_block->gcr[0] & 0x2000ul) && 746 return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs);
748 (atomic_read(sigp_ctrl) & SIGP_CTRL_C) &&
749 (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
750 return 1;
751 747
752 return 0; 748 return (sigp_ctrl & SIGP_CTRL_C) &&
749 (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND);
753} 750}
754 751
755int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) 752int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
756{ 753{
757 struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; 754 struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
758 struct kvm_s390_interrupt_info *inti; 755 struct kvm_s390_interrupt_info *inti;
@@ -773,7 +770,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
773 if (!rc && kvm_cpu_has_pending_timer(vcpu)) 770 if (!rc && kvm_cpu_has_pending_timer(vcpu))
774 rc = 1; 771 rc = 1;
775 772
776 if (!rc && kvm_s390_si_ext_call_pending(vcpu)) 773 /* external call pending and deliverable */
774 if (!rc && kvm_s390_ext_call_pending(vcpu) &&
775 !psw_extint_disabled(vcpu) &&
776 (vcpu->arch.sie_block->gcr[0] & 0x2000ul))
777 rc = 1;
778
779 if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
777 rc = 1; 780 rc = 1;
778 781
779 return rc; 782 return rc;
@@ -804,14 +807,20 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
804 return -EOPNOTSUPP; /* disabled wait */ 807 return -EOPNOTSUPP; /* disabled wait */
805 } 808 }
806 809
807 __set_cpu_idle(vcpu);
808 if (!ckc_interrupts_enabled(vcpu)) { 810 if (!ckc_interrupts_enabled(vcpu)) {
809 VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer"); 811 VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
812 __set_cpu_idle(vcpu);
810 goto no_timer; 813 goto no_timer;
811 } 814 }
812 815
813 now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; 816 now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
814 sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); 817 sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
818
819 /* underflow */
820 if (vcpu->arch.sie_block->ckc < now)
821 return 0;
822
823 __set_cpu_idle(vcpu);
815 hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); 824 hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
816 VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime); 825 VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
817no_timer: 826no_timer:
@@ -820,7 +829,7 @@ no_timer:
820 __unset_cpu_idle(vcpu); 829 __unset_cpu_idle(vcpu);
821 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 830 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
822 831
823 hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); 832 hrtimer_cancel(&vcpu->arch.ckc_timer);
824 return 0; 833 return 0;
825} 834}
826 835
@@ -840,10 +849,20 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
840enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) 849enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
841{ 850{
842 struct kvm_vcpu *vcpu; 851 struct kvm_vcpu *vcpu;
852 u64 now, sltime;
843 853
844 vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); 854 vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
845 kvm_s390_vcpu_wakeup(vcpu); 855 now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
856 sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
846 857
858 /*
859 * If the monotonic clock runs faster than the tod clock we might be
860 * woken up too early and have to go back to sleep to avoid deadlocks.
861 */
862 if (vcpu->arch.sie_block->ckc > now &&
863 hrtimer_forward_now(timer, ns_to_ktime(sltime)))
864 return HRTIMER_RESTART;
865 kvm_s390_vcpu_wakeup(vcpu);
847 return HRTIMER_NORESTART; 866 return HRTIMER_NORESTART;
848} 867}
849 868
@@ -859,8 +878,7 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
859 878
860 /* clear pending external calls set by sigp interpretation facility */ 879 /* clear pending external calls set by sigp interpretation facility */
861 atomic_clear_mask(CPUSTAT_ECALL_PEND, li->cpuflags); 880 atomic_clear_mask(CPUSTAT_ECALL_PEND, li->cpuflags);
862 atomic_clear_mask(SIGP_CTRL_C, 881 vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl = 0;
863 &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl);
864} 882}
865 883
866int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) 884int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
@@ -984,18 +1002,43 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
984 return 0; 1002 return 0;
985} 1003}
986 1004
987int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) 1005static int __inject_extcall_sigpif(struct kvm_vcpu *vcpu, uint16_t src_id)
1006{
1007 unsigned char new_val, old_val;
1008 uint8_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl;
1009
1010 new_val = SIGP_CTRL_C | (src_id & SIGP_CTRL_SCN_MASK);
1011 old_val = *sigp_ctrl & ~SIGP_CTRL_C;
1012 if (cmpxchg(sigp_ctrl, old_val, new_val) != old_val) {
1013 /* another external call is pending */
1014 return -EBUSY;
1015 }
1016 atomic_set_mask(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
1017 return 0;
1018}
1019
1020static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
988{ 1021{
989 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1022 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
990 struct kvm_s390_extcall_info *extcall = &li->irq.extcall; 1023 struct kvm_s390_extcall_info *extcall = &li->irq.extcall;
1024 uint16_t src_id = irq->u.extcall.code;
991 1025
992 VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u", 1026 VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u",
993 irq->u.extcall.code); 1027 src_id);
994 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, 1028 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL,
995 irq->u.extcall.code, 0, 2); 1029 src_id, 0, 2);
1030
1031 /* sending vcpu invalid */
1032 if (src_id >= KVM_MAX_VCPUS ||
1033 kvm_get_vcpu(vcpu->kvm, src_id) == NULL)
1034 return -EINVAL;
996 1035
1036 if (sclp_has_sigpif())
1037 return __inject_extcall_sigpif(vcpu, src_id);
1038
1039 if (!test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
1040 return -EBUSY;
997 *extcall = irq->u.extcall; 1041 *extcall = irq->u.extcall;
998 set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs);
999 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 1042 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
1000 return 0; 1043 return 0;
1001} 1044}
@@ -1006,23 +1049,41 @@ static int __inject_set_prefix(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1006 struct kvm_s390_prefix_info *prefix = &li->irq.prefix; 1049 struct kvm_s390_prefix_info *prefix = &li->irq.prefix;
1007 1050
1008 VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)", 1051 VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)",
1009 prefix->address); 1052 irq->u.prefix.address);
1010 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX, 1053 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX,
1011 prefix->address, 0, 2); 1054 irq->u.prefix.address, 0, 2);
1055
1056 if (!is_vcpu_stopped(vcpu))
1057 return -EBUSY;
1012 1058
1013 *prefix = irq->u.prefix; 1059 *prefix = irq->u.prefix;
1014 set_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs); 1060 set_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs);
1015 return 0; 1061 return 0;
1016} 1062}
1017 1063
1064#define KVM_S390_STOP_SUPP_FLAGS (KVM_S390_STOP_FLAG_STORE_STATUS)
1018static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) 1065static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1019{ 1066{
1020 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1067 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1068 struct kvm_s390_stop_info *stop = &li->irq.stop;
1069 int rc = 0;
1021 1070
1022 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0, 2); 1071 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0, 2);
1023 1072
1024 li->action_bits |= ACTION_STOP_ON_STOP; 1073 if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS)
1025 set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); 1074 return -EINVAL;
1075
1076 if (is_vcpu_stopped(vcpu)) {
1077 if (irq->u.stop.flags & KVM_S390_STOP_FLAG_STORE_STATUS)
1078 rc = kvm_s390_store_status_unloaded(vcpu,
1079 KVM_S390_STORE_STATUS_NOADDR);
1080 return rc;
1081 }
1082
1083 if (test_and_set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs))
1084 return -EBUSY;
1085 stop->flags = irq->u.stop.flags;
1086 __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
1026 return 0; 1087 return 0;
1027} 1088}
1028 1089
@@ -1042,14 +1103,13 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu,
1042 struct kvm_s390_irq *irq) 1103 struct kvm_s390_irq *irq)
1043{ 1104{
1044 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1105 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1045 struct kvm_s390_emerg_info *emerg = &li->irq.emerg;
1046 1106
1047 VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", 1107 VCPU_EVENT(vcpu, 3, "inject: emergency %u\n",
1048 irq->u.emerg.code); 1108 irq->u.emerg.code);
1049 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, 1109 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY,
1050 emerg->code, 0, 2); 1110 irq->u.emerg.code, 0, 2);
1051 1111
1052 set_bit(emerg->code, li->sigp_emerg_pending); 1112 set_bit(irq->u.emerg.code, li->sigp_emerg_pending);
1053 set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); 1113 set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs);
1054 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 1114 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
1055 return 0; 1115 return 0;
@@ -1061,9 +1121,9 @@ static int __inject_mchk(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1061 struct kvm_s390_mchk_info *mchk = &li->irq.mchk; 1121 struct kvm_s390_mchk_info *mchk = &li->irq.mchk;
1062 1122
1063 VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx", 1123 VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx",
1064 mchk->mcic); 1124 irq->u.mchk.mcic);
1065 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0, 1125 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0,
1066 mchk->mcic, 2); 1126 irq->u.mchk.mcic, 2);
1067 1127
1068 /* 1128 /*
1069 * Because repressible machine checks can be indicated along with 1129 * Because repressible machine checks can be indicated along with
@@ -1121,7 +1181,6 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
1121 1181
1122 if ((!schid && !cr6) || (schid && cr6)) 1182 if ((!schid && !cr6) || (schid && cr6))
1123 return NULL; 1183 return NULL;
1124 mutex_lock(&kvm->lock);
1125 fi = &kvm->arch.float_int; 1184 fi = &kvm->arch.float_int;
1126 spin_lock(&fi->lock); 1185 spin_lock(&fi->lock);
1127 inti = NULL; 1186 inti = NULL;
@@ -1149,7 +1208,6 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
1149 if (list_empty(&fi->list)) 1208 if (list_empty(&fi->list))
1150 atomic_set(&fi->active, 0); 1209 atomic_set(&fi->active, 0);
1151 spin_unlock(&fi->lock); 1210 spin_unlock(&fi->lock);
1152 mutex_unlock(&kvm->lock);
1153 return inti; 1211 return inti;
1154} 1212}
1155 1213
@@ -1162,7 +1220,6 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1162 int sigcpu; 1220 int sigcpu;
1163 int rc = 0; 1221 int rc = 0;
1164 1222
1165 mutex_lock(&kvm->lock);
1166 fi = &kvm->arch.float_int; 1223 fi = &kvm->arch.float_int;
1167 spin_lock(&fi->lock); 1224 spin_lock(&fi->lock);
1168 if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) { 1225 if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) {
@@ -1187,6 +1244,8 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1187 list_add_tail(&inti->list, &iter->list); 1244 list_add_tail(&inti->list, &iter->list);
1188 } 1245 }
1189 atomic_set(&fi->active, 1); 1246 atomic_set(&fi->active, 1);
1247 if (atomic_read(&kvm->online_vcpus) == 0)
1248 goto unlock_fi;
1190 sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); 1249 sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
1191 if (sigcpu == KVM_MAX_VCPUS) { 1250 if (sigcpu == KVM_MAX_VCPUS) {
1192 do { 1251 do {
@@ -1213,7 +1272,6 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1213 kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu)); 1272 kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu));
1214unlock_fi: 1273unlock_fi:
1215 spin_unlock(&fi->lock); 1274 spin_unlock(&fi->lock);
1216 mutex_unlock(&kvm->lock);
1217 return rc; 1275 return rc;
1218} 1276}
1219 1277
@@ -1221,6 +1279,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
1221 struct kvm_s390_interrupt *s390int) 1279 struct kvm_s390_interrupt *s390int)
1222{ 1280{
1223 struct kvm_s390_interrupt_info *inti; 1281 struct kvm_s390_interrupt_info *inti;
1282 int rc;
1224 1283
1225 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 1284 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
1226 if (!inti) 1285 if (!inti)
@@ -1239,7 +1298,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
1239 inti->ext.ext_params = s390int->parm; 1298 inti->ext.ext_params = s390int->parm;
1240 break; 1299 break;
1241 case KVM_S390_INT_PFAULT_DONE: 1300 case KVM_S390_INT_PFAULT_DONE:
1242 inti->type = s390int->type;
1243 inti->ext.ext_params2 = s390int->parm64; 1301 inti->ext.ext_params2 = s390int->parm64;
1244 break; 1302 break;
1245 case KVM_S390_MCHK: 1303 case KVM_S390_MCHK:
@@ -1268,7 +1326,10 @@ int kvm_s390_inject_vm(struct kvm *kvm,
1268 trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64, 1326 trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
1269 2); 1327 2);
1270 1328
1271 return __inject_vm(kvm, inti); 1329 rc = __inject_vm(kvm, inti);
1330 if (rc)
1331 kfree(inti);
1332 return rc;
1272} 1333}
1273 1334
1274void kvm_s390_reinject_io_int(struct kvm *kvm, 1335void kvm_s390_reinject_io_int(struct kvm *kvm,
@@ -1290,13 +1351,16 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int,
1290 case KVM_S390_SIGP_SET_PREFIX: 1351 case KVM_S390_SIGP_SET_PREFIX:
1291 irq->u.prefix.address = s390int->parm; 1352 irq->u.prefix.address = s390int->parm;
1292 break; 1353 break;
1354 case KVM_S390_SIGP_STOP:
1355 irq->u.stop.flags = s390int->parm;
1356 break;
1293 case KVM_S390_INT_EXTERNAL_CALL: 1357 case KVM_S390_INT_EXTERNAL_CALL:
1294 if (irq->u.extcall.code & 0xffff0000) 1358 if (s390int->parm & 0xffff0000)
1295 return -EINVAL; 1359 return -EINVAL;
1296 irq->u.extcall.code = s390int->parm; 1360 irq->u.extcall.code = s390int->parm;
1297 break; 1361 break;
1298 case KVM_S390_INT_EMERGENCY: 1362 case KVM_S390_INT_EMERGENCY:
1299 if (irq->u.emerg.code & 0xffff0000) 1363 if (s390int->parm & 0xffff0000)
1300 return -EINVAL; 1364 return -EINVAL;
1301 irq->u.emerg.code = s390int->parm; 1365 irq->u.emerg.code = s390int->parm;
1302 break; 1366 break;
@@ -1307,6 +1371,23 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int,
1307 return 0; 1371 return 0;
1308} 1372}
1309 1373
1374int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu)
1375{
1376 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1377
1378 return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
1379}
1380
1381void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu)
1382{
1383 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1384
1385 spin_lock(&li->lock);
1386 li->irq.stop.flags = 0;
1387 clear_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
1388 spin_unlock(&li->lock);
1389}
1390
1310int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) 1391int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1311{ 1392{
1312 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1393 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -1363,7 +1444,6 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
1363 struct kvm_s390_float_interrupt *fi; 1444 struct kvm_s390_float_interrupt *fi;
1364 struct kvm_s390_interrupt_info *n, *inti = NULL; 1445 struct kvm_s390_interrupt_info *n, *inti = NULL;
1365 1446
1366 mutex_lock(&kvm->lock);
1367 fi = &kvm->arch.float_int; 1447 fi = &kvm->arch.float_int;
1368 spin_lock(&fi->lock); 1448 spin_lock(&fi->lock);
1369 list_for_each_entry_safe(inti, n, &fi->list, list) { 1449 list_for_each_entry_safe(inti, n, &fi->list, list) {
@@ -1373,7 +1453,6 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
1373 fi->irq_count = 0; 1453 fi->irq_count = 0;
1374 atomic_set(&fi->active, 0); 1454 atomic_set(&fi->active, 0);
1375 spin_unlock(&fi->lock); 1455 spin_unlock(&fi->lock);
1376 mutex_unlock(&kvm->lock);
1377} 1456}
1378 1457
1379static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti, 1458static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
@@ -1413,7 +1492,6 @@ static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len)
1413 int ret = 0; 1492 int ret = 0;
1414 int n = 0; 1493 int n = 0;
1415 1494
1416 mutex_lock(&kvm->lock);
1417 fi = &kvm->arch.float_int; 1495 fi = &kvm->arch.float_int;
1418 spin_lock(&fi->lock); 1496 spin_lock(&fi->lock);
1419 1497
@@ -1432,7 +1510,6 @@ static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len)
1432 } 1510 }
1433 1511
1434 spin_unlock(&fi->lock); 1512 spin_unlock(&fi->lock);
1435 mutex_unlock(&kvm->lock);
1436 1513
1437 return ret < 0 ? ret : n; 1514 return ret < 0 ? ret : n;
1438} 1515}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3e09801e3104..0c3623927563 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -22,6 +22,7 @@
22#include <linux/kvm.h> 22#include <linux/kvm.h>
23#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/random.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/timer.h> 27#include <linux/timer.h>
27#include <asm/asm-offsets.h> 28#include <asm/asm-offsets.h>
@@ -29,7 +30,6 @@
29#include <asm/pgtable.h> 30#include <asm/pgtable.h>
30#include <asm/nmi.h> 31#include <asm/nmi.h>
31#include <asm/switch_to.h> 32#include <asm/switch_to.h>
32#include <asm/facility.h>
33#include <asm/sclp.h> 33#include <asm/sclp.h>
34#include "kvm-s390.h" 34#include "kvm-s390.h"
35#include "gaccess.h" 35#include "gaccess.h"
@@ -50,6 +50,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
50 { "exit_instruction", VCPU_STAT(exit_instruction) }, 50 { "exit_instruction", VCPU_STAT(exit_instruction) },
51 { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, 51 { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
52 { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, 52 { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
53 { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
53 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 54 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
54 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, 55 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
55 { "instruction_lctl", VCPU_STAT(instruction_lctl) }, 56 { "instruction_lctl", VCPU_STAT(instruction_lctl) },
@@ -98,15 +99,20 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
98 { NULL } 99 { NULL }
99}; 100};
100 101
101unsigned long *vfacilities; 102/* upper facilities limit for kvm */
102static struct gmap_notifier gmap_notifier; 103unsigned long kvm_s390_fac_list_mask[] = {
104 0xff82fffbf4fc2000UL,
105 0x005c000000000000UL,
106};
103 107
104/* test availability of vfacility */ 108unsigned long kvm_s390_fac_list_mask_size(void)
105int test_vfacility(unsigned long nr)
106{ 109{
107 return __test_facility(nr, (void *) vfacilities); 110 BUILD_BUG_ON(ARRAY_SIZE(kvm_s390_fac_list_mask) > S390_ARCH_FAC_MASK_SIZE_U64);
111 return ARRAY_SIZE(kvm_s390_fac_list_mask);
108} 112}
109 113
114static struct gmap_notifier gmap_notifier;
115
110/* Section: not file related */ 116/* Section: not file related */
111int kvm_arch_hardware_enable(void) 117int kvm_arch_hardware_enable(void)
112{ 118{
@@ -166,6 +172,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
166 case KVM_CAP_S390_IRQCHIP: 172 case KVM_CAP_S390_IRQCHIP:
167 case KVM_CAP_VM_ATTRIBUTES: 173 case KVM_CAP_VM_ATTRIBUTES:
168 case KVM_CAP_MP_STATE: 174 case KVM_CAP_MP_STATE:
175 case KVM_CAP_S390_USER_SIGP:
169 r = 1; 176 r = 1;
170 break; 177 break;
171 case KVM_CAP_NR_VCPUS: 178 case KVM_CAP_NR_VCPUS:
@@ -254,6 +261,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
254 kvm->arch.use_irqchip = 1; 261 kvm->arch.use_irqchip = 1;
255 r = 0; 262 r = 0;
256 break; 263 break;
264 case KVM_CAP_S390_USER_SIGP:
265 kvm->arch.user_sigp = 1;
266 r = 0;
267 break;
257 default: 268 default:
258 r = -EINVAL; 269 r = -EINVAL;
259 break; 270 break;
@@ -261,7 +272,24 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
261 return r; 272 return r;
262} 273}
263 274
264static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) 275static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
276{
277 int ret;
278
279 switch (attr->attr) {
280 case KVM_S390_VM_MEM_LIMIT_SIZE:
281 ret = 0;
282 if (put_user(kvm->arch.gmap->asce_end, (u64 __user *)attr->addr))
283 ret = -EFAULT;
284 break;
285 default:
286 ret = -ENXIO;
287 break;
288 }
289 return ret;
290}
291
292static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
265{ 293{
266 int ret; 294 int ret;
267 unsigned int idx; 295 unsigned int idx;
@@ -283,6 +311,36 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
283 mutex_unlock(&kvm->lock); 311 mutex_unlock(&kvm->lock);
284 ret = 0; 312 ret = 0;
285 break; 313 break;
314 case KVM_S390_VM_MEM_LIMIT_SIZE: {
315 unsigned long new_limit;
316
317 if (kvm_is_ucontrol(kvm))
318 return -EINVAL;
319
320 if (get_user(new_limit, (u64 __user *)attr->addr))
321 return -EFAULT;
322
323 if (new_limit > kvm->arch.gmap->asce_end)
324 return -E2BIG;
325
326 ret = -EBUSY;
327 mutex_lock(&kvm->lock);
328 if (atomic_read(&kvm->online_vcpus) == 0) {
329 /* gmap_alloc will round the limit up */
330 struct gmap *new = gmap_alloc(current->mm, new_limit);
331
332 if (!new) {
333 ret = -ENOMEM;
334 } else {
335 gmap_free(kvm->arch.gmap);
336 new->private = kvm;
337 kvm->arch.gmap = new;
338 ret = 0;
339 }
340 }
341 mutex_unlock(&kvm->lock);
342 break;
343 }
286 default: 344 default:
287 ret = -ENXIO; 345 ret = -ENXIO;
288 break; 346 break;
@@ -290,13 +348,276 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
290 return ret; 348 return ret;
291} 349}
292 350
351static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu);
352
353static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
354{
355 struct kvm_vcpu *vcpu;
356 int i;
357
358 if (!test_kvm_facility(kvm, 76))
359 return -EINVAL;
360
361 mutex_lock(&kvm->lock);
362 switch (attr->attr) {
363 case KVM_S390_VM_CRYPTO_ENABLE_AES_KW:
364 get_random_bytes(
365 kvm->arch.crypto.crycb->aes_wrapping_key_mask,
366 sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
367 kvm->arch.crypto.aes_kw = 1;
368 break;
369 case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW:
370 get_random_bytes(
371 kvm->arch.crypto.crycb->dea_wrapping_key_mask,
372 sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
373 kvm->arch.crypto.dea_kw = 1;
374 break;
375 case KVM_S390_VM_CRYPTO_DISABLE_AES_KW:
376 kvm->arch.crypto.aes_kw = 0;
377 memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0,
378 sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
379 break;
380 case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
381 kvm->arch.crypto.dea_kw = 0;
382 memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0,
383 sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
384 break;
385 default:
386 mutex_unlock(&kvm->lock);
387 return -ENXIO;
388 }
389
390 kvm_for_each_vcpu(i, vcpu, kvm) {
391 kvm_s390_vcpu_crypto_setup(vcpu);
392 exit_sie(vcpu);
393 }
394 mutex_unlock(&kvm->lock);
395 return 0;
396}
397
398static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
399{
400 u8 gtod_high;
401
402 if (copy_from_user(&gtod_high, (void __user *)attr->addr,
403 sizeof(gtod_high)))
404 return -EFAULT;
405
406 if (gtod_high != 0)
407 return -EINVAL;
408
409 return 0;
410}
411
412static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
413{
414 struct kvm_vcpu *cur_vcpu;
415 unsigned int vcpu_idx;
416 u64 host_tod, gtod;
417 int r;
418
419 if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
420 return -EFAULT;
421
422 r = store_tod_clock(&host_tod);
423 if (r)
424 return r;
425
426 mutex_lock(&kvm->lock);
427 kvm->arch.epoch = gtod - host_tod;
428 kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) {
429 cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
430 exit_sie(cur_vcpu);
431 }
432 mutex_unlock(&kvm->lock);
433 return 0;
434}
435
436static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
437{
438 int ret;
439
440 if (attr->flags)
441 return -EINVAL;
442
443 switch (attr->attr) {
444 case KVM_S390_VM_TOD_HIGH:
445 ret = kvm_s390_set_tod_high(kvm, attr);
446 break;
447 case KVM_S390_VM_TOD_LOW:
448 ret = kvm_s390_set_tod_low(kvm, attr);
449 break;
450 default:
451 ret = -ENXIO;
452 break;
453 }
454 return ret;
455}
456
457static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
458{
459 u8 gtod_high = 0;
460
461 if (copy_to_user((void __user *)attr->addr, &gtod_high,
462 sizeof(gtod_high)))
463 return -EFAULT;
464
465 return 0;
466}
467
468static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
469{
470 u64 host_tod, gtod;
471 int r;
472
473 r = store_tod_clock(&host_tod);
474 if (r)
475 return r;
476
477 gtod = host_tod + kvm->arch.epoch;
478 if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
479 return -EFAULT;
480
481 return 0;
482}
483
484static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr)
485{
486 int ret;
487
488 if (attr->flags)
489 return -EINVAL;
490
491 switch (attr->attr) {
492 case KVM_S390_VM_TOD_HIGH:
493 ret = kvm_s390_get_tod_high(kvm, attr);
494 break;
495 case KVM_S390_VM_TOD_LOW:
496 ret = kvm_s390_get_tod_low(kvm, attr);
497 break;
498 default:
499 ret = -ENXIO;
500 break;
501 }
502 return ret;
503}
504
505static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
506{
507 struct kvm_s390_vm_cpu_processor *proc;
508 int ret = 0;
509
510 mutex_lock(&kvm->lock);
511 if (atomic_read(&kvm->online_vcpus)) {
512 ret = -EBUSY;
513 goto out;
514 }
515 proc = kzalloc(sizeof(*proc), GFP_KERNEL);
516 if (!proc) {
517 ret = -ENOMEM;
518 goto out;
519 }
520 if (!copy_from_user(proc, (void __user *)attr->addr,
521 sizeof(*proc))) {
522 memcpy(&kvm->arch.model.cpu_id, &proc->cpuid,
523 sizeof(struct cpuid));
524 kvm->arch.model.ibc = proc->ibc;
525 memcpy(kvm->arch.model.fac->kvm, proc->fac_list,
526 S390_ARCH_FAC_LIST_SIZE_BYTE);
527 } else
528 ret = -EFAULT;
529 kfree(proc);
530out:
531 mutex_unlock(&kvm->lock);
532 return ret;
533}
534
535static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
536{
537 int ret = -ENXIO;
538
539 switch (attr->attr) {
540 case KVM_S390_VM_CPU_PROCESSOR:
541 ret = kvm_s390_set_processor(kvm, attr);
542 break;
543 }
544 return ret;
545}
546
547static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
548{
549 struct kvm_s390_vm_cpu_processor *proc;
550 int ret = 0;
551
552 proc = kzalloc(sizeof(*proc), GFP_KERNEL);
553 if (!proc) {
554 ret = -ENOMEM;
555 goto out;
556 }
557 memcpy(&proc->cpuid, &kvm->arch.model.cpu_id, sizeof(struct cpuid));
558 proc->ibc = kvm->arch.model.ibc;
559 memcpy(&proc->fac_list, kvm->arch.model.fac->kvm, S390_ARCH_FAC_LIST_SIZE_BYTE);
560 if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
561 ret = -EFAULT;
562 kfree(proc);
563out:
564 return ret;
565}
566
567static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
568{
569 struct kvm_s390_vm_cpu_machine *mach;
570 int ret = 0;
571
572 mach = kzalloc(sizeof(*mach), GFP_KERNEL);
573 if (!mach) {
574 ret = -ENOMEM;
575 goto out;
576 }
577 get_cpu_id((struct cpuid *) &mach->cpuid);
578 mach->ibc = sclp_get_ibc();
579 memcpy(&mach->fac_mask, kvm_s390_fac_list_mask,
580 kvm_s390_fac_list_mask_size() * sizeof(u64));
581 memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
582 S390_ARCH_FAC_LIST_SIZE_U64);
583 if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach)))
584 ret = -EFAULT;
585 kfree(mach);
586out:
587 return ret;
588}
589
590static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
591{
592 int ret = -ENXIO;
593
594 switch (attr->attr) {
595 case KVM_S390_VM_CPU_PROCESSOR:
596 ret = kvm_s390_get_processor(kvm, attr);
597 break;
598 case KVM_S390_VM_CPU_MACHINE:
599 ret = kvm_s390_get_machine(kvm, attr);
600 break;
601 }
602 return ret;
603}
604
293static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) 605static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
294{ 606{
295 int ret; 607 int ret;
296 608
297 switch (attr->group) { 609 switch (attr->group) {
298 case KVM_S390_VM_MEM_CTRL: 610 case KVM_S390_VM_MEM_CTRL:
299 ret = kvm_s390_mem_control(kvm, attr); 611 ret = kvm_s390_set_mem_control(kvm, attr);
612 break;
613 case KVM_S390_VM_TOD:
614 ret = kvm_s390_set_tod(kvm, attr);
615 break;
616 case KVM_S390_VM_CPU_MODEL:
617 ret = kvm_s390_set_cpu_model(kvm, attr);
618 break;
619 case KVM_S390_VM_CRYPTO:
620 ret = kvm_s390_vm_set_crypto(kvm, attr);
300 break; 621 break;
301 default: 622 default:
302 ret = -ENXIO; 623 ret = -ENXIO;
@@ -308,7 +629,24 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
308 629
309static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) 630static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
310{ 631{
311 return -ENXIO; 632 int ret;
633
634 switch (attr->group) {
635 case KVM_S390_VM_MEM_CTRL:
636 ret = kvm_s390_get_mem_control(kvm, attr);
637 break;
638 case KVM_S390_VM_TOD:
639 ret = kvm_s390_get_tod(kvm, attr);
640 break;
641 case KVM_S390_VM_CPU_MODEL:
642 ret = kvm_s390_get_cpu_model(kvm, attr);
643 break;
644 default:
645 ret = -ENXIO;
646 break;
647 }
648
649 return ret;
312} 650}
313 651
314static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) 652static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
@@ -320,6 +658,42 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
320 switch (attr->attr) { 658 switch (attr->attr) {
321 case KVM_S390_VM_MEM_ENABLE_CMMA: 659 case KVM_S390_VM_MEM_ENABLE_CMMA:
322 case KVM_S390_VM_MEM_CLR_CMMA: 660 case KVM_S390_VM_MEM_CLR_CMMA:
661 case KVM_S390_VM_MEM_LIMIT_SIZE:
662 ret = 0;
663 break;
664 default:
665 ret = -ENXIO;
666 break;
667 }
668 break;
669 case KVM_S390_VM_TOD:
670 switch (attr->attr) {
671 case KVM_S390_VM_TOD_LOW:
672 case KVM_S390_VM_TOD_HIGH:
673 ret = 0;
674 break;
675 default:
676 ret = -ENXIO;
677 break;
678 }
679 break;
680 case KVM_S390_VM_CPU_MODEL:
681 switch (attr->attr) {
682 case KVM_S390_VM_CPU_PROCESSOR:
683 case KVM_S390_VM_CPU_MACHINE:
684 ret = 0;
685 break;
686 default:
687 ret = -ENXIO;
688 break;
689 }
690 break;
691 case KVM_S390_VM_CRYPTO:
692 switch (attr->attr) {
693 case KVM_S390_VM_CRYPTO_ENABLE_AES_KW:
694 case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW:
695 case KVM_S390_VM_CRYPTO_DISABLE_AES_KW:
696 case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW:
323 ret = 0; 697 ret = 0;
324 break; 698 break;
325 default: 699 default:
@@ -401,9 +775,61 @@ long kvm_arch_vm_ioctl(struct file *filp,
401 return r; 775 return r;
402} 776}
403 777
778static int kvm_s390_query_ap_config(u8 *config)
779{
780 u32 fcn_code = 0x04000000UL;
781 u32 cc;
782
783 asm volatile(
784 "lgr 0,%1\n"
785 "lgr 2,%2\n"
786 ".long 0xb2af0000\n" /* PQAP(QCI) */
787 "ipm %0\n"
788 "srl %0,28\n"
789 : "=r" (cc)
790 : "r" (fcn_code), "r" (config)
791 : "cc", "0", "2", "memory"
792 );
793
794 return cc;
795}
796
797static int kvm_s390_apxa_installed(void)
798{
799 u8 config[128];
800 int cc;
801
802 if (test_facility(2) && test_facility(12)) {
803 cc = kvm_s390_query_ap_config(config);
804
805 if (cc)
806 pr_err("PQAP(QCI) failed with cc=%d", cc);
807 else
808 return config[0] & 0x40;
809 }
810
811 return 0;
812}
813
814static void kvm_s390_set_crycb_format(struct kvm *kvm)
815{
816 kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb;
817
818 if (kvm_s390_apxa_installed())
819 kvm->arch.crypto.crycbd |= CRYCB_FORMAT2;
820 else
821 kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
822}
823
824static void kvm_s390_get_cpu_id(struct cpuid *cpu_id)
825{
826 get_cpu_id(cpu_id);
827 cpu_id->version = 0xff;
828}
829
404static int kvm_s390_crypto_init(struct kvm *kvm) 830static int kvm_s390_crypto_init(struct kvm *kvm)
405{ 831{
406 if (!test_vfacility(76)) 832 if (!test_kvm_facility(kvm, 76))
407 return 0; 833 return 0;
408 834
409 kvm->arch.crypto.crycb = kzalloc(sizeof(*kvm->arch.crypto.crycb), 835 kvm->arch.crypto.crycb = kzalloc(sizeof(*kvm->arch.crypto.crycb),
@@ -411,15 +837,18 @@ static int kvm_s390_crypto_init(struct kvm *kvm)
411 if (!kvm->arch.crypto.crycb) 837 if (!kvm->arch.crypto.crycb)
412 return -ENOMEM; 838 return -ENOMEM;
413 839
414 kvm->arch.crypto.crycbd = (__u32) (unsigned long) kvm->arch.crypto.crycb | 840 kvm_s390_set_crycb_format(kvm);
415 CRYCB_FORMAT1; 841
842 /* Disable AES/DEA protected key functions by default */
843 kvm->arch.crypto.aes_kw = 0;
844 kvm->arch.crypto.dea_kw = 0;
416 845
417 return 0; 846 return 0;
418} 847}
419 848
420int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 849int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
421{ 850{
422 int rc; 851 int i, rc;
423 char debug_name[16]; 852 char debug_name[16];
424 static unsigned long sca_offset; 853 static unsigned long sca_offset;
425 854
@@ -454,6 +883,46 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
454 if (!kvm->arch.dbf) 883 if (!kvm->arch.dbf)
455 goto out_nodbf; 884 goto out_nodbf;
456 885
886 /*
887 * The architectural maximum amount of facilities is 16 kbit. To store
888 * this amount, 2 kbyte of memory is required. Thus we need a full
889 * page to hold the active copy (arch.model.fac->sie) and the current
890 * facilities set (arch.model.fac->kvm). Its address size has to be
891 * 31 bits and word aligned.
892 */
893 kvm->arch.model.fac =
894 (struct s390_model_fac *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
895 if (!kvm->arch.model.fac)
896 goto out_nofac;
897
898 memcpy(kvm->arch.model.fac->kvm, S390_lowcore.stfle_fac_list,
899 S390_ARCH_FAC_LIST_SIZE_U64);
900
901 /*
902 * If this KVM host runs *not* in a LPAR, relax the facility bits
903 * of the kvm facility mask by all missing facilities. This will allow
904 * to determine the right CPU model by means of the remaining facilities.
905 * Live guest migration must prohibit the migration of KVMs running in
906 * a LPAR to non LPAR hosts.
907 */
908 if (!MACHINE_IS_LPAR)
909 for (i = 0; i < kvm_s390_fac_list_mask_size(); i++)
910 kvm_s390_fac_list_mask[i] &= kvm->arch.model.fac->kvm[i];
911
912 /*
913 * Apply the kvm facility mask to limit the kvm supported/tolerated
914 * facility list.
915 */
916 for (i = 0; i < S390_ARCH_FAC_LIST_SIZE_U64; i++) {
917 if (i < kvm_s390_fac_list_mask_size())
918 kvm->arch.model.fac->kvm[i] &= kvm_s390_fac_list_mask[i];
919 else
920 kvm->arch.model.fac->kvm[i] = 0UL;
921 }
922
923 kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id);
924 kvm->arch.model.ibc = sclp_get_ibc() & 0x0fff;
925
457 if (kvm_s390_crypto_init(kvm) < 0) 926 if (kvm_s390_crypto_init(kvm) < 0)
458 goto out_crypto; 927 goto out_crypto;
459 928
@@ -477,6 +946,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
477 946
478 kvm->arch.css_support = 0; 947 kvm->arch.css_support = 0;
479 kvm->arch.use_irqchip = 0; 948 kvm->arch.use_irqchip = 0;
949 kvm->arch.epoch = 0;
480 950
481 spin_lock_init(&kvm->arch.start_stop_lock); 951 spin_lock_init(&kvm->arch.start_stop_lock);
482 952
@@ -484,6 +954,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
484out_nogmap: 954out_nogmap:
485 kfree(kvm->arch.crypto.crycb); 955 kfree(kvm->arch.crypto.crycb);
486out_crypto: 956out_crypto:
957 free_page((unsigned long)kvm->arch.model.fac);
958out_nofac:
487 debug_unregister(kvm->arch.dbf); 959 debug_unregister(kvm->arch.dbf);
488out_nodbf: 960out_nodbf:
489 free_page((unsigned long)(kvm->arch.sca)); 961 free_page((unsigned long)(kvm->arch.sca));
@@ -536,6 +1008,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
536void kvm_arch_destroy_vm(struct kvm *kvm) 1008void kvm_arch_destroy_vm(struct kvm *kvm)
537{ 1009{
538 kvm_free_vcpus(kvm); 1010 kvm_free_vcpus(kvm);
1011 free_page((unsigned long)kvm->arch.model.fac);
539 free_page((unsigned long)(kvm->arch.sca)); 1012 free_page((unsigned long)(kvm->arch.sca));
540 debug_unregister(kvm->arch.dbf); 1013 debug_unregister(kvm->arch.dbf);
541 kfree(kvm->arch.crypto.crycb); 1014 kfree(kvm->arch.crypto.crycb);
@@ -546,25 +1019,30 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
546} 1019}
547 1020
548/* Section: vcpu related */ 1021/* Section: vcpu related */
1022static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
1023{
1024 vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
1025 if (!vcpu->arch.gmap)
1026 return -ENOMEM;
1027 vcpu->arch.gmap->private = vcpu->kvm;
1028
1029 return 0;
1030}
1031
549int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 1032int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
550{ 1033{
551 vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; 1034 vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
552 kvm_clear_async_pf_completion_queue(vcpu); 1035 kvm_clear_async_pf_completion_queue(vcpu);
553 if (kvm_is_ucontrol(vcpu->kvm)) {
554 vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
555 if (!vcpu->arch.gmap)
556 return -ENOMEM;
557 vcpu->arch.gmap->private = vcpu->kvm;
558 return 0;
559 }
560
561 vcpu->arch.gmap = vcpu->kvm->arch.gmap;
562 vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | 1036 vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
563 KVM_SYNC_GPRS | 1037 KVM_SYNC_GPRS |
564 KVM_SYNC_ACRS | 1038 KVM_SYNC_ACRS |
565 KVM_SYNC_CRS | 1039 KVM_SYNC_CRS |
566 KVM_SYNC_ARCH0 | 1040 KVM_SYNC_ARCH0 |
567 KVM_SYNC_PFAULT; 1041 KVM_SYNC_PFAULT;
1042
1043 if (kvm_is_ucontrol(vcpu->kvm))
1044 return __kvm_ucontrol_vcpu_init(vcpu);
1045
568 return 0; 1046 return 0;
569} 1047}
570 1048
@@ -615,16 +1093,27 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
615 kvm_s390_clear_local_irqs(vcpu); 1093 kvm_s390_clear_local_irqs(vcpu);
616} 1094}
617 1095
618int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 1096void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
619{ 1097{
620 return 0; 1098 mutex_lock(&vcpu->kvm->lock);
1099 vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch;
1100 mutex_unlock(&vcpu->kvm->lock);
1101 if (!kvm_is_ucontrol(vcpu->kvm))
1102 vcpu->arch.gmap = vcpu->kvm->arch.gmap;
621} 1103}
622 1104
623static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) 1105static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
624{ 1106{
625 if (!test_vfacility(76)) 1107 if (!test_kvm_facility(vcpu->kvm, 76))
626 return; 1108 return;
627 1109
1110 vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
1111
1112 if (vcpu->kvm->arch.crypto.aes_kw)
1113 vcpu->arch.sie_block->ecb3 |= ECB3_AES;
1114 if (vcpu->kvm->arch.crypto.dea_kw)
1115 vcpu->arch.sie_block->ecb3 |= ECB3_DEA;
1116
628 vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; 1117 vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
629} 1118}
630 1119
@@ -654,14 +1143,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
654 CPUSTAT_STOPPED | 1143 CPUSTAT_STOPPED |
655 CPUSTAT_GED); 1144 CPUSTAT_GED);
656 vcpu->arch.sie_block->ecb = 6; 1145 vcpu->arch.sie_block->ecb = 6;
657 if (test_vfacility(50) && test_vfacility(73)) 1146 if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
658 vcpu->arch.sie_block->ecb |= 0x10; 1147 vcpu->arch.sie_block->ecb |= 0x10;
659 1148
660 vcpu->arch.sie_block->ecb2 = 8; 1149 vcpu->arch.sie_block->ecb2 = 8;
661 vcpu->arch.sie_block->eca = 0xD1002000U; 1150 vcpu->arch.sie_block->eca = 0xC1002000U;
662 if (sclp_has_siif()) 1151 if (sclp_has_siif())
663 vcpu->arch.sie_block->eca |= 1; 1152 vcpu->arch.sie_block->eca |= 1;
664 vcpu->arch.sie_block->fac = (int) (long) vfacilities; 1153 if (sclp_has_sigpif())
1154 vcpu->arch.sie_block->eca |= 0x10000000U;
665 vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE | 1155 vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE |
666 ICTL_TPROT; 1156 ICTL_TPROT;
667 1157
@@ -670,10 +1160,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
670 if (rc) 1160 if (rc)
671 return rc; 1161 return rc;
672 } 1162 }
673 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 1163 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
674 vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; 1164 vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
675 get_cpu_id(&vcpu->arch.cpu_id); 1165
676 vcpu->arch.cpu_id.version = 0xff; 1166 mutex_lock(&vcpu->kvm->lock);
1167 vcpu->arch.cpu_id = vcpu->kvm->arch.model.cpu_id;
1168 memcpy(vcpu->kvm->arch.model.fac->sie, vcpu->kvm->arch.model.fac->kvm,
1169 S390_ARCH_FAC_LIST_SIZE_BYTE);
1170 vcpu->arch.sie_block->ibc = vcpu->kvm->arch.model.ibc;
1171 mutex_unlock(&vcpu->kvm->lock);
677 1172
678 kvm_s390_vcpu_crypto_setup(vcpu); 1173 kvm_s390_vcpu_crypto_setup(vcpu);
679 1174
@@ -717,6 +1212,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
717 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; 1212 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
718 set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn); 1213 set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
719 } 1214 }
1215 vcpu->arch.sie_block->fac = (int) (long) kvm->arch.model.fac->sie;
720 1216
721 spin_lock_init(&vcpu->arch.local_int.lock); 1217 spin_lock_init(&vcpu->arch.local_int.lock);
722 vcpu->arch.local_int.float_int = &kvm->arch.float_int; 1218 vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -741,7 +1237,7 @@ out:
741 1237
742int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 1238int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
743{ 1239{
744 return kvm_cpu_has_interrupt(vcpu); 1240 return kvm_s390_vcpu_has_irq(vcpu, 0);
745} 1241}
746 1242
747void s390_vcpu_block(struct kvm_vcpu *vcpu) 1243void s390_vcpu_block(struct kvm_vcpu *vcpu)
@@ -869,6 +1365,8 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
869 case KVM_REG_S390_PFTOKEN: 1365 case KVM_REG_S390_PFTOKEN:
870 r = get_user(vcpu->arch.pfault_token, 1366 r = get_user(vcpu->arch.pfault_token,
871 (u64 __user *)reg->addr); 1367 (u64 __user *)reg->addr);
1368 if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
1369 kvm_clear_async_pf_completion_queue(vcpu);
872 break; 1370 break;
873 case KVM_REG_S390_PFCOMPARE: 1371 case KVM_REG_S390_PFCOMPARE:
874 r = get_user(vcpu->arch.pfault_compare, 1372 r = get_user(vcpu->arch.pfault_compare,
@@ -1176,7 +1674,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
1176 return 0; 1674 return 0;
1177 if (psw_extint_disabled(vcpu)) 1675 if (psw_extint_disabled(vcpu))
1178 return 0; 1676 return 0;
1179 if (kvm_cpu_has_interrupt(vcpu)) 1677 if (kvm_s390_vcpu_has_irq(vcpu, 0))
1180 return 0; 1678 return 0;
1181 if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul)) 1679 if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul))
1182 return 0; 1680 return 0;
@@ -1341,6 +1839,8 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1341 vcpu->arch.pfault_token = kvm_run->s.regs.pft; 1839 vcpu->arch.pfault_token = kvm_run->s.regs.pft;
1342 vcpu->arch.pfault_select = kvm_run->s.regs.pfs; 1840 vcpu->arch.pfault_select = kvm_run->s.regs.pfs;
1343 vcpu->arch.pfault_compare = kvm_run->s.regs.pfc; 1841 vcpu->arch.pfault_compare = kvm_run->s.regs.pfc;
1842 if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
1843 kvm_clear_async_pf_completion_queue(vcpu);
1344 } 1844 }
1345 kvm_run->kvm_dirty_regs = 0; 1845 kvm_run->kvm_dirty_regs = 0;
1346} 1846}
@@ -1559,15 +2059,10 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
1559 spin_lock(&vcpu->kvm->arch.start_stop_lock); 2059 spin_lock(&vcpu->kvm->arch.start_stop_lock);
1560 online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); 2060 online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
1561 2061
1562 /* Need to lock access to action_bits to avoid a SIGP race condition */
1563 spin_lock(&vcpu->arch.local_int.lock);
1564 atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
1565
1566 /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */ 2062 /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
1567 vcpu->arch.local_int.action_bits &= 2063 kvm_s390_clear_stop_irq(vcpu);
1568 ~(ACTION_STOP_ON_STOP | ACTION_STORE_ON_STOP);
1569 spin_unlock(&vcpu->arch.local_int.lock);
1570 2064
2065 atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
1571 __disable_ibs_on_vcpu(vcpu); 2066 __disable_ibs_on_vcpu(vcpu);
1572 2067
1573 for (i = 0; i < online_vcpus; i++) { 2068 for (i = 0; i < online_vcpus; i++) {
@@ -1783,30 +2278,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
1783 2278
1784static int __init kvm_s390_init(void) 2279static int __init kvm_s390_init(void)
1785{ 2280{
1786 int ret; 2281 return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
1787 ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
1788 if (ret)
1789 return ret;
1790
1791 /*
1792 * guests can ask for up to 255+1 double words, we need a full page
1793 * to hold the maximum amount of facilities. On the other hand, we
1794 * only set facilities that are known to work in KVM.
1795 */
1796 vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
1797 if (!vfacilities) {
1798 kvm_exit();
1799 return -ENOMEM;
1800 }
1801 memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
1802 vfacilities[0] &= 0xff82fffbf47c2000UL;
1803 vfacilities[1] &= 0x005c000000000000UL;
1804 return 0;
1805} 2282}
1806 2283
1807static void __exit kvm_s390_exit(void) 2284static void __exit kvm_s390_exit(void)
1808{ 2285{
1809 free_page((unsigned long) vfacilities);
1810 kvm_exit(); 2286 kvm_exit();
1811} 2287}
1812 2288
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index a8f3d9b71c11..985c2114d7ef 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -18,12 +18,10 @@
18#include <linux/hrtimer.h> 18#include <linux/hrtimer.h>
19#include <linux/kvm.h> 19#include <linux/kvm.h>
20#include <linux/kvm_host.h> 20#include <linux/kvm_host.h>
21#include <asm/facility.h>
21 22
22typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); 23typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
23 24
24/* declare vfacilities extern */
25extern unsigned long *vfacilities;
26
27/* Transactional Memory Execution related macros */ 25/* Transactional Memory Execution related macros */
28#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & 0x10)) 26#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & 0x10))
29#define TDB_FORMAT1 1 27#define TDB_FORMAT1 1
@@ -127,6 +125,12 @@ static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
127 vcpu->arch.sie_block->gpsw.mask |= cc << 44; 125 vcpu->arch.sie_block->gpsw.mask |= cc << 44;
128} 126}
129 127
128/* test availability of facility in a kvm intance */
129static inline int test_kvm_facility(struct kvm *kvm, unsigned long nr)
130{
131 return __test_facility(nr, kvm->arch.model.fac->kvm);
132}
133
130/* are cpu states controlled by user space */ 134/* are cpu states controlled by user space */
131static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) 135static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
132{ 136{
@@ -183,7 +187,8 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
183void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); 187void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
184/* is cmma enabled */ 188/* is cmma enabled */
185bool kvm_s390_cmma_enabled(struct kvm *kvm); 189bool kvm_s390_cmma_enabled(struct kvm *kvm);
186int test_vfacility(unsigned long nr); 190unsigned long kvm_s390_fac_list_mask_size(void);
191extern unsigned long kvm_s390_fac_list_mask[];
187 192
188/* implemented in diag.c */ 193/* implemented in diag.c */
189int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); 194int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
@@ -228,11 +233,13 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int,
228 struct kvm_s390_irq *s390irq); 233 struct kvm_s390_irq *s390irq);
229 234
230/* implemented in interrupt.c */ 235/* implemented in interrupt.c */
231int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 236int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop);
232int psw_extint_disabled(struct kvm_vcpu *vcpu); 237int psw_extint_disabled(struct kvm_vcpu *vcpu);
233void kvm_s390_destroy_adapters(struct kvm *kvm); 238void kvm_s390_destroy_adapters(struct kvm *kvm);
234int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu); 239int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu);
235extern struct kvm_device_ops kvm_flic_ops; 240extern struct kvm_device_ops kvm_flic_ops;
241int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu);
242void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu);
236 243
237/* implemented in guestdbg.c */ 244/* implemented in guestdbg.c */
238void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); 245void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 1be578d64dfc..bdd9b5b17e03 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -337,19 +337,24 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
337static int handle_stfl(struct kvm_vcpu *vcpu) 337static int handle_stfl(struct kvm_vcpu *vcpu)
338{ 338{
339 int rc; 339 int rc;
340 unsigned int fac;
340 341
341 vcpu->stat.instruction_stfl++; 342 vcpu->stat.instruction_stfl++;
342 343
343 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 344 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
344 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 345 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
345 346
347 /*
348 * We need to shift the lower 32 facility bits (bit 0-31) from a u64
349 * into a u32 memory representation. They will remain bits 0-31.
350 */
351 fac = *vcpu->kvm->arch.model.fac->sie >> 32;
346 rc = write_guest_lc(vcpu, offsetof(struct _lowcore, stfl_fac_list), 352 rc = write_guest_lc(vcpu, offsetof(struct _lowcore, stfl_fac_list),
347 vfacilities, 4); 353 &fac, sizeof(fac));
348 if (rc) 354 if (rc)
349 return rc; 355 return rc;
350 VCPU_EVENT(vcpu, 5, "store facility list value %x", 356 VCPU_EVENT(vcpu, 5, "store facility list value %x", fac);
351 *(unsigned int *) vfacilities); 357 trace_kvm_s390_handle_stfl(vcpu, fac);
352 trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
353 return 0; 358 return 0;
354} 359}
355 360
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 6651f9f73973..23b1e86b2122 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -26,15 +26,17 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
26 struct kvm_s390_local_interrupt *li; 26 struct kvm_s390_local_interrupt *li;
27 int cpuflags; 27 int cpuflags;
28 int rc; 28 int rc;
29 int ext_call_pending;
29 30
30 li = &dst_vcpu->arch.local_int; 31 li = &dst_vcpu->arch.local_int;
31 32
32 cpuflags = atomic_read(li->cpuflags); 33 cpuflags = atomic_read(li->cpuflags);
33 if (!(cpuflags & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) 34 ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu);
35 if (!(cpuflags & CPUSTAT_STOPPED) && !ext_call_pending)
34 rc = SIGP_CC_ORDER_CODE_ACCEPTED; 36 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
35 else { 37 else {
36 *reg &= 0xffffffff00000000UL; 38 *reg &= 0xffffffff00000000UL;
37 if (cpuflags & CPUSTAT_ECALL_PEND) 39 if (ext_call_pending)
38 *reg |= SIGP_STATUS_EXT_CALL_PENDING; 40 *reg |= SIGP_STATUS_EXT_CALL_PENDING;
39 if (cpuflags & CPUSTAT_STOPPED) 41 if (cpuflags & CPUSTAT_STOPPED)
40 *reg |= SIGP_STATUS_STOPPED; 42 *reg |= SIGP_STATUS_STOPPED;
@@ -96,7 +98,7 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
96} 98}
97 99
98static int __sigp_external_call(struct kvm_vcpu *vcpu, 100static int __sigp_external_call(struct kvm_vcpu *vcpu,
99 struct kvm_vcpu *dst_vcpu) 101 struct kvm_vcpu *dst_vcpu, u64 *reg)
100{ 102{
101 struct kvm_s390_irq irq = { 103 struct kvm_s390_irq irq = {
102 .type = KVM_S390_INT_EXTERNAL_CALL, 104 .type = KVM_S390_INT_EXTERNAL_CALL,
@@ -105,45 +107,31 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu,
105 int rc; 107 int rc;
106 108
107 rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); 109 rc = kvm_s390_inject_vcpu(dst_vcpu, &irq);
108 if (!rc) 110 if (rc == -EBUSY) {
111 *reg &= 0xffffffff00000000UL;
112 *reg |= SIGP_STATUS_EXT_CALL_PENDING;
113 return SIGP_CC_STATUS_STORED;
114 } else if (rc == 0) {
109 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", 115 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x",
110 dst_vcpu->vcpu_id); 116 dst_vcpu->vcpu_id);
111
112 return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED;
113}
114
115static int __inject_sigp_stop(struct kvm_vcpu *dst_vcpu, int action)
116{
117 struct kvm_s390_local_interrupt *li = &dst_vcpu->arch.local_int;
118 int rc = SIGP_CC_ORDER_CODE_ACCEPTED;
119
120 spin_lock(&li->lock);
121 if (li->action_bits & ACTION_STOP_ON_STOP) {
122 /* another SIGP STOP is pending */
123 rc = SIGP_CC_BUSY;
124 goto out;
125 } 117 }
126 if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
127 if ((action & ACTION_STORE_ON_STOP) != 0)
128 rc = -ESHUTDOWN;
129 goto out;
130 }
131 set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
132 li->action_bits |= action;
133 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
134 kvm_s390_vcpu_wakeup(dst_vcpu);
135out:
136 spin_unlock(&li->lock);
137 118
138 return rc; 119 return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED;
139} 120}
140 121
141static int __sigp_stop(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu) 122static int __sigp_stop(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu)
142{ 123{
124 struct kvm_s390_irq irq = {
125 .type = KVM_S390_SIGP_STOP,
126 };
143 int rc; 127 int rc;
144 128
145 rc = __inject_sigp_stop(dst_vcpu, ACTION_STOP_ON_STOP); 129 rc = kvm_s390_inject_vcpu(dst_vcpu, &irq);
146 VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", dst_vcpu->vcpu_id); 130 if (rc == -EBUSY)
131 rc = SIGP_CC_BUSY;
132 else if (rc == 0)
133 VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x",
134 dst_vcpu->vcpu_id);
147 135
148 return rc; 136 return rc;
149} 137}
@@ -151,20 +139,18 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu)
151static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu, 139static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu,
152 struct kvm_vcpu *dst_vcpu, u64 *reg) 140 struct kvm_vcpu *dst_vcpu, u64 *reg)
153{ 141{
142 struct kvm_s390_irq irq = {
143 .type = KVM_S390_SIGP_STOP,
144 .u.stop.flags = KVM_S390_STOP_FLAG_STORE_STATUS,
145 };
154 int rc; 146 int rc;
155 147
156 rc = __inject_sigp_stop(dst_vcpu, ACTION_STOP_ON_STOP | 148 rc = kvm_s390_inject_vcpu(dst_vcpu, &irq);
157 ACTION_STORE_ON_STOP); 149 if (rc == -EBUSY)
158 VCPU_EVENT(vcpu, 4, "sent sigp stop and store status to cpu %x", 150 rc = SIGP_CC_BUSY;
159 dst_vcpu->vcpu_id); 151 else if (rc == 0)
160 152 VCPU_EVENT(vcpu, 4, "sent sigp stop and store status to cpu %x",
161 if (rc == -ESHUTDOWN) { 153 dst_vcpu->vcpu_id);
162 /* If the CPU has already been stopped, we still have
163 * to save the status when doing stop-and-store. This
164 * has to be done after unlocking all spinlocks. */
165 rc = kvm_s390_store_status_unloaded(dst_vcpu,
166 KVM_S390_STORE_STATUS_NOADDR);
167 }
168 154
169 return rc; 155 return rc;
170} 156}
@@ -197,41 +183,33 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
197static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, 183static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
198 u32 address, u64 *reg) 184 u32 address, u64 *reg)
199{ 185{
200 struct kvm_s390_local_interrupt *li; 186 struct kvm_s390_irq irq = {
187 .type = KVM_S390_SIGP_SET_PREFIX,
188 .u.prefix.address = address & 0x7fffe000u,
189 };
201 int rc; 190 int rc;
202 191
203 li = &dst_vcpu->arch.local_int;
204
205 /* 192 /*
206 * Make sure the new value is valid memory. We only need to check the 193 * Make sure the new value is valid memory. We only need to check the
207 * first page, since address is 8k aligned and memory pieces are always 194 * first page, since address is 8k aligned and memory pieces are always
208 * at least 1MB aligned and have at least a size of 1MB. 195 * at least 1MB aligned and have at least a size of 1MB.
209 */ 196 */
210 address &= 0x7fffe000u; 197 if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) {
211 if (kvm_is_error_gpa(vcpu->kvm, address)) {
212 *reg &= 0xffffffff00000000UL; 198 *reg &= 0xffffffff00000000UL;
213 *reg |= SIGP_STATUS_INVALID_PARAMETER; 199 *reg |= SIGP_STATUS_INVALID_PARAMETER;
214 return SIGP_CC_STATUS_STORED; 200 return SIGP_CC_STATUS_STORED;
215 } 201 }
216 202
217 spin_lock(&li->lock); 203 rc = kvm_s390_inject_vcpu(dst_vcpu, &irq);
218 /* cpu must be in stopped state */ 204 if (rc == -EBUSY) {
219 if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
220 *reg &= 0xffffffff00000000UL; 205 *reg &= 0xffffffff00000000UL;
221 *reg |= SIGP_STATUS_INCORRECT_STATE; 206 *reg |= SIGP_STATUS_INCORRECT_STATE;
222 rc = SIGP_CC_STATUS_STORED; 207 return SIGP_CC_STATUS_STORED;
223 goto out_li; 208 } else if (rc == 0) {
209 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x",
210 dst_vcpu->vcpu_id, irq.u.prefix.address);
224 } 211 }
225 212
226 li->irq.prefix.address = address;
227 set_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs);
228 kvm_s390_vcpu_wakeup(dst_vcpu);
229 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
230
231 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", dst_vcpu->vcpu_id,
232 address);
233out_li:
234 spin_unlock(&li->lock);
235 return rc; 213 return rc;
236} 214}
237 215
@@ -242,9 +220,7 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu,
242 int flags; 220 int flags;
243 int rc; 221 int rc;
244 222
245 spin_lock(&dst_vcpu->arch.local_int.lock);
246 flags = atomic_read(dst_vcpu->arch.local_int.cpuflags); 223 flags = atomic_read(dst_vcpu->arch.local_int.cpuflags);
247 spin_unlock(&dst_vcpu->arch.local_int.lock);
248 if (!(flags & CPUSTAT_STOPPED)) { 224 if (!(flags & CPUSTAT_STOPPED)) {
249 *reg &= 0xffffffff00000000UL; 225 *reg &= 0xffffffff00000000UL;
250 *reg |= SIGP_STATUS_INCORRECT_STATE; 226 *reg |= SIGP_STATUS_INCORRECT_STATE;
@@ -291,8 +267,9 @@ static int __prepare_sigp_re_start(struct kvm_vcpu *vcpu,
291 /* handle (RE)START in user space */ 267 /* handle (RE)START in user space */
292 int rc = -EOPNOTSUPP; 268 int rc = -EOPNOTSUPP;
293 269
270 /* make sure we don't race with STOP irq injection */
294 spin_lock(&li->lock); 271 spin_lock(&li->lock);
295 if (li->action_bits & ACTION_STOP_ON_STOP) 272 if (kvm_s390_is_stop_irq_pending(dst_vcpu))
296 rc = SIGP_CC_BUSY; 273 rc = SIGP_CC_BUSY;
297 spin_unlock(&li->lock); 274 spin_unlock(&li->lock);
298 275
@@ -333,7 +310,7 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code,
333 break; 310 break;
334 case SIGP_EXTERNAL_CALL: 311 case SIGP_EXTERNAL_CALL:
335 vcpu->stat.instruction_sigp_external_call++; 312 vcpu->stat.instruction_sigp_external_call++;
336 rc = __sigp_external_call(vcpu, dst_vcpu); 313 rc = __sigp_external_call(vcpu, dst_vcpu, status_reg);
337 break; 314 break;
338 case SIGP_EMERGENCY_SIGNAL: 315 case SIGP_EMERGENCY_SIGNAL:
339 vcpu->stat.instruction_sigp_emergency++; 316 vcpu->stat.instruction_sigp_emergency++;
@@ -394,6 +371,53 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code,
394 return rc; 371 return rc;
395} 372}
396 373
374static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code)
375{
376 if (!vcpu->kvm->arch.user_sigp)
377 return 0;
378
379 switch (order_code) {
380 case SIGP_SENSE:
381 case SIGP_EXTERNAL_CALL:
382 case SIGP_EMERGENCY_SIGNAL:
383 case SIGP_COND_EMERGENCY_SIGNAL:
384 case SIGP_SENSE_RUNNING:
385 return 0;
386 /* update counters as we're directly dropping to user space */
387 case SIGP_STOP:
388 vcpu->stat.instruction_sigp_stop++;
389 break;
390 case SIGP_STOP_AND_STORE_STATUS:
391 vcpu->stat.instruction_sigp_stop_store_status++;
392 break;
393 case SIGP_STORE_STATUS_AT_ADDRESS:
394 vcpu->stat.instruction_sigp_store_status++;
395 break;
396 case SIGP_SET_PREFIX:
397 vcpu->stat.instruction_sigp_prefix++;
398 break;
399 case SIGP_START:
400 vcpu->stat.instruction_sigp_start++;
401 break;
402 case SIGP_RESTART:
403 vcpu->stat.instruction_sigp_restart++;
404 break;
405 case SIGP_INITIAL_CPU_RESET:
406 vcpu->stat.instruction_sigp_init_cpu_reset++;
407 break;
408 case SIGP_CPU_RESET:
409 vcpu->stat.instruction_sigp_cpu_reset++;
410 break;
411 default:
412 vcpu->stat.instruction_sigp_unknown++;
413 }
414
415 VCPU_EVENT(vcpu, 4, "sigp order %u: completely handled in user space",
416 order_code);
417
418 return 1;
419}
420
397int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) 421int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
398{ 422{
399 int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 423 int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
@@ -408,6 +432,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
408 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 432 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
409 433
410 order_code = kvm_s390_get_base_disp_rs(vcpu); 434 order_code = kvm_s390_get_base_disp_rs(vcpu);
435 if (handle_sigp_order_in_user_space(vcpu, order_code))
436 return -EOPNOTSUPP;
411 437
412 if (r1 % 2) 438 if (r1 % 2)
413 parameter = vcpu->run->s.regs.gprs[r1]; 439 parameter = vcpu->run->s.regs.gprs[r1];
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 647e9d6a4818..653a7ec09ef5 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -209,19 +209,21 @@ TRACE_EVENT(kvm_s390_request_resets,
209 * Trace point for a vcpu's stop requests. 209 * Trace point for a vcpu's stop requests.
210 */ 210 */
211TRACE_EVENT(kvm_s390_stop_request, 211TRACE_EVENT(kvm_s390_stop_request,
212 TP_PROTO(unsigned int action_bits), 212 TP_PROTO(unsigned char stop_irq, unsigned char flags),
213 TP_ARGS(action_bits), 213 TP_ARGS(stop_irq, flags),
214 214
215 TP_STRUCT__entry( 215 TP_STRUCT__entry(
216 __field(unsigned int, action_bits) 216 __field(unsigned char, stop_irq)
217 __field(unsigned char, flags)
217 ), 218 ),
218 219
219 TP_fast_assign( 220 TP_fast_assign(
220 __entry->action_bits = action_bits; 221 __entry->stop_irq = stop_irq;
222 __entry->flags = flags;
221 ), 223 ),
222 224
223 TP_printk("stop request, action_bits = %08x", 225 TP_printk("stop request, stop irq = %u, flags = %08x",
224 __entry->action_bits) 226 __entry->stop_irq, __entry->flags)
225 ); 227 );
226 228
227 229
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index eb181178fe0b..57a9d94fe160 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -208,6 +208,7 @@ struct x86_emulate_ops {
208 208
209 void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 209 void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
210 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 210 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
211 void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
211}; 212};
212 213
213typedef u32 __attribute__((vector_size(16))) sse128_t; 214typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d89c6b828c96..a236e39cc385 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,8 +38,6 @@
38#define KVM_PRIVATE_MEM_SLOTS 3 38#define KVM_PRIVATE_MEM_SLOTS 3
39#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) 39#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
40 40
41#define KVM_MMIO_SIZE 16
42
43#define KVM_PIO_PAGE_OFFSET 1 41#define KVM_PIO_PAGE_OFFSET 1
44#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 42#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
45 43
@@ -51,7 +49,7 @@
51 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 49 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
52 50
53#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL 51#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
54#define CR3_PCID_INVD (1UL << 63) 52#define CR3_PCID_INVD BIT_64(63)
55#define CR4_RESERVED_BITS \ 53#define CR4_RESERVED_BITS \
56 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 54 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
57 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 55 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
@@ -160,6 +158,18 @@ enum {
160#define DR7_FIXED_1 0x00000400 158#define DR7_FIXED_1 0x00000400
161#define DR7_VOLATILE 0xffff2bff 159#define DR7_VOLATILE 0xffff2bff
162 160
161#define PFERR_PRESENT_BIT 0
162#define PFERR_WRITE_BIT 1
163#define PFERR_USER_BIT 2
164#define PFERR_RSVD_BIT 3
165#define PFERR_FETCH_BIT 4
166
167#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
168#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
169#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
170#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
171#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
172
163/* apic attention bits */ 173/* apic attention bits */
164#define KVM_APIC_CHECK_VAPIC 0 174#define KVM_APIC_CHECK_VAPIC 0
165/* 175/*
@@ -615,6 +625,8 @@ struct kvm_arch {
615 #ifdef CONFIG_KVM_MMU_AUDIT 625 #ifdef CONFIG_KVM_MMU_AUDIT
616 int audit_point; 626 int audit_point;
617 #endif 627 #endif
628
629 bool boot_vcpu_runs_old_kvmclock;
618}; 630};
619 631
620struct kvm_vm_stat { 632struct kvm_vm_stat {
@@ -643,6 +655,7 @@ struct kvm_vcpu_stat {
643 u32 irq_window_exits; 655 u32 irq_window_exits;
644 u32 nmi_window_exits; 656 u32 nmi_window_exits;
645 u32 halt_exits; 657 u32 halt_exits;
658 u32 halt_successful_poll;
646 u32 halt_wakeup; 659 u32 halt_wakeup;
647 u32 request_irq_exits; 660 u32 request_irq_exits;
648 u32 irq_exits; 661 u32 irq_exits;
@@ -787,6 +800,31 @@ struct kvm_x86_ops {
787 int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); 800 int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
788 801
789 void (*sched_in)(struct kvm_vcpu *kvm, int cpu); 802 void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
803
804 /*
805 * Arch-specific dirty logging hooks. These hooks are only supposed to
806 * be valid if the specific arch has hardware-accelerated dirty logging
807 * mechanism. Currently only for PML on VMX.
808 *
809 * - slot_enable_log_dirty:
810 * called when enabling log dirty mode for the slot.
811 * - slot_disable_log_dirty:
812 * called when disabling log dirty mode for the slot.
813 * also called when slot is created with log dirty disabled.
814 * - flush_log_dirty:
815 * called before reporting dirty_bitmap to userspace.
816 * - enable_log_dirty_pt_masked:
817 * called when reenabling log dirty for the GFNs in the mask after
818 * corresponding bits are cleared in slot->dirty_bitmap.
819 */
820 void (*slot_enable_log_dirty)(struct kvm *kvm,
821 struct kvm_memory_slot *slot);
822 void (*slot_disable_log_dirty)(struct kvm *kvm,
823 struct kvm_memory_slot *slot);
824 void (*flush_log_dirty)(struct kvm *kvm);
825 void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
826 struct kvm_memory_slot *slot,
827 gfn_t offset, unsigned long mask);
790}; 828};
791 829
792struct kvm_arch_async_pf { 830struct kvm_arch_async_pf {
@@ -819,10 +857,17 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
819 u64 dirty_mask, u64 nx_mask, u64 x_mask); 857 u64 dirty_mask, u64 nx_mask, u64 x_mask);
820 858
821void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 859void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
822void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 860void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
823void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 861 struct kvm_memory_slot *memslot);
824 struct kvm_memory_slot *slot, 862void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
825 gfn_t gfn_offset, unsigned long mask); 863 struct kvm_memory_slot *memslot);
864void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
865 struct kvm_memory_slot *memslot);
866void kvm_mmu_slot_set_dirty(struct kvm *kvm,
867 struct kvm_memory_slot *memslot);
868void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
869 struct kvm_memory_slot *slot,
870 gfn_t gfn_offset, unsigned long mask);
826void kvm_mmu_zap_all(struct kvm *kvm); 871void kvm_mmu_zap_all(struct kvm *kvm);
827void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); 872void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
828unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 873unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 45afaee9555c..da772edd19ab 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
69#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 69#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
70#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 70#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
71#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 71#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
72#define SECONDARY_EXEC_ENABLE_PML 0x00020000
72#define SECONDARY_EXEC_XSAVES 0x00100000 73#define SECONDARY_EXEC_XSAVES 0x00100000
73 74
74 75
@@ -121,6 +122,7 @@ enum vmcs_field {
121 GUEST_LDTR_SELECTOR = 0x0000080c, 122 GUEST_LDTR_SELECTOR = 0x0000080c,
122 GUEST_TR_SELECTOR = 0x0000080e, 123 GUEST_TR_SELECTOR = 0x0000080e,
123 GUEST_INTR_STATUS = 0x00000810, 124 GUEST_INTR_STATUS = 0x00000810,
125 GUEST_PML_INDEX = 0x00000812,
124 HOST_ES_SELECTOR = 0x00000c00, 126 HOST_ES_SELECTOR = 0x00000c00,
125 HOST_CS_SELECTOR = 0x00000c02, 127 HOST_CS_SELECTOR = 0x00000c02,
126 HOST_SS_SELECTOR = 0x00000c04, 128 HOST_SS_SELECTOR = 0x00000c04,
@@ -140,6 +142,8 @@ enum vmcs_field {
140 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, 142 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
141 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, 143 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
142 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, 144 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
145 PML_ADDRESS = 0x0000200e,
146 PML_ADDRESS_HIGH = 0x0000200f,
143 TSC_OFFSET = 0x00002010, 147 TSC_OFFSET = 0x00002010,
144 TSC_OFFSET_HIGH = 0x00002011, 148 TSC_OFFSET_HIGH = 0x00002011,
145 VIRTUAL_APIC_PAGE_ADDR = 0x00002012, 149 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 536240fa9a95..3ce079136c11 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -364,6 +364,9 @@
364#define MSR_IA32_UCODE_WRITE 0x00000079 364#define MSR_IA32_UCODE_WRITE 0x00000079
365#define MSR_IA32_UCODE_REV 0x0000008b 365#define MSR_IA32_UCODE_REV 0x0000008b
366 366
367#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
368#define MSR_IA32_SMBASE 0x0000009e
369
367#define MSR_IA32_PERF_STATUS 0x00000198 370#define MSR_IA32_PERF_STATUS 0x00000198
368#define MSR_IA32_PERF_CTL 0x00000199 371#define MSR_IA32_PERF_CTL 0x00000199
369#define INTEL_PERF_CTL_MASK 0xffff 372#define INTEL_PERF_CTL_MASK 0xffff
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b813bf9da1e2..c5f1a1deb91a 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -56,6 +56,7 @@
56#define EXIT_REASON_MSR_READ 31 56#define EXIT_REASON_MSR_READ 31
57#define EXIT_REASON_MSR_WRITE 32 57#define EXIT_REASON_MSR_WRITE 32
58#define EXIT_REASON_INVALID_STATE 33 58#define EXIT_REASON_INVALID_STATE 33
59#define EXIT_REASON_MSR_LOAD_FAIL 34
59#define EXIT_REASON_MWAIT_INSTRUCTION 36 60#define EXIT_REASON_MWAIT_INSTRUCTION 36
60#define EXIT_REASON_MONITOR_INSTRUCTION 39 61#define EXIT_REASON_MONITOR_INSTRUCTION 39
61#define EXIT_REASON_PAUSE_INSTRUCTION 40 62#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -72,6 +73,7 @@
72#define EXIT_REASON_XSETBV 55 73#define EXIT_REASON_XSETBV 55
73#define EXIT_REASON_APIC_WRITE 56 74#define EXIT_REASON_APIC_WRITE 56
74#define EXIT_REASON_INVPCID 58 75#define EXIT_REASON_INVPCID 58
76#define EXIT_REASON_PML_FULL 62
75#define EXIT_REASON_XSAVES 63 77#define EXIT_REASON_XSAVES 63
76#define EXIT_REASON_XRSTORS 64 78#define EXIT_REASON_XRSTORS 64
77 79
@@ -116,10 +118,14 @@
116 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ 118 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
117 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 119 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
118 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 120 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
121 { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
119 { EXIT_REASON_INVD, "INVD" }, \ 122 { EXIT_REASON_INVD, "INVD" }, \
120 { EXIT_REASON_INVVPID, "INVVPID" }, \ 123 { EXIT_REASON_INVVPID, "INVVPID" }, \
121 { EXIT_REASON_INVPCID, "INVPCID" }, \ 124 { EXIT_REASON_INVPCID, "INVPCID" }, \
122 { EXIT_REASON_XSAVES, "XSAVES" }, \ 125 { EXIT_REASON_XSAVES, "XSAVES" }, \
123 { EXIT_REASON_XRSTORS, "XRSTORS" } 126 { EXIT_REASON_XRSTORS, "XRSTORS" }
124 127
128#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
129#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
130
125#endif /* _UAPIVMX_H */ 131#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7dc7ba577ecd..413a7bf9efbb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -39,6 +39,7 @@ config KVM
39 select PERF_EVENTS 39 select PERF_EVENTS
40 select HAVE_KVM_MSI 40 select HAVE_KVM_MSI
41 select HAVE_KVM_CPU_RELAX_INTERCEPT 41 select HAVE_KVM_CPU_RELAX_INTERCEPT
42 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
42 select KVM_VFIO 43 select KVM_VFIO
43 select SRCU 44 select SRCU
44 ---help--- 45 ---help---
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de12c1d379f1..e0b794a84c35 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -86,6 +86,7 @@
86#define DstAcc (OpAcc << DstShift) 86#define DstAcc (OpAcc << DstShift)
87#define DstDI (OpDI << DstShift) 87#define DstDI (OpDI << DstShift)
88#define DstMem64 (OpMem64 << DstShift) 88#define DstMem64 (OpMem64 << DstShift)
89#define DstMem16 (OpMem16 << DstShift)
89#define DstImmUByte (OpImmUByte << DstShift) 90#define DstImmUByte (OpImmUByte << DstShift)
90#define DstDX (OpDX << DstShift) 91#define DstDX (OpDX << DstShift)
91#define DstAccLo (OpAccLo << DstShift) 92#define DstAccLo (OpAccLo << DstShift)
@@ -124,6 +125,7 @@
124#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ 125#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
125#define Escape (5<<15) /* Escape to coprocessor instruction */ 126#define Escape (5<<15) /* Escape to coprocessor instruction */
126#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ 127#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
128#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
127#define Sse (1<<18) /* SSE Vector instruction */ 129#define Sse (1<<18) /* SSE Vector instruction */
128/* Generic ModRM decode. */ 130/* Generic ModRM decode. */
129#define ModRM (1<<19) 131#define ModRM (1<<19)
@@ -165,10 +167,10 @@
165#define NoMod ((u64)1 << 47) /* Mod field is ignored */ 167#define NoMod ((u64)1 << 47) /* Mod field is ignored */
166#define Intercept ((u64)1 << 48) /* Has valid intercept field */ 168#define Intercept ((u64)1 << 48) /* Has valid intercept field */
167#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ 169#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
168#define NoBigReal ((u64)1 << 50) /* No big real mode */
169#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ 170#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
170#define NearBranch ((u64)1 << 52) /* Near branches */ 171#define NearBranch ((u64)1 << 52) /* Near branches */
171#define No16 ((u64)1 << 53) /* No 16 bit operand */ 172#define No16 ((u64)1 << 53) /* No 16 bit operand */
173#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
172 174
173#define DstXacc (DstAccLo | SrcAccHi | SrcWrite) 175#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
174 176
@@ -213,6 +215,7 @@ struct opcode {
213 const struct gprefix *gprefix; 215 const struct gprefix *gprefix;
214 const struct escape *esc; 216 const struct escape *esc;
215 const struct instr_dual *idual; 217 const struct instr_dual *idual;
218 const struct mode_dual *mdual;
216 void (*fastop)(struct fastop *fake); 219 void (*fastop)(struct fastop *fake);
217 } u; 220 } u;
218 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 221 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -240,6 +243,11 @@ struct instr_dual {
240 struct opcode mod3; 243 struct opcode mod3;
241}; 244};
242 245
246struct mode_dual {
247 struct opcode mode32;
248 struct opcode mode64;
249};
250
243/* EFLAGS bit definitions. */ 251/* EFLAGS bit definitions. */
244#define EFLG_ID (1<<21) 252#define EFLG_ID (1<<21)
245#define EFLG_VIP (1<<20) 253#define EFLG_VIP (1<<20)
@@ -262,6 +270,13 @@ struct instr_dual {
262#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a 270#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
263#define EFLG_RESERVED_ONE_MASK 2 271#define EFLG_RESERVED_ONE_MASK 2
264 272
273enum x86_transfer_type {
274 X86_TRANSFER_NONE,
275 X86_TRANSFER_CALL_JMP,
276 X86_TRANSFER_RET,
277 X86_TRANSFER_TASK_SWITCH,
278};
279
265static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) 280static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
266{ 281{
267 if (!(ctxt->regs_valid & (1 << nr))) { 282 if (!(ctxt->regs_valid & (1 << nr))) {
@@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
669 } 684 }
670 if (addr.ea > lim) 685 if (addr.ea > lim)
671 goto bad; 686 goto bad;
672 *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); 687 if (lim == 0xffffffff)
673 if (size > *max_size) 688 *max_size = ~0u;
674 goto bad; 689 else {
690 *max_size = (u64)lim + 1 - addr.ea;
691 if (size > *max_size)
692 goto bad;
693 }
675 la &= (u32)-1; 694 la &= (u32)-1;
676 break; 695 break;
677 } 696 }
@@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
722 const struct desc_struct *cs_desc) 741 const struct desc_struct *cs_desc)
723{ 742{
724 enum x86emul_mode mode = ctxt->mode; 743 enum x86emul_mode mode = ctxt->mode;
744 int rc;
725 745
726#ifdef CONFIG_X86_64 746#ifdef CONFIG_X86_64
727 if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) { 747 if (ctxt->mode >= X86EMUL_MODE_PROT16) {
728 u64 efer = 0; 748 if (cs_desc->l) {
749 u64 efer = 0;
729 750
730 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 751 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
731 if (efer & EFER_LMA) 752 if (efer & EFER_LMA)
732 mode = X86EMUL_MODE_PROT64; 753 mode = X86EMUL_MODE_PROT64;
754 } else
755 mode = X86EMUL_MODE_PROT32; /* temporary value */
733 } 756 }
734#endif 757#endif
735 if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) 758 if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
736 mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 759 mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
737 return assign_eip(ctxt, dst, mode); 760 rc = assign_eip(ctxt, dst, mode);
761 if (rc == X86EMUL_CONTINUE)
762 ctxt->mode = mode;
763 return rc;
738} 764}
739 765
740static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 766static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
1057 asm volatile("fnstcw %0": "+m"(fcw)); 1083 asm volatile("fnstcw %0": "+m"(fcw));
1058 ctxt->ops->put_fpu(ctxt); 1084 ctxt->ops->put_fpu(ctxt);
1059 1085
1060 /* force 2 byte destination */
1061 ctxt->dst.bytes = 2;
1062 ctxt->dst.val = fcw; 1086 ctxt->dst.val = fcw;
1063 1087
1064 return X86EMUL_CONTINUE; 1088 return X86EMUL_CONTINUE;
@@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
1075 asm volatile("fnstsw %0": "+m"(fsw)); 1099 asm volatile("fnstsw %0": "+m"(fsw));
1076 ctxt->ops->put_fpu(ctxt); 1100 ctxt->ops->put_fpu(ctxt);
1077 1101
1078 /* force 2 byte destination */
1079 ctxt->dst.bytes = 2;
1080 ctxt->dst.val = fsw; 1102 ctxt->dst.val = fsw;
1081 1103
1082 return X86EMUL_CONTINUE; 1104 return X86EMUL_CONTINUE;
@@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1223 else { 1245 else {
1224 modrm_ea += reg_read(ctxt, base_reg); 1246 modrm_ea += reg_read(ctxt, base_reg);
1225 adjust_modrm_seg(ctxt, base_reg); 1247 adjust_modrm_seg(ctxt, base_reg);
1248 /* Increment ESP on POP [ESP] */
1249 if ((ctxt->d & IncSP) &&
1250 base_reg == VCPU_REGS_RSP)
1251 modrm_ea += ctxt->op_bytes;
1226 } 1252 }
1227 if (index_reg != 4) 1253 if (index_reg != 4)
1228 modrm_ea += reg_read(ctxt, index_reg) << scale; 1254 modrm_ea += reg_read(ctxt, index_reg) << scale;
@@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1435 ops->get_gdt(ctxt, dt); 1461 ops->get_gdt(ctxt, dt);
1436} 1462}
1437 1463
1438/* allowed just for 8 bytes segments */ 1464static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
1439static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1465 u16 selector, ulong *desc_addr_p)
1440 u16 selector, struct desc_struct *desc,
1441 ulong *desc_addr_p)
1442{ 1466{
1443 struct desc_ptr dt; 1467 struct desc_ptr dt;
1444 u16 index = selector >> 3; 1468 u16 index = selector >> 3;
@@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1449 if (dt.size < index * 8 + 7) 1473 if (dt.size < index * 8 + 7)
1450 return emulate_gp(ctxt, selector & 0xfffc); 1474 return emulate_gp(ctxt, selector & 0xfffc);
1451 1475
1452 *desc_addr_p = addr = dt.address + index * 8; 1476 addr = dt.address + index * 8;
1453 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, 1477
1478#ifdef CONFIG_X86_64
1479 if (addr >> 32 != 0) {
1480 u64 efer = 0;
1481
1482 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
1483 if (!(efer & EFER_LMA))
1484 addr &= (u32)-1;
1485 }
1486#endif
1487
1488 *desc_addr_p = addr;
1489 return X86EMUL_CONTINUE;
1490}
1491
1492/* allowed just for 8 bytes segments */
1493static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1494 u16 selector, struct desc_struct *desc,
1495 ulong *desc_addr_p)
1496{
1497 int rc;
1498
1499 rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
1500 if (rc != X86EMUL_CONTINUE)
1501 return rc;
1502
1503 return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
1454 &ctxt->exception); 1504 &ctxt->exception);
1455} 1505}
1456 1506
@@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1458static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1508static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1459 u16 selector, struct desc_struct *desc) 1509 u16 selector, struct desc_struct *desc)
1460{ 1510{
1461 struct desc_ptr dt; 1511 int rc;
1462 u16 index = selector >> 3;
1463 ulong addr; 1512 ulong addr;
1464 1513
1465 get_descriptor_table_ptr(ctxt, selector, &dt); 1514 rc = get_descriptor_ptr(ctxt, selector, &addr);
1466 1515 if (rc != X86EMUL_CONTINUE)
1467 if (dt.size < index * 8 + 7) 1516 return rc;
1468 return emulate_gp(ctxt, selector & 0xfffc);
1469 1517
1470 addr = dt.address + index * 8;
1471 return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, 1518 return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
1472 &ctxt->exception); 1519 &ctxt->exception);
1473} 1520}
@@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1475/* Does not support long mode */ 1522/* Does not support long mode */
1476static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1523static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1477 u16 selector, int seg, u8 cpl, 1524 u16 selector, int seg, u8 cpl,
1478 bool in_task_switch, 1525 enum x86_transfer_type transfer,
1479 struct desc_struct *desc) 1526 struct desc_struct *desc)
1480{ 1527{
1481 struct desc_struct seg_desc, old_desc; 1528 struct desc_struct seg_desc, old_desc;
@@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1529 return ret; 1576 return ret;
1530 1577
1531 err_code = selector & 0xfffc; 1578 err_code = selector & 0xfffc;
1532 err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; 1579 err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
1580 GP_VECTOR;
1533 1581
1534 /* can't load system descriptor into segment selector */ 1582 /* can't load system descriptor into segment selector */
1535 if (seg <= VCPU_SREG_GS && !seg_desc.s) 1583 if (seg <= VCPU_SREG_GS && !seg_desc.s) {
1584 if (transfer == X86_TRANSFER_CALL_JMP)
1585 return X86EMUL_UNHANDLEABLE;
1536 goto exception; 1586 goto exception;
1587 }
1537 1588
1538 if (!seg_desc.p) { 1589 if (!seg_desc.p) {
1539 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; 1590 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
@@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1605 1656
1606 if (seg_desc.s) { 1657 if (seg_desc.s) {
1607 /* mark segment as accessed */ 1658 /* mark segment as accessed */
1608 seg_desc.type |= 1; 1659 if (!(seg_desc.type & 1)) {
1609 ret = write_segment_descriptor(ctxt, selector, &seg_desc); 1660 seg_desc.type |= 1;
1610 if (ret != X86EMUL_CONTINUE) 1661 ret = write_segment_descriptor(ctxt, selector,
1611 return ret; 1662 &seg_desc);
1663 if (ret != X86EMUL_CONTINUE)
1664 return ret;
1665 }
1612 } else if (ctxt->mode == X86EMUL_MODE_PROT64) { 1666 } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
1613 ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, 1667 ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
1614 sizeof(base3), &ctxt->exception); 1668 sizeof(base3), &ctxt->exception);
@@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1631 u16 selector, int seg) 1685 u16 selector, int seg)
1632{ 1686{
1633 u8 cpl = ctxt->ops->cpl(ctxt); 1687 u8 cpl = ctxt->ops->cpl(ctxt);
1634 return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); 1688 return __load_segment_descriptor(ctxt, selector, seg, cpl,
1689 X86_TRANSFER_NONE, NULL);
1635} 1690}
1636 1691
1637static void write_register_operand(struct operand *op) 1692static void write_register_operand(struct operand *op)
@@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
1828 unsigned long selector; 1883 unsigned long selector;
1829 int rc; 1884 int rc;
1830 1885
1831 rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); 1886 rc = emulate_pop(ctxt, &selector, 2);
1832 if (rc != X86EMUL_CONTINUE) 1887 if (rc != X86EMUL_CONTINUE)
1833 return rc; 1888 return rc;
1834 1889
1835 if (ctxt->modrm_reg == VCPU_SREG_SS) 1890 if (ctxt->modrm_reg == VCPU_SREG_SS)
1836 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; 1891 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
1892 if (ctxt->op_bytes > 2)
1893 rsp_increment(ctxt, ctxt->op_bytes - 2);
1837 1894
1838 rc = load_segment_descriptor(ctxt, (u16)selector, seg); 1895 rc = load_segment_descriptor(ctxt, (u16)selector, seg);
1839 return rc; 1896 return rc;
@@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
2007 2064
2008 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ 2065 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
2009 ctxt->eflags |= EFLG_RESERVED_ONE_MASK; 2066 ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
2067 ctxt->ops->set_nmi_mask(ctxt, false);
2010 2068
2011 return rc; 2069 return rc;
2012} 2070}
@@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
2041 2099
2042 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); 2100 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
2043 2101
2044 rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, 2102 rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
2103 X86_TRANSFER_CALL_JMP,
2045 &new_desc); 2104 &new_desc);
2046 if (rc != X86EMUL_CONTINUE) 2105 if (rc != X86EMUL_CONTINUE)
2047 return rc; 2106 return rc;
@@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
2130 /* Outer-privilege level return is not implemented */ 2189 /* Outer-privilege level return is not implemented */
2131 if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) 2190 if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
2132 return X86EMUL_UNHANDLEABLE; 2191 return X86EMUL_UNHANDLEABLE;
2133 rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, 2192 rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
2193 X86_TRANSFER_RET,
2134 &new_desc); 2194 &new_desc);
2135 if (rc != X86EMUL_CONTINUE) 2195 if (rc != X86EMUL_CONTINUE)
2136 return rc; 2196 return rc;
@@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2163 fastop(ctxt, em_cmp); 2223 fastop(ctxt, em_cmp);
2164 2224
2165 if (ctxt->eflags & EFLG_ZF) { 2225 if (ctxt->eflags & EFLG_ZF) {
2166 /* Success: write back to memory. */ 2226 /* Success: write back to memory; no update of EAX */
2227 ctxt->src.type = OP_NONE;
2167 ctxt->dst.val = ctxt->src.orig_val; 2228 ctxt->dst.val = ctxt->src.orig_val;
2168 } else { 2229 } else {
2169 /* Failure: write the value we saw to EAX. */ 2230 /* Failure: write the value we saw to EAX. */
2170 ctxt->dst.type = OP_REG; 2231 ctxt->src.type = OP_REG;
2171 ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); 2232 ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
2233 ctxt->src.val = ctxt->dst.orig_val;
2234 /* Create write-cycle to dest by writing the same value */
2172 ctxt->dst.val = ctxt->dst.orig_val; 2235 ctxt->dst.val = ctxt->dst.orig_val;
2173 } 2236 }
2174 return X86EMUL_CONTINUE; 2237 return X86EMUL_CONTINUE;
@@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2556 * it is handled in a context of new task 2619 * it is handled in a context of new task
2557 */ 2620 */
2558 ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, 2621 ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
2559 true, NULL); 2622 X86_TRANSFER_TASK_SWITCH, NULL);
2560 if (ret != X86EMUL_CONTINUE) 2623 if (ret != X86EMUL_CONTINUE)
2561 return ret; 2624 return ret;
2562 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, 2625 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
2563 true, NULL); 2626 X86_TRANSFER_TASK_SWITCH, NULL);
2564 if (ret != X86EMUL_CONTINUE) 2627 if (ret != X86EMUL_CONTINUE)
2565 return ret; 2628 return ret;
2566 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, 2629 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
2567 true, NULL); 2630 X86_TRANSFER_TASK_SWITCH, NULL);
2568 if (ret != X86EMUL_CONTINUE) 2631 if (ret != X86EMUL_CONTINUE)
2569 return ret; 2632 return ret;
2570 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, 2633 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
2571 true, NULL); 2634 X86_TRANSFER_TASK_SWITCH, NULL);
2572 if (ret != X86EMUL_CONTINUE) 2635 if (ret != X86EMUL_CONTINUE)
2573 return ret; 2636 return ret;
2574 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, 2637 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
2575 true, NULL); 2638 X86_TRANSFER_TASK_SWITCH, NULL);
2576 if (ret != X86EMUL_CONTINUE) 2639 if (ret != X86EMUL_CONTINUE)
2577 return ret; 2640 return ret;
2578 2641
@@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2694 * it is handled in a context of new task 2757 * it is handled in a context of new task
2695 */ 2758 */
2696 ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, 2759 ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
2697 cpl, true, NULL); 2760 cpl, X86_TRANSFER_TASK_SWITCH, NULL);
2698 if (ret != X86EMUL_CONTINUE) 2761 if (ret != X86EMUL_CONTINUE)
2699 return ret; 2762 return ret;
2700 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, 2763 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
2701 true, NULL); 2764 X86_TRANSFER_TASK_SWITCH, NULL);
2702 if (ret != X86EMUL_CONTINUE) 2765 if (ret != X86EMUL_CONTINUE)
2703 return ret; 2766 return ret;
2704 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, 2767 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
2705 true, NULL); 2768 X86_TRANSFER_TASK_SWITCH, NULL);
2706 if (ret != X86EMUL_CONTINUE) 2769 if (ret != X86EMUL_CONTINUE)
2707 return ret; 2770 return ret;
2708 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, 2771 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
2709 true, NULL); 2772 X86_TRANSFER_TASK_SWITCH, NULL);
2710 if (ret != X86EMUL_CONTINUE) 2773 if (ret != X86EMUL_CONTINUE)
2711 return ret; 2774 return ret;
2712 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, 2775 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
2713 true, NULL); 2776 X86_TRANSFER_TASK_SWITCH, NULL);
2714 if (ret != X86EMUL_CONTINUE) 2777 if (ret != X86EMUL_CONTINUE)
2715 return ret; 2778 return ret;
2716 ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, 2779 ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
2717 true, NULL); 2780 X86_TRANSFER_TASK_SWITCH, NULL);
2718 if (ret != X86EMUL_CONTINUE) 2781 if (ret != X86EMUL_CONTINUE)
2719 return ret; 2782 return ret;
2720 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, 2783 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
2721 true, NULL); 2784 X86_TRANSFER_TASK_SWITCH, NULL);
2722 if (ret != X86EMUL_CONTINUE) 2785 if (ret != X86EMUL_CONTINUE)
2723 return ret; 2786 return ret;
2724 2787
@@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2739 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2802 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2740 &ctxt->exception); 2803 &ctxt->exception);
2741 if (ret != X86EMUL_CONTINUE) 2804 if (ret != X86EMUL_CONTINUE)
2742 /* FIXME: need to provide precise fault address */
2743 return ret; 2805 return ret;
2744 2806
2745 save_state_to_tss32(ctxt, &tss_seg); 2807 save_state_to_tss32(ctxt, &tss_seg);
@@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2748 ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, 2810 ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
2749 ldt_sel_offset - eip_offset, &ctxt->exception); 2811 ldt_sel_offset - eip_offset, &ctxt->exception);
2750 if (ret != X86EMUL_CONTINUE) 2812 if (ret != X86EMUL_CONTINUE)
2751 /* FIXME: need to provide precise fault address */
2752 return ret; 2813 return ret;
2753 2814
2754 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, 2815 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
2755 &ctxt->exception); 2816 &ctxt->exception);
2756 if (ret != X86EMUL_CONTINUE) 2817 if (ret != X86EMUL_CONTINUE)
2757 /* FIXME: need to provide precise fault address */
2758 return ret; 2818 return ret;
2759 2819
2760 if (old_tss_sel != 0xffff) { 2820 if (old_tss_sel != 0xffff) {
@@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2765 sizeof tss_seg.prev_task_link, 2825 sizeof tss_seg.prev_task_link,
2766 &ctxt->exception); 2826 &ctxt->exception);
2767 if (ret != X86EMUL_CONTINUE) 2827 if (ret != X86EMUL_CONTINUE)
2768 /* FIXME: need to provide precise fault address */
2769 return ret; 2828 return ret;
2770 } 2829 }
2771 2830
@@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
2999 struct desc_struct old_desc, new_desc; 3058 struct desc_struct old_desc, new_desc;
3000 const struct x86_emulate_ops *ops = ctxt->ops; 3059 const struct x86_emulate_ops *ops = ctxt->ops;
3001 int cpl = ctxt->ops->cpl(ctxt); 3060 int cpl = ctxt->ops->cpl(ctxt);
3061 enum x86emul_mode prev_mode = ctxt->mode;
3002 3062
3003 old_eip = ctxt->_eip; 3063 old_eip = ctxt->_eip;
3004 ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); 3064 ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
3005 3065
3006 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); 3066 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
3007 rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, 3067 rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
3008 &new_desc); 3068 X86_TRANSFER_CALL_JMP, &new_desc);
3009 if (rc != X86EMUL_CONTINUE) 3069 if (rc != X86EMUL_CONTINUE)
3010 return X86EMUL_CONTINUE; 3070 return rc;
3011 3071
3012 rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); 3072 rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
3013 if (rc != X86EMUL_CONTINUE) 3073 if (rc != X86EMUL_CONTINUE)
@@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
3022 rc = em_push(ctxt); 3082 rc = em_push(ctxt);
3023 /* If we failed, we tainted the memory, but the very least we should 3083 /* If we failed, we tainted the memory, but the very least we should
3024 restore cs */ 3084 restore cs */
3025 if (rc != X86EMUL_CONTINUE) 3085 if (rc != X86EMUL_CONTINUE) {
3086 pr_warn_once("faulting far call emulation tainted memory\n");
3026 goto fail; 3087 goto fail;
3088 }
3027 return rc; 3089 return rc;
3028fail: 3090fail:
3029 ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); 3091 ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
3092 ctxt->mode = prev_mode;
3030 return rc; 3093 return rc;
3031 3094
3032} 3095}
@@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
3477 return X86EMUL_CONTINUE; 3540 return X86EMUL_CONTINUE;
3478} 3541}
3479 3542
3543static int em_movsxd(struct x86_emulate_ctxt *ctxt)
3544{
3545 ctxt->dst.val = (s32) ctxt->src.val;
3546 return X86EMUL_CONTINUE;
3547}
3548
3480static bool valid_cr(int nr) 3549static bool valid_cr(int nr)
3481{ 3550{
3482 switch (nr) { 3551 switch (nr) {
@@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3676#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } 3745#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3677#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } 3746#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
3678#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } 3747#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
3748#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
3679#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } 3749#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
3680#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3750#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3681#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } 3751#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3738,7 +3808,7 @@ static const struct opcode group1[] = {
3738}; 3808};
3739 3809
3740static const struct opcode group1A[] = { 3810static const struct opcode group1A[] = {
3741 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3811 I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
3742}; 3812};
3743 3813
3744static const struct opcode group2[] = { 3814static const struct opcode group2[] = {
@@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = {
3854}; 3924};
3855 3925
3856static const struct escape escape_d9 = { { 3926static const struct escape escape_d9 = { {
3857 N, N, N, N, N, N, N, I(DstMem, em_fnstcw), 3927 N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
3858}, { 3928}, {
3859 /* 0xC0 - 0xC7 */ 3929 /* 0xC0 - 0xC7 */
3860 N, N, N, N, N, N, N, N, 3930 N, N, N, N, N, N, N, N,
@@ -3896,7 +3966,7 @@ static const struct escape escape_db = { {
3896} }; 3966} };
3897 3967
3898static const struct escape escape_dd = { { 3968static const struct escape escape_dd = { {
3899 N, N, N, N, N, N, N, I(DstMem, em_fnstsw), 3969 N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
3900}, { 3970}, {
3901 /* 0xC0 - 0xC7 */ 3971 /* 0xC0 - 0xC7 */
3902 N, N, N, N, N, N, N, N, 3972 N, N, N, N, N, N, N, N,
@@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = {
3920 I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N 3990 I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
3921}; 3991};
3922 3992
3993static const struct mode_dual mode_dual_63 = {
3994 N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
3995};
3996
3923static const struct opcode opcode_table[256] = { 3997static const struct opcode opcode_table[256] = {
3924 /* 0x00 - 0x07 */ 3998 /* 0x00 - 0x07 */
3925 F6ALU(Lock, em_add), 3999 F6ALU(Lock, em_add),
@@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = {
3954 /* 0x60 - 0x67 */ 4028 /* 0x60 - 0x67 */
3955 I(ImplicitOps | Stack | No64, em_pusha), 4029 I(ImplicitOps | Stack | No64, em_pusha),
3956 I(ImplicitOps | Stack | No64, em_popa), 4030 I(ImplicitOps | Stack | No64, em_popa),
3957 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , 4031 N, MD(ModRM, &mode_dual_63),
3958 N, N, N, N, 4032 N, N, N, N,
3959 /* 0x68 - 0x6F */ 4033 /* 0x68 - 0x6F */
3960 I(SrcImm | Mov | Stack, em_push), 4034 I(SrcImm | Mov | Stack, em_push),
@@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = {
4010 G(ByteOp, group11), G(0, group11), 4084 G(ByteOp, group11), G(0, group11),
4011 /* 0xC8 - 0xCF */ 4085 /* 0xC8 - 0xCF */
4012 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), 4086 I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
4013 I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), 4087 I(ImplicitOps | SrcImmU16, em_ret_far_imm),
4014 I(ImplicitOps | Stack, em_ret_far), 4088 I(ImplicitOps, em_ret_far),
4015 D(ImplicitOps), DI(SrcImmByte, intn), 4089 D(ImplicitOps), DI(SrcImmByte, intn),
4016 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 4090 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
4017 /* 0xD0 - 0xD7 */ 4091 /* 0xD0 - 0xD7 */
@@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = {
4108 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), 4182 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
4109 GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), 4183 GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
4110 /* 0xB0 - 0xB7 */ 4184 /* 0xB0 - 0xB7 */
4111 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), 4185 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
4112 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), 4186 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
4113 F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), 4187 F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
4114 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 4188 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = {
4174#undef I 4248#undef I
4175#undef GP 4249#undef GP
4176#undef EXT 4250#undef EXT
4251#undef MD
4252#undef ID
4177 4253
4178#undef D2bv 4254#undef D2bv
4179#undef D2bvIP 4255#undef D2bvIP
@@ -4563,6 +4639,12 @@ done_prefixes:
4563 else 4639 else
4564 opcode = opcode.u.idual->mod012; 4640 opcode = opcode.u.idual->mod012;
4565 break; 4641 break;
4642 case ModeDual:
4643 if (ctxt->mode == X86EMUL_MODE_PROT64)
4644 opcode = opcode.u.mdual->mode64;
4645 else
4646 opcode = opcode.u.mdual->mode32;
4647 break;
4566 default: 4648 default:
4567 return EMULATION_FAILED; 4649 return EMULATION_FAILED;
4568 } 4650 }
@@ -4860,8 +4942,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4860 /* optimisation - avoid slow emulated read if Mov */ 4942 /* optimisation - avoid slow emulated read if Mov */
4861 rc = segmented_read(ctxt, ctxt->dst.addr.mem, 4943 rc = segmented_read(ctxt, ctxt->dst.addr.mem,
4862 &ctxt->dst.val, ctxt->dst.bytes); 4944 &ctxt->dst.val, ctxt->dst.bytes);
4863 if (rc != X86EMUL_CONTINUE) 4945 if (rc != X86EMUL_CONTINUE) {
4946 if (!(ctxt->d & NoWrite) &&
4947 rc == X86EMUL_PROPAGATE_FAULT &&
4948 ctxt->exception.vector == PF_VECTOR)
4949 ctxt->exception.error_code |= PFERR_WRITE_MASK;
4864 goto done; 4950 goto done;
4951 }
4865 } 4952 }
4866 ctxt->dst.orig_val = ctxt->dst.val; 4953 ctxt->dst.orig_val = ctxt->dst.val;
4867 4954
@@ -4899,11 +4986,6 @@ special_insn:
4899 goto threebyte_insn; 4986 goto threebyte_insn;
4900 4987
4901 switch (ctxt->b) { 4988 switch (ctxt->b) {
4902 case 0x63: /* movsxd */
4903 if (ctxt->mode != X86EMUL_MODE_PROT64)
4904 goto cannot_emulate;
4905 ctxt->dst.val = (s32) ctxt->src.val;
4906 break;
4907 case 0x70 ... 0x7f: /* jcc (short) */ 4989 case 0x70 ... 0x7f: /* jcc (short) */
4908 if (test_cc(ctxt->b, ctxt->eflags)) 4990 if (test_cc(ctxt->b, ctxt->eflags))
4909 rc = jmp_rel(ctxt, ctxt->src.val); 4991 rc = jmp_rel(ctxt, ctxt->src.val);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3c9195535ffc..c2e36d934af4 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
98} 98}
99 99
100void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); 100void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
101int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 101bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
102 int short_hand, unsigned int dest, int dest_mode); 102 int short_hand, unsigned int dest, int dest_mode);
103int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 103int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
104void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, 104void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 17b73eeac8a4..7dbced309ddb 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
138 138
139 gfn += page_size >> PAGE_SHIFT; 139 gfn += page_size >> PAGE_SHIFT;
140 140
141 141 cond_resched();
142 } 142 }
143 143
144 return 0; 144 return 0;
@@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
306 kvm_unpin_pages(kvm, pfn, unmap_pages); 306 kvm_unpin_pages(kvm, pfn, unmap_pages);
307 307
308 gfn += unmap_pages; 308 gfn += unmap_pages;
309
310 cond_resched();
309 } 311 }
310} 312}
311 313
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d52dcf0776ea..e55b5fc344eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
33#include <asm/page.h> 33#include <asm/page.h>
34#include <asm/current.h> 34#include <asm/current.h>
35#include <asm/apicdef.h> 35#include <asm/apicdef.h>
36#include <asm/delay.h>
36#include <linux/atomic.h> 37#include <linux/atomic.h>
37#include <linux/jump_label.h> 38#include <linux/jump_label.h>
38#include "kvm_cache_regs.h" 39#include "kvm_cache_regs.h"
@@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap)
327 return count; 328 return count;
328} 329}
329 330
330void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) 331void __kvm_apic_update_irr(u32 *pir, void *regs)
331{ 332{
332 u32 i, pir_val; 333 u32 i, pir_val;
333 struct kvm_lapic *apic = vcpu->arch.apic;
334 334
335 for (i = 0; i <= 7; i++) { 335 for (i = 0; i <= 7; i++) {
336 pir_val = xchg(&pir[i], 0); 336 pir_val = xchg(&pir[i], 0);
337 if (pir_val) 337 if (pir_val)
338 *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; 338 *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
339 } 339 }
340} 340}
341EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
342
343void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
344{
345 struct kvm_lapic *apic = vcpu->arch.apic;
346
347 __kvm_apic_update_irr(pir, apic->regs);
348}
341EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 349EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
342 350
343static inline void apic_set_irr(int vec, struct kvm_lapic *apic) 351static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
@@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
405 * because the processor can modify ISR under the hood. Instead 413 * because the processor can modify ISR under the hood. Instead
406 * just set SVI. 414 * just set SVI.
407 */ 415 */
408 if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) 416 if (unlikely(kvm_x86_ops->hwapic_isr_update))
409 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); 417 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
410 else { 418 else {
411 ++apic->isr_count; 419 ++apic->isr_count;
@@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
453 * on the other hand isr_count and highest_isr_cache are unused 461 * on the other hand isr_count and highest_isr_cache are unused
454 * and must be left alone. 462 * and must be left alone.
455 */ 463 */
456 if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) 464 if (unlikely(kvm_x86_ops->hwapic_isr_update))
457 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, 465 kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
458 apic_find_highest_isr(apic)); 466 apic_find_highest_isr(apic));
459 else { 467 else {
@@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
580 apic_update_ppr(apic); 588 apic_update_ppr(apic);
581} 589}
582 590
583static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) 591static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
584{ 592{
585 return dest == (apic_x2apic_mode(apic) ? 593 return dest == (apic_x2apic_mode(apic) ?
586 X2APIC_BROADCAST : APIC_BROADCAST); 594 X2APIC_BROADCAST : APIC_BROADCAST);
587} 595}
588 596
589int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) 597static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
590{ 598{
591 return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); 599 return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
592} 600}
593 601
594int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) 602static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
595{ 603{
596 int result = 0;
597 u32 logical_id; 604 u32 logical_id;
598 605
599 if (kvm_apic_broadcast(apic, mda)) 606 if (kvm_apic_broadcast(apic, mda))
600 return 1; 607 return true;
601 608
602 if (apic_x2apic_mode(apic)) { 609 logical_id = kvm_apic_get_reg(apic, APIC_LDR);
603 logical_id = kvm_apic_get_reg(apic, APIC_LDR);
604 return logical_id & mda;
605 }
606 610
607 logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); 611 if (apic_x2apic_mode(apic))
612 return ((logical_id >> 16) == (mda >> 16))
613 && (logical_id & mda & 0xffff) != 0;
614
615 logical_id = GET_APIC_LOGICAL_ID(logical_id);
608 616
609 switch (kvm_apic_get_reg(apic, APIC_DFR)) { 617 switch (kvm_apic_get_reg(apic, APIC_DFR)) {
610 case APIC_DFR_FLAT: 618 case APIC_DFR_FLAT:
611 if (logical_id & mda) 619 return (logical_id & mda) != 0;
612 result = 1;
613 break;
614 case APIC_DFR_CLUSTER: 620 case APIC_DFR_CLUSTER:
615 if (((logical_id >> 4) == (mda >> 0x4)) 621 return ((logical_id >> 4) == (mda >> 4))
616 && (logical_id & mda & 0xf)) 622 && (logical_id & mda & 0xf) != 0;
617 result = 1;
618 break;
619 default: 623 default:
620 apic_debug("Bad DFR vcpu %d: %08x\n", 624 apic_debug("Bad DFR vcpu %d: %08x\n",
621 apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); 625 apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
622 break; 626 return false;
623 } 627 }
624
625 return result;
626} 628}
627 629
628int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 630bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
629 int short_hand, unsigned int dest, int dest_mode) 631 int short_hand, unsigned int dest, int dest_mode)
630{ 632{
631 int result = 0;
632 struct kvm_lapic *target = vcpu->arch.apic; 633 struct kvm_lapic *target = vcpu->arch.apic;
633 634
634 apic_debug("target %p, source %p, dest 0x%x, " 635 apic_debug("target %p, source %p, dest 0x%x, "
@@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
638 ASSERT(target); 639 ASSERT(target);
639 switch (short_hand) { 640 switch (short_hand) {
640 case APIC_DEST_NOSHORT: 641 case APIC_DEST_NOSHORT:
641 if (dest_mode == 0) 642 if (dest_mode == APIC_DEST_PHYSICAL)
642 /* Physical mode. */ 643 return kvm_apic_match_physical_addr(target, dest);
643 result = kvm_apic_match_physical_addr(target, dest);
644 else 644 else
645 /* Logical mode. */ 645 return kvm_apic_match_logical_addr(target, dest);
646 result = kvm_apic_match_logical_addr(target, dest);
647 break;
648 case APIC_DEST_SELF: 646 case APIC_DEST_SELF:
649 result = (target == source); 647 return target == source;
650 break;
651 case APIC_DEST_ALLINC: 648 case APIC_DEST_ALLINC:
652 result = 1; 649 return true;
653 break;
654 case APIC_DEST_ALLBUT: 650 case APIC_DEST_ALLBUT:
655 result = (target != source); 651 return target != source;
656 break;
657 default: 652 default:
658 apic_debug("kvm: apic: Bad dest shorthand value %x\n", 653 apic_debug("kvm: apic: Bad dest shorthand value %x\n",
659 short_hand); 654 short_hand);
660 break; 655 return false;
661 } 656 }
662
663 return result;
664} 657}
665 658
666bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 659bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
@@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
693 686
694 ret = true; 687 ret = true;
695 688
696 if (irq->dest_mode == 0) { /* physical mode */ 689 if (irq->dest_mode == APIC_DEST_PHYSICAL) {
697 if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) 690 if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
698 goto out; 691 goto out;
699 692
@@ -1076,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic)
1076{ 1069{
1077 struct kvm_vcpu *vcpu = apic->vcpu; 1070 struct kvm_vcpu *vcpu = apic->vcpu;
1078 wait_queue_head_t *q = &vcpu->wq; 1071 wait_queue_head_t *q = &vcpu->wq;
1072 struct kvm_timer *ktimer = &apic->lapic_timer;
1079 1073
1080 /*
1081 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1082 * vcpu_enter_guest.
1083 */
1084 if (atomic_read(&apic->lapic_timer.pending)) 1074 if (atomic_read(&apic->lapic_timer.pending))
1085 return; 1075 return;
1086 1076
1087 atomic_inc(&apic->lapic_timer.pending); 1077 atomic_inc(&apic->lapic_timer.pending);
1088 /* FIXME: this code should not know anything about vcpus */ 1078 kvm_set_pending_timer(vcpu);
1089 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1090 1079
1091 if (waitqueue_active(q)) 1080 if (waitqueue_active(q))
1092 wake_up_interruptible(q); 1081 wake_up_interruptible(q);
1082
1083 if (apic_lvtt_tscdeadline(apic))
1084 ktimer->expired_tscdeadline = ktimer->tscdeadline;
1085}
1086
1087/*
1088 * On APICv, this test will cause a busy wait
1089 * during a higher-priority task.
1090 */
1091
1092static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
1093{
1094 struct kvm_lapic *apic = vcpu->arch.apic;
1095 u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
1096
1097 if (kvm_apic_hw_enabled(apic)) {
1098 int vec = reg & APIC_VECTOR_MASK;
1099 void *bitmap = apic->regs + APIC_ISR;
1100
1101 if (kvm_x86_ops->deliver_posted_interrupt)
1102 bitmap = apic->regs + APIC_IRR;
1103
1104 if (apic_test_vector(vec, bitmap))
1105 return true;
1106 }
1107 return false;
1108}
1109
1110void wait_lapic_expire(struct kvm_vcpu *vcpu)
1111{
1112 struct kvm_lapic *apic = vcpu->arch.apic;
1113 u64 guest_tsc, tsc_deadline;
1114
1115 if (!kvm_vcpu_has_lapic(vcpu))
1116 return;
1117
1118 if (apic->lapic_timer.expired_tscdeadline == 0)
1119 return;
1120
1121 if (!lapic_timer_int_injected(vcpu))
1122 return;
1123
1124 tsc_deadline = apic->lapic_timer.expired_tscdeadline;
1125 apic->lapic_timer.expired_tscdeadline = 0;
1126 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
1127 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
1128
1129 /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
1130 if (guest_tsc < tsc_deadline)
1131 __delay(tsc_deadline - guest_tsc);
1093} 1132}
1094 1133
1095static void start_apic_timer(struct kvm_lapic *apic) 1134static void start_apic_timer(struct kvm_lapic *apic)
1096{ 1135{
1097 ktime_t now; 1136 ktime_t now;
1137
1098 atomic_set(&apic->lapic_timer.pending, 0); 1138 atomic_set(&apic->lapic_timer.pending, 0);
1099 1139
1100 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { 1140 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1140,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
1140 /* lapic timer in tsc deadline mode */ 1180 /* lapic timer in tsc deadline mode */
1141 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; 1181 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
1142 u64 ns = 0; 1182 u64 ns = 0;
1183 ktime_t expire;
1143 struct kvm_vcpu *vcpu = apic->vcpu; 1184 struct kvm_vcpu *vcpu = apic->vcpu;
1144 unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; 1185 unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
1145 unsigned long flags; 1186 unsigned long flags;
@@ -1154,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
1154 if (likely(tscdeadline > guest_tsc)) { 1195 if (likely(tscdeadline > guest_tsc)) {
1155 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1196 ns = (tscdeadline - guest_tsc) * 1000000ULL;
1156 do_div(ns, this_tsc_khz); 1197 do_div(ns, this_tsc_khz);
1198 expire = ktime_add_ns(now, ns);
1199 expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
1157 hrtimer_start(&apic->lapic_timer.timer, 1200 hrtimer_start(&apic->lapic_timer.timer,
1158 ktime_add_ns(now, ns), HRTIMER_MODE_ABS); 1201 expire, HRTIMER_MODE_ABS);
1159 } else 1202 } else
1160 apic_timer_expired(apic); 1203 apic_timer_expired(apic);
1161 1204
@@ -1745,7 +1788,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1745 if (kvm_x86_ops->hwapic_irr_update) 1788 if (kvm_x86_ops->hwapic_irr_update)
1746 kvm_x86_ops->hwapic_irr_update(vcpu, 1789 kvm_x86_ops->hwapic_irr_update(vcpu,
1747 apic_find_highest_irr(apic)); 1790 apic_find_highest_irr(apic));
1748 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); 1791 if (unlikely(kvm_x86_ops->hwapic_isr_update))
1792 kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
1793 apic_find_highest_isr(apic));
1749 kvm_make_request(KVM_REQ_EVENT, vcpu); 1794 kvm_make_request(KVM_REQ_EVENT, vcpu);
1750 kvm_rtc_eoi_tracking_restore_one(vcpu); 1795 kvm_rtc_eoi_tracking_restore_one(vcpu);
1751} 1796}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c674fce53cf9..0bc6c656625b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
14 u32 timer_mode; 14 u32 timer_mode;
15 u32 timer_mode_mask; 15 u32 timer_mode_mask;
16 u64 tscdeadline; 16 u64 tscdeadline;
17 u64 expired_tscdeadline;
17 atomic_t pending; /* accumulated triggered timers */ 18 atomic_t pending; /* accumulated triggered timers */
18}; 19};
19 20
@@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
56void kvm_apic_set_version(struct kvm_vcpu *vcpu); 57void kvm_apic_set_version(struct kvm_vcpu *vcpu);
57 58
58void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); 59void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
60void __kvm_apic_update_irr(u32 *pir, void *regs);
59void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); 61void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
60int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
61int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
62int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, 62int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
63 unsigned long *dest_map); 63 unsigned long *dest_map);
64int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 64int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
170 170
171bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); 171bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
172 172
173void wait_lapic_expire(struct kvm_vcpu *vcpu);
174
173#endif 175#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f83fc6c5e0ba..cee759299a35 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -63,30 +63,16 @@ enum {
63#undef MMU_DEBUG 63#undef MMU_DEBUG
64 64
65#ifdef MMU_DEBUG 65#ifdef MMU_DEBUG
66static bool dbg = 0;
67module_param(dbg, bool, 0644);
66 68
67#define pgprintk(x...) do { if (dbg) printk(x); } while (0) 69#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
68#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) 70#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
69 71#define MMU_WARN_ON(x) WARN_ON(x)
70#else 72#else
71
72#define pgprintk(x...) do { } while (0) 73#define pgprintk(x...) do { } while (0)
73#define rmap_printk(x...) do { } while (0) 74#define rmap_printk(x...) do { } while (0)
74 75#define MMU_WARN_ON(x) do { } while (0)
75#endif
76
77#ifdef MMU_DEBUG
78static bool dbg = 0;
79module_param(dbg, bool, 0644);
80#endif
81
82#ifndef MMU_DEBUG
83#define ASSERT(x) do { } while (0)
84#else
85#define ASSERT(x) \
86 if (!(x)) { \
87 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
88 __FILE__, __LINE__, #x); \
89 }
90#endif 76#endif
91 77
92#define PTE_PREFETCH_NUM 8 78#define PTE_PREFETCH_NUM 8
@@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
546 return (old_spte & bit_mask) && !(new_spte & bit_mask); 532 return (old_spte & bit_mask) && !(new_spte & bit_mask);
547} 533}
548 534
535static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
536{
537 return (old_spte & bit_mask) != (new_spte & bit_mask);
538}
539
549/* Rules for using mmu_spte_set: 540/* Rules for using mmu_spte_set:
550 * Set the sptep from nonpresent to present. 541 * Set the sptep from nonpresent to present.
551 * Note: the sptep being assigned *must* be either not present 542 * Note: the sptep being assigned *must* be either not present
@@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
596 if (!shadow_accessed_mask) 587 if (!shadow_accessed_mask)
597 return ret; 588 return ret;
598 589
590 /*
591 * Flush TLB when accessed/dirty bits are changed in the page tables,
592 * to guarantee consistency between TLB and page tables.
593 */
594 if (spte_is_bit_changed(old_spte, new_spte,
595 shadow_accessed_mask | shadow_dirty_mask))
596 ret = true;
597
599 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 598 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
600 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 599 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
601 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 600 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
@@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1216 return flush; 1215 return flush;
1217} 1216}
1218 1217
1218static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
1219{
1220 u64 spte = *sptep;
1221
1222 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1223
1224 spte &= ~shadow_dirty_mask;
1225
1226 return mmu_spte_update(sptep, spte);
1227}
1228
1229static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
1230{
1231 u64 *sptep;
1232 struct rmap_iterator iter;
1233 bool flush = false;
1234
1235 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1236 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1237
1238 flush |= spte_clear_dirty(kvm, sptep);
1239 sptep = rmap_get_next(&iter);
1240 }
1241
1242 return flush;
1243}
1244
1245static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
1246{
1247 u64 spte = *sptep;
1248
1249 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1250
1251 spte |= shadow_dirty_mask;
1252
1253 return mmu_spte_update(sptep, spte);
1254}
1255
1256static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
1257{
1258 u64 *sptep;
1259 struct rmap_iterator iter;
1260 bool flush = false;
1261
1262 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1263 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1264
1265 flush |= spte_set_dirty(kvm, sptep);
1266 sptep = rmap_get_next(&iter);
1267 }
1268
1269 return flush;
1270}
1271
1219/** 1272/**
1220 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1273 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1221 * @kvm: kvm instance 1274 * @kvm: kvm instance
@@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1226 * Used when we do not need to care about huge page mappings: e.g. during dirty 1279 * Used when we do not need to care about huge page mappings: e.g. during dirty
1227 * logging we do not have any such mappings. 1280 * logging we do not have any such mappings.
1228 */ 1281 */
1229void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1282static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1230 struct kvm_memory_slot *slot, 1283 struct kvm_memory_slot *slot,
1231 gfn_t gfn_offset, unsigned long mask) 1284 gfn_t gfn_offset, unsigned long mask)
1232{ 1285{
@@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1242 } 1295 }
1243} 1296}
1244 1297
1298/**
1299 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
1300 * @kvm: kvm instance
1301 * @slot: slot to clear D-bit
1302 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1303 * @mask: indicates which pages we should clear D-bit
1304 *
1305 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1306 */
1307void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1308 struct kvm_memory_slot *slot,
1309 gfn_t gfn_offset, unsigned long mask)
1310{
1311 unsigned long *rmapp;
1312
1313 while (mask) {
1314 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1315 PT_PAGE_TABLE_LEVEL, slot);
1316 __rmap_clear_dirty(kvm, rmapp);
1317
1318 /* clear the first set bit */
1319 mask &= mask - 1;
1320 }
1321}
1322EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1323
1324/**
1325 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1326 * PT level pages.
1327 *
1328 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1329 * enable dirty logging for them.
1330 *
1331 * Used when we do not need to care about huge page mappings: e.g. during dirty
1332 * logging we do not have any such mappings.
1333 */
1334void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1335 struct kvm_memory_slot *slot,
1336 gfn_t gfn_offset, unsigned long mask)
1337{
1338 if (kvm_x86_ops->enable_log_dirty_pt_masked)
1339 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1340 mask);
1341 else
1342 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1343}
1344
1245static bool rmap_write_protect(struct kvm *kvm, u64 gfn) 1345static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1246{ 1346{
1247 struct kvm_memory_slot *slot; 1347 struct kvm_memory_slot *slot;
@@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1536 1636
1537static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1637static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1538{ 1638{
1539 ASSERT(is_empty_shadow_page(sp->spt)); 1639 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1540 hlist_del(&sp->hash_link); 1640 hlist_del(&sp->hash_link);
1541 list_del(&sp->link); 1641 list_del(&sp->link);
1542 free_page((unsigned long)sp->spt); 1642 free_page((unsigned long)sp->spt);
@@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2501 } 2601 }
2502 } 2602 }
2503 2603
2504 if (pte_access & ACC_WRITE_MASK) 2604 if (pte_access & ACC_WRITE_MASK) {
2505 mark_page_dirty(vcpu->kvm, gfn); 2605 mark_page_dirty(vcpu->kvm, gfn);
2606 spte |= shadow_dirty_mask;
2607 }
2506 2608
2507set_pte: 2609set_pte:
2508 if (mmu_spte_update(sptep, spte)) 2610 if (mmu_spte_update(sptep, spte))
@@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2818 */ 2920 */
2819 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 2921 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2820 2922
2923 /*
2924 * Theoretically we could also set dirty bit (and flush TLB) here in
2925 * order to eliminate unnecessary PML logging. See comments in
2926 * set_spte. But fast_page_fault is very unlikely to happen with PML
2927 * enabled, so we do not do this. This might result in the same GPA
2928 * to be logged in PML buffer again when the write really happens, and
2929 * eventually to be called by mark_page_dirty twice. But it's also no
2930 * harm. This also avoids the TLB flush needed after setting dirty bit
2931 * so non-PML cases won't be impacted.
2932 *
2933 * Compare with set_spte where instead shadow_dirty_mask is set.
2934 */
2821 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) 2935 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2822 mark_page_dirty(vcpu->kvm, gfn); 2936 mark_page_dirty(vcpu->kvm, gfn);
2823 2937
@@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3041 for (i = 0; i < 4; ++i) { 3155 for (i = 0; i < 4; ++i) {
3042 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3156 hpa_t root = vcpu->arch.mmu.pae_root[i];
3043 3157
3044 ASSERT(!VALID_PAGE(root)); 3158 MMU_WARN_ON(VALID_PAGE(root));
3045 spin_lock(&vcpu->kvm->mmu_lock); 3159 spin_lock(&vcpu->kvm->mmu_lock);
3046 make_mmu_pages_available(vcpu); 3160 make_mmu_pages_available(vcpu);
3047 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 3161 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
@@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3079 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3193 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
3080 hpa_t root = vcpu->arch.mmu.root_hpa; 3194 hpa_t root = vcpu->arch.mmu.root_hpa;
3081 3195
3082 ASSERT(!VALID_PAGE(root)); 3196 MMU_WARN_ON(VALID_PAGE(root));
3083 3197
3084 spin_lock(&vcpu->kvm->mmu_lock); 3198 spin_lock(&vcpu->kvm->mmu_lock);
3085 make_mmu_pages_available(vcpu); 3199 make_mmu_pages_available(vcpu);
@@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3104 for (i = 0; i < 4; ++i) { 3218 for (i = 0; i < 4; ++i) {
3105 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3219 hpa_t root = vcpu->arch.mmu.pae_root[i];
3106 3220
3107 ASSERT(!VALID_PAGE(root)); 3221 MMU_WARN_ON(VALID_PAGE(root));
3108 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 3222 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
3109 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); 3223 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
3110 if (!is_present_gpte(pdptr)) { 3224 if (!is_present_gpte(pdptr)) {
@@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3329 if (r) 3443 if (r)
3330 return r; 3444 return r;
3331 3445
3332 ASSERT(vcpu); 3446 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3333 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3334 3447
3335 gfn = gva >> PAGE_SHIFT; 3448 gfn = gva >> PAGE_SHIFT;
3336 3449
@@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3396 int write = error_code & PFERR_WRITE_MASK; 3509 int write = error_code & PFERR_WRITE_MASK;
3397 bool map_writable; 3510 bool map_writable;
3398 3511
3399 ASSERT(vcpu); 3512 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3400 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3401 3513
3402 if (unlikely(error_code & PFERR_RSVD_MASK)) { 3514 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3403 r = handle_mmio_page_fault(vcpu, gpa, error_code, true); 3515 r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
@@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
3718 update_permission_bitmask(vcpu, context, false); 3830 update_permission_bitmask(vcpu, context, false);
3719 update_last_pte_bitmap(vcpu, context); 3831 update_last_pte_bitmap(vcpu, context);
3720 3832
3721 ASSERT(is_pae(vcpu)); 3833 MMU_WARN_ON(!is_pae(vcpu));
3722 context->page_fault = paging64_page_fault; 3834 context->page_fault = paging64_page_fault;
3723 context->gva_to_gpa = paging64_gva_to_gpa; 3835 context->gva_to_gpa = paging64_gva_to_gpa;
3724 context->sync_page = paging64_sync_page; 3836 context->sync_page = paging64_sync_page;
@@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
3763 3875
3764static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 3876static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3765{ 3877{
3766 struct kvm_mmu *context = vcpu->arch.walk_mmu; 3878 struct kvm_mmu *context = &vcpu->arch.mmu;
3767 3879
3768 context->base_role.word = 0; 3880 context->base_role.word = 0;
3769 context->page_fault = tdp_page_fault; 3881 context->page_fault = tdp_page_fault;
@@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3803 update_last_pte_bitmap(vcpu, context); 3915 update_last_pte_bitmap(vcpu, context);
3804} 3916}
3805 3917
3806void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 3918void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
3807{ 3919{
3808 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 3920 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3809 ASSERT(vcpu); 3921 struct kvm_mmu *context = &vcpu->arch.mmu;
3810 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3922
3923 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
3811 3924
3812 if (!is_paging(vcpu)) 3925 if (!is_paging(vcpu))
3813 nonpaging_init_context(vcpu, context); 3926 nonpaging_init_context(vcpu, context);
@@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3818 else 3931 else
3819 paging32_init_context(vcpu, context); 3932 paging32_init_context(vcpu, context);
3820 3933
3821 vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); 3934 context->base_role.nxe = is_nx(vcpu);
3822 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3935 context->base_role.cr4_pae = !!is_pae(vcpu);
3823 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3936 context->base_role.cr0_wp = is_write_protection(vcpu);
3824 vcpu->arch.mmu.base_role.smep_andnot_wp 3937 context->base_role.smep_andnot_wp
3825 = smep && !is_write_protection(vcpu); 3938 = smep && !is_write_protection(vcpu);
3826} 3939}
3827EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 3940EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3828 3941
3829void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, 3942void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
3830 bool execonly)
3831{ 3943{
3832 ASSERT(vcpu); 3944 struct kvm_mmu *context = &vcpu->arch.mmu;
3833 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3945
3946 MMU_WARN_ON(VALID_PAGE(context->root_hpa));
3834 3947
3835 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 3948 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
3836 3949
@@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
3851 3964
3852static void init_kvm_softmmu(struct kvm_vcpu *vcpu) 3965static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
3853{ 3966{
3854 kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); 3967 struct kvm_mmu *context = &vcpu->arch.mmu;
3855 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; 3968
3856 vcpu->arch.walk_mmu->get_cr3 = get_cr3; 3969 kvm_init_shadow_mmu(vcpu);
3857 vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; 3970 context->set_cr3 = kvm_x86_ops->set_cr3;
3858 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3971 context->get_cr3 = get_cr3;
3972 context->get_pdptr = kvm_pdptr_read;
3973 context->inject_page_fault = kvm_inject_page_fault;
3859} 3974}
3860 3975
3861static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) 3976static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3900static void init_kvm_mmu(struct kvm_vcpu *vcpu) 4015static void init_kvm_mmu(struct kvm_vcpu *vcpu)
3901{ 4016{
3902 if (mmu_is_nested(vcpu)) 4017 if (mmu_is_nested(vcpu))
3903 return init_kvm_nested_mmu(vcpu); 4018 init_kvm_nested_mmu(vcpu);
3904 else if (tdp_enabled) 4019 else if (tdp_enabled)
3905 return init_kvm_tdp_mmu(vcpu); 4020 init_kvm_tdp_mmu(vcpu);
3906 else 4021 else
3907 return init_kvm_softmmu(vcpu); 4022 init_kvm_softmmu(vcpu);
3908} 4023}
3909 4024
3910void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 4025void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
3911{ 4026{
3912 ASSERT(vcpu);
3913
3914 kvm_mmu_unload(vcpu); 4027 kvm_mmu_unload(vcpu);
3915 init_kvm_mmu(vcpu); 4028 init_kvm_mmu(vcpu);
3916} 4029}
@@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
4266 struct page *page; 4379 struct page *page;
4267 int i; 4380 int i;
4268 4381
4269 ASSERT(vcpu);
4270
4271 /* 4382 /*
4272 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 4383 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
4273 * Therefore we need to allocate shadow page tables in the first 4384 * Therefore we need to allocate shadow page tables in the first
@@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
4286 4397
4287int kvm_mmu_create(struct kvm_vcpu *vcpu) 4398int kvm_mmu_create(struct kvm_vcpu *vcpu)
4288{ 4399{
4289 ASSERT(vcpu);
4290
4291 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 4400 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
4292 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4401 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4293 vcpu->arch.mmu.translate_gpa = translate_gpa; 4402 vcpu->arch.mmu.translate_gpa = translate_gpa;
@@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
4298 4407
4299void kvm_mmu_setup(struct kvm_vcpu *vcpu) 4408void kvm_mmu_setup(struct kvm_vcpu *vcpu)
4300{ 4409{
4301 ASSERT(vcpu); 4410 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
4302 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
4303 4411
4304 init_kvm_mmu(vcpu); 4412 init_kvm_mmu(vcpu);
4305} 4413}
4306 4414
4307void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4415void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
4416 struct kvm_memory_slot *memslot)
4308{ 4417{
4309 struct kvm_memory_slot *memslot;
4310 gfn_t last_gfn; 4418 gfn_t last_gfn;
4311 int i; 4419 int i;
4420 bool flush = false;
4312 4421
4313 memslot = id_to_memslot(kvm->memslots, slot);
4314 last_gfn = memslot->base_gfn + memslot->npages - 1; 4422 last_gfn = memslot->base_gfn + memslot->npages - 1;
4315 4423
4316 spin_lock(&kvm->mmu_lock); 4424 spin_lock(&kvm->mmu_lock);
@@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4325 4433
4326 for (index = 0; index <= last_index; ++index, ++rmapp) { 4434 for (index = 0; index <= last_index; ++index, ++rmapp) {
4327 if (*rmapp) 4435 if (*rmapp)
4328 __rmap_write_protect(kvm, rmapp, false); 4436 flush |= __rmap_write_protect(kvm, rmapp,
4437 false);
4329 4438
4330 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) 4439 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4331 cond_resched_lock(&kvm->mmu_lock); 4440 cond_resched_lock(&kvm->mmu_lock);
@@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4352 * instead of PT_WRITABLE_MASK, that means it does not depend 4461 * instead of PT_WRITABLE_MASK, that means it does not depend
4353 * on PT_WRITABLE_MASK anymore. 4462 * on PT_WRITABLE_MASK anymore.
4354 */ 4463 */
4355 kvm_flush_remote_tlbs(kvm); 4464 if (flush)
4465 kvm_flush_remote_tlbs(kvm);
4466}
4467
4468void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
4469 struct kvm_memory_slot *memslot)
4470{
4471 gfn_t last_gfn;
4472 unsigned long *rmapp;
4473 unsigned long last_index, index;
4474 bool flush = false;
4475
4476 last_gfn = memslot->base_gfn + memslot->npages - 1;
4477
4478 spin_lock(&kvm->mmu_lock);
4479
4480 rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
4481 last_index = gfn_to_index(last_gfn, memslot->base_gfn,
4482 PT_PAGE_TABLE_LEVEL);
4483
4484 for (index = 0; index <= last_index; ++index, ++rmapp) {
4485 if (*rmapp)
4486 flush |= __rmap_clear_dirty(kvm, rmapp);
4487
4488 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4489 cond_resched_lock(&kvm->mmu_lock);
4490 }
4491
4492 spin_unlock(&kvm->mmu_lock);
4493
4494 lockdep_assert_held(&kvm->slots_lock);
4495
4496 /*
4497 * It's also safe to flush TLBs out of mmu lock here as currently this
4498 * function is only used for dirty logging, in which case flushing TLB
4499 * out of mmu lock also guarantees no dirty pages will be lost in
4500 * dirty_bitmap.
4501 */
4502 if (flush)
4503 kvm_flush_remote_tlbs(kvm);
4504}
4505EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
4506
4507void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
4508 struct kvm_memory_slot *memslot)
4509{
4510 gfn_t last_gfn;
4511 int i;
4512 bool flush = false;
4513
4514 last_gfn = memslot->base_gfn + memslot->npages - 1;
4515
4516 spin_lock(&kvm->mmu_lock);
4517
4518 for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
4519 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4520 unsigned long *rmapp;
4521 unsigned long last_index, index;
4522
4523 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4524 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4525
4526 for (index = 0; index <= last_index; ++index, ++rmapp) {
4527 if (*rmapp)
4528 flush |= __rmap_write_protect(kvm, rmapp,
4529 false);
4530
4531 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4532 cond_resched_lock(&kvm->mmu_lock);
4533 }
4534 }
4535 spin_unlock(&kvm->mmu_lock);
4536
4537 /* see kvm_mmu_slot_remove_write_access */
4538 lockdep_assert_held(&kvm->slots_lock);
4539
4540 if (flush)
4541 kvm_flush_remote_tlbs(kvm);
4542}
4543EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
4544
4545void kvm_mmu_slot_set_dirty(struct kvm *kvm,
4546 struct kvm_memory_slot *memslot)
4547{
4548 gfn_t last_gfn;
4549 int i;
4550 bool flush = false;
4551
4552 last_gfn = memslot->base_gfn + memslot->npages - 1;
4553
4554 spin_lock(&kvm->mmu_lock);
4555
4556 for (i = PT_PAGE_TABLE_LEVEL;
4557 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4558 unsigned long *rmapp;
4559 unsigned long last_index, index;
4560
4561 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4562 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4563
4564 for (index = 0; index <= last_index; ++index, ++rmapp) {
4565 if (*rmapp)
4566 flush |= __rmap_set_dirty(kvm, rmapp);
4567
4568 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4569 cond_resched_lock(&kvm->mmu_lock);
4570 }
4571 }
4572
4573 spin_unlock(&kvm->mmu_lock);
4574
4575 lockdep_assert_held(&kvm->slots_lock);
4576
4577 /* see kvm_mmu_slot_leaf_clear_dirty */
4578 if (flush)
4579 kvm_flush_remote_tlbs(kvm);
4356} 4580}
4581EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
4357 4582
4358#define BATCH_ZAP_PAGES 10 4583#define BATCH_ZAP_PAGES 10
4359static void kvm_zap_obsolete_pages(struct kvm *kvm) 4584static void kvm_zap_obsolete_pages(struct kvm *kvm)
@@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
4606 4831
4607void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 4832void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
4608{ 4833{
4609 ASSERT(vcpu);
4610
4611 kvm_mmu_unload(vcpu); 4834 kvm_mmu_unload(vcpu);
4612 free_mmu_pages(vcpu); 4835 free_mmu_pages(vcpu);
4613 mmu_free_memory_caches(vcpu); 4836 mmu_free_memory_caches(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bde8ee725754..c7d65637c851 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,18 +44,6 @@
44#define PT_DIRECTORY_LEVEL 2 44#define PT_DIRECTORY_LEVEL 2
45#define PT_PAGE_TABLE_LEVEL 1 45#define PT_PAGE_TABLE_LEVEL 1
46 46
47#define PFERR_PRESENT_BIT 0
48#define PFERR_WRITE_BIT 1
49#define PFERR_USER_BIT 2
50#define PFERR_RSVD_BIT 3
51#define PFERR_FETCH_BIT 4
52
53#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
54#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
55#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
56#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
57#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
58
59static inline u64 rsvd_bits(int s, int e) 47static inline u64 rsvd_bits(int s, int e)
60{ 48{
61 return ((1ULL << (e - s + 1)) - 1) << s; 49 return ((1ULL << (e - s + 1)) - 1) << s;
@@ -81,9 +69,8 @@ enum {
81}; 69};
82 70
83int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 71int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
84void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 72void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
85void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, 73void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
86 bool execonly);
87void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 74void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
88 bool ept); 75 bool ept);
89 76
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..a17d848c6d42 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
2003 2003
2004static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) 2004static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2005{ 2005{
2006 kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); 2006 WARN_ON(mmu_is_nested(vcpu));
2007 2007 kvm_init_shadow_mmu(vcpu);
2008 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; 2008 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
2009 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; 2009 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
2010 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; 2010 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c2a34bb5ad93..7c7bc8bef21f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc,
848 848
849#endif /* CONFIG_X86_64 */ 849#endif /* CONFIG_X86_64 */
850 850
851/*
852 * Tracepoint for PML full VMEXIT.
853 */
854TRACE_EVENT(kvm_pml_full,
855 TP_PROTO(unsigned int vcpu_id),
856 TP_ARGS(vcpu_id),
857
858 TP_STRUCT__entry(
859 __field( unsigned int, vcpu_id )
860 ),
861
862 TP_fast_assign(
863 __entry->vcpu_id = vcpu_id;
864 ),
865
866 TP_printk("vcpu %d: PML full", __entry->vcpu_id)
867);
868
851TRACE_EVENT(kvm_ple_window, 869TRACE_EVENT(kvm_ple_window,
852 TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), 870 TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
853 TP_ARGS(grow, vcpu_id, new, old), 871 TP_ARGS(grow, vcpu_id, new, old),
@@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update,
914 __entry->flags) 932 __entry->flags)
915); 933);
916 934
935TRACE_EVENT(kvm_wait_lapic_expire,
936 TP_PROTO(unsigned int vcpu_id, s64 delta),
937 TP_ARGS(vcpu_id, delta),
938
939 TP_STRUCT__entry(
940 __field( unsigned int, vcpu_id )
941 __field( s64, delta )
942 ),
943
944 TP_fast_assign(
945 __entry->vcpu_id = vcpu_id;
946 __entry->delta = delta;
947 ),
948
949 TP_printk("vcpu %u: delta %lld (%s)",
950 __entry->vcpu_id,
951 __entry->delta,
952 __entry->delta < 0 ? "early" : "late")
953);
954
917#endif /* _TRACE_KVM_H */ 955#endif /* _TRACE_KVM_H */
918 956
919#undef TRACE_INCLUDE_PATH 957#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..3f73bfad0349 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
45#include <asm/perf_event.h> 45#include <asm/perf_event.h>
46#include <asm/debugreg.h> 46#include <asm/debugreg.h>
47#include <asm/kexec.h> 47#include <asm/kexec.h>
48#include <asm/apic.h>
48 49
49#include "trace.h" 50#include "trace.h"
50 51
@@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO);
101 102
102static u64 __read_mostly host_xss; 103static u64 __read_mostly host_xss;
103 104
105static bool __read_mostly enable_pml = 1;
106module_param_named(pml, enable_pml, bool, S_IRUGO);
107
104#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) 108#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
105#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) 109#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
106#define KVM_VM_CR0_ALWAYS_ON \ 110#define KVM_VM_CR0_ALWAYS_ON \
@@ -215,7 +219,12 @@ struct __packed vmcs12 {
215 u64 tsc_offset; 219 u64 tsc_offset;
216 u64 virtual_apic_page_addr; 220 u64 virtual_apic_page_addr;
217 u64 apic_access_addr; 221 u64 apic_access_addr;
222 u64 posted_intr_desc_addr;
218 u64 ept_pointer; 223 u64 ept_pointer;
224 u64 eoi_exit_bitmap0;
225 u64 eoi_exit_bitmap1;
226 u64 eoi_exit_bitmap2;
227 u64 eoi_exit_bitmap3;
219 u64 xss_exit_bitmap; 228 u64 xss_exit_bitmap;
220 u64 guest_physical_address; 229 u64 guest_physical_address;
221 u64 vmcs_link_pointer; 230 u64 vmcs_link_pointer;
@@ -330,6 +339,7 @@ struct __packed vmcs12 {
330 u32 vmx_preemption_timer_value; 339 u32 vmx_preemption_timer_value;
331 u32 padding32[7]; /* room for future expansion */ 340 u32 padding32[7]; /* room for future expansion */
332 u16 virtual_processor_id; 341 u16 virtual_processor_id;
342 u16 posted_intr_nv;
333 u16 guest_es_selector; 343 u16 guest_es_selector;
334 u16 guest_cs_selector; 344 u16 guest_cs_selector;
335 u16 guest_ss_selector; 345 u16 guest_ss_selector;
@@ -338,6 +348,7 @@ struct __packed vmcs12 {
338 u16 guest_gs_selector; 348 u16 guest_gs_selector;
339 u16 guest_ldtr_selector; 349 u16 guest_ldtr_selector;
340 u16 guest_tr_selector; 350 u16 guest_tr_selector;
351 u16 guest_intr_status;
341 u16 host_es_selector; 352 u16 host_es_selector;
342 u16 host_cs_selector; 353 u16 host_cs_selector;
343 u16 host_ss_selector; 354 u16 host_ss_selector;
@@ -401,6 +412,10 @@ struct nested_vmx {
401 */ 412 */
402 struct page *apic_access_page; 413 struct page *apic_access_page;
403 struct page *virtual_apic_page; 414 struct page *virtual_apic_page;
415 struct page *pi_desc_page;
416 struct pi_desc *pi_desc;
417 bool pi_pending;
418 u16 posted_intr_nv;
404 u64 msr_ia32_feature_control; 419 u64 msr_ia32_feature_control;
405 420
406 struct hrtimer preemption_timer; 421 struct hrtimer preemption_timer;
@@ -408,6 +423,23 @@ struct nested_vmx {
408 423
409 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 424 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
410 u64 vmcs01_debugctl; 425 u64 vmcs01_debugctl;
426
427 u32 nested_vmx_procbased_ctls_low;
428 u32 nested_vmx_procbased_ctls_high;
429 u32 nested_vmx_true_procbased_ctls_low;
430 u32 nested_vmx_secondary_ctls_low;
431 u32 nested_vmx_secondary_ctls_high;
432 u32 nested_vmx_pinbased_ctls_low;
433 u32 nested_vmx_pinbased_ctls_high;
434 u32 nested_vmx_exit_ctls_low;
435 u32 nested_vmx_exit_ctls_high;
436 u32 nested_vmx_true_exit_ctls_low;
437 u32 nested_vmx_entry_ctls_low;
438 u32 nested_vmx_entry_ctls_high;
439 u32 nested_vmx_true_entry_ctls_low;
440 u32 nested_vmx_misc_low;
441 u32 nested_vmx_misc_high;
442 u32 nested_vmx_ept_caps;
411}; 443};
412 444
413#define POSTED_INTR_ON 0 445#define POSTED_INTR_ON 0
@@ -511,6 +543,10 @@ struct vcpu_vmx {
511 /* Dynamic PLE window. */ 543 /* Dynamic PLE window. */
512 int ple_window; 544 int ple_window;
513 bool ple_window_dirty; 545 bool ple_window_dirty;
546
547 /* Support for PML */
548#define PML_ENTITY_NUM 512
549 struct page *pml_pg;
514}; 550};
515 551
516enum segment_cache_field { 552enum segment_cache_field {
@@ -594,6 +630,7 @@ static int max_shadow_read_write_fields =
594 630
595static const unsigned short vmcs_field_to_offset_table[] = { 631static const unsigned short vmcs_field_to_offset_table[] = {
596 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 632 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
633 FIELD(POSTED_INTR_NV, posted_intr_nv),
597 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 634 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
598 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 635 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
599 FIELD(GUEST_SS_SELECTOR, guest_ss_selector), 636 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
602 FIELD(GUEST_GS_SELECTOR, guest_gs_selector), 639 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
603 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), 640 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
604 FIELD(GUEST_TR_SELECTOR, guest_tr_selector), 641 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
642 FIELD(GUEST_INTR_STATUS, guest_intr_status),
605 FIELD(HOST_ES_SELECTOR, host_es_selector), 643 FIELD(HOST_ES_SELECTOR, host_es_selector),
606 FIELD(HOST_CS_SELECTOR, host_cs_selector), 644 FIELD(HOST_CS_SELECTOR, host_cs_selector),
607 FIELD(HOST_SS_SELECTOR, host_ss_selector), 645 FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = {
618 FIELD64(TSC_OFFSET, tsc_offset), 656 FIELD64(TSC_OFFSET, tsc_offset),
619 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), 657 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
620 FIELD64(APIC_ACCESS_ADDR, apic_access_addr), 658 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
659 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
621 FIELD64(EPT_POINTER, ept_pointer), 660 FIELD64(EPT_POINTER, ept_pointer),
661 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
662 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
663 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
664 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
622 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), 665 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
623 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), 666 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
624 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), 667 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr);
766static void kvm_cpu_vmxoff(void); 809static void kvm_cpu_vmxoff(void);
767static bool vmx_mpx_supported(void); 810static bool vmx_mpx_supported(void);
768static bool vmx_xsaves_supported(void); 811static bool vmx_xsaves_supported(void);
812static int vmx_vm_has_apicv(struct kvm *kvm);
769static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 813static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
770static void vmx_set_segment(struct kvm_vcpu *vcpu, 814static void vmx_set_segment(struct kvm_vcpu *vcpu,
771 struct kvm_segment *var, int seg); 815 struct kvm_segment *var, int seg);
@@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
793static unsigned long *vmx_msr_bitmap_longmode; 837static unsigned long *vmx_msr_bitmap_longmode;
794static unsigned long *vmx_msr_bitmap_legacy_x2apic; 838static unsigned long *vmx_msr_bitmap_legacy_x2apic;
795static unsigned long *vmx_msr_bitmap_longmode_x2apic; 839static unsigned long *vmx_msr_bitmap_longmode_x2apic;
840static unsigned long *vmx_msr_bitmap_nested;
796static unsigned long *vmx_vmread_bitmap; 841static unsigned long *vmx_vmread_bitmap;
797static unsigned long *vmx_vmwrite_bitmap; 842static unsigned long *vmx_vmwrite_bitmap;
798 843
@@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void)
959 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; 1004 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
960} 1005}
961 1006
962static inline bool cpu_has_vmx_eptp_uncacheable(void)
963{
964 return vmx_capability.ept & VMX_EPTP_UC_BIT;
965}
966
967static inline bool cpu_has_vmx_eptp_writeback(void)
968{
969 return vmx_capability.ept & VMX_EPTP_WB_BIT;
970}
971
972static inline bool cpu_has_vmx_ept_2m_page(void) 1007static inline bool cpu_has_vmx_ept_2m_page(void)
973{ 1008{
974 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; 1009 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void)
1073 SECONDARY_EXEC_SHADOW_VMCS; 1108 SECONDARY_EXEC_SHADOW_VMCS;
1074} 1109}
1075 1110
1111static inline bool cpu_has_vmx_pml(void)
1112{
1113 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1114}
1115
1076static inline bool report_flexpriority(void) 1116static inline bool report_flexpriority(void)
1077{ 1117{
1078 return flexpriority_enabled; 1118 return flexpriority_enabled;
@@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1112 vmx_xsaves_supported(); 1152 vmx_xsaves_supported();
1113} 1153}
1114 1154
1155static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1156{
1157 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1158}
1159
1160static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1161{
1162 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1163}
1164
1165static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1166{
1167 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1168}
1169
1170static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1171{
1172 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1173}
1174
1115static inline bool is_exception(u32 intr_info) 1175static inline bool is_exception(u32 intr_info)
1116{ 1176{
1117 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1177 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2284,20 +2344,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2284 * if the corresponding bit in the (32-bit) control field *must* be on, and a 2344 * if the corresponding bit in the (32-bit) control field *must* be on, and a
2285 * bit in the high half is on if the corresponding bit in the control field 2345 * bit in the high half is on if the corresponding bit in the control field
2286 * may be on. See also vmx_control_verify(). 2346 * may be on. See also vmx_control_verify().
2287 * TODO: allow these variables to be modified (downgraded) by module options
2288 * or other means.
2289 */ 2347 */
2290static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; 2348static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2291static u32 nested_vmx_true_procbased_ctls_low;
2292static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
2293static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2294static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2295static u32 nested_vmx_true_exit_ctls_low;
2296static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2297static u32 nested_vmx_true_entry_ctls_low;
2298static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2299static u32 nested_vmx_ept_caps;
2300static __init void nested_vmx_setup_ctls_msrs(void)
2301{ 2349{
2302 /* 2350 /*
2303 * Note that as a general rule, the high half of the MSRs (bits in 2351 * Note that as a general rule, the high half of the MSRs (bits in
@@ -2316,57 +2364,74 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2316 2364
2317 /* pin-based controls */ 2365 /* pin-based controls */
2318 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 2366 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2319 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); 2367 vmx->nested.nested_vmx_pinbased_ctls_low,
2320 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2368 vmx->nested.nested_vmx_pinbased_ctls_high);
2321 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | 2369 vmx->nested.nested_vmx_pinbased_ctls_low |=
2322 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; 2370 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2323 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2371 vmx->nested.nested_vmx_pinbased_ctls_high &=
2372 PIN_BASED_EXT_INTR_MASK |
2373 PIN_BASED_NMI_EXITING |
2374 PIN_BASED_VIRTUAL_NMIS;
2375 vmx->nested.nested_vmx_pinbased_ctls_high |=
2376 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2324 PIN_BASED_VMX_PREEMPTION_TIMER; 2377 PIN_BASED_VMX_PREEMPTION_TIMER;
2378 if (vmx_vm_has_apicv(vmx->vcpu.kvm))
2379 vmx->nested.nested_vmx_pinbased_ctls_high |=
2380 PIN_BASED_POSTED_INTR;
2325 2381
2326 /* exit controls */ 2382 /* exit controls */
2327 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2383 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2328 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); 2384 vmx->nested.nested_vmx_exit_ctls_low,
2329 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2385 vmx->nested.nested_vmx_exit_ctls_high);
2386 vmx->nested.nested_vmx_exit_ctls_low =
2387 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2330 2388
2331 nested_vmx_exit_ctls_high &= 2389 vmx->nested.nested_vmx_exit_ctls_high &=
2332#ifdef CONFIG_X86_64 2390#ifdef CONFIG_X86_64
2333 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2391 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2334#endif 2392#endif
2335 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2393 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2336 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2394 vmx->nested.nested_vmx_exit_ctls_high |=
2395 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2337 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2396 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2338 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 2397 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2339 2398
2340 if (vmx_mpx_supported()) 2399 if (vmx_mpx_supported())
2341 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2400 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2342 2401
2343 /* We support free control of debug control saving. */ 2402 /* We support free control of debug control saving. */
2344 nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & 2403 vmx->nested.nested_vmx_true_exit_ctls_low =
2404 vmx->nested.nested_vmx_exit_ctls_low &
2345 ~VM_EXIT_SAVE_DEBUG_CONTROLS; 2405 ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2346 2406
2347 /* entry controls */ 2407 /* entry controls */
2348 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2408 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2349 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); 2409 vmx->nested.nested_vmx_entry_ctls_low,
2350 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2410 vmx->nested.nested_vmx_entry_ctls_high);
2351 nested_vmx_entry_ctls_high &= 2411 vmx->nested.nested_vmx_entry_ctls_low =
2412 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2413 vmx->nested.nested_vmx_entry_ctls_high &=
2352#ifdef CONFIG_X86_64 2414#ifdef CONFIG_X86_64
2353 VM_ENTRY_IA32E_MODE | 2415 VM_ENTRY_IA32E_MODE |
2354#endif 2416#endif
2355 VM_ENTRY_LOAD_IA32_PAT; 2417 VM_ENTRY_LOAD_IA32_PAT;
2356 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | 2418 vmx->nested.nested_vmx_entry_ctls_high |=
2357 VM_ENTRY_LOAD_IA32_EFER); 2419 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2358 if (vmx_mpx_supported()) 2420 if (vmx_mpx_supported())
2359 nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 2421 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2360 2422
2361 /* We support free control of debug control loading. */ 2423 /* We support free control of debug control loading. */
2362 nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & 2424 vmx->nested.nested_vmx_true_entry_ctls_low =
2425 vmx->nested.nested_vmx_entry_ctls_low &
2363 ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 2426 ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2364 2427
2365 /* cpu-based controls */ 2428 /* cpu-based controls */
2366 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2429 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2367 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); 2430 vmx->nested.nested_vmx_procbased_ctls_low,
2368 nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2431 vmx->nested.nested_vmx_procbased_ctls_high);
2369 nested_vmx_procbased_ctls_high &= 2432 vmx->nested.nested_vmx_procbased_ctls_low =
2433 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2434 vmx->nested.nested_vmx_procbased_ctls_high &=
2370 CPU_BASED_VIRTUAL_INTR_PENDING | 2435 CPU_BASED_VIRTUAL_INTR_PENDING |
2371 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 2436 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2372 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 2437 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2386,45 +2451,55 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2386 * can use it to avoid exits to L1 - even when L0 runs L2 2451 * can use it to avoid exits to L1 - even when L0 runs L2
2387 * without MSR bitmaps. 2452 * without MSR bitmaps.
2388 */ 2453 */
2389 nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2454 vmx->nested.nested_vmx_procbased_ctls_high |=
2455 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2390 CPU_BASED_USE_MSR_BITMAPS; 2456 CPU_BASED_USE_MSR_BITMAPS;
2391 2457
2392 /* We support free control of CR3 access interception. */ 2458 /* We support free control of CR3 access interception. */
2393 nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & 2459 vmx->nested.nested_vmx_true_procbased_ctls_low =
2460 vmx->nested.nested_vmx_procbased_ctls_low &
2394 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 2461 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2395 2462
2396 /* secondary cpu-based controls */ 2463 /* secondary cpu-based controls */
2397 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 2464 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2398 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); 2465 vmx->nested.nested_vmx_secondary_ctls_low,
2399 nested_vmx_secondary_ctls_low = 0; 2466 vmx->nested.nested_vmx_secondary_ctls_high);
2400 nested_vmx_secondary_ctls_high &= 2467 vmx->nested.nested_vmx_secondary_ctls_low = 0;
2468 vmx->nested.nested_vmx_secondary_ctls_high &=
2401 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2469 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2470 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2471 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2472 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2402 SECONDARY_EXEC_WBINVD_EXITING | 2473 SECONDARY_EXEC_WBINVD_EXITING |
2403 SECONDARY_EXEC_XSAVES; 2474 SECONDARY_EXEC_XSAVES;
2404 2475
2405 if (enable_ept) { 2476 if (enable_ept) {
2406 /* nested EPT: emulate EPT also to L1 */ 2477 /* nested EPT: emulate EPT also to L1 */
2407 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | 2478 vmx->nested.nested_vmx_secondary_ctls_high |=
2479 SECONDARY_EXEC_ENABLE_EPT |
2408 SECONDARY_EXEC_UNRESTRICTED_GUEST; 2480 SECONDARY_EXEC_UNRESTRICTED_GUEST;
2409 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 2481 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2410 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | 2482 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
2411 VMX_EPT_INVEPT_BIT; 2483 VMX_EPT_INVEPT_BIT;
2412 nested_vmx_ept_caps &= vmx_capability.ept; 2484 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2413 /* 2485 /*
2414 * For nested guests, we don't do anything specific 2486 * For nested guests, we don't do anything specific
2415 * for single context invalidation. Hence, only advertise 2487 * for single context invalidation. Hence, only advertise
2416 * support for global context invalidation. 2488 * support for global context invalidation.
2417 */ 2489 */
2418 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; 2490 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
2419 } else 2491 } else
2420 nested_vmx_ept_caps = 0; 2492 vmx->nested.nested_vmx_ept_caps = 0;
2421 2493
2422 /* miscellaneous data */ 2494 /* miscellaneous data */
2423 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2495 rdmsr(MSR_IA32_VMX_MISC,
2424 nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; 2496 vmx->nested.nested_vmx_misc_low,
2425 nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 2497 vmx->nested.nested_vmx_misc_high);
2498 vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2499 vmx->nested.nested_vmx_misc_low |=
2500 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2426 VMX_MISC_ACTIVITY_HLT; 2501 VMX_MISC_ACTIVITY_HLT;
2427 nested_vmx_misc_high = 0; 2502 vmx->nested.nested_vmx_misc_high = 0;
2428} 2503}
2429 2504
2430static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 2505static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2443,6 +2518,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
2443/* Returns 0 on success, non-0 otherwise. */ 2518/* Returns 0 on success, non-0 otherwise. */
2444static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2519static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2445{ 2520{
2521 struct vcpu_vmx *vmx = to_vmx(vcpu);
2522
2446 switch (msr_index) { 2523 switch (msr_index) {
2447 case MSR_IA32_VMX_BASIC: 2524 case MSR_IA32_VMX_BASIC:
2448 /* 2525 /*
@@ -2457,36 +2534,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2457 break; 2534 break;
2458 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 2535 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2459 case MSR_IA32_VMX_PINBASED_CTLS: 2536 case MSR_IA32_VMX_PINBASED_CTLS:
2460 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, 2537 *pdata = vmx_control_msr(
2461 nested_vmx_pinbased_ctls_high); 2538 vmx->nested.nested_vmx_pinbased_ctls_low,
2539 vmx->nested.nested_vmx_pinbased_ctls_high);
2462 break; 2540 break;
2463 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 2541 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2464 *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, 2542 *pdata = vmx_control_msr(
2465 nested_vmx_procbased_ctls_high); 2543 vmx->nested.nested_vmx_true_procbased_ctls_low,
2544 vmx->nested.nested_vmx_procbased_ctls_high);
2466 break; 2545 break;
2467 case MSR_IA32_VMX_PROCBASED_CTLS: 2546 case MSR_IA32_VMX_PROCBASED_CTLS:
2468 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, 2547 *pdata = vmx_control_msr(
2469 nested_vmx_procbased_ctls_high); 2548 vmx->nested.nested_vmx_procbased_ctls_low,
2549 vmx->nested.nested_vmx_procbased_ctls_high);
2470 break; 2550 break;
2471 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 2551 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2472 *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, 2552 *pdata = vmx_control_msr(
2473 nested_vmx_exit_ctls_high); 2553 vmx->nested.nested_vmx_true_exit_ctls_low,
2554 vmx->nested.nested_vmx_exit_ctls_high);
2474 break; 2555 break;
2475 case MSR_IA32_VMX_EXIT_CTLS: 2556 case MSR_IA32_VMX_EXIT_CTLS:
2476 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, 2557 *pdata = vmx_control_msr(
2477 nested_vmx_exit_ctls_high); 2558 vmx->nested.nested_vmx_exit_ctls_low,
2559 vmx->nested.nested_vmx_exit_ctls_high);
2478 break; 2560 break;
2479 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 2561 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2480 *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, 2562 *pdata = vmx_control_msr(
2481 nested_vmx_entry_ctls_high); 2563 vmx->nested.nested_vmx_true_entry_ctls_low,
2564 vmx->nested.nested_vmx_entry_ctls_high);
2482 break; 2565 break;
2483 case MSR_IA32_VMX_ENTRY_CTLS: 2566 case MSR_IA32_VMX_ENTRY_CTLS:
2484 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, 2567 *pdata = vmx_control_msr(
2485 nested_vmx_entry_ctls_high); 2568 vmx->nested.nested_vmx_entry_ctls_low,
2569 vmx->nested.nested_vmx_entry_ctls_high);
2486 break; 2570 break;
2487 case MSR_IA32_VMX_MISC: 2571 case MSR_IA32_VMX_MISC:
2488 *pdata = vmx_control_msr(nested_vmx_misc_low, 2572 *pdata = vmx_control_msr(
2489 nested_vmx_misc_high); 2573 vmx->nested.nested_vmx_misc_low,
2574 vmx->nested.nested_vmx_misc_high);
2490 break; 2575 break;
2491 /* 2576 /*
2492 * These MSRs specify bits which the guest must keep fixed (on or off) 2577 * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2511,12 +2596,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2511 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 2596 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2512 break; 2597 break;
2513 case MSR_IA32_VMX_PROCBASED_CTLS2: 2598 case MSR_IA32_VMX_PROCBASED_CTLS2:
2514 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, 2599 *pdata = vmx_control_msr(
2515 nested_vmx_secondary_ctls_high); 2600 vmx->nested.nested_vmx_secondary_ctls_low,
2601 vmx->nested.nested_vmx_secondary_ctls_high);
2516 break; 2602 break;
2517 case MSR_IA32_VMX_EPT_VPID_CAP: 2603 case MSR_IA32_VMX_EPT_VPID_CAP:
2518 /* Currently, no nested vpid support */ 2604 /* Currently, no nested vpid support */
2519 *pdata = nested_vmx_ept_caps; 2605 *pdata = vmx->nested.nested_vmx_ept_caps;
2520 break; 2606 break;
2521 default: 2607 default:
2522 return 1; 2608 return 1;
@@ -2929,7 +3015,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2929 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3015 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2930 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3016 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2931 SECONDARY_EXEC_SHADOW_VMCS | 3017 SECONDARY_EXEC_SHADOW_VMCS |
2932 SECONDARY_EXEC_XSAVES; 3018 SECONDARY_EXEC_XSAVES |
3019 SECONDARY_EXEC_ENABLE_PML;
2933 if (adjust_vmx_controls(min2, opt2, 3020 if (adjust_vmx_controls(min2, opt2,
2934 MSR_IA32_VMX_PROCBASED_CTLS2, 3021 MSR_IA32_VMX_PROCBASED_CTLS2,
2935 &_cpu_based_2nd_exec_control) < 0) 3022 &_cpu_based_2nd_exec_control) < 0)
@@ -4159,6 +4246,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4159 } 4246 }
4160} 4247}
4161 4248
4249/*
4250 * If a msr is allowed by L0, we should check whether it is allowed by L1.
4251 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
4252 */
4253static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
4254 unsigned long *msr_bitmap_nested,
4255 u32 msr, int type)
4256{
4257 int f = sizeof(unsigned long);
4258
4259 if (!cpu_has_vmx_msr_bitmap()) {
4260 WARN_ON(1);
4261 return;
4262 }
4263
4264 /*
4265 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4266 * have the write-low and read-high bitmap offsets the wrong way round.
4267 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4268 */
4269 if (msr <= 0x1fff) {
4270 if (type & MSR_TYPE_R &&
4271 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
4272 /* read-low */
4273 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
4274
4275 if (type & MSR_TYPE_W &&
4276 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
4277 /* write-low */
4278 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
4279
4280 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4281 msr &= 0x1fff;
4282 if (type & MSR_TYPE_R &&
4283 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
4284 /* read-high */
4285 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
4286
4287 if (type & MSR_TYPE_W &&
4288 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
4289 /* write-high */
4290 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
4291
4292 }
4293}
4294
4162static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 4295static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
4163{ 4296{
4164 if (!longmode_only) 4297 if (!longmode_only)
@@ -4197,6 +4330,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
4197 return enable_apicv && irqchip_in_kernel(kvm); 4330 return enable_apicv && irqchip_in_kernel(kvm);
4198} 4331}
4199 4332
4333static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4334{
4335 struct vcpu_vmx *vmx = to_vmx(vcpu);
4336 int max_irr;
4337 void *vapic_page;
4338 u16 status;
4339
4340 if (vmx->nested.pi_desc &&
4341 vmx->nested.pi_pending) {
4342 vmx->nested.pi_pending = false;
4343 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
4344 return 0;
4345
4346 max_irr = find_last_bit(
4347 (unsigned long *)vmx->nested.pi_desc->pir, 256);
4348
4349 if (max_irr == 256)
4350 return 0;
4351
4352 vapic_page = kmap(vmx->nested.virtual_apic_page);
4353 if (!vapic_page) {
4354 WARN_ON(1);
4355 return -ENOMEM;
4356 }
4357 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
4358 kunmap(vmx->nested.virtual_apic_page);
4359
4360 status = vmcs_read16(GUEST_INTR_STATUS);
4361 if ((u8)max_irr > ((u8)status & 0xff)) {
4362 status &= ~0xff;
4363 status |= (u8)max_irr;
4364 vmcs_write16(GUEST_INTR_STATUS, status);
4365 }
4366 }
4367 return 0;
4368}
4369
4370static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4371 int vector)
4372{
4373 struct vcpu_vmx *vmx = to_vmx(vcpu);
4374
4375 if (is_guest_mode(vcpu) &&
4376 vector == vmx->nested.posted_intr_nv) {
4377 /* the PIR and ON have been set by L1. */
4378 if (vcpu->mode == IN_GUEST_MODE)
4379 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4380 POSTED_INTR_VECTOR);
4381 /*
4382 * If a posted intr is not recognized by hardware,
4383 * we will accomplish it in the next vmentry.
4384 */
4385 vmx->nested.pi_pending = true;
4386 kvm_make_request(KVM_REQ_EVENT, vcpu);
4387 return 0;
4388 }
4389 return -1;
4390}
4200/* 4391/*
4201 * Send interrupt to vcpu via posted interrupt way. 4392 * Send interrupt to vcpu via posted interrupt way.
4202 * 1. If target vcpu is running(non-root mode), send posted interrupt 4393 * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4209,6 +4400,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4209 struct vcpu_vmx *vmx = to_vmx(vcpu); 4400 struct vcpu_vmx *vmx = to_vmx(vcpu);
4210 int r; 4401 int r;
4211 4402
4403 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4404 if (!r)
4405 return;
4406
4212 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4407 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4213 return; 4408 return;
4214 4409
@@ -4360,6 +4555,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4360 a current VMCS12 4555 a current VMCS12
4361 */ 4556 */
4362 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4557 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4558 /* PML is enabled/disabled in creating/destorying vcpu */
4559 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4560
4363 return exec_control; 4561 return exec_control;
4364} 4562}
4365 4563
@@ -4986,11 +5184,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4986 hypercall[2] = 0xc1; 5184 hypercall[2] = 0xc1;
4987} 5185}
4988 5186
4989static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) 5187static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4990{ 5188{
4991 unsigned long always_on = VMXON_CR0_ALWAYSON; 5189 unsigned long always_on = VMXON_CR0_ALWAYSON;
5190 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4992 5191
4993 if (nested_vmx_secondary_ctls_high & 5192 if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
4994 SECONDARY_EXEC_UNRESTRICTED_GUEST && 5193 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
4995 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 5194 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
4996 always_on &= ~(X86_CR0_PE | X86_CR0_PG); 5195 always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -5015,7 +5214,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5015 val = (val & ~vmcs12->cr0_guest_host_mask) | 5214 val = (val & ~vmcs12->cr0_guest_host_mask) |
5016 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5215 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5017 5216
5018 if (!nested_cr0_valid(vmcs12, val)) 5217 if (!nested_cr0_valid(vcpu, val))
5019 return 1; 5218 return 1;
5020 5219
5021 if (kvm_set_cr0(vcpu, val)) 5220 if (kvm_set_cr0(vcpu, val))
@@ -5817,13 +6016,21 @@ static __init int hardware_setup(void)
5817 (unsigned long *)__get_free_page(GFP_KERNEL); 6016 (unsigned long *)__get_free_page(GFP_KERNEL);
5818 if (!vmx_msr_bitmap_longmode_x2apic) 6017 if (!vmx_msr_bitmap_longmode_x2apic)
5819 goto out4; 6018 goto out4;
6019
6020 if (nested) {
6021 vmx_msr_bitmap_nested =
6022 (unsigned long *)__get_free_page(GFP_KERNEL);
6023 if (!vmx_msr_bitmap_nested)
6024 goto out5;
6025 }
6026
5820 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6027 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
5821 if (!vmx_vmread_bitmap) 6028 if (!vmx_vmread_bitmap)
5822 goto out5; 6029 goto out6;
5823 6030
5824 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6031 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
5825 if (!vmx_vmwrite_bitmap) 6032 if (!vmx_vmwrite_bitmap)
5826 goto out6; 6033 goto out7;
5827 6034
5828 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 6035 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
5829 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 6036 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5839,10 +6046,12 @@ static __init int hardware_setup(void)
5839 6046
5840 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); 6047 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
5841 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); 6048 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
6049 if (nested)
6050 memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
5842 6051
5843 if (setup_vmcs_config(&vmcs_config) < 0) { 6052 if (setup_vmcs_config(&vmcs_config) < 0) {
5844 r = -EIO; 6053 r = -EIO;
5845 goto out7; 6054 goto out8;
5846 } 6055 }
5847 6056
5848 if (boot_cpu_has(X86_FEATURE_NX)) 6057 if (boot_cpu_has(X86_FEATURE_NX))
@@ -5868,16 +6077,16 @@ static __init int hardware_setup(void)
5868 if (!cpu_has_vmx_unrestricted_guest()) 6077 if (!cpu_has_vmx_unrestricted_guest())
5869 enable_unrestricted_guest = 0; 6078 enable_unrestricted_guest = 0;
5870 6079
5871 if (!cpu_has_vmx_flexpriority()) { 6080 if (!cpu_has_vmx_flexpriority())
5872 flexpriority_enabled = 0; 6081 flexpriority_enabled = 0;
5873 6082
5874 /* 6083 /*
5875 * set_apic_access_page_addr() is used to reload apic access 6084 * set_apic_access_page_addr() is used to reload apic access
5876 * page upon invalidation. No need to do anything if the 6085 * page upon invalidation. No need to do anything if not
5877 * processor does not have the APIC_ACCESS_ADDR VMCS field. 6086 * using the APIC_ACCESS_ADDR VMCS field.
5878 */ 6087 */
6088 if (!flexpriority_enabled)
5879 kvm_x86_ops->set_apic_access_page_addr = NULL; 6089 kvm_x86_ops->set_apic_access_page_addr = NULL;
5880 }
5881 6090
5882 if (!cpu_has_vmx_tpr_shadow()) 6091 if (!cpu_has_vmx_tpr_shadow())
5883 kvm_x86_ops->update_cr8_intercept = NULL; 6092 kvm_x86_ops->update_cr8_intercept = NULL;
@@ -5895,13 +6104,11 @@ static __init int hardware_setup(void)
5895 kvm_x86_ops->update_cr8_intercept = NULL; 6104 kvm_x86_ops->update_cr8_intercept = NULL;
5896 else { 6105 else {
5897 kvm_x86_ops->hwapic_irr_update = NULL; 6106 kvm_x86_ops->hwapic_irr_update = NULL;
6107 kvm_x86_ops->hwapic_isr_update = NULL;
5898 kvm_x86_ops->deliver_posted_interrupt = NULL; 6108 kvm_x86_ops->deliver_posted_interrupt = NULL;
5899 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; 6109 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
5900 } 6110 }
5901 6111
5902 if (nested)
5903 nested_vmx_setup_ctls_msrs();
5904
5905 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 6112 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
5906 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 6113 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
5907 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 6114 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -5945,12 +6152,29 @@ static __init int hardware_setup(void)
5945 6152
5946 update_ple_window_actual_max(); 6153 update_ple_window_actual_max();
5947 6154
6155 /*
6156 * Only enable PML when hardware supports PML feature, and both EPT
6157 * and EPT A/D bit features are enabled -- PML depends on them to work.
6158 */
6159 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
6160 enable_pml = 0;
6161
6162 if (!enable_pml) {
6163 kvm_x86_ops->slot_enable_log_dirty = NULL;
6164 kvm_x86_ops->slot_disable_log_dirty = NULL;
6165 kvm_x86_ops->flush_log_dirty = NULL;
6166 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
6167 }
6168
5948 return alloc_kvm_area(); 6169 return alloc_kvm_area();
5949 6170
5950out7: 6171out8:
5951 free_page((unsigned long)vmx_vmwrite_bitmap); 6172 free_page((unsigned long)vmx_vmwrite_bitmap);
5952out6: 6173out7:
5953 free_page((unsigned long)vmx_vmread_bitmap); 6174 free_page((unsigned long)vmx_vmread_bitmap);
6175out6:
6176 if (nested)
6177 free_page((unsigned long)vmx_msr_bitmap_nested);
5954out5: 6178out5:
5955 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 6179 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
5956out4: 6180out4:
@@ -5977,6 +6201,8 @@ static __exit void hardware_unsetup(void)
5977 free_page((unsigned long)vmx_io_bitmap_a); 6201 free_page((unsigned long)vmx_io_bitmap_a);
5978 free_page((unsigned long)vmx_vmwrite_bitmap); 6202 free_page((unsigned long)vmx_vmwrite_bitmap);
5979 free_page((unsigned long)vmx_vmread_bitmap); 6203 free_page((unsigned long)vmx_vmread_bitmap);
6204 if (nested)
6205 free_page((unsigned long)vmx_msr_bitmap_nested);
5980 6206
5981 free_kvm_area(); 6207 free_kvm_area();
5982} 6208}
@@ -6143,6 +6369,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
6143 */ 6369 */
6144} 6370}
6145 6371
6372static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
6373{
6374 /* TODO: not to reset guest simply here. */
6375 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6376 pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
6377}
6378
6146static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 6379static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
6147{ 6380{
6148 struct vcpu_vmx *vmx = 6381 struct vcpu_vmx *vmx =
@@ -6432,6 +6665,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
6432 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 6665 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6433 vmcs_write64(VMCS_LINK_POINTER, -1ull); 6666 vmcs_write64(VMCS_LINK_POINTER, -1ull);
6434 } 6667 }
6668 vmx->nested.posted_intr_nv = -1;
6435 kunmap(vmx->nested.current_vmcs12_page); 6669 kunmap(vmx->nested.current_vmcs12_page);
6436 nested_release_page(vmx->nested.current_vmcs12_page); 6670 nested_release_page(vmx->nested.current_vmcs12_page);
6437 vmx->nested.current_vmptr = -1ull; 6671 vmx->nested.current_vmptr = -1ull;
@@ -6460,6 +6694,12 @@ static void free_nested(struct vcpu_vmx *vmx)
6460 nested_release_page(vmx->nested.virtual_apic_page); 6694 nested_release_page(vmx->nested.virtual_apic_page);
6461 vmx->nested.virtual_apic_page = NULL; 6695 vmx->nested.virtual_apic_page = NULL;
6462 } 6696 }
6697 if (vmx->nested.pi_desc_page) {
6698 kunmap(vmx->nested.pi_desc_page);
6699 nested_release_page(vmx->nested.pi_desc_page);
6700 vmx->nested.pi_desc_page = NULL;
6701 vmx->nested.pi_desc = NULL;
6702 }
6463 6703
6464 nested_free_all_saved_vmcss(vmx); 6704 nested_free_all_saved_vmcss(vmx);
6465} 6705}
@@ -6893,6 +7133,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
6893/* Emulate the INVEPT instruction */ 7133/* Emulate the INVEPT instruction */
6894static int handle_invept(struct kvm_vcpu *vcpu) 7134static int handle_invept(struct kvm_vcpu *vcpu)
6895{ 7135{
7136 struct vcpu_vmx *vmx = to_vmx(vcpu);
6896 u32 vmx_instruction_info, types; 7137 u32 vmx_instruction_info, types;
6897 unsigned long type; 7138 unsigned long type;
6898 gva_t gva; 7139 gva_t gva;
@@ -6901,8 +7142,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
6901 u64 eptp, gpa; 7142 u64 eptp, gpa;
6902 } operand; 7143 } operand;
6903 7144
6904 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || 7145 if (!(vmx->nested.nested_vmx_secondary_ctls_high &
6905 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { 7146 SECONDARY_EXEC_ENABLE_EPT) ||
7147 !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
6906 kvm_queue_exception(vcpu, UD_VECTOR); 7148 kvm_queue_exception(vcpu, UD_VECTOR);
6907 return 1; 7149 return 1;
6908 } 7150 }
@@ -6918,7 +7160,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
6918 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7160 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6919 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 7161 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
6920 7162
6921 types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 7163 types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
6922 7164
6923 if (!(types & (1UL << type))) { 7165 if (!(types & (1UL << type))) {
6924 nested_vmx_failValid(vcpu, 7166 nested_vmx_failValid(vcpu,
@@ -6960,6 +7202,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
6960 return 1; 7202 return 1;
6961} 7203}
6962 7204
7205static int handle_pml_full(struct kvm_vcpu *vcpu)
7206{
7207 unsigned long exit_qualification;
7208
7209 trace_kvm_pml_full(vcpu->vcpu_id);
7210
7211 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7212
7213 /*
7214 * PML buffer FULL happened while executing iret from NMI,
7215 * "blocked by NMI" bit has to be set before next VM entry.
7216 */
7217 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
7218 cpu_has_virtual_nmis() &&
7219 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
7220 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7221 GUEST_INTR_STATE_NMI);
7222
7223 /*
7224 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
7225 * here.., and there's no userspace involvement needed for PML.
7226 */
7227 return 1;
7228}
7229
6963/* 7230/*
6964 * The exit handlers return 1 if the exit was handled fully and guest execution 7231 * The exit handlers return 1 if the exit was handled fully and guest execution
6965 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 7232 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7008,6 +7275,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7008 [EXIT_REASON_INVVPID] = handle_invvpid, 7275 [EXIT_REASON_INVVPID] = handle_invvpid,
7009 [EXIT_REASON_XSAVES] = handle_xsaves, 7276 [EXIT_REASON_XSAVES] = handle_xsaves,
7010 [EXIT_REASON_XRSTORS] = handle_xrstors, 7277 [EXIT_REASON_XRSTORS] = handle_xrstors,
7278 [EXIT_REASON_PML_FULL] = handle_pml_full,
7011}; 7279};
7012 7280
7013static const int kvm_vmx_max_exit_handlers = 7281static const int kvm_vmx_max_exit_handlers =
@@ -7275,6 +7543,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7275 case EXIT_REASON_APIC_ACCESS: 7543 case EXIT_REASON_APIC_ACCESS:
7276 return nested_cpu_has2(vmcs12, 7544 return nested_cpu_has2(vmcs12,
7277 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 7545 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
7546 case EXIT_REASON_APIC_WRITE:
7547 case EXIT_REASON_EOI_INDUCED:
7548 /* apic_write and eoi_induced should exit unconditionally. */
7549 return 1;
7278 case EXIT_REASON_EPT_VIOLATION: 7550 case EXIT_REASON_EPT_VIOLATION:
7279 /* 7551 /*
7280 * L0 always deals with the EPT violation. If nested EPT is 7552 * L0 always deals with the EPT violation. If nested EPT is
@@ -7314,6 +7586,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
7314 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 7586 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
7315} 7587}
7316 7588
7589static int vmx_enable_pml(struct vcpu_vmx *vmx)
7590{
7591 struct page *pml_pg;
7592 u32 exec_control;
7593
7594 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
7595 if (!pml_pg)
7596 return -ENOMEM;
7597
7598 vmx->pml_pg = pml_pg;
7599
7600 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
7601 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7602
7603 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7604 exec_control |= SECONDARY_EXEC_ENABLE_PML;
7605 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7606
7607 return 0;
7608}
7609
7610static void vmx_disable_pml(struct vcpu_vmx *vmx)
7611{
7612 u32 exec_control;
7613
7614 ASSERT(vmx->pml_pg);
7615 __free_page(vmx->pml_pg);
7616 vmx->pml_pg = NULL;
7617
7618 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7619 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
7620 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7621}
7622
7623static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
7624{
7625 struct kvm *kvm = vmx->vcpu.kvm;
7626 u64 *pml_buf;
7627 u16 pml_idx;
7628
7629 pml_idx = vmcs_read16(GUEST_PML_INDEX);
7630
7631 /* Do nothing if PML buffer is empty */
7632 if (pml_idx == (PML_ENTITY_NUM - 1))
7633 return;
7634
7635 /* PML index always points to next available PML buffer entity */
7636 if (pml_idx >= PML_ENTITY_NUM)
7637 pml_idx = 0;
7638 else
7639 pml_idx++;
7640
7641 pml_buf = page_address(vmx->pml_pg);
7642 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
7643 u64 gpa;
7644
7645 gpa = pml_buf[pml_idx];
7646 WARN_ON(gpa & (PAGE_SIZE - 1));
7647 mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
7648 }
7649
7650 /* reset PML index */
7651 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7652}
7653
7654/*
7655 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
7656 * Called before reporting dirty_bitmap to userspace.
7657 */
7658static void kvm_flush_pml_buffers(struct kvm *kvm)
7659{
7660 int i;
7661 struct kvm_vcpu *vcpu;
7662 /*
7663 * We only need to kick vcpu out of guest mode here, as PML buffer
7664 * is flushed at beginning of all VMEXITs, and it's obvious that only
7665 * vcpus running in guest are possible to have unflushed GPAs in PML
7666 * buffer.
7667 */
7668 kvm_for_each_vcpu(i, vcpu, kvm)
7669 kvm_vcpu_kick(vcpu);
7670}
7671
7317/* 7672/*
7318 * The guest has exited. See if we can fix it or if we need userspace 7673 * The guest has exited. See if we can fix it or if we need userspace
7319 * assistance. 7674 * assistance.
@@ -7324,6 +7679,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
7324 u32 exit_reason = vmx->exit_reason; 7679 u32 exit_reason = vmx->exit_reason;
7325 u32 vectoring_info = vmx->idt_vectoring_info; 7680 u32 vectoring_info = vmx->idt_vectoring_info;
7326 7681
7682 /*
7683 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
7684 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
7685 * querying dirty_bitmap, we only need to kick all vcpus out of guest
7686 * mode as if vcpus is in root mode, the PML buffer must has been
7687 * flushed already.
7688 */
7689 if (enable_pml)
7690 vmx_flush_pml_buffer(vmx);
7691
7327 /* If guest state is invalid, start emulating */ 7692 /* If guest state is invalid, start emulating */
7328 if (vmx->emulation_required) 7693 if (vmx->emulation_required)
7329 return handle_invalid_guest_state(vcpu); 7694 return handle_invalid_guest_state(vcpu);
@@ -7471,9 +7836,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
7471 u16 status; 7836 u16 status;
7472 u8 old; 7837 u8 old;
7473 7838
7474 if (!vmx_vm_has_apicv(kvm))
7475 return;
7476
7477 if (isr == -1) 7839 if (isr == -1)
7478 isr = 0; 7840 isr = 0;
7479 7841
@@ -7973,6 +8335,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
7973{ 8335{
7974 struct vcpu_vmx *vmx = to_vmx(vcpu); 8336 struct vcpu_vmx *vmx = to_vmx(vcpu);
7975 8337
8338 if (enable_pml)
8339 vmx_disable_pml(vmx);
7976 free_vpid(vmx); 8340 free_vpid(vmx);
7977 leave_guest_mode(vcpu); 8341 leave_guest_mode(vcpu);
7978 vmx_load_vmcs01(vcpu); 8342 vmx_load_vmcs01(vcpu);
@@ -8040,9 +8404,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8040 goto free_vmcs; 8404 goto free_vmcs;
8041 } 8405 }
8042 8406
8407 if (nested)
8408 nested_vmx_setup_ctls_msrs(vmx);
8409
8410 vmx->nested.posted_intr_nv = -1;
8043 vmx->nested.current_vmptr = -1ull; 8411 vmx->nested.current_vmptr = -1ull;
8044 vmx->nested.current_vmcs12 = NULL; 8412 vmx->nested.current_vmcs12 = NULL;
8045 8413
8414 /*
8415 * If PML is turned on, failure on enabling PML just results in failure
8416 * of creating the vcpu, therefore we can simplify PML logic (by
8417 * avoiding dealing with cases, such as enabling PML partially on vcpus
8418 * for the guest, etc.
8419 */
8420 if (enable_pml) {
8421 err = vmx_enable_pml(vmx);
8422 if (err)
8423 goto free_vmcs;
8424 }
8425
8046 return &vmx->vcpu; 8426 return &vmx->vcpu;
8047 8427
8048free_vmcs: 8428free_vmcs:
@@ -8184,9 +8564,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
8184 8564
8185static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 8565static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
8186{ 8566{
8187 kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, 8567 WARN_ON(mmu_is_nested(vcpu));
8188 nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); 8568 kvm_init_shadow_ept_mmu(vcpu,
8189 8569 to_vmx(vcpu)->nested.nested_vmx_ept_caps &
8570 VMX_EPT_EXECUTE_ONLY_BIT);
8190 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 8571 vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
8191 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 8572 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
8192 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; 8573 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -8199,6 +8580,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
8199 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 8580 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
8200} 8581}
8201 8582
8583static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
8584 u16 error_code)
8585{
8586 bool inequality, bit;
8587
8588 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
8589 inequality =
8590 (error_code & vmcs12->page_fault_error_code_mask) !=
8591 vmcs12->page_fault_error_code_match;
8592 return inequality ^ bit;
8593}
8594
8202static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 8595static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
8203 struct x86_exception *fault) 8596 struct x86_exception *fault)
8204{ 8597{
@@ -8206,8 +8599,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
8206 8599
8207 WARN_ON(!is_guest_mode(vcpu)); 8600 WARN_ON(!is_guest_mode(vcpu));
8208 8601
8209 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ 8602 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
8210 if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
8211 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 8603 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
8212 vmcs_read32(VM_EXIT_INTR_INFO), 8604 vmcs_read32(VM_EXIT_INTR_INFO),
8213 vmcs_readl(EXIT_QUALIFICATION)); 8605 vmcs_readl(EXIT_QUALIFICATION));
@@ -8261,6 +8653,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8261 return false; 8653 return false;
8262 } 8654 }
8263 8655
8656 if (nested_cpu_has_posted_intr(vmcs12)) {
8657 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
8658 return false;
8659
8660 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
8661 kunmap(vmx->nested.pi_desc_page);
8662 nested_release_page(vmx->nested.pi_desc_page);
8663 }
8664 vmx->nested.pi_desc_page =
8665 nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
8666 if (!vmx->nested.pi_desc_page)
8667 return false;
8668
8669 vmx->nested.pi_desc =
8670 (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
8671 if (!vmx->nested.pi_desc) {
8672 nested_release_page_clean(vmx->nested.pi_desc_page);
8673 return false;
8674 }
8675 vmx->nested.pi_desc =
8676 (struct pi_desc *)((void *)vmx->nested.pi_desc +
8677 (unsigned long)(vmcs12->posted_intr_desc_addr &
8678 (PAGE_SIZE - 1)));
8679 }
8680
8264 return true; 8681 return true;
8265} 8682}
8266 8683
@@ -8286,6 +8703,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
8286 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 8703 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
8287} 8704}
8288 8705
8706static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
8707 struct vmcs12 *vmcs12)
8708{
8709 int maxphyaddr;
8710 u64 addr;
8711
8712 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
8713 return 0;
8714
8715 if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
8716 WARN_ON(1);
8717 return -EINVAL;
8718 }
8719 maxphyaddr = cpuid_maxphyaddr(vcpu);
8720
8721 if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
8722 ((addr + PAGE_SIZE) >> maxphyaddr))
8723 return -EINVAL;
8724
8725 return 0;
8726}
8727
8728/*
8729 * Merge L0's and L1's MSR bitmap, return false to indicate that
8730 * we do not use the hardware.
8731 */
8732static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
8733 struct vmcs12 *vmcs12)
8734{
8735 int msr;
8736 struct page *page;
8737 unsigned long *msr_bitmap;
8738
8739 if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
8740 return false;
8741
8742 page = nested_get_page(vcpu, vmcs12->msr_bitmap);
8743 if (!page) {
8744 WARN_ON(1);
8745 return false;
8746 }
8747 msr_bitmap = (unsigned long *)kmap(page);
8748 if (!msr_bitmap) {
8749 nested_release_page_clean(page);
8750 WARN_ON(1);
8751 return false;
8752 }
8753
8754 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
8755 if (nested_cpu_has_apic_reg_virt(vmcs12))
8756 for (msr = 0x800; msr <= 0x8ff; msr++)
8757 nested_vmx_disable_intercept_for_msr(
8758 msr_bitmap,
8759 vmx_msr_bitmap_nested,
8760 msr, MSR_TYPE_R);
8761 /* TPR is allowed */
8762 nested_vmx_disable_intercept_for_msr(msr_bitmap,
8763 vmx_msr_bitmap_nested,
8764 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
8765 MSR_TYPE_R | MSR_TYPE_W);
8766 if (nested_cpu_has_vid(vmcs12)) {
8767 /* EOI and self-IPI are allowed */
8768 nested_vmx_disable_intercept_for_msr(
8769 msr_bitmap,
8770 vmx_msr_bitmap_nested,
8771 APIC_BASE_MSR + (APIC_EOI >> 4),
8772 MSR_TYPE_W);
8773 nested_vmx_disable_intercept_for_msr(
8774 msr_bitmap,
8775 vmx_msr_bitmap_nested,
8776 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
8777 MSR_TYPE_W);
8778 }
8779 } else {
8780 /*
8781 * Enable reading intercept of all the x2apic
8782 * MSRs. We should not rely on vmcs12 to do any
8783 * optimizations here, it may have been modified
8784 * by L1.
8785 */
8786 for (msr = 0x800; msr <= 0x8ff; msr++)
8787 __vmx_enable_intercept_for_msr(
8788 vmx_msr_bitmap_nested,
8789 msr,
8790 MSR_TYPE_R);
8791
8792 __vmx_enable_intercept_for_msr(
8793 vmx_msr_bitmap_nested,
8794 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
8795 MSR_TYPE_W);
8796 __vmx_enable_intercept_for_msr(
8797 vmx_msr_bitmap_nested,
8798 APIC_BASE_MSR + (APIC_EOI >> 4),
8799 MSR_TYPE_W);
8800 __vmx_enable_intercept_for_msr(
8801 vmx_msr_bitmap_nested,
8802 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
8803 MSR_TYPE_W);
8804 }
8805 kunmap(page);
8806 nested_release_page_clean(page);
8807
8808 return true;
8809}
8810
8811static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
8812 struct vmcs12 *vmcs12)
8813{
8814 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
8815 !nested_cpu_has_apic_reg_virt(vmcs12) &&
8816 !nested_cpu_has_vid(vmcs12) &&
8817 !nested_cpu_has_posted_intr(vmcs12))
8818 return 0;
8819
8820 /*
8821 * If virtualize x2apic mode is enabled,
8822 * virtualize apic access must be disabled.
8823 */
8824 if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
8825 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
8826 return -EINVAL;
8827
8828 /*
8829 * If virtual interrupt delivery is enabled,
8830 * we must exit on external interrupts.
8831 */
8832 if (nested_cpu_has_vid(vmcs12) &&
8833 !nested_exit_on_intr(vcpu))
8834 return -EINVAL;
8835
8836 /*
8837 * bits 15:8 should be zero in posted_intr_nv,
8838 * the descriptor address has been already checked
8839 * in nested_get_vmcs12_pages.
8840 */
8841 if (nested_cpu_has_posted_intr(vmcs12) &&
8842 (!nested_cpu_has_vid(vmcs12) ||
8843 !nested_exit_intr_ack_set(vcpu) ||
8844 vmcs12->posted_intr_nv & 0xff00))
8845 return -EINVAL;
8846
8847 /* tpr shadow is needed by all apicv features. */
8848 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
8849 return -EINVAL;
8850
8851 return 0;
8852}
8853
8854static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8855 unsigned long count_field,
8856 unsigned long addr_field,
8857 int maxphyaddr)
8858{
8859 u64 count, addr;
8860
8861 if (vmcs12_read_any(vcpu, count_field, &count) ||
8862 vmcs12_read_any(vcpu, addr_field, &addr)) {
8863 WARN_ON(1);
8864 return -EINVAL;
8865 }
8866 if (count == 0)
8867 return 0;
8868 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
8869 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
8870 pr_warn_ratelimited(
8871 "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
8872 addr_field, maxphyaddr, count, addr);
8873 return -EINVAL;
8874 }
8875 return 0;
8876}
8877
8878static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
8879 struct vmcs12 *vmcs12)
8880{
8881 int maxphyaddr;
8882
8883 if (vmcs12->vm_exit_msr_load_count == 0 &&
8884 vmcs12->vm_exit_msr_store_count == 0 &&
8885 vmcs12->vm_entry_msr_load_count == 0)
8886 return 0; /* Fast path */
8887 maxphyaddr = cpuid_maxphyaddr(vcpu);
8888 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
8889 VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
8890 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
8891 VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
8892 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
8893 VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
8894 return -EINVAL;
8895 return 0;
8896}
8897
8898static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
8899 struct vmx_msr_entry *e)
8900{
8901 /* x2APIC MSR accesses are not allowed */
8902 if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
8903 return -EINVAL;
8904 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
8905 e->index == MSR_IA32_UCODE_REV)
8906 return -EINVAL;
8907 if (e->reserved != 0)
8908 return -EINVAL;
8909 return 0;
8910}
8911
8912static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
8913 struct vmx_msr_entry *e)
8914{
8915 if (e->index == MSR_FS_BASE ||
8916 e->index == MSR_GS_BASE ||
8917 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
8918 nested_vmx_msr_check_common(vcpu, e))
8919 return -EINVAL;
8920 return 0;
8921}
8922
8923static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
8924 struct vmx_msr_entry *e)
8925{
8926 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
8927 nested_vmx_msr_check_common(vcpu, e))
8928 return -EINVAL;
8929 return 0;
8930}
8931
8932/*
8933 * Load guest's/host's msr at nested entry/exit.
8934 * return 0 for success, entry index for failure.
8935 */
8936static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8937{
8938 u32 i;
8939 struct vmx_msr_entry e;
8940 struct msr_data msr;
8941
8942 msr.host_initiated = false;
8943 for (i = 0; i < count; i++) {
8944 if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
8945 &e, sizeof(e))) {
8946 pr_warn_ratelimited(
8947 "%s cannot read MSR entry (%u, 0x%08llx)\n",
8948 __func__, i, gpa + i * sizeof(e));
8949 goto fail;
8950 }
8951 if (nested_vmx_load_msr_check(vcpu, &e)) {
8952 pr_warn_ratelimited(
8953 "%s check failed (%u, 0x%x, 0x%x)\n",
8954 __func__, i, e.index, e.reserved);
8955 goto fail;
8956 }
8957 msr.index = e.index;
8958 msr.data = e.value;
8959 if (kvm_set_msr(vcpu, &msr)) {
8960 pr_warn_ratelimited(
8961 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
8962 __func__, i, e.index, e.value);
8963 goto fail;
8964 }
8965 }
8966 return 0;
8967fail:
8968 return i + 1;
8969}
8970
8971static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8972{
8973 u32 i;
8974 struct vmx_msr_entry e;
8975
8976 for (i = 0; i < count; i++) {
8977 if (kvm_read_guest(vcpu->kvm,
8978 gpa + i * sizeof(e),
8979 &e, 2 * sizeof(u32))) {
8980 pr_warn_ratelimited(
8981 "%s cannot read MSR entry (%u, 0x%08llx)\n",
8982 __func__, i, gpa + i * sizeof(e));
8983 return -EINVAL;
8984 }
8985 if (nested_vmx_store_msr_check(vcpu, &e)) {
8986 pr_warn_ratelimited(
8987 "%s check failed (%u, 0x%x, 0x%x)\n",
8988 __func__, i, e.index, e.reserved);
8989 return -EINVAL;
8990 }
8991 if (kvm_get_msr(vcpu, e.index, &e.value)) {
8992 pr_warn_ratelimited(
8993 "%s cannot read MSR (%u, 0x%x)\n",
8994 __func__, i, e.index);
8995 return -EINVAL;
8996 }
8997 if (kvm_write_guest(vcpu->kvm,
8998 gpa + i * sizeof(e) +
8999 offsetof(struct vmx_msr_entry, value),
9000 &e.value, sizeof(e.value))) {
9001 pr_warn_ratelimited(
9002 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
9003 __func__, i, e.index, e.value);
9004 return -EINVAL;
9005 }
9006 }
9007 return 0;
9008}
9009
8289/* 9010/*
8290 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 9011 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
8291 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 9012 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8365,8 +9086,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8365 9086
8366 exec_control = vmcs12->pin_based_vm_exec_control; 9087 exec_control = vmcs12->pin_based_vm_exec_control;
8367 exec_control |= vmcs_config.pin_based_exec_ctrl; 9088 exec_control |= vmcs_config.pin_based_exec_ctrl;
8368 exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | 9089 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
8369 PIN_BASED_POSTED_INTR); 9090
9091 if (nested_cpu_has_posted_intr(vmcs12)) {
9092 /*
9093 * Note that we use L0's vector here and in
9094 * vmx_deliver_nested_posted_interrupt.
9095 */
9096 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
9097 vmx->nested.pi_pending = false;
9098 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
9099 vmcs_write64(POSTED_INTR_DESC_ADDR,
9100 page_to_phys(vmx->nested.pi_desc_page) +
9101 (unsigned long)(vmcs12->posted_intr_desc_addr &
9102 (PAGE_SIZE - 1)));
9103 } else
9104 exec_control &= ~PIN_BASED_POSTED_INTR;
9105
8370 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 9106 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
8371 9107
8372 vmx->nested.preemption_timer_expired = false; 9108 vmx->nested.preemption_timer_expired = false;
@@ -8423,12 +9159,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8423 else 9159 else
8424 vmcs_write64(APIC_ACCESS_ADDR, 9160 vmcs_write64(APIC_ACCESS_ADDR,
8425 page_to_phys(vmx->nested.apic_access_page)); 9161 page_to_phys(vmx->nested.apic_access_page));
8426 } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { 9162 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
9163 (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
8427 exec_control |= 9164 exec_control |=
8428 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9165 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
8429 kvm_vcpu_reload_apic_access_page(vcpu); 9166 kvm_vcpu_reload_apic_access_page(vcpu);
8430 } 9167 }
8431 9168
9169 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
9170 vmcs_write64(EOI_EXIT_BITMAP0,
9171 vmcs12->eoi_exit_bitmap0);
9172 vmcs_write64(EOI_EXIT_BITMAP1,
9173 vmcs12->eoi_exit_bitmap1);
9174 vmcs_write64(EOI_EXIT_BITMAP2,
9175 vmcs12->eoi_exit_bitmap2);
9176 vmcs_write64(EOI_EXIT_BITMAP3,
9177 vmcs12->eoi_exit_bitmap3);
9178 vmcs_write16(GUEST_INTR_STATUS,
9179 vmcs12->guest_intr_status);
9180 }
9181
8432 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 9182 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
8433 } 9183 }
8434 9184
@@ -8462,11 +9212,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
8462 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 9212 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
8463 } 9213 }
8464 9214
9215 if (cpu_has_vmx_msr_bitmap() &&
9216 exec_control & CPU_BASED_USE_MSR_BITMAPS &&
9217 nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
9218 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
9219 } else
9220 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
9221
8465 /* 9222 /*
8466 * Merging of IO and MSR bitmaps not currently supported. 9223 * Merging of IO bitmap not currently supported.
8467 * Rather, exit every time. 9224 * Rather, exit every time.
8468 */ 9225 */
8469 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
8470 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 9226 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
8471 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 9227 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
8472 9228
@@ -8582,6 +9338,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8582 int cpu; 9338 int cpu;
8583 struct loaded_vmcs *vmcs02; 9339 struct loaded_vmcs *vmcs02;
8584 bool ia32e; 9340 bool ia32e;
9341 u32 msr_entry_idx;
8585 9342
8586 if (!nested_vmx_check_permission(vcpu) || 9343 if (!nested_vmx_check_permission(vcpu) ||
8587 !nested_vmx_check_vmcs12(vcpu)) 9344 !nested_vmx_check_vmcs12(vcpu))
@@ -8616,41 +9373,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8616 return 1; 9373 return 1;
8617 } 9374 }
8618 9375
8619 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && 9376 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
8620 !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
8621 /*TODO: Also verify bits beyond physical address width are 0*/ 9377 /*TODO: Also verify bits beyond physical address width are 0*/
8622 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9378 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8623 return 1; 9379 return 1;
8624 } 9380 }
8625 9381
8626 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 9382 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
8627 /*TODO: Also verify bits beyond physical address width are 0*/
8628 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9383 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8629 return 1; 9384 return 1;
8630 } 9385 }
8631 9386
8632 if (vmcs12->vm_entry_msr_load_count > 0 || 9387 if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
8633 vmcs12->vm_exit_msr_load_count > 0 || 9388 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8634 vmcs12->vm_exit_msr_store_count > 0) { 9389 return 1;
8635 pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", 9390 }
8636 __func__); 9391
9392 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
8637 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9393 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8638 return 1; 9394 return 1;
8639 } 9395 }
8640 9396
8641 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 9397 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
8642 nested_vmx_true_procbased_ctls_low, 9398 vmx->nested.nested_vmx_true_procbased_ctls_low,
8643 nested_vmx_procbased_ctls_high) || 9399 vmx->nested.nested_vmx_procbased_ctls_high) ||
8644 !vmx_control_verify(vmcs12->secondary_vm_exec_control, 9400 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
8645 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || 9401 vmx->nested.nested_vmx_secondary_ctls_low,
9402 vmx->nested.nested_vmx_secondary_ctls_high) ||
8646 !vmx_control_verify(vmcs12->pin_based_vm_exec_control, 9403 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
8647 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || 9404 vmx->nested.nested_vmx_pinbased_ctls_low,
9405 vmx->nested.nested_vmx_pinbased_ctls_high) ||
8648 !vmx_control_verify(vmcs12->vm_exit_controls, 9406 !vmx_control_verify(vmcs12->vm_exit_controls,
8649 nested_vmx_true_exit_ctls_low, 9407 vmx->nested.nested_vmx_true_exit_ctls_low,
8650 nested_vmx_exit_ctls_high) || 9408 vmx->nested.nested_vmx_exit_ctls_high) ||
8651 !vmx_control_verify(vmcs12->vm_entry_controls, 9409 !vmx_control_verify(vmcs12->vm_entry_controls,
8652 nested_vmx_true_entry_ctls_low, 9410 vmx->nested.nested_vmx_true_entry_ctls_low,
8653 nested_vmx_entry_ctls_high)) 9411 vmx->nested.nested_vmx_entry_ctls_high))
8654 { 9412 {
8655 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9413 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
8656 return 1; 9414 return 1;
@@ -8663,7 +9421,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8663 return 1; 9421 return 1;
8664 } 9422 }
8665 9423
8666 if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || 9424 if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
8667 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 9425 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
8668 nested_vmx_entry_failure(vcpu, vmcs12, 9426 nested_vmx_entry_failure(vcpu, vmcs12,
8669 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9427 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -8739,10 +9497,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
8739 9497
8740 vmx_segment_cache_clear(vmx); 9498 vmx_segment_cache_clear(vmx);
8741 9499
8742 vmcs12->launch_state = 1;
8743
8744 prepare_vmcs02(vcpu, vmcs12); 9500 prepare_vmcs02(vcpu, vmcs12);
8745 9501
9502 msr_entry_idx = nested_vmx_load_msr(vcpu,
9503 vmcs12->vm_entry_msr_load_addr,
9504 vmcs12->vm_entry_msr_load_count);
9505 if (msr_entry_idx) {
9506 leave_guest_mode(vcpu);
9507 vmx_load_vmcs01(vcpu);
9508 nested_vmx_entry_failure(vcpu, vmcs12,
9509 EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
9510 return 1;
9511 }
9512
9513 vmcs12->launch_state = 1;
9514
8746 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) 9515 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
8747 return kvm_emulate_halt(vcpu); 9516 return kvm_emulate_halt(vcpu);
8748 9517
@@ -8869,9 +9638,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
8869 if (vmx->nested.nested_run_pending) 9638 if (vmx->nested.nested_run_pending)
8870 return -EBUSY; 9639 return -EBUSY;
8871 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 9640 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
9641 return 0;
8872 } 9642 }
8873 9643
8874 return 0; 9644 return vmx_complete_nested_posted_interrupt(vcpu);
8875} 9645}
8876 9646
8877static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 9647static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -8981,6 +9751,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
8981 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 9751 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
8982 } 9752 }
8983 9753
9754 if (nested_cpu_has_vid(vmcs12))
9755 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
9756
8984 vmcs12->vm_entry_controls = 9757 vmcs12->vm_entry_controls =
8985 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 9758 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
8986 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 9759 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
@@ -9172,6 +9945,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
9172 9945
9173 kvm_set_dr(vcpu, 7, 0x400); 9946 kvm_set_dr(vcpu, 7, 0x400);
9174 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 9947 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
9948
9949 if (cpu_has_vmx_msr_bitmap())
9950 vmx_set_msr_bitmap(vcpu);
9951
9952 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
9953 vmcs12->vm_exit_msr_load_count))
9954 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
9175} 9955}
9176 9956
9177/* 9957/*
@@ -9193,6 +9973,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
9193 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 9973 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
9194 exit_qualification); 9974 exit_qualification);
9195 9975
9976 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
9977 vmcs12->vm_exit_msr_store_count))
9978 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
9979
9196 vmx_load_vmcs01(vcpu); 9980 vmx_load_vmcs01(vcpu);
9197 9981
9198 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 9982 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -9235,6 +10019,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
9235 nested_release_page(vmx->nested.virtual_apic_page); 10019 nested_release_page(vmx->nested.virtual_apic_page);
9236 vmx->nested.virtual_apic_page = NULL; 10020 vmx->nested.virtual_apic_page = NULL;
9237 } 10021 }
10022 if (vmx->nested.pi_desc_page) {
10023 kunmap(vmx->nested.pi_desc_page);
10024 nested_release_page(vmx->nested.pi_desc_page);
10025 vmx->nested.pi_desc_page = NULL;
10026 vmx->nested.pi_desc = NULL;
10027 }
9238 10028
9239 /* 10029 /*
9240 * We are now running in L2, mmu_notifier will force to reload the 10030 * We are now running in L2, mmu_notifier will force to reload the
@@ -9301,6 +10091,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
9301 shrink_ple_window(vcpu); 10091 shrink_ple_window(vcpu);
9302} 10092}
9303 10093
10094static void vmx_slot_enable_log_dirty(struct kvm *kvm,
10095 struct kvm_memory_slot *slot)
10096{
10097 kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
10098 kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
10099}
10100
10101static void vmx_slot_disable_log_dirty(struct kvm *kvm,
10102 struct kvm_memory_slot *slot)
10103{
10104 kvm_mmu_slot_set_dirty(kvm, slot);
10105}
10106
10107static void vmx_flush_log_dirty(struct kvm *kvm)
10108{
10109 kvm_flush_pml_buffers(kvm);
10110}
10111
10112static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
10113 struct kvm_memory_slot *memslot,
10114 gfn_t offset, unsigned long mask)
10115{
10116 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
10117}
10118
9304static struct kvm_x86_ops vmx_x86_ops = { 10119static struct kvm_x86_ops vmx_x86_ops = {
9305 .cpu_has_kvm_support = cpu_has_kvm_support, 10120 .cpu_has_kvm_support = cpu_has_kvm_support,
9306 .disabled_by_bios = vmx_disabled_by_bios, 10121 .disabled_by_bios = vmx_disabled_by_bios,
@@ -9409,6 +10224,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
9409 .check_nested_events = vmx_check_nested_events, 10224 .check_nested_events = vmx_check_nested_events,
9410 10225
9411 .sched_in = vmx_sched_in, 10226 .sched_in = vmx_sched_in,
10227
10228 .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
10229 .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
10230 .flush_log_dirty = vmx_flush_log_dirty,
10231 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
9412}; 10232};
9413 10233
9414static int __init vmx_init(void) 10234static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..bd7a70be41b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
108static u32 tsc_tolerance_ppm = 250; 108static u32 tsc_tolerance_ppm = 250;
109module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); 109module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
110 110
111/* lapic timer advance (tscdeadline mode only) in nanoseconds */
112unsigned int lapic_timer_advance_ns = 0;
113module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
114
111static bool backwards_tsc_observed = false; 115static bool backwards_tsc_observed = false;
112 116
113#define KVM_NR_SHARED_MSRS 16 117#define KVM_NR_SHARED_MSRS 16
@@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
141 { "irq_window", VCPU_STAT(irq_window_exits) }, 145 { "irq_window", VCPU_STAT(irq_window_exits) },
142 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 146 { "nmi_window", VCPU_STAT(nmi_window_exits) },
143 { "halt_exits", VCPU_STAT(halt_exits) }, 147 { "halt_exits", VCPU_STAT(halt_exits) },
148 { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
144 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 149 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
145 { "hypercalls", VCPU_STAT(hypercalls) }, 150 { "hypercalls", VCPU_STAT(hypercalls) },
146 { "request_irq", VCPU_STAT(request_irq_exits) }, 151 { "request_irq", VCPU_STAT(request_irq_exits) },
@@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
492} 497}
493EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); 498EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
494 499
495int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 500static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
496 void *data, int offset, int len, u32 access) 501 void *data, int offset, int len, u32 access)
497{ 502{
498 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, 503 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
643 } 648 }
644} 649}
645 650
646int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 651static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
647{ 652{
648 u64 xcr0 = xcr; 653 u64 xcr0 = xcr;
649 u64 old_xcr0 = vcpu->arch.xcr0; 654 u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
1083} 1088}
1084#endif 1089#endif
1085 1090
1091void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1092{
1093 /*
1094 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1095 * vcpu_enter_guest. This function is only called from
1096 * the physical CPU that is running vcpu.
1097 */
1098 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1099}
1086 1100
1087static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 1101static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1088{ 1102{
@@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1180#endif 1194#endif
1181 1195
1182static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1196static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1183unsigned long max_tsc_khz; 1197static unsigned long max_tsc_khz;
1184 1198
1185static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 1199static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1186{ 1200{
@@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1234 return tsc; 1248 return tsc;
1235} 1249}
1236 1250
1237void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) 1251static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1238{ 1252{
1239#ifdef CONFIG_X86_64 1253#ifdef CONFIG_X86_64
1240 bool vcpus_matched; 1254 bool vcpus_matched;
@@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1529 &ka->master_cycle_now); 1543 &ka->master_cycle_now);
1530 1544
1531 ka->use_master_clock = host_tsc_clocksource && vcpus_matched 1545 ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1532 && !backwards_tsc_observed; 1546 && !backwards_tsc_observed
1547 && !ka->boot_vcpu_runs_old_kvmclock;
1533 1548
1534 if (ka->use_master_clock) 1549 if (ka->use_master_clock)
1535 atomic_set(&kvm_guest_has_master_clock, 1); 1550 atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2161 case MSR_KVM_SYSTEM_TIME_NEW: 2176 case MSR_KVM_SYSTEM_TIME_NEW:
2162 case MSR_KVM_SYSTEM_TIME: { 2177 case MSR_KVM_SYSTEM_TIME: {
2163 u64 gpa_offset; 2178 u64 gpa_offset;
2179 struct kvm_arch *ka = &vcpu->kvm->arch;
2180
2164 kvmclock_reset(vcpu); 2181 kvmclock_reset(vcpu);
2165 2182
2183 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2184 bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2185
2186 if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2187 set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
2188 &vcpu->requests);
2189
2190 ka->boot_vcpu_runs_old_kvmclock = tmp;
2191 }
2192
2166 vcpu->arch.time = data; 2193 vcpu->arch.time = data;
2167 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); 2194 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2168 2195
@@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2324{ 2351{
2325 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 2352 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
2326} 2353}
2354EXPORT_SYMBOL_GPL(kvm_get_msr);
2327 2355
2328static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2356static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2329{ 2357{
@@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2738 case KVM_CAP_READONLY_MEM: 2766 case KVM_CAP_READONLY_MEM:
2739 case KVM_CAP_HYPERV_TIME: 2767 case KVM_CAP_HYPERV_TIME:
2740 case KVM_CAP_IOAPIC_POLARITY_IGNORED: 2768 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2769 case KVM_CAP_TSC_DEADLINE_TIMER:
2741#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2770#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2742 case KVM_CAP_ASSIGN_DEV_IRQ: 2771 case KVM_CAP_ASSIGN_DEV_IRQ:
2743 case KVM_CAP_PCI_2_3: 2772 case KVM_CAP_PCI_2_3:
@@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2776 case KVM_CAP_TSC_CONTROL: 2805 case KVM_CAP_TSC_CONTROL:
2777 r = kvm_has_tsc_control; 2806 r = kvm_has_tsc_control;
2778 break; 2807 break;
2779 case KVM_CAP_TSC_DEADLINE_TIMER:
2780 r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2781 break;
2782 default: 2808 default:
2783 r = 0; 2809 r = 0;
2784 break; 2810 break;
@@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3734 * @kvm: kvm instance 3760 * @kvm: kvm instance
3735 * @log: slot id and address to which we copy the log 3761 * @log: slot id and address to which we copy the log
3736 * 3762 *
3737 * We need to keep it in mind that VCPU threads can write to the bitmap 3763 * Steps 1-4 below provide general overview of dirty page logging. See
3738 * concurrently. So, to avoid losing data, we keep the following order for 3764 * kvm_get_dirty_log_protect() function description for additional details.
3739 * each bit: 3765 *
3766 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
3767 * always flush the TLB (step 4) even if previous step failed and the dirty
3768 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
3769 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
3770 * writes will be marked dirty for next log read.
3740 * 3771 *
3741 * 1. Take a snapshot of the bit and clear it if needed. 3772 * 1. Take a snapshot of the bit and clear it if needed.
3742 * 2. Write protect the corresponding page. 3773 * 2. Write protect the corresponding page.
3743 * 3. Flush TLB's if needed. 3774 * 3. Copy the snapshot to the userspace.
3744 * 4. Copy the snapshot to the userspace. 3775 * 4. Flush TLB's if needed.
3745 *
3746 * Between 2 and 3, the guest may write to the page using the remaining TLB
3747 * entry. This is not a problem because the page will be reported dirty at
3748 * step 4 using the snapshot taken before and step 3 ensures that successive
3749 * writes will be logged for the next call.
3750 */ 3776 */
3751int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) 3777int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3752{ 3778{
3753 int r;
3754 struct kvm_memory_slot *memslot;
3755 unsigned long n, i;
3756 unsigned long *dirty_bitmap;
3757 unsigned long *dirty_bitmap_buffer;
3758 bool is_dirty = false; 3779 bool is_dirty = false;
3780 int r;
3759 3781
3760 mutex_lock(&kvm->slots_lock); 3782 mutex_lock(&kvm->slots_lock);
3761 3783
3762 r = -EINVAL; 3784 /*
3763 if (log->slot >= KVM_USER_MEM_SLOTS) 3785 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
3764 goto out; 3786 */
3765 3787 if (kvm_x86_ops->flush_log_dirty)
3766 memslot = id_to_memslot(kvm->memslots, log->slot); 3788 kvm_x86_ops->flush_log_dirty(kvm);
3767
3768 dirty_bitmap = memslot->dirty_bitmap;
3769 r = -ENOENT;
3770 if (!dirty_bitmap)
3771 goto out;
3772
3773 n = kvm_dirty_bitmap_bytes(memslot);
3774
3775 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3776 memset(dirty_bitmap_buffer, 0, n);
3777
3778 spin_lock(&kvm->mmu_lock);
3779
3780 for (i = 0; i < n / sizeof(long); i++) {
3781 unsigned long mask;
3782 gfn_t offset;
3783
3784 if (!dirty_bitmap[i])
3785 continue;
3786
3787 is_dirty = true;
3788
3789 mask = xchg(&dirty_bitmap[i], 0);
3790 dirty_bitmap_buffer[i] = mask;
3791
3792 offset = i * BITS_PER_LONG;
3793 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3794 }
3795
3796 spin_unlock(&kvm->mmu_lock);
3797 3789
3798 /* See the comments in kvm_mmu_slot_remove_write_access(). */ 3790 r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
3799 lockdep_assert_held(&kvm->slots_lock);
3800 3791
3801 /* 3792 /*
3802 * All the TLBs can be flushed out of mmu lock, see the comments in 3793 * All the TLBs can be flushed out of mmu lock, see the comments in
3803 * kvm_mmu_slot_remove_write_access(). 3794 * kvm_mmu_slot_remove_write_access().
3804 */ 3795 */
3796 lockdep_assert_held(&kvm->slots_lock);
3805 if (is_dirty) 3797 if (is_dirty)
3806 kvm_flush_remote_tlbs(kvm); 3798 kvm_flush_remote_tlbs(kvm);
3807 3799
3808 r = -EFAULT;
3809 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3810 goto out;
3811
3812 r = 0;
3813out:
3814 mutex_unlock(&kvm->slots_lock); 3800 mutex_unlock(&kvm->slots_lock);
3815 return r; 3801 return r;
3816} 3802}
@@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
4516 if (rc != X86EMUL_CONTINUE) 4502 if (rc != X86EMUL_CONTINUE)
4517 return rc; 4503 return rc;
4518 addr += now; 4504 addr += now;
4505 if (ctxt->mode != X86EMUL_MODE_PROT64)
4506 addr = (u32)addr;
4519 val += now; 4507 val += now;
4520 bytes -= now; 4508 bytes -= now;
4521 } 4509 }
@@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
4984 kvm_register_write(emul_to_vcpu(ctxt), reg, val); 4972 kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4985} 4973}
4986 4974
4975static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
4976{
4977 kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
4978}
4979
4987static const struct x86_emulate_ops emulate_ops = { 4980static const struct x86_emulate_ops emulate_ops = {
4988 .read_gpr = emulator_read_gpr, 4981 .read_gpr = emulator_read_gpr,
4989 .write_gpr = emulator_write_gpr, 4982 .write_gpr = emulator_write_gpr,
@@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = {
5019 .put_fpu = emulator_put_fpu, 5012 .put_fpu = emulator_put_fpu,
5020 .intercept = emulator_intercept, 5013 .intercept = emulator_intercept,
5021 .get_cpuid = emulator_get_cpuid, 5014 .get_cpuid = emulator_get_cpuid,
5015 .set_nmi_mask = emulator_set_nmi_mask,
5022}; 5016};
5023 5017
5024static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 5018static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6311 } 6305 }
6312 6306
6313 trace_kvm_entry(vcpu->vcpu_id); 6307 trace_kvm_entry(vcpu->vcpu_id);
6308 wait_lapic_expire(vcpu);
6314 kvm_x86_ops->run(vcpu); 6309 kvm_x86_ops->run(vcpu);
6315 6310
6316 /* 6311 /*
@@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
7041 return r; 7036 return r;
7042} 7037}
7043 7038
7044int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 7039void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
7045{ 7040{
7046 int r;
7047 struct msr_data msr; 7041 struct msr_data msr;
7048 struct kvm *kvm = vcpu->kvm; 7042 struct kvm *kvm = vcpu->kvm;
7049 7043
7050 r = vcpu_load(vcpu); 7044 if (vcpu_load(vcpu))
7051 if (r) 7045 return;
7052 return r;
7053 msr.data = 0x0; 7046 msr.data = 0x0;
7054 msr.index = MSR_IA32_TSC; 7047 msr.index = MSR_IA32_TSC;
7055 msr.host_initiated = true; 7048 msr.host_initiated = true;
@@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
7058 7051
7059 schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 7052 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
7060 KVMCLOCK_SYNC_PERIOD); 7053 KVMCLOCK_SYNC_PERIOD);
7061
7062 return r;
7063} 7054}
7064 7055
7065void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 7056void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
7549 return 0; 7540 return 0;
7550} 7541}
7551 7542
7543static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
7544 struct kvm_memory_slot *new)
7545{
7546 /* Still write protect RO slot */
7547 if (new->flags & KVM_MEM_READONLY) {
7548 kvm_mmu_slot_remove_write_access(kvm, new);
7549 return;
7550 }
7551
7552 /*
7553 * Call kvm_x86_ops dirty logging hooks when they are valid.
7554 *
7555 * kvm_x86_ops->slot_disable_log_dirty is called when:
7556 *
7557 * - KVM_MR_CREATE with dirty logging is disabled
7558 * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
7559 *
7560 * The reason is, in case of PML, we need to set D-bit for any slots
7561 * with dirty logging disabled in order to eliminate unnecessary GPA
7562 * logging in PML buffer (and potential PML buffer full VMEXT). This
7563 * guarantees leaving PML enabled during guest's lifetime won't have
7564 * any additonal overhead from PML when guest is running with dirty
7565 * logging disabled for memory slots.
7566 *
7567 * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
7568 * to dirty logging mode.
7569 *
7570 * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
7571 *
7572 * In case of write protect:
7573 *
7574 * Write protect all pages for dirty logging.
7575 *
7576 * All the sptes including the large sptes which point to this
7577 * slot are set to readonly. We can not create any new large
7578 * spte on this slot until the end of the logging.
7579 *
7580 * See the comments in fast_page_fault().
7581 */
7582 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
7583 if (kvm_x86_ops->slot_enable_log_dirty)
7584 kvm_x86_ops->slot_enable_log_dirty(kvm, new);
7585 else
7586 kvm_mmu_slot_remove_write_access(kvm, new);
7587 } else {
7588 if (kvm_x86_ops->slot_disable_log_dirty)
7589 kvm_x86_ops->slot_disable_log_dirty(kvm, new);
7590 }
7591}
7592
7552void kvm_arch_commit_memory_region(struct kvm *kvm, 7593void kvm_arch_commit_memory_region(struct kvm *kvm,
7553 struct kvm_userspace_memory_region *mem, 7594 struct kvm_userspace_memory_region *mem,
7554 const struct kvm_memory_slot *old, 7595 const struct kvm_memory_slot *old,
7555 enum kvm_mr_change change) 7596 enum kvm_mr_change change)
7556{ 7597{
7557 7598 struct kvm_memory_slot *new;
7558 int nr_mmu_pages = 0; 7599 int nr_mmu_pages = 0;
7559 7600
7560 if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { 7601 if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
@@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7573 7614
7574 if (nr_mmu_pages) 7615 if (nr_mmu_pages)
7575 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 7616 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
7617
7618 /* It's OK to get 'new' slot here as it has already been installed */
7619 new = id_to_memslot(kvm->memslots, mem->slot);
7620
7576 /* 7621 /*
7577 * Write protect all pages for dirty logging. 7622 * Set up write protection and/or dirty logging for the new slot.
7578 * 7623 *
7579 * All the sptes including the large sptes which point to this 7624 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
7580 * slot are set to readonly. We can not create any new large 7625 * been zapped so no dirty logging staff is needed for old slot. For
7581 * spte on this slot until the end of the logging. 7626 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
7582 * 7627 * new and it's also covered when dealing with the new slot.
7583 * See the comments in fast_page_fault().
7584 */ 7628 */
7585 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7629 if (change != KVM_MR_DELETE)
7586 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7630 kvm_mmu_slot_apply_flags(kvm, new);
7587} 7631}
7588 7632
7589void kvm_arch_flush_shadow_all(struct kvm *kvm) 7633void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7837EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 7881EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7838EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); 7882EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
7839EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); 7883EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
7884EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cc1d61af6140..f5fef1868096 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
147 147
148void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 148void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
149void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 149void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
150void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
150int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 151int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
151 152
152void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); 153void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void);
170 171
171extern unsigned int min_timer_period_us; 172extern unsigned int min_timer_period_us;
172 173
174extern unsigned int lapic_timer_advance_ns;
175
173extern struct static_key kvm_no_apic_vcpu; 176extern struct static_key kvm_no_apic_vcpu;
174#endif 177#endif
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 1a146ccee701..2ab290bec655 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -481,15 +481,19 @@ out:
481 return tlist; 481 return tlist;
482} 482}
483 483
484#define MPIDR_TO_SGI_AFFINITY(cluster_id, level) \
485 (MPIDR_AFFINITY_LEVEL(cluster_id, level) \
486 << ICC_SGI1R_AFFINITY_## level ##_SHIFT)
487
484static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq) 488static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq)
485{ 489{
486 u64 val; 490 u64 val;
487 491
488 val = (MPIDR_AFFINITY_LEVEL(cluster_id, 3) << 48 | 492 val = (MPIDR_TO_SGI_AFFINITY(cluster_id, 3) |
489 MPIDR_AFFINITY_LEVEL(cluster_id, 2) << 32 | 493 MPIDR_TO_SGI_AFFINITY(cluster_id, 2) |
490 irq << 24 | 494 irq << ICC_SGI1R_SGI_ID_SHIFT |
491 MPIDR_AFFINITY_LEVEL(cluster_id, 1) << 16 | 495 MPIDR_TO_SGI_AFFINITY(cluster_id, 1) |
492 tlist); 496 tlist << ICC_SGI1R_TARGET_LIST_SHIFT);
493 497
494 pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val); 498 pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
495 gic_write_sgi1r(val); 499 gic_write_sgi1r(val);
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index daf6cd5079ec..1efa4fdb7fe2 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -54,6 +54,7 @@ static unsigned long sclp_hsa_size;
54static unsigned int sclp_max_cpu; 54static unsigned int sclp_max_cpu;
55static struct sclp_ipl_info sclp_ipl_info; 55static struct sclp_ipl_info sclp_ipl_info;
56static unsigned char sclp_siif; 56static unsigned char sclp_siif;
57static unsigned char sclp_sigpif;
57static u32 sclp_ibc; 58static u32 sclp_ibc;
58static unsigned int sclp_mtid; 59static unsigned int sclp_mtid;
59static unsigned int sclp_mtid_cp; 60static unsigned int sclp_mtid_cp;
@@ -140,6 +141,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
140 if (boot_cpu_address != cpue->core_id) 141 if (boot_cpu_address != cpue->core_id)
141 continue; 142 continue;
142 sclp_siif = cpue->siif; 143 sclp_siif = cpue->siif;
144 sclp_sigpif = cpue->sigpif;
143 break; 145 break;
144 } 146 }
145 147
@@ -186,6 +188,12 @@ int sclp_has_siif(void)
186} 188}
187EXPORT_SYMBOL(sclp_has_siif); 189EXPORT_SYMBOL(sclp_has_siif);
188 190
191int sclp_has_sigpif(void)
192{
193 return sclp_sigpif;
194}
195EXPORT_SYMBOL(sclp_has_sigpif);
196
189unsigned int sclp_get_ibc(void) 197unsigned int sclp_get_ibc(void)
190{ 198{
191 return sclp_ibc; 199 return sclp_ibc;
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ac4888dc86bc..7c55dd5dd2c9 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -33,10 +33,11 @@
33#define VGIC_V2_MAX_LRS (1 << 6) 33#define VGIC_V2_MAX_LRS (1 << 6)
34#define VGIC_V3_MAX_LRS 16 34#define VGIC_V3_MAX_LRS 16
35#define VGIC_MAX_IRQS 1024 35#define VGIC_MAX_IRQS 1024
36#define VGIC_V2_MAX_CPUS 8
36 37
37/* Sanity checks... */ 38/* Sanity checks... */
38#if (KVM_MAX_VCPUS > 8) 39#if (KVM_MAX_VCPUS > 255)
39#error Invalid number of CPU interfaces 40#error Too many KVM VCPUs, the VGIC only supports up to 255 VCPUs for now
40#endif 41#endif
41 42
42#if (VGIC_NR_IRQS_LEGACY & 31) 43#if (VGIC_NR_IRQS_LEGACY & 31)
@@ -132,6 +133,18 @@ struct vgic_params {
132 unsigned int maint_irq; 133 unsigned int maint_irq;
133 /* Virtual control interface base address */ 134 /* Virtual control interface base address */
134 void __iomem *vctrl_base; 135 void __iomem *vctrl_base;
136 int max_gic_vcpus;
137 /* Only needed for the legacy KVM_CREATE_IRQCHIP */
138 bool can_emulate_gicv2;
139};
140
141struct vgic_vm_ops {
142 bool (*handle_mmio)(struct kvm_vcpu *, struct kvm_run *,
143 struct kvm_exit_mmio *);
144 bool (*queue_sgi)(struct kvm_vcpu *, int irq);
145 void (*add_sgi_source)(struct kvm_vcpu *, int irq, int source);
146 int (*init_model)(struct kvm *);
147 int (*map_resources)(struct kvm *, const struct vgic_params *);
135}; 148};
136 149
137struct vgic_dist { 150struct vgic_dist {
@@ -140,6 +153,9 @@ struct vgic_dist {
140 bool in_kernel; 153 bool in_kernel;
141 bool ready; 154 bool ready;
142 155
156 /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
157 u32 vgic_model;
158
143 int nr_cpus; 159 int nr_cpus;
144 int nr_irqs; 160 int nr_irqs;
145 161
@@ -148,7 +164,11 @@ struct vgic_dist {
148 164
149 /* Distributor and vcpu interface mapping in the guest */ 165 /* Distributor and vcpu interface mapping in the guest */
150 phys_addr_t vgic_dist_base; 166 phys_addr_t vgic_dist_base;
151 phys_addr_t vgic_cpu_base; 167 /* GICv2 and GICv3 use different mapped register blocks */
168 union {
169 phys_addr_t vgic_cpu_base;
170 phys_addr_t vgic_redist_base;
171 };
152 172
153 /* Distributor enabled */ 173 /* Distributor enabled */
154 u32 enabled; 174 u32 enabled;
@@ -210,8 +230,13 @@ struct vgic_dist {
210 */ 230 */
211 struct vgic_bitmap *irq_spi_target; 231 struct vgic_bitmap *irq_spi_target;
212 232
233 /* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
234 u32 *irq_spi_mpidr;
235
213 /* Bitmap indicating which CPU has something pending */ 236 /* Bitmap indicating which CPU has something pending */
214 unsigned long *irq_pending_on_cpu; 237 unsigned long *irq_pending_on_cpu;
238
239 struct vgic_vm_ops vm_ops;
215#endif 240#endif
216}; 241};
217 242
@@ -229,6 +254,7 @@ struct vgic_v3_cpu_if {
229#ifdef CONFIG_ARM_GIC_V3 254#ifdef CONFIG_ARM_GIC_V3
230 u32 vgic_hcr; 255 u32 vgic_hcr;
231 u32 vgic_vmcr; 256 u32 vgic_vmcr;
257 u32 vgic_sre; /* Restored only, change ignored */
232 u32 vgic_misr; /* Saved only */ 258 u32 vgic_misr; /* Saved only */
233 u32 vgic_eisr; /* Saved only */ 259 u32 vgic_eisr; /* Saved only */
234 u32 vgic_elrsr; /* Saved only */ 260 u32 vgic_elrsr; /* Saved only */
@@ -275,13 +301,15 @@ struct kvm_exit_mmio;
275int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); 301int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
276int kvm_vgic_hyp_init(void); 302int kvm_vgic_hyp_init(void);
277int kvm_vgic_map_resources(struct kvm *kvm); 303int kvm_vgic_map_resources(struct kvm *kvm);
278int kvm_vgic_create(struct kvm *kvm); 304int kvm_vgic_get_max_vcpus(void);
305int kvm_vgic_create(struct kvm *kvm, u32 type);
279void kvm_vgic_destroy(struct kvm *kvm); 306void kvm_vgic_destroy(struct kvm *kvm);
280void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); 307void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
281void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); 308void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
282void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); 309void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
283int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, 310int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
284 bool level); 311 bool level);
312void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
285int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); 313int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
286bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, 314bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
287 struct kvm_exit_mmio *mmio); 315 struct kvm_exit_mmio *mmio);
@@ -327,7 +355,7 @@ static inline int kvm_vgic_map_resources(struct kvm *kvm)
327 return 0; 355 return 0;
328} 356}
329 357
330static inline int kvm_vgic_create(struct kvm *kvm) 358static inline int kvm_vgic_create(struct kvm *kvm, u32 type)
331{ 359{
332 return 0; 360 return 0;
333} 361}
@@ -379,6 +407,11 @@ static inline bool vgic_ready(struct kvm *kvm)
379{ 407{
380 return true; 408 return true;
381} 409}
410
411static inline int kvm_vgic_get_max_vcpus(void)
412{
413 return KVM_MAX_VCPUS;
414}
382#endif 415#endif
383 416
384#endif 417#endif
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 1e8b0cf30792..800544bc7bfd 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -33,6 +33,7 @@
33#define GICD_SETSPI_SR 0x0050 33#define GICD_SETSPI_SR 0x0050
34#define GICD_CLRSPI_SR 0x0058 34#define GICD_CLRSPI_SR 0x0058
35#define GICD_SEIR 0x0068 35#define GICD_SEIR 0x0068
36#define GICD_IGROUPR 0x0080
36#define GICD_ISENABLER 0x0100 37#define GICD_ISENABLER 0x0100
37#define GICD_ICENABLER 0x0180 38#define GICD_ICENABLER 0x0180
38#define GICD_ISPENDR 0x0200 39#define GICD_ISPENDR 0x0200
@@ -41,14 +42,37 @@
41#define GICD_ICACTIVER 0x0380 42#define GICD_ICACTIVER 0x0380
42#define GICD_IPRIORITYR 0x0400 43#define GICD_IPRIORITYR 0x0400
43#define GICD_ICFGR 0x0C00 44#define GICD_ICFGR 0x0C00
45#define GICD_IGRPMODR 0x0D00
46#define GICD_NSACR 0x0E00
44#define GICD_IROUTER 0x6000 47#define GICD_IROUTER 0x6000
48#define GICD_IDREGS 0xFFD0
45#define GICD_PIDR2 0xFFE8 49#define GICD_PIDR2 0xFFE8
46 50
51/*
52 * Those registers are actually from GICv2, but the spec demands that they
53 * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
54 */
55#define GICD_ITARGETSR 0x0800
56#define GICD_SGIR 0x0F00
57#define GICD_CPENDSGIR 0x0F10
58#define GICD_SPENDSGIR 0x0F20
59
47#define GICD_CTLR_RWP (1U << 31) 60#define GICD_CTLR_RWP (1U << 31)
61#define GICD_CTLR_DS (1U << 6)
48#define GICD_CTLR_ARE_NS (1U << 4) 62#define GICD_CTLR_ARE_NS (1U << 4)
49#define GICD_CTLR_ENABLE_G1A (1U << 1) 63#define GICD_CTLR_ENABLE_G1A (1U << 1)
50#define GICD_CTLR_ENABLE_G1 (1U << 0) 64#define GICD_CTLR_ENABLE_G1 (1U << 0)
51 65
66/*
67 * In systems with a single security state (what we emulate in KVM)
68 * the meaning of the interrupt group enable bits is slightly different
69 */
70#define GICD_CTLR_ENABLE_SS_G1 (1U << 1)
71#define GICD_CTLR_ENABLE_SS_G0 (1U << 0)
72
73#define GICD_TYPER_LPIS (1U << 17)
74#define GICD_TYPER_MBIS (1U << 16)
75
52#define GICD_TYPER_ID_BITS(typer) ((((typer) >> 19) & 0x1f) + 1) 76#define GICD_TYPER_ID_BITS(typer) ((((typer) >> 19) & 0x1f) + 1)
53#define GICD_TYPER_IRQS(typer) ((((typer) & 0x1f) + 1) * 32) 77#define GICD_TYPER_IRQS(typer) ((((typer) & 0x1f) + 1) * 32)
54#define GICD_TYPER_LPIS (1U << 17) 78#define GICD_TYPER_LPIS (1U << 17)
@@ -60,6 +84,8 @@
60#define GIC_PIDR2_ARCH_GICv3 0x30 84#define GIC_PIDR2_ARCH_GICv3 0x30
61#define GIC_PIDR2_ARCH_GICv4 0x40 85#define GIC_PIDR2_ARCH_GICv4 0x40
62 86
87#define GIC_V3_DIST_SIZE 0x10000
88
63/* 89/*
64 * Re-Distributor registers, offsets from RD_base 90 * Re-Distributor registers, offsets from RD_base
65 */ 91 */
@@ -78,6 +104,7 @@
78#define GICR_SYNCR 0x00C0 104#define GICR_SYNCR 0x00C0
79#define GICR_MOVLPIR 0x0100 105#define GICR_MOVLPIR 0x0100
80#define GICR_MOVALLR 0x0110 106#define GICR_MOVALLR 0x0110
107#define GICR_IDREGS GICD_IDREGS
81#define GICR_PIDR2 GICD_PIDR2 108#define GICR_PIDR2 GICD_PIDR2
82 109
83#define GICR_CTLR_ENABLE_LPIS (1UL << 0) 110#define GICR_CTLR_ENABLE_LPIS (1UL << 0)
@@ -104,6 +131,7 @@
104/* 131/*
105 * Re-Distributor registers, offsets from SGI_base 132 * Re-Distributor registers, offsets from SGI_base
106 */ 133 */
134#define GICR_IGROUPR0 GICD_IGROUPR
107#define GICR_ISENABLER0 GICD_ISENABLER 135#define GICR_ISENABLER0 GICD_ISENABLER
108#define GICR_ICENABLER0 GICD_ICENABLER 136#define GICR_ICENABLER0 GICD_ICENABLER
109#define GICR_ISPENDR0 GICD_ISPENDR 137#define GICR_ISPENDR0 GICD_ISPENDR
@@ -112,11 +140,15 @@
112#define GICR_ICACTIVER0 GICD_ICACTIVER 140#define GICR_ICACTIVER0 GICD_ICACTIVER
113#define GICR_IPRIORITYR0 GICD_IPRIORITYR 141#define GICR_IPRIORITYR0 GICD_IPRIORITYR
114#define GICR_ICFGR0 GICD_ICFGR 142#define GICR_ICFGR0 GICD_ICFGR
143#define GICR_IGRPMODR0 GICD_IGRPMODR
144#define GICR_NSACR GICD_NSACR
115 145
116#define GICR_TYPER_PLPIS (1U << 0) 146#define GICR_TYPER_PLPIS (1U << 0)
117#define GICR_TYPER_VLPIS (1U << 1) 147#define GICR_TYPER_VLPIS (1U << 1)
118#define GICR_TYPER_LAST (1U << 4) 148#define GICR_TYPER_LAST (1U << 4)
119 149
150#define GIC_V3_REDIST_SIZE 0x20000
151
120#define LPI_PROP_GROUP1 (1 << 1) 152#define LPI_PROP_GROUP1 (1 << 1)
121#define LPI_PROP_ENABLED (1 << 0) 153#define LPI_PROP_ENABLED (1 << 0)
122 154
@@ -248,6 +280,18 @@
248#define ICC_SRE_EL2_SRE (1 << 0) 280#define ICC_SRE_EL2_SRE (1 << 0)
249#define ICC_SRE_EL2_ENABLE (1 << 3) 281#define ICC_SRE_EL2_ENABLE (1 << 3)
250 282
283#define ICC_SGI1R_TARGET_LIST_SHIFT 0
284#define ICC_SGI1R_TARGET_LIST_MASK (0xffff << ICC_SGI1R_TARGET_LIST_SHIFT)
285#define ICC_SGI1R_AFFINITY_1_SHIFT 16
286#define ICC_SGI1R_AFFINITY_1_MASK (0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
287#define ICC_SGI1R_SGI_ID_SHIFT 24
288#define ICC_SGI1R_SGI_ID_MASK (0xff << ICC_SGI1R_SGI_ID_SHIFT)
289#define ICC_SGI1R_AFFINITY_2_SHIFT 32
290#define ICC_SGI1R_AFFINITY_2_MASK (0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
291#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT 40
292#define ICC_SGI1R_AFFINITY_3_SHIFT 48
293#define ICC_SGI1R_AFFINITY_3_MASK (0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
294
251/* 295/*
252 * System register definitions 296 * System register definitions
253 */ 297 */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d189ee098aa2..d12b2104d19b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -33,10 +33,6 @@
33 33
34#include <asm/kvm_host.h> 34#include <asm/kvm_host.h>
35 35
36#ifndef KVM_MMIO_SIZE
37#define KVM_MMIO_SIZE 8
38#endif
39
40/* 36/*
41 * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used 37 * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
42 * in kvm, other bits are visible for userspace which are defined in 38 * in kvm, other bits are visible for userspace which are defined in
@@ -600,6 +596,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
600 596
601int kvm_get_dirty_log(struct kvm *kvm, 597int kvm_get_dirty_log(struct kvm *kvm,
602 struct kvm_dirty_log *log, int *is_dirty); 598 struct kvm_dirty_log *log, int *is_dirty);
599
600int kvm_get_dirty_log_protect(struct kvm *kvm,
601 struct kvm_dirty_log *log, bool *is_dirty);
602
603void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
604 struct kvm_memory_slot *slot,
605 gfn_t gfn_offset,
606 unsigned long mask);
607
603int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 608int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
604 struct kvm_dirty_log *log); 609 struct kvm_dirty_log *log);
605 610
@@ -641,7 +646,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
641void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); 646void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
642struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); 647struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
643int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); 648int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
644int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); 649void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
645void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); 650void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
646 651
647int kvm_arch_hardware_enable(void); 652int kvm_arch_hardware_enable(void);
@@ -1031,6 +1036,8 @@ void kvm_unregister_device_ops(u32 type);
1031 1036
1032extern struct kvm_device_ops kvm_mpic_ops; 1037extern struct kvm_device_ops kvm_mpic_ops;
1033extern struct kvm_device_ops kvm_xics_ops; 1038extern struct kvm_device_ops kvm_xics_ops;
1039extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
1040extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
1034 1041
1035#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 1042#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
1036 1043
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 86b399c66c3d..a44062da684b 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -37,6 +37,25 @@ TRACE_EVENT(kvm_userspace_exit,
37 __entry->errno < 0 ? -__entry->errno : __entry->reason) 37 __entry->errno < 0 ? -__entry->errno : __entry->reason)
38); 38);
39 39
40TRACE_EVENT(kvm_vcpu_wakeup,
41 TP_PROTO(__u64 ns, bool waited),
42 TP_ARGS(ns, waited),
43
44 TP_STRUCT__entry(
45 __field( __u64, ns )
46 __field( bool, waited )
47 ),
48
49 TP_fast_assign(
50 __entry->ns = ns;
51 __entry->waited = waited;
52 ),
53
54 TP_printk("%s time %lld ns",
55 __entry->waited ? "wait" : "poll",
56 __entry->ns)
57);
58
40#if defined(CONFIG_HAVE_KVM_IRQFD) 59#if defined(CONFIG_HAVE_KVM_IRQFD)
41TRACE_EVENT(kvm_set_irq, 60TRACE_EVENT(kvm_set_irq,
42 TP_PROTO(unsigned int gsi, int level, int irq_source_id), 61 TP_PROTO(unsigned int gsi, int level, int irq_source_id),
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a37fd1224f36..805570650062 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -491,6 +491,11 @@ struct kvm_s390_emerg_info {
491 __u16 code; 491 __u16 code;
492}; 492};
493 493
494#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01
495struct kvm_s390_stop_info {
496 __u32 flags;
497};
498
494struct kvm_s390_mchk_info { 499struct kvm_s390_mchk_info {
495 __u64 cr14; 500 __u64 cr14;
496 __u64 mcic; 501 __u64 mcic;
@@ -509,6 +514,7 @@ struct kvm_s390_irq {
509 struct kvm_s390_emerg_info emerg; 514 struct kvm_s390_emerg_info emerg;
510 struct kvm_s390_extcall_info extcall; 515 struct kvm_s390_extcall_info extcall;
511 struct kvm_s390_prefix_info prefix; 516 struct kvm_s390_prefix_info prefix;
517 struct kvm_s390_stop_info stop;
512 struct kvm_s390_mchk_info mchk; 518 struct kvm_s390_mchk_info mchk;
513 char reserved[64]; 519 char reserved[64];
514 } u; 520 } u;
@@ -753,6 +759,7 @@ struct kvm_ppc_smmu_info {
753#define KVM_CAP_PPC_FIXUP_HCALL 103 759#define KVM_CAP_PPC_FIXUP_HCALL 103
754#define KVM_CAP_PPC_ENABLE_HCALL 104 760#define KVM_CAP_PPC_ENABLE_HCALL 104
755#define KVM_CAP_CHECK_EXTENSION_VM 105 761#define KVM_CAP_CHECK_EXTENSION_VM 105
762#define KVM_CAP_S390_USER_SIGP 106
756 763
757#ifdef KVM_CAP_IRQ_ROUTING 764#ifdef KVM_CAP_IRQ_ROUTING
758 765
@@ -952,6 +959,8 @@ enum kvm_device_type {
952#define KVM_DEV_TYPE_ARM_VGIC_V2 KVM_DEV_TYPE_ARM_VGIC_V2 959#define KVM_DEV_TYPE_ARM_VGIC_V2 KVM_DEV_TYPE_ARM_VGIC_V2
953 KVM_DEV_TYPE_FLIC, 960 KVM_DEV_TYPE_FLIC,
954#define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC 961#define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC
962 KVM_DEV_TYPE_ARM_VGIC_V3,
963#define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3
955 KVM_DEV_TYPE_MAX, 964 KVM_DEV_TYPE_MAX,
956}; 965};
957 966
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index fc0c5e603eb4..e2c876d5a03b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -37,3 +37,13 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT
37 37
38config KVM_VFIO 38config KVM_VFIO
39 bool 39 bool
40
41config HAVE_KVM_ARCH_TLB_FLUSH_ALL
42 bool
43
44config KVM_GENERIC_DIRTYLOG_READ_PROTECT
45 bool
46
47config KVM_COMPAT
48 def_bool y
49 depends on COMPAT && !S390
diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
new file mode 100644
index 000000000000..19c6210f02cf
--- /dev/null
+++ b/virt/kvm/arm/vgic-v2-emul.c
@@ -0,0 +1,847 @@
1/*
2 * Contains GICv2 specific emulation code, was in vgic.c before.
3 *
4 * Copyright (C) 2012 ARM Ltd.
5 * Author: Marc Zyngier <marc.zyngier@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include <linux/cpu.h>
21#include <linux/kvm.h>
22#include <linux/kvm_host.h>
23#include <linux/interrupt.h>
24#include <linux/io.h>
25#include <linux/uaccess.h>
26
27#include <linux/irqchip/arm-gic.h>
28
29#include <asm/kvm_emulate.h>
30#include <asm/kvm_arm.h>
31#include <asm/kvm_mmu.h>
32
33#include "vgic.h"
34
35#define GICC_ARCH_VERSION_V2 0x2
36
37static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
38static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
39{
40 return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;
41}
42
43static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
44 struct kvm_exit_mmio *mmio, phys_addr_t offset)
45{
46 u32 reg;
47 u32 word_offset = offset & 3;
48
49 switch (offset & ~3) {
50 case 0: /* GICD_CTLR */
51 reg = vcpu->kvm->arch.vgic.enabled;
52 vgic_reg_access(mmio, &reg, word_offset,
53 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
54 if (mmio->is_write) {
55 vcpu->kvm->arch.vgic.enabled = reg & 1;
56 vgic_update_state(vcpu->kvm);
57 return true;
58 }
59 break;
60
61 case 4: /* GICD_TYPER */
62 reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
63 reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
64 vgic_reg_access(mmio, &reg, word_offset,
65 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
66 break;
67
68 case 8: /* GICD_IIDR */
69 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
70 vgic_reg_access(mmio, &reg, word_offset,
71 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
72 break;
73 }
74
75 return false;
76}
77
78static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
79 struct kvm_exit_mmio *mmio,
80 phys_addr_t offset)
81{
82 return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
83 vcpu->vcpu_id, ACCESS_WRITE_SETBIT);
84}
85
86static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
87 struct kvm_exit_mmio *mmio,
88 phys_addr_t offset)
89{
90 return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
91 vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT);
92}
93
94static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
95 struct kvm_exit_mmio *mmio,
96 phys_addr_t offset)
97{
98 return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
99 vcpu->vcpu_id);
100}
101
102static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
103 struct kvm_exit_mmio *mmio,
104 phys_addr_t offset)
105{
106 return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
107 vcpu->vcpu_id);
108}
109
110static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
111 struct kvm_exit_mmio *mmio,
112 phys_addr_t offset)
113{
114 u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
115 vcpu->vcpu_id, offset);
116 vgic_reg_access(mmio, reg, offset,
117 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
118 return false;
119}
120
121#define GICD_ITARGETSR_SIZE 32
122#define GICD_CPUTARGETS_BITS 8
123#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
124static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
125{
126 struct vgic_dist *dist = &kvm->arch.vgic;
127 int i;
128 u32 val = 0;
129
130 irq -= VGIC_NR_PRIVATE_IRQS;
131
132 for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
133 val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
134
135 return val;
136}
137
138static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
139{
140 struct vgic_dist *dist = &kvm->arch.vgic;
141 struct kvm_vcpu *vcpu;
142 int i, c;
143 unsigned long *bmap;
144 u32 target;
145
146 irq -= VGIC_NR_PRIVATE_IRQS;
147
148 /*
149 * Pick the LSB in each byte. This ensures we target exactly
150 * one vcpu per IRQ. If the byte is null, assume we target
151 * CPU0.
152 */
153 for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
154 int shift = i * GICD_CPUTARGETS_BITS;
155
156 target = ffs((val >> shift) & 0xffU);
157 target = target ? (target - 1) : 0;
158 dist->irq_spi_cpu[irq + i] = target;
159 kvm_for_each_vcpu(c, vcpu, kvm) {
160 bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
161 if (c == target)
162 set_bit(irq + i, bmap);
163 else
164 clear_bit(irq + i, bmap);
165 }
166 }
167}
168
169static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
170 struct kvm_exit_mmio *mmio,
171 phys_addr_t offset)
172{
173 u32 reg;
174
175 /* We treat the banked interrupts targets as read-only */
176 if (offset < 32) {
177 u32 roreg;
178
179 roreg = 1 << vcpu->vcpu_id;
180 roreg |= roreg << 8;
181 roreg |= roreg << 16;
182
183 vgic_reg_access(mmio, &roreg, offset,
184 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
185 return false;
186 }
187
188 reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
189 vgic_reg_access(mmio, &reg, offset,
190 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
191 if (mmio->is_write) {
192 vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
193 vgic_update_state(vcpu->kvm);
194 return true;
195 }
196
197 return false;
198}
199
200static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
201 struct kvm_exit_mmio *mmio, phys_addr_t offset)
202{
203 u32 *reg;
204
205 reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
206 vcpu->vcpu_id, offset >> 1);
207
208 return vgic_handle_cfg_reg(reg, mmio, offset);
209}
210
211static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
212 struct kvm_exit_mmio *mmio, phys_addr_t offset)
213{
214 u32 reg;
215
216 vgic_reg_access(mmio, &reg, offset,
217 ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
218 if (mmio->is_write) {
219 vgic_dispatch_sgi(vcpu, reg);
220 vgic_update_state(vcpu->kvm);
221 return true;
222 }
223
224 return false;
225}
226
227/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
228static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
229 struct kvm_exit_mmio *mmio,
230 phys_addr_t offset)
231{
232 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
233 int sgi;
234 int min_sgi = (offset & ~0x3);
235 int max_sgi = min_sgi + 3;
236 int vcpu_id = vcpu->vcpu_id;
237 u32 reg = 0;
238
239 /* Copy source SGIs from distributor side */
240 for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
241 u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi);
242
243 reg |= ((u32)sources) << (8 * (sgi - min_sgi));
244 }
245
246 mmio_data_write(mmio, ~0, reg);
247 return false;
248}
249
250static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
251 struct kvm_exit_mmio *mmio,
252 phys_addr_t offset, bool set)
253{
254 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
255 int sgi;
256 int min_sgi = (offset & ~0x3);
257 int max_sgi = min_sgi + 3;
258 int vcpu_id = vcpu->vcpu_id;
259 u32 reg;
260 bool updated = false;
261
262 reg = mmio_data_read(mmio, ~0);
263
264 /* Clear pending SGIs on the distributor */
265 for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
266 u8 mask = reg >> (8 * (sgi - min_sgi));
267 u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
268
269 if (set) {
270 if ((*src & mask) != mask)
271 updated = true;
272 *src |= mask;
273 } else {
274 if (*src & mask)
275 updated = true;
276 *src &= ~mask;
277 }
278 }
279
280 if (updated)
281 vgic_update_state(vcpu->kvm);
282
283 return updated;
284}
285
286static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
287 struct kvm_exit_mmio *mmio,
288 phys_addr_t offset)
289{
290 if (!mmio->is_write)
291 return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
292 else
293 return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
294}
295
296static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
297 struct kvm_exit_mmio *mmio,
298 phys_addr_t offset)
299{
300 if (!mmio->is_write)
301 return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
302 else
303 return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
304}
305
306static const struct kvm_mmio_range vgic_dist_ranges[] = {
307 {
308 .base = GIC_DIST_CTRL,
309 .len = 12,
310 .bits_per_irq = 0,
311 .handle_mmio = handle_mmio_misc,
312 },
313 {
314 .base = GIC_DIST_IGROUP,
315 .len = VGIC_MAX_IRQS / 8,
316 .bits_per_irq = 1,
317 .handle_mmio = handle_mmio_raz_wi,
318 },
319 {
320 .base = GIC_DIST_ENABLE_SET,
321 .len = VGIC_MAX_IRQS / 8,
322 .bits_per_irq = 1,
323 .handle_mmio = handle_mmio_set_enable_reg,
324 },
325 {
326 .base = GIC_DIST_ENABLE_CLEAR,
327 .len = VGIC_MAX_IRQS / 8,
328 .bits_per_irq = 1,
329 .handle_mmio = handle_mmio_clear_enable_reg,
330 },
331 {
332 .base = GIC_DIST_PENDING_SET,
333 .len = VGIC_MAX_IRQS / 8,
334 .bits_per_irq = 1,
335 .handle_mmio = handle_mmio_set_pending_reg,
336 },
337 {
338 .base = GIC_DIST_PENDING_CLEAR,
339 .len = VGIC_MAX_IRQS / 8,
340 .bits_per_irq = 1,
341 .handle_mmio = handle_mmio_clear_pending_reg,
342 },
343 {
344 .base = GIC_DIST_ACTIVE_SET,
345 .len = VGIC_MAX_IRQS / 8,
346 .bits_per_irq = 1,
347 .handle_mmio = handle_mmio_raz_wi,
348 },
349 {
350 .base = GIC_DIST_ACTIVE_CLEAR,
351 .len = VGIC_MAX_IRQS / 8,
352 .bits_per_irq = 1,
353 .handle_mmio = handle_mmio_raz_wi,
354 },
355 {
356 .base = GIC_DIST_PRI,
357 .len = VGIC_MAX_IRQS,
358 .bits_per_irq = 8,
359 .handle_mmio = handle_mmio_priority_reg,
360 },
361 {
362 .base = GIC_DIST_TARGET,
363 .len = VGIC_MAX_IRQS,
364 .bits_per_irq = 8,
365 .handle_mmio = handle_mmio_target_reg,
366 },
367 {
368 .base = GIC_DIST_CONFIG,
369 .len = VGIC_MAX_IRQS / 4,
370 .bits_per_irq = 2,
371 .handle_mmio = handle_mmio_cfg_reg,
372 },
373 {
374 .base = GIC_DIST_SOFTINT,
375 .len = 4,
376 .handle_mmio = handle_mmio_sgi_reg,
377 },
378 {
379 .base = GIC_DIST_SGI_PENDING_CLEAR,
380 .len = VGIC_NR_SGIS,
381 .handle_mmio = handle_mmio_sgi_clear,
382 },
383 {
384 .base = GIC_DIST_SGI_PENDING_SET,
385 .len = VGIC_NR_SGIS,
386 .handle_mmio = handle_mmio_sgi_set,
387 },
388 {}
389};
390
391static bool vgic_v2_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
392 struct kvm_exit_mmio *mmio)
393{
394 unsigned long base = vcpu->kvm->arch.vgic.vgic_dist_base;
395
396 if (!is_in_range(mmio->phys_addr, mmio->len, base,
397 KVM_VGIC_V2_DIST_SIZE))
398 return false;
399
400 /* GICv2 does not support accesses wider than 32 bits */
401 if (mmio->len > 4) {
402 kvm_inject_dabt(vcpu, mmio->phys_addr);
403 return true;
404 }
405
406 return vgic_handle_mmio_range(vcpu, run, mmio, vgic_dist_ranges, base);
407}
408
409static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
410{
411 struct kvm *kvm = vcpu->kvm;
412 struct vgic_dist *dist = &kvm->arch.vgic;
413 int nrcpus = atomic_read(&kvm->online_vcpus);
414 u8 target_cpus;
415 int sgi, mode, c, vcpu_id;
416
417 vcpu_id = vcpu->vcpu_id;
418
419 sgi = reg & 0xf;
420 target_cpus = (reg >> 16) & 0xff;
421 mode = (reg >> 24) & 3;
422
423 switch (mode) {
424 case 0:
425 if (!target_cpus)
426 return;
427 break;
428
429 case 1:
430 target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
431 break;
432
433 case 2:
434 target_cpus = 1 << vcpu_id;
435 break;
436 }
437
438 kvm_for_each_vcpu(c, vcpu, kvm) {
439 if (target_cpus & 1) {
440 /* Flag the SGI as pending */
441 vgic_dist_irq_set_pending(vcpu, sgi);
442 *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
443 kvm_debug("SGI%d from CPU%d to CPU%d\n",
444 sgi, vcpu_id, c);
445 }
446
447 target_cpus >>= 1;
448 }
449}
450
451static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq)
452{
453 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
454 unsigned long sources;
455 int vcpu_id = vcpu->vcpu_id;
456 int c;
457
458 sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
459
460 for_each_set_bit(c, &sources, dist->nr_cpus) {
461 if (vgic_queue_irq(vcpu, c, irq))
462 clear_bit(c, &sources);
463 }
464
465 *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
466
467 /*
468 * If the sources bitmap has been cleared it means that we
469 * could queue all the SGIs onto link registers (see the
470 * clear_bit above), and therefore we are done with them in
471 * our emulated gic and can get rid of them.
472 */
473 if (!sources) {
474 vgic_dist_irq_clear_pending(vcpu, irq);
475 vgic_cpu_irq_clear(vcpu, irq);
476 return true;
477 }
478
479 return false;
480}
481
482/**
483 * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
484 * @kvm: pointer to the kvm struct
485 *
486 * Map the virtual CPU interface into the VM before running any VCPUs. We
487 * can't do this at creation time, because user space must first set the
488 * virtual CPU interface address in the guest physical address space.
489 */
490static int vgic_v2_map_resources(struct kvm *kvm,
491 const struct vgic_params *params)
492{
493 int ret = 0;
494
495 if (!irqchip_in_kernel(kvm))
496 return 0;
497
498 mutex_lock(&kvm->lock);
499
500 if (vgic_ready(kvm))
501 goto out;
502
503 if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) ||
504 IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_cpu_base)) {
505 kvm_err("Need to set vgic cpu and dist addresses first\n");
506 ret = -ENXIO;
507 goto out;
508 }
509
510 /*
511 * Initialize the vgic if this hasn't already been done on demand by
512 * accessing the vgic state from userspace.
513 */
514 ret = vgic_init(kvm);
515 if (ret) {
516 kvm_err("Unable to allocate maps\n");
517 goto out;
518 }
519
520 ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base,
521 params->vcpu_base, KVM_VGIC_V2_CPU_SIZE,
522 true);
523 if (ret) {
524 kvm_err("Unable to remap VGIC CPU to VCPU\n");
525 goto out;
526 }
527
528 kvm->arch.vgic.ready = true;
529out:
530 if (ret)
531 kvm_vgic_destroy(kvm);
532 mutex_unlock(&kvm->lock);
533 return ret;
534}
535
536static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
537{
538 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
539
540 *vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source;
541}
542
543static int vgic_v2_init_model(struct kvm *kvm)
544{
545 int i;
546
547 for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
548 vgic_set_target_reg(kvm, 0, i);
549
550 return 0;
551}
552
553void vgic_v2_init_emulation(struct kvm *kvm)
554{
555 struct vgic_dist *dist = &kvm->arch.vgic;
556
557 dist->vm_ops.handle_mmio = vgic_v2_handle_mmio;
558 dist->vm_ops.queue_sgi = vgic_v2_queue_sgi;
559 dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source;
560 dist->vm_ops.init_model = vgic_v2_init_model;
561 dist->vm_ops.map_resources = vgic_v2_map_resources;
562
563 kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
564}
565
566static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
567 struct kvm_exit_mmio *mmio, phys_addr_t offset)
568{
569 bool updated = false;
570 struct vgic_vmcr vmcr;
571 u32 *vmcr_field;
572 u32 reg;
573
574 vgic_get_vmcr(vcpu, &vmcr);
575
576 switch (offset & ~0x3) {
577 case GIC_CPU_CTRL:
578 vmcr_field = &vmcr.ctlr;
579 break;
580 case GIC_CPU_PRIMASK:
581 vmcr_field = &vmcr.pmr;
582 break;
583 case GIC_CPU_BINPOINT:
584 vmcr_field = &vmcr.bpr;
585 break;
586 case GIC_CPU_ALIAS_BINPOINT:
587 vmcr_field = &vmcr.abpr;
588 break;
589 default:
590 BUG();
591 }
592
593 if (!mmio->is_write) {
594 reg = *vmcr_field;
595 mmio_data_write(mmio, ~0, reg);
596 } else {
597 reg = mmio_data_read(mmio, ~0);
598 if (reg != *vmcr_field) {
599 *vmcr_field = reg;
600 vgic_set_vmcr(vcpu, &vmcr);
601 updated = true;
602 }
603 }
604 return updated;
605}
606
607static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
608 struct kvm_exit_mmio *mmio, phys_addr_t offset)
609{
610 return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
611}
612
613static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
614 struct kvm_exit_mmio *mmio,
615 phys_addr_t offset)
616{
617 u32 reg;
618
619 if (mmio->is_write)
620 return false;
621
622 /* GICC_IIDR */
623 reg = (PRODUCT_ID_KVM << 20) |
624 (GICC_ARCH_VERSION_V2 << 16) |
625 (IMPLEMENTER_ARM << 0);
626 mmio_data_write(mmio, ~0, reg);
627 return false;
628}
629
630/*
631 * CPU Interface Register accesses - these are not accessed by the VM, but by
632 * user space for saving and restoring VGIC state.
633 */
634static const struct kvm_mmio_range vgic_cpu_ranges[] = {
635 {
636 .base = GIC_CPU_CTRL,
637 .len = 12,
638 .handle_mmio = handle_cpu_mmio_misc,
639 },
640 {
641 .base = GIC_CPU_ALIAS_BINPOINT,
642 .len = 4,
643 .handle_mmio = handle_mmio_abpr,
644 },
645 {
646 .base = GIC_CPU_ACTIVEPRIO,
647 .len = 16,
648 .handle_mmio = handle_mmio_raz_wi,
649 },
650 {
651 .base = GIC_CPU_IDENT,
652 .len = 4,
653 .handle_mmio = handle_cpu_mmio_ident,
654 },
655};
656
657static int vgic_attr_regs_access(struct kvm_device *dev,
658 struct kvm_device_attr *attr,
659 u32 *reg, bool is_write)
660{
661 const struct kvm_mmio_range *r = NULL, *ranges;
662 phys_addr_t offset;
663 int ret, cpuid, c;
664 struct kvm_vcpu *vcpu, *tmp_vcpu;
665 struct vgic_dist *vgic;
666 struct kvm_exit_mmio mmio;
667
668 offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
669 cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
670 KVM_DEV_ARM_VGIC_CPUID_SHIFT;
671
672 mutex_lock(&dev->kvm->lock);
673
674 ret = vgic_init(dev->kvm);
675 if (ret)
676 goto out;
677
678 if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
679 ret = -EINVAL;
680 goto out;
681 }
682
683 vcpu = kvm_get_vcpu(dev->kvm, cpuid);
684 vgic = &dev->kvm->arch.vgic;
685
686 mmio.len = 4;
687 mmio.is_write = is_write;
688 if (is_write)
689 mmio_data_write(&mmio, ~0, *reg);
690 switch (attr->group) {
691 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
692 mmio.phys_addr = vgic->vgic_dist_base + offset;
693 ranges = vgic_dist_ranges;
694 break;
695 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
696 mmio.phys_addr = vgic->vgic_cpu_base + offset;
697 ranges = vgic_cpu_ranges;
698 break;
699 default:
700 BUG();
701 }
702 r = vgic_find_range(ranges, &mmio, offset);
703
704 if (unlikely(!r || !r->handle_mmio)) {
705 ret = -ENXIO;
706 goto out;
707 }
708
709
710 spin_lock(&vgic->lock);
711
712 /*
713 * Ensure that no other VCPU is running by checking the vcpu->cpu
714 * field. If no other VPCUs are running we can safely access the VGIC
715 * state, because even if another VPU is run after this point, that
716 * VCPU will not touch the vgic state, because it will block on
717 * getting the vgic->lock in kvm_vgic_sync_hwstate().
718 */
719 kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
720 if (unlikely(tmp_vcpu->cpu != -1)) {
721 ret = -EBUSY;
722 goto out_vgic_unlock;
723 }
724 }
725
726 /*
727 * Move all pending IRQs from the LRs on all VCPUs so the pending
728 * state can be properly represented in the register state accessible
729 * through this API.
730 */
731 kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
732 vgic_unqueue_irqs(tmp_vcpu);
733
734 offset -= r->base;
735 r->handle_mmio(vcpu, &mmio, offset);
736
737 if (!is_write)
738 *reg = mmio_data_read(&mmio, ~0);
739
740 ret = 0;
741out_vgic_unlock:
742 spin_unlock(&vgic->lock);
743out:
744 mutex_unlock(&dev->kvm->lock);
745 return ret;
746}
747
748static int vgic_v2_create(struct kvm_device *dev, u32 type)
749{
750 return kvm_vgic_create(dev->kvm, type);
751}
752
753static void vgic_v2_destroy(struct kvm_device *dev)
754{
755 kfree(dev);
756}
757
758static int vgic_v2_set_attr(struct kvm_device *dev,
759 struct kvm_device_attr *attr)
760{
761 int ret;
762
763 ret = vgic_set_common_attr(dev, attr);
764 if (ret != -ENXIO)
765 return ret;
766
767 switch (attr->group) {
768 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
769 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
770 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
771 u32 reg;
772
773 if (get_user(reg, uaddr))
774 return -EFAULT;
775
776 return vgic_attr_regs_access(dev, attr, &reg, true);
777 }
778
779 }
780
781 return -ENXIO;
782}
783
784static int vgic_v2_get_attr(struct kvm_device *dev,
785 struct kvm_device_attr *attr)
786{
787 int ret;
788
789 ret = vgic_get_common_attr(dev, attr);
790 if (ret != -ENXIO)
791 return ret;
792
793 switch (attr->group) {
794 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
795 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
796 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
797 u32 reg = 0;
798
799 ret = vgic_attr_regs_access(dev, attr, &reg, false);
800 if (ret)
801 return ret;
802 return put_user(reg, uaddr);
803 }
804
805 }
806
807 return -ENXIO;
808}
809
810static int vgic_v2_has_attr(struct kvm_device *dev,
811 struct kvm_device_attr *attr)
812{
813 phys_addr_t offset;
814
815 switch (attr->group) {
816 case KVM_DEV_ARM_VGIC_GRP_ADDR:
817 switch (attr->attr) {
818 case KVM_VGIC_V2_ADDR_TYPE_DIST:
819 case KVM_VGIC_V2_ADDR_TYPE_CPU:
820 return 0;
821 }
822 break;
823 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
824 offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
825 return vgic_has_attr_regs(vgic_dist_ranges, offset);
826 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
827 offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
828 return vgic_has_attr_regs(vgic_cpu_ranges, offset);
829 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
830 return 0;
831 case KVM_DEV_ARM_VGIC_GRP_CTRL:
832 switch (attr->attr) {
833 case KVM_DEV_ARM_VGIC_CTRL_INIT:
834 return 0;
835 }
836 }
837 return -ENXIO;
838}
839
840struct kvm_device_ops kvm_arm_vgic_v2_ops = {
841 .name = "kvm-arm-vgic-v2",
842 .create = vgic_v2_create,
843 .destroy = vgic_v2_destroy,
844 .set_attr = vgic_v2_set_attr,
845 .get_attr = vgic_v2_get_attr,
846 .has_attr = vgic_v2_has_attr,
847};
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index 2935405ad22f..a0a7b5d1a070 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -229,12 +229,16 @@ int vgic_v2_probe(struct device_node *vgic_node,
229 goto out_unmap; 229 goto out_unmap;
230 } 230 }
231 231
232 vgic->can_emulate_gicv2 = true;
233 kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
234
232 vgic->vcpu_base = vcpu_res.start; 235 vgic->vcpu_base = vcpu_res.start;
233 236
234 kvm_info("%s@%llx IRQ%d\n", vgic_node->name, 237 kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
235 vctrl_res.start, vgic->maint_irq); 238 vctrl_res.start, vgic->maint_irq);
236 239
237 vgic->type = VGIC_V2; 240 vgic->type = VGIC_V2;
241 vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS;
238 *ops = &vgic_v2_ops; 242 *ops = &vgic_v2_ops;
239 *params = vgic; 243 *params = vgic;
240 goto out; 244 goto out;
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
new file mode 100644
index 000000000000..b3f154631515
--- /dev/null
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -0,0 +1,1036 @@
1/*
2 * GICv3 distributor and redistributor emulation
3 *
4 * GICv3 emulation is currently only supported on a GICv3 host (because
5 * we rely on the hardware's CPU interface virtualization support), but
6 * supports both hardware with or without the optional GICv2 backwards
7 * compatibility features.
8 *
9 * Limitations of the emulation:
10 * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
11 * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
12 * - We do not support the message based interrupts (MBIs) triggered by
13 * writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
14 * - We do not support the (optional) backwards compatibility feature.
15 * GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
16 * the compatiblity feature, you can use a GICv2 in the guest, though.
17 * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
18 * - Priorities are not emulated (same as the GICv2 emulation). Linux
19 * as a guest is fine with this, because it does not use priorities.
20 * - We only support Group1 interrupts. Again Linux uses only those.
21 *
22 * Copyright (C) 2014 ARM Ltd.
23 * Author: Andre Przywara <andre.przywara@arm.com>
24 *
25 * This program is free software; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License version 2 as
27 * published by the Free Software Foundation.
28 *
29 * This program is distributed in the hope that it will be useful,
30 * but WITHOUT ANY WARRANTY; without even the implied warranty of
31 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32 * GNU General Public License for more details.
33 *
34 * You should have received a copy of the GNU General Public License
35 * along with this program. If not, see <http://www.gnu.org/licenses/>.
36 */
37
38#include <linux/cpu.h>
39#include <linux/kvm.h>
40#include <linux/kvm_host.h>
41#include <linux/interrupt.h>
42
43#include <linux/irqchip/arm-gic-v3.h>
44#include <kvm/arm_vgic.h>
45
46#include <asm/kvm_emulate.h>
47#include <asm/kvm_arm.h>
48#include <asm/kvm_mmu.h>
49
50#include "vgic.h"
51
52static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
53 struct kvm_exit_mmio *mmio, phys_addr_t offset)
54{
55 u32 reg = 0xffffffff;
56
57 vgic_reg_access(mmio, &reg, offset,
58 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
59
60 return false;
61}
62
63static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
64 struct kvm_exit_mmio *mmio, phys_addr_t offset)
65{
66 u32 reg = 0;
67
68 /*
69 * Force ARE and DS to 1, the guest cannot change this.
70 * For the time being we only support Group1 interrupts.
71 */
72 if (vcpu->kvm->arch.vgic.enabled)
73 reg = GICD_CTLR_ENABLE_SS_G1;
74 reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
75
76 vgic_reg_access(mmio, &reg, offset,
77 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
78 if (mmio->is_write) {
79 if (reg & GICD_CTLR_ENABLE_SS_G0)
80 kvm_info("guest tried to enable unsupported Group0 interrupts\n");
81 vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
82 vgic_update_state(vcpu->kvm);
83 return true;
84 }
85 return false;
86}
87
88/*
89 * As this implementation does not provide compatibility
90 * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
91 * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
92 * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
93 */
94#define INTERRUPT_ID_BITS 10
95static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
96 struct kvm_exit_mmio *mmio, phys_addr_t offset)
97{
98 u32 reg;
99
100 reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
101
102 reg |= (INTERRUPT_ID_BITS - 1) << 19;
103
104 vgic_reg_access(mmio, &reg, offset,
105 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
106
107 return false;
108}
109
110static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
111 struct kvm_exit_mmio *mmio, phys_addr_t offset)
112{
113 u32 reg;
114
115 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
116 vgic_reg_access(mmio, &reg, offset,
117 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
118
119 return false;
120}
121
122static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
123 struct kvm_exit_mmio *mmio,
124 phys_addr_t offset)
125{
126 if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
127 return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
128 vcpu->vcpu_id,
129 ACCESS_WRITE_SETBIT);
130
131 vgic_reg_access(mmio, NULL, offset,
132 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
133 return false;
134}
135
136static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
137 struct kvm_exit_mmio *mmio,
138 phys_addr_t offset)
139{
140 if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
141 return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
142 vcpu->vcpu_id,
143 ACCESS_WRITE_CLEARBIT);
144
145 vgic_reg_access(mmio, NULL, offset,
146 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
147 return false;
148}
149
150static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
151 struct kvm_exit_mmio *mmio,
152 phys_addr_t offset)
153{
154 if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
155 return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
156 vcpu->vcpu_id);
157
158 vgic_reg_access(mmio, NULL, offset,
159 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
160 return false;
161}
162
163static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
164 struct kvm_exit_mmio *mmio,
165 phys_addr_t offset)
166{
167 if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
168 return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
169 vcpu->vcpu_id);
170
171 vgic_reg_access(mmio, NULL, offset,
172 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
173 return false;
174}
175
176static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
177 struct kvm_exit_mmio *mmio,
178 phys_addr_t offset)
179{
180 u32 *reg;
181
182 if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
183 vgic_reg_access(mmio, NULL, offset,
184 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
185 return false;
186 }
187
188 reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
189 vcpu->vcpu_id, offset);
190 vgic_reg_access(mmio, reg, offset,
191 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
192 return false;
193}
194
195static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
196 struct kvm_exit_mmio *mmio,
197 phys_addr_t offset)
198{
199 u32 *reg;
200
201 if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
202 vgic_reg_access(mmio, NULL, offset,
203 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
204 return false;
205 }
206
207 reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
208 vcpu->vcpu_id, offset >> 1);
209
210 return vgic_handle_cfg_reg(reg, mmio, offset);
211}
212
213/*
214 * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
215 * when we store the target MPIDR written by the guest.
216 */
217static u32 compress_mpidr(unsigned long mpidr)
218{
219 u32 ret;
220
221 ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
222 ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
223 ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
224 ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
225
226 return ret;
227}
228
229static unsigned long uncompress_mpidr(u32 value)
230{
231 unsigned long mpidr;
232
233 mpidr = ((value >> 0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
234 mpidr |= ((value >> 8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
235 mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
236 mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
237
238 return mpidr;
239}
240
241/*
242 * Lookup the given MPIDR value to get the vcpu_id (if there is one)
243 * and store that in the irq_spi_cpu[] array.
244 * This limits the number of VCPUs to 255 for now, extending the data
245 * type (or storing kvm_vcpu pointers) should lift the limit.
246 * Store the original MPIDR value in an extra array to support read-as-written.
247 * Unallocated MPIDRs are translated to a special value and caught
248 * before any array accesses.
249 */
250static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
251 struct kvm_exit_mmio *mmio,
252 phys_addr_t offset)
253{
254 struct kvm *kvm = vcpu->kvm;
255 struct vgic_dist *dist = &kvm->arch.vgic;
256 int spi;
257 u32 reg;
258 int vcpu_id;
259 unsigned long *bmap, mpidr;
260
261 /*
262 * The upper 32 bits of each 64 bit register are zero,
263 * as we don't support Aff3.
264 */
265 if ((offset & 4)) {
266 vgic_reg_access(mmio, NULL, offset,
267 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
268 return false;
269 }
270
271 /* This region only covers SPIs, so no handling of private IRQs here. */
272 spi = offset / 8;
273
274 /* get the stored MPIDR for this IRQ */
275 mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
276 reg = mpidr;
277
278 vgic_reg_access(mmio, &reg, offset,
279 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
280
281 if (!mmio->is_write)
282 return false;
283
284 /*
285 * Now clear the currently assigned vCPU from the map, making room
286 * for the new one to be written below
287 */
288 vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
289 if (likely(vcpu)) {
290 vcpu_id = vcpu->vcpu_id;
291 bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
292 __clear_bit(spi, bmap);
293 }
294
295 dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
296 vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
297
298 /*
299 * The spec says that non-existent MPIDR values should not be
300 * forwarded to any existent (v)CPU, but should be able to become
301 * pending anyway. We simply keep the irq_spi_target[] array empty, so
302 * the interrupt will never be injected.
303 * irq_spi_cpu[irq] gets a magic value in this case.
304 */
305 if (likely(vcpu)) {
306 vcpu_id = vcpu->vcpu_id;
307 dist->irq_spi_cpu[spi] = vcpu_id;
308 bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
309 __set_bit(spi, bmap);
310 } else {
311 dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
312 }
313
314 vgic_update_state(kvm);
315
316 return true;
317}
318
319/*
320 * We should be careful about promising too much when a guest reads
321 * this register. Don't claim to be like any hardware implementation,
322 * but just report the GIC as version 3 - which is what a Linux guest
323 * would check.
324 */
325static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
326 struct kvm_exit_mmio *mmio,
327 phys_addr_t offset)
328{
329 u32 reg = 0;
330
331 switch (offset + GICD_IDREGS) {
332 case GICD_PIDR2:
333 reg = 0x3b;
334 break;
335 }
336
337 vgic_reg_access(mmio, &reg, offset,
338 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
339
340 return false;
341}
342
343static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
344 {
345 .base = GICD_CTLR,
346 .len = 0x04,
347 .bits_per_irq = 0,
348 .handle_mmio = handle_mmio_ctlr,
349 },
350 {
351 .base = GICD_TYPER,
352 .len = 0x04,
353 .bits_per_irq = 0,
354 .handle_mmio = handle_mmio_typer,
355 },
356 {
357 .base = GICD_IIDR,
358 .len = 0x04,
359 .bits_per_irq = 0,
360 .handle_mmio = handle_mmio_iidr,
361 },
362 {
363 /* this register is optional, it is RAZ/WI if not implemented */
364 .base = GICD_STATUSR,
365 .len = 0x04,
366 .bits_per_irq = 0,
367 .handle_mmio = handle_mmio_raz_wi,
368 },
369 {
370 /* this write only register is WI when TYPER.MBIS=0 */
371 .base = GICD_SETSPI_NSR,
372 .len = 0x04,
373 .bits_per_irq = 0,
374 .handle_mmio = handle_mmio_raz_wi,
375 },
376 {
377 /* this write only register is WI when TYPER.MBIS=0 */
378 .base = GICD_CLRSPI_NSR,
379 .len = 0x04,
380 .bits_per_irq = 0,
381 .handle_mmio = handle_mmio_raz_wi,
382 },
383 {
384 /* this is RAZ/WI when DS=1 */
385 .base = GICD_SETSPI_SR,
386 .len = 0x04,
387 .bits_per_irq = 0,
388 .handle_mmio = handle_mmio_raz_wi,
389 },
390 {
391 /* this is RAZ/WI when DS=1 */
392 .base = GICD_CLRSPI_SR,
393 .len = 0x04,
394 .bits_per_irq = 0,
395 .handle_mmio = handle_mmio_raz_wi,
396 },
397 {
398 .base = GICD_IGROUPR,
399 .len = 0x80,
400 .bits_per_irq = 1,
401 .handle_mmio = handle_mmio_rao_wi,
402 },
403 {
404 .base = GICD_ISENABLER,
405 .len = 0x80,
406 .bits_per_irq = 1,
407 .handle_mmio = handle_mmio_set_enable_reg_dist,
408 },
409 {
410 .base = GICD_ICENABLER,
411 .len = 0x80,
412 .bits_per_irq = 1,
413 .handle_mmio = handle_mmio_clear_enable_reg_dist,
414 },
415 {
416 .base = GICD_ISPENDR,
417 .len = 0x80,
418 .bits_per_irq = 1,
419 .handle_mmio = handle_mmio_set_pending_reg_dist,
420 },
421 {
422 .base = GICD_ICPENDR,
423 .len = 0x80,
424 .bits_per_irq = 1,
425 .handle_mmio = handle_mmio_clear_pending_reg_dist,
426 },
427 {
428 .base = GICD_ISACTIVER,
429 .len = 0x80,
430 .bits_per_irq = 1,
431 .handle_mmio = handle_mmio_raz_wi,
432 },
433 {
434 .base = GICD_ICACTIVER,
435 .len = 0x80,
436 .bits_per_irq = 1,
437 .handle_mmio = handle_mmio_raz_wi,
438 },
439 {
440 .base = GICD_IPRIORITYR,
441 .len = 0x400,
442 .bits_per_irq = 8,
443 .handle_mmio = handle_mmio_priority_reg_dist,
444 },
445 {
446 /* TARGETSRn is RES0 when ARE=1 */
447 .base = GICD_ITARGETSR,
448 .len = 0x400,
449 .bits_per_irq = 8,
450 .handle_mmio = handle_mmio_raz_wi,
451 },
452 {
453 .base = GICD_ICFGR,
454 .len = 0x100,
455 .bits_per_irq = 2,
456 .handle_mmio = handle_mmio_cfg_reg_dist,
457 },
458 {
459 /* this is RAZ/WI when DS=1 */
460 .base = GICD_IGRPMODR,
461 .len = 0x80,
462 .bits_per_irq = 1,
463 .handle_mmio = handle_mmio_raz_wi,
464 },
465 {
466 /* this is RAZ/WI when DS=1 */
467 .base = GICD_NSACR,
468 .len = 0x100,
469 .bits_per_irq = 2,
470 .handle_mmio = handle_mmio_raz_wi,
471 },
472 {
473 /* this is RAZ/WI when ARE=1 */
474 .base = GICD_SGIR,
475 .len = 0x04,
476 .handle_mmio = handle_mmio_raz_wi,
477 },
478 {
479 /* this is RAZ/WI when ARE=1 */
480 .base = GICD_CPENDSGIR,
481 .len = 0x10,
482 .handle_mmio = handle_mmio_raz_wi,
483 },
484 {
485 /* this is RAZ/WI when ARE=1 */
486 .base = GICD_SPENDSGIR,
487 .len = 0x10,
488 .handle_mmio = handle_mmio_raz_wi,
489 },
490 {
491 .base = GICD_IROUTER + 0x100,
492 .len = 0x1ee0,
493 .bits_per_irq = 64,
494 .handle_mmio = handle_mmio_route_reg,
495 },
496 {
497 .base = GICD_IDREGS,
498 .len = 0x30,
499 .bits_per_irq = 0,
500 .handle_mmio = handle_mmio_idregs,
501 },
502 {},
503};
504
505static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
506 struct kvm_exit_mmio *mmio,
507 phys_addr_t offset)
508{
509 struct kvm_vcpu *redist_vcpu = mmio->private;
510
511 return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
512 redist_vcpu->vcpu_id,
513 ACCESS_WRITE_SETBIT);
514}
515
516static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
517 struct kvm_exit_mmio *mmio,
518 phys_addr_t offset)
519{
520 struct kvm_vcpu *redist_vcpu = mmio->private;
521
522 return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
523 redist_vcpu->vcpu_id,
524 ACCESS_WRITE_CLEARBIT);
525}
526
527static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
528 struct kvm_exit_mmio *mmio,
529 phys_addr_t offset)
530{
531 struct kvm_vcpu *redist_vcpu = mmio->private;
532
533 return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
534 redist_vcpu->vcpu_id);
535}
536
537static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
538 struct kvm_exit_mmio *mmio,
539 phys_addr_t offset)
540{
541 struct kvm_vcpu *redist_vcpu = mmio->private;
542
543 return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
544 redist_vcpu->vcpu_id);
545}
546
547static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
548 struct kvm_exit_mmio *mmio,
549 phys_addr_t offset)
550{
551 struct kvm_vcpu *redist_vcpu = mmio->private;
552 u32 *reg;
553
554 reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
555 redist_vcpu->vcpu_id, offset);
556 vgic_reg_access(mmio, reg, offset,
557 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
558 return false;
559}
560
561static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
562 struct kvm_exit_mmio *mmio,
563 phys_addr_t offset)
564{
565 struct kvm_vcpu *redist_vcpu = mmio->private;
566
567 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
568 redist_vcpu->vcpu_id, offset >> 1);
569
570 return vgic_handle_cfg_reg(reg, mmio, offset);
571}
572
573static const struct kvm_mmio_range vgic_redist_sgi_ranges[] = {
574 {
575 .base = GICR_IGROUPR0,
576 .len = 0x04,
577 .bits_per_irq = 1,
578 .handle_mmio = handle_mmio_rao_wi,
579 },
580 {
581 .base = GICR_ISENABLER0,
582 .len = 0x04,
583 .bits_per_irq = 1,
584 .handle_mmio = handle_mmio_set_enable_reg_redist,
585 },
586 {
587 .base = GICR_ICENABLER0,
588 .len = 0x04,
589 .bits_per_irq = 1,
590 .handle_mmio = handle_mmio_clear_enable_reg_redist,
591 },
592 {
593 .base = GICR_ISPENDR0,
594 .len = 0x04,
595 .bits_per_irq = 1,
596 .handle_mmio = handle_mmio_set_pending_reg_redist,
597 },
598 {
599 .base = GICR_ICPENDR0,
600 .len = 0x04,
601 .bits_per_irq = 1,
602 .handle_mmio = handle_mmio_clear_pending_reg_redist,
603 },
604 {
605 .base = GICR_ISACTIVER0,
606 .len = 0x04,
607 .bits_per_irq = 1,
608 .handle_mmio = handle_mmio_raz_wi,
609 },
610 {
611 .base = GICR_ICACTIVER0,
612 .len = 0x04,
613 .bits_per_irq = 1,
614 .handle_mmio = handle_mmio_raz_wi,
615 },
616 {
617 .base = GICR_IPRIORITYR0,
618 .len = 0x20,
619 .bits_per_irq = 8,
620 .handle_mmio = handle_mmio_priority_reg_redist,
621 },
622 {
623 .base = GICR_ICFGR0,
624 .len = 0x08,
625 .bits_per_irq = 2,
626 .handle_mmio = handle_mmio_cfg_reg_redist,
627 },
628 {
629 .base = GICR_IGRPMODR0,
630 .len = 0x04,
631 .bits_per_irq = 1,
632 .handle_mmio = handle_mmio_raz_wi,
633 },
634 {
635 .base = GICR_NSACR,
636 .len = 0x04,
637 .handle_mmio = handle_mmio_raz_wi,
638 },
639 {},
640};
641
642static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
643 struct kvm_exit_mmio *mmio,
644 phys_addr_t offset)
645{
646 /* since we don't support LPIs, this register is zero for now */
647 vgic_reg_access(mmio, NULL, offset,
648 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
649 return false;
650}
651
652static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
653 struct kvm_exit_mmio *mmio,
654 phys_addr_t offset)
655{
656 u32 reg;
657 u64 mpidr;
658 struct kvm_vcpu *redist_vcpu = mmio->private;
659 int target_vcpu_id = redist_vcpu->vcpu_id;
660
661 /* the upper 32 bits contain the affinity value */
662 if ((offset & ~3) == 4) {
663 mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
664 reg = compress_mpidr(mpidr);
665
666 vgic_reg_access(mmio, &reg, offset,
667 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
668 return false;
669 }
670
671 reg = redist_vcpu->vcpu_id << 8;
672 if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
673 reg |= GICR_TYPER_LAST;
674 vgic_reg_access(mmio, &reg, offset,
675 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
676 return false;
677}
678
679static const struct kvm_mmio_range vgic_redist_ranges[] = {
680 {
681 .base = GICR_CTLR,
682 .len = 0x04,
683 .bits_per_irq = 0,
684 .handle_mmio = handle_mmio_ctlr_redist,
685 },
686 {
687 .base = GICR_TYPER,
688 .len = 0x08,
689 .bits_per_irq = 0,
690 .handle_mmio = handle_mmio_typer_redist,
691 },
692 {
693 .base = GICR_IIDR,
694 .len = 0x04,
695 .bits_per_irq = 0,
696 .handle_mmio = handle_mmio_iidr,
697 },
698 {
699 .base = GICR_WAKER,
700 .len = 0x04,
701 .bits_per_irq = 0,
702 .handle_mmio = handle_mmio_raz_wi,
703 },
704 {
705 .base = GICR_IDREGS,
706 .len = 0x30,
707 .bits_per_irq = 0,
708 .handle_mmio = handle_mmio_idregs,
709 },
710 {},
711};
712
713/*
714 * This function splits accesses between the distributor and the two
715 * redistributor parts (private/SPI). As each redistributor is accessible
716 * from any CPU, we have to determine the affected VCPU by taking the faulting
717 * address into account. We then pass this VCPU to the handler function via
718 * the private parameter.
719 */
720#define SGI_BASE_OFFSET SZ_64K
721static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
722 struct kvm_exit_mmio *mmio)
723{
724 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
725 unsigned long dbase = dist->vgic_dist_base;
726 unsigned long rdbase = dist->vgic_redist_base;
727 int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
728 int vcpu_id;
729 const struct kvm_mmio_range *mmio_range;
730
731 if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
732 return vgic_handle_mmio_range(vcpu, run, mmio,
733 vgic_v3_dist_ranges, dbase);
734 }
735
736 if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
737 GIC_V3_REDIST_SIZE * nrcpus))
738 return false;
739
740 vcpu_id = (mmio->phys_addr - rdbase) / GIC_V3_REDIST_SIZE;
741 rdbase += (vcpu_id * GIC_V3_REDIST_SIZE);
742 mmio->private = kvm_get_vcpu(vcpu->kvm, vcpu_id);
743
744 if (mmio->phys_addr >= rdbase + SGI_BASE_OFFSET) {
745 rdbase += SGI_BASE_OFFSET;
746 mmio_range = vgic_redist_sgi_ranges;
747 } else {
748 mmio_range = vgic_redist_ranges;
749 }
750 return vgic_handle_mmio_range(vcpu, run, mmio, mmio_range, rdbase);
751}
752
753static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
754{
755 if (vgic_queue_irq(vcpu, 0, irq)) {
756 vgic_dist_irq_clear_pending(vcpu, irq);
757 vgic_cpu_irq_clear(vcpu, irq);
758 return true;
759 }
760
761 return false;
762}
763
764static int vgic_v3_map_resources(struct kvm *kvm,
765 const struct vgic_params *params)
766{
767 int ret = 0;
768 struct vgic_dist *dist = &kvm->arch.vgic;
769
770 if (!irqchip_in_kernel(kvm))
771 return 0;
772
773 mutex_lock(&kvm->lock);
774
775 if (vgic_ready(kvm))
776 goto out;
777
778 if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
779 IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
780 kvm_err("Need to set vgic distributor addresses first\n");
781 ret = -ENXIO;
782 goto out;
783 }
784
785 /*
786 * For a VGICv3 we require the userland to explicitly initialize
787 * the VGIC before we need to use it.
788 */
789 if (!vgic_initialized(kvm)) {
790 ret = -EBUSY;
791 goto out;
792 }
793
794 kvm->arch.vgic.ready = true;
795out:
796 if (ret)
797 kvm_vgic_destroy(kvm);
798 mutex_unlock(&kvm->lock);
799 return ret;
800}
801
802static int vgic_v3_init_model(struct kvm *kvm)
803{
804 int i;
805 u32 mpidr;
806 struct vgic_dist *dist = &kvm->arch.vgic;
807 int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
808
809 dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
810 GFP_KERNEL);
811
812 if (!dist->irq_spi_mpidr)
813 return -ENOMEM;
814
815 /* Initialize the target VCPUs for each IRQ to VCPU 0 */
816 mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
817 for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
818 dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
819 dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
820 vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
821 }
822
823 return 0;
824}
825
826/* GICv3 does not keep track of SGI sources anymore. */
827static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
828{
829}
830
831void vgic_v3_init_emulation(struct kvm *kvm)
832{
833 struct vgic_dist *dist = &kvm->arch.vgic;
834
835 dist->vm_ops.handle_mmio = vgic_v3_handle_mmio;
836 dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
837 dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
838 dist->vm_ops.init_model = vgic_v3_init_model;
839 dist->vm_ops.map_resources = vgic_v3_map_resources;
840
841 kvm->arch.max_vcpus = KVM_MAX_VCPUS;
842}
843
844/*
845 * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
846 * generation register ICC_SGI1R_EL1) with a given VCPU.
847 * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
848 * return -1.
849 */
850static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
851{
852 unsigned long affinity;
853 int level0;
854
855 /*
856 * Split the current VCPU's MPIDR into affinity level 0 and the
857 * rest as this is what we have to compare against.
858 */
859 affinity = kvm_vcpu_get_mpidr_aff(vcpu);
860 level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
861 affinity &= ~MPIDR_LEVEL_MASK;
862
863 /* bail out if the upper three levels don't match */
864 if (sgi_aff != affinity)
865 return -1;
866
867 /* Is this VCPU's bit set in the mask ? */
868 if (!(sgi_cpu_mask & BIT(level0)))
869 return -1;
870
871 return level0;
872}
873
874#define SGI_AFFINITY_LEVEL(reg, level) \
875 ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
876 >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
877
878/**
879 * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
880 * @vcpu: The VCPU requesting a SGI
881 * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
882 *
883 * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
884 * This will trap in sys_regs.c and call this function.
885 * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
886 * target processors as well as a bitmask of 16 Aff0 CPUs.
887 * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
888 * check for matching ones. If this bit is set, we signal all, but not the
889 * calling VCPU.
890 */
891void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
892{
893 struct kvm *kvm = vcpu->kvm;
894 struct kvm_vcpu *c_vcpu;
895 struct vgic_dist *dist = &kvm->arch.vgic;
896 u16 target_cpus;
897 u64 mpidr;
898 int sgi, c;
899 int vcpu_id = vcpu->vcpu_id;
900 bool broadcast;
901 int updated = 0;
902
903 sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
904 broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
905 target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
906 mpidr = SGI_AFFINITY_LEVEL(reg, 3);
907 mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
908 mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
909
910 /*
911 * We take the dist lock here, because we come from the sysregs
912 * code path and not from the MMIO one (which already takes the lock).
913 */
914 spin_lock(&dist->lock);
915
916 /*
917 * We iterate over all VCPUs to find the MPIDRs matching the request.
918 * If we have handled one CPU, we clear it's bit to detect early
919 * if we are already finished. This avoids iterating through all
920 * VCPUs when most of the times we just signal a single VCPU.
921 */
922 kvm_for_each_vcpu(c, c_vcpu, kvm) {
923
924 /* Exit early if we have dealt with all requested CPUs */
925 if (!broadcast && target_cpus == 0)
926 break;
927
928 /* Don't signal the calling VCPU */
929 if (broadcast && c == vcpu_id)
930 continue;
931
932 if (!broadcast) {
933 int level0;
934
935 level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
936 if (level0 == -1)
937 continue;
938
939 /* remove this matching VCPU from the mask */
940 target_cpus &= ~BIT(level0);
941 }
942
943 /* Flag the SGI as pending */
944 vgic_dist_irq_set_pending(c_vcpu, sgi);
945 updated = 1;
946 kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
947 }
948 if (updated)
949 vgic_update_state(vcpu->kvm);
950 spin_unlock(&dist->lock);
951 if (updated)
952 vgic_kick_vcpus(vcpu->kvm);
953}
954
955static int vgic_v3_create(struct kvm_device *dev, u32 type)
956{
957 return kvm_vgic_create(dev->kvm, type);
958}
959
960static void vgic_v3_destroy(struct kvm_device *dev)
961{
962 kfree(dev);
963}
964
965static int vgic_v3_set_attr(struct kvm_device *dev,
966 struct kvm_device_attr *attr)
967{
968 int ret;
969
970 ret = vgic_set_common_attr(dev, attr);
971 if (ret != -ENXIO)
972 return ret;
973
974 switch (attr->group) {
975 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
976 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
977 return -ENXIO;
978 }
979
980 return -ENXIO;
981}
982
983static int vgic_v3_get_attr(struct kvm_device *dev,
984 struct kvm_device_attr *attr)
985{
986 int ret;
987
988 ret = vgic_get_common_attr(dev, attr);
989 if (ret != -ENXIO)
990 return ret;
991
992 switch (attr->group) {
993 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
994 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
995 return -ENXIO;
996 }
997
998 return -ENXIO;
999}
1000
1001static int vgic_v3_has_attr(struct kvm_device *dev,
1002 struct kvm_device_attr *attr)
1003{
1004 switch (attr->group) {
1005 case KVM_DEV_ARM_VGIC_GRP_ADDR:
1006 switch (attr->attr) {
1007 case KVM_VGIC_V2_ADDR_TYPE_DIST:
1008 case KVM_VGIC_V2_ADDR_TYPE_CPU:
1009 return -ENXIO;
1010 case KVM_VGIC_V3_ADDR_TYPE_DIST:
1011 case KVM_VGIC_V3_ADDR_TYPE_REDIST:
1012 return 0;
1013 }
1014 break;
1015 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
1016 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
1017 return -ENXIO;
1018 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
1019 return 0;
1020 case KVM_DEV_ARM_VGIC_GRP_CTRL:
1021 switch (attr->attr) {
1022 case KVM_DEV_ARM_VGIC_CTRL_INIT:
1023 return 0;
1024 }
1025 }
1026 return -ENXIO;
1027}
1028
1029struct kvm_device_ops kvm_arm_vgic_v3_ops = {
1030 .name = "kvm-arm-vgic-v3",
1031 .create = vgic_v3_create,
1032 .destroy = vgic_v3_destroy,
1033 .set_attr = vgic_v3_set_attr,
1034 .get_attr = vgic_v3_get_attr,
1035 .has_attr = vgic_v3_has_attr,
1036};
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index 1c2c8eef0599..3a62d8a9a2c6 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -34,6 +34,7 @@
34#define GICH_LR_VIRTUALID (0x3ffUL << 0) 34#define GICH_LR_VIRTUALID (0x3ffUL << 0)
35#define GICH_LR_PHYSID_CPUID_SHIFT (10) 35#define GICH_LR_PHYSID_CPUID_SHIFT (10)
36#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) 36#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT)
37#define ICH_LR_VIRTUALID_MASK (BIT_ULL(32) - 1)
37 38
38/* 39/*
39 * LRs are stored in reverse order in memory. make sure we index them 40 * LRs are stored in reverse order in memory. make sure we index them
@@ -48,12 +49,17 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
48 struct vgic_lr lr_desc; 49 struct vgic_lr lr_desc;
49 u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)]; 50 u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)];
50 51
51 lr_desc.irq = val & GICH_LR_VIRTUALID; 52 if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
52 if (lr_desc.irq <= 15) 53 lr_desc.irq = val & ICH_LR_VIRTUALID_MASK;
53 lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
54 else 54 else
55 lr_desc.source = 0; 55 lr_desc.irq = val & GICH_LR_VIRTUALID;
56 lr_desc.state = 0; 56
57 lr_desc.source = 0;
58 if (lr_desc.irq <= 15 &&
59 vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
60 lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
61
62 lr_desc.state = 0;
57 63
58 if (val & ICH_LR_PENDING_BIT) 64 if (val & ICH_LR_PENDING_BIT)
59 lr_desc.state |= LR_STATE_PENDING; 65 lr_desc.state |= LR_STATE_PENDING;
@@ -68,8 +74,20 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
68static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr, 74static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
69 struct vgic_lr lr_desc) 75 struct vgic_lr lr_desc)
70{ 76{
71 u64 lr_val = (((u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | 77 u64 lr_val;
72 lr_desc.irq); 78
79 lr_val = lr_desc.irq;
80
81 /*
82 * Currently all guest IRQs are Group1, as Group0 would result
83 * in a FIQ in the guest, which it wouldn't expect.
84 * Eventually we want to make this configurable, so we may revisit
85 * this in the future.
86 */
87 if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
88 lr_val |= ICH_LR_GROUP;
89 else
90 lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
73 91
74 if (lr_desc.state & LR_STATE_PENDING) 92 if (lr_desc.state & LR_STATE_PENDING)
75 lr_val |= ICH_LR_PENDING_BIT; 93 lr_val |= ICH_LR_PENDING_BIT;
@@ -145,15 +163,27 @@ static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
145 163
146static void vgic_v3_enable(struct kvm_vcpu *vcpu) 164static void vgic_v3_enable(struct kvm_vcpu *vcpu)
147{ 165{
166 struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
167
148 /* 168 /*
149 * By forcing VMCR to zero, the GIC will restore the binary 169 * By forcing VMCR to zero, the GIC will restore the binary
150 * points to their reset values. Anything else resets to zero 170 * points to their reset values. Anything else resets to zero
151 * anyway. 171 * anyway.
152 */ 172 */
153 vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = 0; 173 vgic_v3->vgic_vmcr = 0;
174
175 /*
176 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
177 * way, so we force SRE to 1 to demonstrate this to the guest.
178 * This goes with the spec allowing the value to be RAO/WI.
179 */
180 if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
181 vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
182 else
183 vgic_v3->vgic_sre = 0;
154 184
155 /* Get the show on the road... */ 185 /* Get the show on the road... */
156 vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr = ICH_HCR_EN; 186 vgic_v3->vgic_hcr = ICH_HCR_EN;
157} 187}
158 188
159static const struct vgic_ops vgic_v3_ops = { 189static const struct vgic_ops vgic_v3_ops = {
@@ -205,35 +235,37 @@ int vgic_v3_probe(struct device_node *vgic_node,
205 * maximum of 16 list registers. Just ignore bit 4... 235 * maximum of 16 list registers. Just ignore bit 4...
206 */ 236 */
207 vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1; 237 vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
238 vgic->can_emulate_gicv2 = false;
208 239
209 if (of_property_read_u32(vgic_node, "#redistributor-regions", &gicv_idx)) 240 if (of_property_read_u32(vgic_node, "#redistributor-regions", &gicv_idx))
210 gicv_idx = 1; 241 gicv_idx = 1;
211 242
212 gicv_idx += 3; /* Also skip GICD, GICC, GICH */ 243 gicv_idx += 3; /* Also skip GICD, GICC, GICH */
213 if (of_address_to_resource(vgic_node, gicv_idx, &vcpu_res)) { 244 if (of_address_to_resource(vgic_node, gicv_idx, &vcpu_res)) {
214 kvm_err("Cannot obtain GICV region\n"); 245 kvm_info("GICv3: no GICV resource entry\n");
215 ret = -ENXIO; 246 vgic->vcpu_base = 0;
216 goto out; 247 } else if (!PAGE_ALIGNED(vcpu_res.start)) {
217 } 248 pr_warn("GICV physical address 0x%llx not page aligned\n",
218
219 if (!PAGE_ALIGNED(vcpu_res.start)) {
220 kvm_err("GICV physical address 0x%llx not page aligned\n",
221 (unsigned long long)vcpu_res.start); 249 (unsigned long long)vcpu_res.start);
222 ret = -ENXIO; 250 vgic->vcpu_base = 0;
223 goto out; 251 } else if (!PAGE_ALIGNED(resource_size(&vcpu_res))) {
224 } 252 pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
225
226 if (!PAGE_ALIGNED(resource_size(&vcpu_res))) {
227 kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
228 (unsigned long long)resource_size(&vcpu_res), 253 (unsigned long long)resource_size(&vcpu_res),
229 PAGE_SIZE); 254 PAGE_SIZE);
230 ret = -ENXIO; 255 vgic->vcpu_base = 0;
231 goto out; 256 } else {
257 vgic->vcpu_base = vcpu_res.start;
258 vgic->can_emulate_gicv2 = true;
259 kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
260 KVM_DEV_TYPE_ARM_VGIC_V2);
232 } 261 }
262 if (vgic->vcpu_base == 0)
263 kvm_info("disabling GICv2 emulation\n");
264 kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3);
233 265
234 vgic->vcpu_base = vcpu_res.start;
235 vgic->vctrl_base = NULL; 266 vgic->vctrl_base = NULL;
236 vgic->type = VGIC_V3; 267 vgic->type = VGIC_V3;
268 vgic->max_gic_vcpus = KVM_MAX_VCPUS;
237 269
238 kvm_info("%s@%llx IRQ%d\n", vgic_node->name, 270 kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
239 vcpu_res.start, vgic->maint_irq); 271 vcpu_res.start, vgic->maint_irq);
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 03affc7bf453..0cc6ab6005a0 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -75,37 +75,31 @@
75 * inactive as long as the external input line is held high. 75 * inactive as long as the external input line is held high.
76 */ 76 */
77 77
78#define VGIC_ADDR_UNDEF (-1) 78#include "vgic.h"
79#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) 79
80
81#define PRODUCT_ID_KVM 0x4b /* ASCII code K */
82#define IMPLEMENTER_ARM 0x43b
83#define GICC_ARCH_VERSION_V2 0x2
84
85#define ACCESS_READ_VALUE (1 << 0)
86#define ACCESS_READ_RAZ (0 << 0)
87#define ACCESS_READ_MASK(x) ((x) & (1 << 0))
88#define ACCESS_WRITE_IGNORED (0 << 1)
89#define ACCESS_WRITE_SETBIT (1 << 1)
90#define ACCESS_WRITE_CLEARBIT (2 << 1)
91#define ACCESS_WRITE_VALUE (3 << 1)
92#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1))
93
94static int vgic_init(struct kvm *kvm);
95static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); 80static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
96static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu); 81static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
97static void vgic_update_state(struct kvm *kvm);
98static void vgic_kick_vcpus(struct kvm *kvm);
99static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi);
100static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
101static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); 82static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
102static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); 83static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
103static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
104static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
105 84
106static const struct vgic_ops *vgic_ops; 85static const struct vgic_ops *vgic_ops;
107static const struct vgic_params *vgic; 86static const struct vgic_params *vgic;
108 87
88static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
89{
90 vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source);
91}
92
93static bool queue_sgi(struct kvm_vcpu *vcpu, int irq)
94{
95 return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq);
96}
97
98int kvm_vgic_map_resources(struct kvm *kvm)
99{
100 return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic);
101}
102
109/* 103/*
110 * struct vgic_bitmap contains a bitmap made of unsigned longs, but 104 * struct vgic_bitmap contains a bitmap made of unsigned longs, but
111 * extracts u32s out of them. 105 * extracts u32s out of them.
@@ -160,8 +154,7 @@ static unsigned long *u64_to_bitmask(u64 *val)
160 return (unsigned long *)val; 154 return (unsigned long *)val;
161} 155}
162 156
163static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, 157u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset)
164 int cpuid, u32 offset)
165{ 158{
166 offset >>= 2; 159 offset >>= 2;
167 if (!offset) 160 if (!offset)
@@ -179,8 +172,8 @@ static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
179 return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared); 172 return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared);
180} 173}
181 174
182static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, 175void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
183 int irq, int val) 176 int irq, int val)
184{ 177{
185 unsigned long *reg; 178 unsigned long *reg;
186 179
@@ -202,7 +195,7 @@ static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
202 return x->private + cpuid; 195 return x->private + cpuid;
203} 196}
204 197
205static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) 198unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
206{ 199{
207 return x->shared; 200 return x->shared;
208} 201}
@@ -229,7 +222,7 @@ static void vgic_free_bytemap(struct vgic_bytemap *b)
229 b->shared = NULL; 222 b->shared = NULL;
230} 223}
231 224
232static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) 225u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
233{ 226{
234 u32 *reg; 227 u32 *reg;
235 228
@@ -326,14 +319,14 @@ static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
326 return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq); 319 return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq);
327} 320}
328 321
329static void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq) 322void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq)
330{ 323{
331 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 324 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
332 325
333 vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1); 326 vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1);
334} 327}
335 328
336static void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq) 329void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq)
337{ 330{
338 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 331 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
339 332
@@ -349,7 +342,7 @@ static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
349 vcpu->arch.vgic_cpu.pending_shared); 342 vcpu->arch.vgic_cpu.pending_shared);
350} 343}
351 344
352static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq) 345void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
353{ 346{
354 if (irq < VGIC_NR_PRIVATE_IRQS) 347 if (irq < VGIC_NR_PRIVATE_IRQS)
355 clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); 348 clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
@@ -363,16 +356,6 @@ static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
363 return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq); 356 return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq);
364} 357}
365 358
366static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
367{
368 return le32_to_cpu(*((u32 *)mmio->data)) & mask;
369}
370
371static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
372{
373 *((u32 *)mmio->data) = cpu_to_le32(value) & mask;
374}
375
376/** 359/**
377 * vgic_reg_access - access vgic register 360 * vgic_reg_access - access vgic register
378 * @mmio: pointer to the data describing the mmio access 361 * @mmio: pointer to the data describing the mmio access
@@ -384,8 +367,8 @@ static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
384 * modes defined for vgic register access 367 * modes defined for vgic register access
385 * (read,raz,write-ignored,setbit,clearbit,write) 368 * (read,raz,write-ignored,setbit,clearbit,write)
386 */ 369 */
387static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, 370void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
388 phys_addr_t offset, int mode) 371 phys_addr_t offset, int mode)
389{ 372{
390 int word_offset = (offset & 3) * 8; 373 int word_offset = (offset & 3) * 8;
391 u32 mask = (1UL << (mmio->len * 8)) - 1; 374 u32 mask = (1UL << (mmio->len * 8)) - 1;
@@ -434,107 +417,58 @@ static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
434 } 417 }
435} 418}
436 419
437static bool handle_mmio_misc(struct kvm_vcpu *vcpu, 420bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
438 struct kvm_exit_mmio *mmio, phys_addr_t offset) 421 phys_addr_t offset)
439{
440 u32 reg;
441 u32 word_offset = offset & 3;
442
443 switch (offset & ~3) {
444 case 0: /* GICD_CTLR */
445 reg = vcpu->kvm->arch.vgic.enabled;
446 vgic_reg_access(mmio, &reg, word_offset,
447 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
448 if (mmio->is_write) {
449 vcpu->kvm->arch.vgic.enabled = reg & 1;
450 vgic_update_state(vcpu->kvm);
451 return true;
452 }
453 break;
454
455 case 4: /* GICD_TYPER */
456 reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
457 reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
458 vgic_reg_access(mmio, &reg, word_offset,
459 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
460 break;
461
462 case 8: /* GICD_IIDR */
463 reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
464 vgic_reg_access(mmio, &reg, word_offset,
465 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
466 break;
467 }
468
469 return false;
470}
471
472static bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu,
473 struct kvm_exit_mmio *mmio, phys_addr_t offset)
474{ 422{
475 vgic_reg_access(mmio, NULL, offset, 423 vgic_reg_access(mmio, NULL, offset,
476 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); 424 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
477 return false; 425 return false;
478} 426}
479 427
480static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu, 428bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
481 struct kvm_exit_mmio *mmio, 429 phys_addr_t offset, int vcpu_id, int access)
482 phys_addr_t offset)
483{ 430{
484 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, 431 u32 *reg;
485 vcpu->vcpu_id, offset); 432 int mode = ACCESS_READ_VALUE | access;
486 vgic_reg_access(mmio, reg, offset, 433 struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id);
487 ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
488 if (mmio->is_write) {
489 vgic_update_state(vcpu->kvm);
490 return true;
491 }
492
493 return false;
494}
495 434
496static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu, 435 reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset);
497 struct kvm_exit_mmio *mmio, 436 vgic_reg_access(mmio, reg, offset, mode);
498 phys_addr_t offset)
499{
500 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled,
501 vcpu->vcpu_id, offset);
502 vgic_reg_access(mmio, reg, offset,
503 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
504 if (mmio->is_write) { 437 if (mmio->is_write) {
505 if (offset < 4) /* Force SGI enabled */ 438 if (access & ACCESS_WRITE_CLEARBIT) {
506 *reg |= 0xffff; 439 if (offset < 4) /* Force SGI enabled */
507 vgic_retire_disabled_irqs(vcpu); 440 *reg |= 0xffff;
508 vgic_update_state(vcpu->kvm); 441 vgic_retire_disabled_irqs(target_vcpu);
442 }
443 vgic_update_state(kvm);
509 return true; 444 return true;
510 } 445 }
511 446
512 return false; 447 return false;
513} 448}
514 449
515static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, 450bool vgic_handle_set_pending_reg(struct kvm *kvm,
516 struct kvm_exit_mmio *mmio, 451 struct kvm_exit_mmio *mmio,
517 phys_addr_t offset) 452 phys_addr_t offset, int vcpu_id)
518{ 453{
519 u32 *reg, orig; 454 u32 *reg, orig;
520 u32 level_mask; 455 u32 level_mask;
521 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 456 int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT;
457 struct vgic_dist *dist = &kvm->arch.vgic;
522 458
523 reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu->vcpu_id, offset); 459 reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset);
524 level_mask = (~(*reg)); 460 level_mask = (~(*reg));
525 461
526 /* Mark both level and edge triggered irqs as pending */ 462 /* Mark both level and edge triggered irqs as pending */
527 reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset); 463 reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
528 orig = *reg; 464 orig = *reg;
529 vgic_reg_access(mmio, reg, offset, 465 vgic_reg_access(mmio, reg, offset, mode);
530 ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
531 466
532 if (mmio->is_write) { 467 if (mmio->is_write) {
533 /* Set the soft-pending flag only for level-triggered irqs */ 468 /* Set the soft-pending flag only for level-triggered irqs */
534 reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, 469 reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
535 vcpu->vcpu_id, offset); 470 vcpu_id, offset);
536 vgic_reg_access(mmio, reg, offset, 471 vgic_reg_access(mmio, reg, offset, mode);
537 ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
538 *reg &= level_mask; 472 *reg &= level_mask;
539 473
540 /* Ignore writes to SGIs */ 474 /* Ignore writes to SGIs */
@@ -543,31 +477,30 @@ static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
543 *reg |= orig & 0xffff; 477 *reg |= orig & 0xffff;
544 } 478 }
545 479
546 vgic_update_state(vcpu->kvm); 480 vgic_update_state(kvm);
547 return true; 481 return true;
548 } 482 }
549 483
550 return false; 484 return false;
551} 485}
552 486
553static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, 487bool vgic_handle_clear_pending_reg(struct kvm *kvm,
554 struct kvm_exit_mmio *mmio, 488 struct kvm_exit_mmio *mmio,
555 phys_addr_t offset) 489 phys_addr_t offset, int vcpu_id)
556{ 490{
557 u32 *level_active; 491 u32 *level_active;
558 u32 *reg, orig; 492 u32 *reg, orig;
559 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 493 int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT;
494 struct vgic_dist *dist = &kvm->arch.vgic;
560 495
561 reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset); 496 reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
562 orig = *reg; 497 orig = *reg;
563 vgic_reg_access(mmio, reg, offset, 498 vgic_reg_access(mmio, reg, offset, mode);
564 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
565 if (mmio->is_write) { 499 if (mmio->is_write) {
566 /* Re-set level triggered level-active interrupts */ 500 /* Re-set level triggered level-active interrupts */
567 level_active = vgic_bitmap_get_reg(&dist->irq_level, 501 level_active = vgic_bitmap_get_reg(&dist->irq_level,
568 vcpu->vcpu_id, offset); 502 vcpu_id, offset);
569 reg = vgic_bitmap_get_reg(&dist->irq_pending, 503 reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
570 vcpu->vcpu_id, offset);
571 *reg |= *level_active; 504 *reg |= *level_active;
572 505
573 /* Ignore writes to SGIs */ 506 /* Ignore writes to SGIs */
@@ -578,101 +511,12 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
578 511
579 /* Clear soft-pending flags */ 512 /* Clear soft-pending flags */
580 reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, 513 reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
581 vcpu->vcpu_id, offset); 514 vcpu_id, offset);
582 vgic_reg_access(mmio, reg, offset, 515 vgic_reg_access(mmio, reg, offset, mode);
583 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
584 516
585 vgic_update_state(vcpu->kvm); 517 vgic_update_state(kvm);
586 return true; 518 return true;
587 } 519 }
588
589 return false;
590}
591
592static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
593 struct kvm_exit_mmio *mmio,
594 phys_addr_t offset)
595{
596 u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
597 vcpu->vcpu_id, offset);
598 vgic_reg_access(mmio, reg, offset,
599 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
600 return false;
601}
602
603#define GICD_ITARGETSR_SIZE 32
604#define GICD_CPUTARGETS_BITS 8
605#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
606static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
607{
608 struct vgic_dist *dist = &kvm->arch.vgic;
609 int i;
610 u32 val = 0;
611
612 irq -= VGIC_NR_PRIVATE_IRQS;
613
614 for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
615 val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
616
617 return val;
618}
619
620static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
621{
622 struct vgic_dist *dist = &kvm->arch.vgic;
623 struct kvm_vcpu *vcpu;
624 int i, c;
625 unsigned long *bmap;
626 u32 target;
627
628 irq -= VGIC_NR_PRIVATE_IRQS;
629
630 /*
631 * Pick the LSB in each byte. This ensures we target exactly
632 * one vcpu per IRQ. If the byte is null, assume we target
633 * CPU0.
634 */
635 for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
636 int shift = i * GICD_CPUTARGETS_BITS;
637 target = ffs((val >> shift) & 0xffU);
638 target = target ? (target - 1) : 0;
639 dist->irq_spi_cpu[irq + i] = target;
640 kvm_for_each_vcpu(c, vcpu, kvm) {
641 bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
642 if (c == target)
643 set_bit(irq + i, bmap);
644 else
645 clear_bit(irq + i, bmap);
646 }
647 }
648}
649
650static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
651 struct kvm_exit_mmio *mmio,
652 phys_addr_t offset)
653{
654 u32 reg;
655
656 /* We treat the banked interrupts targets as read-only */
657 if (offset < 32) {
658 u32 roreg = 1 << vcpu->vcpu_id;
659 roreg |= roreg << 8;
660 roreg |= roreg << 16;
661
662 vgic_reg_access(mmio, &roreg, offset,
663 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
664 return false;
665 }
666
667 reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
668 vgic_reg_access(mmio, &reg, offset,
669 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
670 if (mmio->is_write) {
671 vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
672 vgic_update_state(vcpu->kvm);
673 return true;
674 }
675
676 return false; 520 return false;
677} 521}
678 522
@@ -711,14 +555,10 @@ static u16 vgic_cfg_compress(u32 val)
711 * LSB is always 0. As such, we only keep the upper bit, and use the 555 * LSB is always 0. As such, we only keep the upper bit, and use the
712 * two above functions to compress/expand the bits 556 * two above functions to compress/expand the bits
713 */ 557 */
714static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu, 558bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
715 struct kvm_exit_mmio *mmio, phys_addr_t offset) 559 phys_addr_t offset)
716{ 560{
717 u32 val; 561 u32 val;
718 u32 *reg;
719
720 reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
721 vcpu->vcpu_id, offset >> 1);
722 562
723 if (offset & 4) 563 if (offset & 4)
724 val = *reg >> 16; 564 val = *reg >> 16;
@@ -747,21 +587,6 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
747 return false; 587 return false;
748} 588}
749 589
750static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
751 struct kvm_exit_mmio *mmio, phys_addr_t offset)
752{
753 u32 reg;
754 vgic_reg_access(mmio, &reg, offset,
755 ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
756 if (mmio->is_write) {
757 vgic_dispatch_sgi(vcpu, reg);
758 vgic_update_state(vcpu->kvm);
759 return true;
760 }
761
762 return false;
763}
764
765/** 590/**
766 * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor 591 * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor
767 * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs 592 * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
@@ -774,11 +599,9 @@ static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
774 * to the distributor but the active state stays in the LRs, because we don't 599 * to the distributor but the active state stays in the LRs, because we don't
775 * track the active state on the distributor side. 600 * track the active state on the distributor side.
776 */ 601 */
777static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) 602void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
778{ 603{
779 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
780 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 604 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
781 int vcpu_id = vcpu->vcpu_id;
782 int i; 605 int i;
783 606
784 for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) { 607 for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) {
@@ -805,7 +628,7 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
805 */ 628 */
806 vgic_dist_irq_set_pending(vcpu, lr.irq); 629 vgic_dist_irq_set_pending(vcpu, lr.irq);
807 if (lr.irq < VGIC_NR_SGIS) 630 if (lr.irq < VGIC_NR_SGIS)
808 *vgic_get_sgi_sources(dist, vcpu_id, lr.irq) |= 1 << lr.source; 631 add_sgi_source(vcpu, lr.irq, lr.source);
809 lr.state &= ~LR_STATE_PENDING; 632 lr.state &= ~LR_STATE_PENDING;
810 vgic_set_lr(vcpu, i, lr); 633 vgic_set_lr(vcpu, i, lr);
811 634
@@ -824,188 +647,12 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
824 } 647 }
825} 648}
826 649
827/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */ 650const
828static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, 651struct kvm_mmio_range *vgic_find_range(const struct kvm_mmio_range *ranges,
829 struct kvm_exit_mmio *mmio,
830 phys_addr_t offset)
831{
832 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
833 int sgi;
834 int min_sgi = (offset & ~0x3);
835 int max_sgi = min_sgi + 3;
836 int vcpu_id = vcpu->vcpu_id;
837 u32 reg = 0;
838
839 /* Copy source SGIs from distributor side */
840 for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
841 int shift = 8 * (sgi - min_sgi);
842 reg |= ((u32)*vgic_get_sgi_sources(dist, vcpu_id, sgi)) << shift;
843 }
844
845 mmio_data_write(mmio, ~0, reg);
846 return false;
847}
848
849static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
850 struct kvm_exit_mmio *mmio,
851 phys_addr_t offset, bool set)
852{
853 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
854 int sgi;
855 int min_sgi = (offset & ~0x3);
856 int max_sgi = min_sgi + 3;
857 int vcpu_id = vcpu->vcpu_id;
858 u32 reg;
859 bool updated = false;
860
861 reg = mmio_data_read(mmio, ~0);
862
863 /* Clear pending SGIs on the distributor */
864 for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
865 u8 mask = reg >> (8 * (sgi - min_sgi));
866 u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
867 if (set) {
868 if ((*src & mask) != mask)
869 updated = true;
870 *src |= mask;
871 } else {
872 if (*src & mask)
873 updated = true;
874 *src &= ~mask;
875 }
876 }
877
878 if (updated)
879 vgic_update_state(vcpu->kvm);
880
881 return updated;
882}
883
884static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
885 struct kvm_exit_mmio *mmio,
886 phys_addr_t offset)
887{
888 if (!mmio->is_write)
889 return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
890 else
891 return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
892}
893
894static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
895 struct kvm_exit_mmio *mmio,
896 phys_addr_t offset)
897{
898 if (!mmio->is_write)
899 return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
900 else
901 return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
902}
903
904/*
905 * I would have liked to use the kvm_bus_io_*() API instead, but it
906 * cannot cope with banked registers (only the VM pointer is passed
907 * around, and we need the vcpu). One of these days, someone please
908 * fix it!
909 */
910struct mmio_range {
911 phys_addr_t base;
912 unsigned long len;
913 int bits_per_irq;
914 bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
915 phys_addr_t offset);
916};
917
918static const struct mmio_range vgic_dist_ranges[] = {
919 {
920 .base = GIC_DIST_CTRL,
921 .len = 12,
922 .bits_per_irq = 0,
923 .handle_mmio = handle_mmio_misc,
924 },
925 {
926 .base = GIC_DIST_IGROUP,
927 .len = VGIC_MAX_IRQS / 8,
928 .bits_per_irq = 1,
929 .handle_mmio = handle_mmio_raz_wi,
930 },
931 {
932 .base = GIC_DIST_ENABLE_SET,
933 .len = VGIC_MAX_IRQS / 8,
934 .bits_per_irq = 1,
935 .handle_mmio = handle_mmio_set_enable_reg,
936 },
937 {
938 .base = GIC_DIST_ENABLE_CLEAR,
939 .len = VGIC_MAX_IRQS / 8,
940 .bits_per_irq = 1,
941 .handle_mmio = handle_mmio_clear_enable_reg,
942 },
943 {
944 .base = GIC_DIST_PENDING_SET,
945 .len = VGIC_MAX_IRQS / 8,
946 .bits_per_irq = 1,
947 .handle_mmio = handle_mmio_set_pending_reg,
948 },
949 {
950 .base = GIC_DIST_PENDING_CLEAR,
951 .len = VGIC_MAX_IRQS / 8,
952 .bits_per_irq = 1,
953 .handle_mmio = handle_mmio_clear_pending_reg,
954 },
955 {
956 .base = GIC_DIST_ACTIVE_SET,
957 .len = VGIC_MAX_IRQS / 8,
958 .bits_per_irq = 1,
959 .handle_mmio = handle_mmio_raz_wi,
960 },
961 {
962 .base = GIC_DIST_ACTIVE_CLEAR,
963 .len = VGIC_MAX_IRQS / 8,
964 .bits_per_irq = 1,
965 .handle_mmio = handle_mmio_raz_wi,
966 },
967 {
968 .base = GIC_DIST_PRI,
969 .len = VGIC_MAX_IRQS,
970 .bits_per_irq = 8,
971 .handle_mmio = handle_mmio_priority_reg,
972 },
973 {
974 .base = GIC_DIST_TARGET,
975 .len = VGIC_MAX_IRQS,
976 .bits_per_irq = 8,
977 .handle_mmio = handle_mmio_target_reg,
978 },
979 {
980 .base = GIC_DIST_CONFIG,
981 .len = VGIC_MAX_IRQS / 4,
982 .bits_per_irq = 2,
983 .handle_mmio = handle_mmio_cfg_reg,
984 },
985 {
986 .base = GIC_DIST_SOFTINT,
987 .len = 4,
988 .handle_mmio = handle_mmio_sgi_reg,
989 },
990 {
991 .base = GIC_DIST_SGI_PENDING_CLEAR,
992 .len = VGIC_NR_SGIS,
993 .handle_mmio = handle_mmio_sgi_clear,
994 },
995 {
996 .base = GIC_DIST_SGI_PENDING_SET,
997 .len = VGIC_NR_SGIS,
998 .handle_mmio = handle_mmio_sgi_set,
999 },
1000 {}
1001};
1002
1003static const
1004struct mmio_range *find_matching_range(const struct mmio_range *ranges,
1005 struct kvm_exit_mmio *mmio, 652 struct kvm_exit_mmio *mmio,
1006 phys_addr_t offset) 653 phys_addr_t offset)
1007{ 654{
1008 const struct mmio_range *r = ranges; 655 const struct kvm_mmio_range *r = ranges;
1009 656
1010 while (r->len) { 657 while (r->len) {
1011 if (offset >= r->base && 658 if (offset >= r->base &&
@@ -1018,7 +665,7 @@ struct mmio_range *find_matching_range(const struct mmio_range *ranges,
1018} 665}
1019 666
1020static bool vgic_validate_access(const struct vgic_dist *dist, 667static bool vgic_validate_access(const struct vgic_dist *dist,
1021 const struct mmio_range *range, 668 const struct kvm_mmio_range *range,
1022 unsigned long offset) 669 unsigned long offset)
1023{ 670{
1024 int irq; 671 int irq;
@@ -1033,37 +680,76 @@ static bool vgic_validate_access(const struct vgic_dist *dist,
1033 return true; 680 return true;
1034} 681}
1035 682
683/*
684 * Call the respective handler function for the given range.
685 * We split up any 64 bit accesses into two consecutive 32 bit
686 * handler calls and merge the result afterwards.
687 * We do this in a little endian fashion regardless of the host's
688 * or guest's endianness, because the GIC is always LE and the rest of
689 * the code (vgic_reg_access) also puts it in a LE fashion already.
690 * At this point we have already identified the handle function, so
691 * range points to that one entry and offset is relative to this.
692 */
693static bool call_range_handler(struct kvm_vcpu *vcpu,
694 struct kvm_exit_mmio *mmio,
695 unsigned long offset,
696 const struct kvm_mmio_range *range)
697{
698 u32 *data32 = (void *)mmio->data;
699 struct kvm_exit_mmio mmio32;
700 bool ret;
701
702 if (likely(mmio->len <= 4))
703 return range->handle_mmio(vcpu, mmio, offset);
704
705 /*
706 * Any access bigger than 4 bytes (that we currently handle in KVM)
707 * is actually 8 bytes long, caused by a 64-bit access
708 */
709
710 mmio32.len = 4;
711 mmio32.is_write = mmio->is_write;
712 mmio32.private = mmio->private;
713
714 mmio32.phys_addr = mmio->phys_addr + 4;
715 if (mmio->is_write)
716 *(u32 *)mmio32.data = data32[1];
717 ret = range->handle_mmio(vcpu, &mmio32, offset + 4);
718 if (!mmio->is_write)
719 data32[1] = *(u32 *)mmio32.data;
720
721 mmio32.phys_addr = mmio->phys_addr;
722 if (mmio->is_write)
723 *(u32 *)mmio32.data = data32[0];
724 ret |= range->handle_mmio(vcpu, &mmio32, offset);
725 if (!mmio->is_write)
726 data32[0] = *(u32 *)mmio32.data;
727
728 return ret;
729}
730
1036/** 731/**
1037 * vgic_handle_mmio - handle an in-kernel MMIO access 732 * vgic_handle_mmio_range - handle an in-kernel MMIO access
1038 * @vcpu: pointer to the vcpu performing the access 733 * @vcpu: pointer to the vcpu performing the access
1039 * @run: pointer to the kvm_run structure 734 * @run: pointer to the kvm_run structure
1040 * @mmio: pointer to the data describing the access 735 * @mmio: pointer to the data describing the access
736 * @ranges: array of MMIO ranges in a given region
737 * @mmio_base: base address of that region
1041 * 738 *
1042 * returns true if the MMIO access has been performed in kernel space, 739 * returns true if the MMIO access could be performed
1043 * and false if it needs to be emulated in user space.
1044 */ 740 */
1045bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, 741bool vgic_handle_mmio_range(struct kvm_vcpu *vcpu, struct kvm_run *run,
1046 struct kvm_exit_mmio *mmio) 742 struct kvm_exit_mmio *mmio,
743 const struct kvm_mmio_range *ranges,
744 unsigned long mmio_base)
1047{ 745{
1048 const struct mmio_range *range; 746 const struct kvm_mmio_range *range;
1049 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 747 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1050 unsigned long base = dist->vgic_dist_base;
1051 bool updated_state; 748 bool updated_state;
1052 unsigned long offset; 749 unsigned long offset;
1053 750
1054 if (!irqchip_in_kernel(vcpu->kvm) || 751 offset = mmio->phys_addr - mmio_base;
1055 mmio->phys_addr < base || 752 range = vgic_find_range(ranges, mmio, offset);
1056 (mmio->phys_addr + mmio->len) > (base + KVM_VGIC_V2_DIST_SIZE))
1057 return false;
1058
1059 /* We don't support ldrd / strd or ldm / stm to the emulated vgic */
1060 if (mmio->len > 4) {
1061 kvm_inject_dabt(vcpu, mmio->phys_addr);
1062 return true;
1063 }
1064
1065 offset = mmio->phys_addr - base;
1066 range = find_matching_range(vgic_dist_ranges, mmio, offset);
1067 if (unlikely(!range || !range->handle_mmio)) { 753 if (unlikely(!range || !range->handle_mmio)) {
1068 pr_warn("Unhandled access %d %08llx %d\n", 754 pr_warn("Unhandled access %d %08llx %d\n",
1069 mmio->is_write, mmio->phys_addr, mmio->len); 755 mmio->is_write, mmio->phys_addr, mmio->len);
@@ -1071,12 +757,12 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
1071 } 757 }
1072 758
1073 spin_lock(&vcpu->kvm->arch.vgic.lock); 759 spin_lock(&vcpu->kvm->arch.vgic.lock);
1074 offset = mmio->phys_addr - range->base - base; 760 offset -= range->base;
1075 if (vgic_validate_access(dist, range, offset)) { 761 if (vgic_validate_access(dist, range, offset)) {
1076 updated_state = range->handle_mmio(vcpu, mmio, offset); 762 updated_state = call_range_handler(vcpu, mmio, offset, range);
1077 } else { 763 } else {
1078 vgic_reg_access(mmio, NULL, offset, 764 if (!mmio->is_write)
1079 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); 765 memset(mmio->data, 0, mmio->len);
1080 updated_state = false; 766 updated_state = false;
1081 } 767 }
1082 spin_unlock(&vcpu->kvm->arch.vgic.lock); 768 spin_unlock(&vcpu->kvm->arch.vgic.lock);
@@ -1089,50 +775,28 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
1089 return true; 775 return true;
1090} 776}
1091 777
1092static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi) 778/**
1093{ 779 * vgic_handle_mmio - handle an in-kernel MMIO access for the GIC emulation
1094 return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi; 780 * @vcpu: pointer to the vcpu performing the access
1095} 781 * @run: pointer to the kvm_run structure
1096 782 * @mmio: pointer to the data describing the access
1097static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) 783 *
784 * returns true if the MMIO access has been performed in kernel space,
785 * and false if it needs to be emulated in user space.
786 * Calls the actual handling routine for the selected VGIC model.
787 */
788bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
789 struct kvm_exit_mmio *mmio)
1098{ 790{
1099 struct kvm *kvm = vcpu->kvm; 791 if (!irqchip_in_kernel(vcpu->kvm))
1100 struct vgic_dist *dist = &kvm->arch.vgic; 792 return false;
1101 int nrcpus = atomic_read(&kvm->online_vcpus);
1102 u8 target_cpus;
1103 int sgi, mode, c, vcpu_id;
1104
1105 vcpu_id = vcpu->vcpu_id;
1106
1107 sgi = reg & 0xf;
1108 target_cpus = (reg >> 16) & 0xff;
1109 mode = (reg >> 24) & 3;
1110
1111 switch (mode) {
1112 case 0:
1113 if (!target_cpus)
1114 return;
1115 break;
1116
1117 case 1:
1118 target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
1119 break;
1120
1121 case 2:
1122 target_cpus = 1 << vcpu_id;
1123 break;
1124 }
1125
1126 kvm_for_each_vcpu(c, vcpu, kvm) {
1127 if (target_cpus & 1) {
1128 /* Flag the SGI as pending */
1129 vgic_dist_irq_set_pending(vcpu, sgi);
1130 *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
1131 kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
1132 }
1133 793
1134 target_cpus >>= 1; 794 /*
1135 } 795 * This will currently call either vgic_v2_handle_mmio() or
796 * vgic_v3_handle_mmio(), which in turn will call
797 * vgic_handle_mmio_range() defined above.
798 */
799 return vcpu->kvm->arch.vgic.vm_ops.handle_mmio(vcpu, run, mmio);
1136} 800}
1137 801
1138static int vgic_nr_shared_irqs(struct vgic_dist *dist) 802static int vgic_nr_shared_irqs(struct vgic_dist *dist)
@@ -1173,7 +837,7 @@ static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
1173 * Update the interrupt state and determine which CPUs have pending 837 * Update the interrupt state and determine which CPUs have pending
1174 * interrupts. Must be called with distributor lock held. 838 * interrupts. Must be called with distributor lock held.
1175 */ 839 */
1176static void vgic_update_state(struct kvm *kvm) 840void vgic_update_state(struct kvm *kvm)
1177{ 841{
1178 struct vgic_dist *dist = &kvm->arch.vgic; 842 struct vgic_dist *dist = &kvm->arch.vgic;
1179 struct kvm_vcpu *vcpu; 843 struct kvm_vcpu *vcpu;
@@ -1234,12 +898,12 @@ static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu)
1234 vgic_ops->disable_underflow(vcpu); 898 vgic_ops->disable_underflow(vcpu);
1235} 899}
1236 900
1237static inline void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) 901void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
1238{ 902{
1239 vgic_ops->get_vmcr(vcpu, vmcr); 903 vgic_ops->get_vmcr(vcpu, vmcr);
1240} 904}
1241 905
1242static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) 906void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
1243{ 907{
1244 vgic_ops->set_vmcr(vcpu, vmcr); 908 vgic_ops->set_vmcr(vcpu, vmcr);
1245} 909}
@@ -1288,8 +952,9 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
1288/* 952/*
1289 * Queue an interrupt to a CPU virtual interface. Return true on success, 953 * Queue an interrupt to a CPU virtual interface. Return true on success,
1290 * or false if it wasn't possible to queue it. 954 * or false if it wasn't possible to queue it.
955 * sgi_source must be zero for any non-SGI interrupts.
1291 */ 956 */
1292static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) 957bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
1293{ 958{
1294 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 959 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1295 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 960 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
@@ -1338,37 +1003,6 @@ static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
1338 return true; 1003 return true;
1339} 1004}
1340 1005
1341static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq)
1342{
1343 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1344 unsigned long sources;
1345 int vcpu_id = vcpu->vcpu_id;
1346 int c;
1347
1348 sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
1349
1350 for_each_set_bit(c, &sources, dist->nr_cpus) {
1351 if (vgic_queue_irq(vcpu, c, irq))
1352 clear_bit(c, &sources);
1353 }
1354
1355 *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
1356
1357 /*
1358 * If the sources bitmap has been cleared it means that we
1359 * could queue all the SGIs onto link registers (see the
1360 * clear_bit above), and therefore we are done with them in
1361 * our emulated gic and can get rid of them.
1362 */
1363 if (!sources) {
1364 vgic_dist_irq_clear_pending(vcpu, irq);
1365 vgic_cpu_irq_clear(vcpu, irq);
1366 return true;
1367 }
1368
1369 return false;
1370}
1371
1372static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) 1006static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
1373{ 1007{
1374 if (!vgic_can_sample_irq(vcpu, irq)) 1008 if (!vgic_can_sample_irq(vcpu, irq))
@@ -1413,7 +1047,7 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
1413 1047
1414 /* SGIs */ 1048 /* SGIs */
1415 for_each_set_bit(i, vgic_cpu->pending_percpu, VGIC_NR_SGIS) { 1049 for_each_set_bit(i, vgic_cpu->pending_percpu, VGIC_NR_SGIS) {
1416 if (!vgic_queue_sgi(vcpu, i)) 1050 if (!queue_sgi(vcpu, i))
1417 overflow = 1; 1051 overflow = 1;
1418 } 1052 }
1419 1053
@@ -1575,7 +1209,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
1575 return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); 1209 return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
1576} 1210}
1577 1211
1578static void vgic_kick_vcpus(struct kvm *kvm) 1212void vgic_kick_vcpus(struct kvm *kvm)
1579{ 1213{
1580 struct kvm_vcpu *vcpu; 1214 struct kvm_vcpu *vcpu;
1581 int c; 1215 int c;
@@ -1615,7 +1249,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
1615 struct kvm_vcpu *vcpu; 1249 struct kvm_vcpu *vcpu;
1616 int edge_triggered, level_triggered; 1250 int edge_triggered, level_triggered;
1617 int enabled; 1251 int enabled;
1618 bool ret = true; 1252 bool ret = true, can_inject = true;
1619 1253
1620 spin_lock(&dist->lock); 1254 spin_lock(&dist->lock);
1621 1255
@@ -1630,6 +1264,11 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
1630 1264
1631 if (irq_num >= VGIC_NR_PRIVATE_IRQS) { 1265 if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
1632 cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS]; 1266 cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
1267 if (cpuid == VCPU_NOT_ALLOCATED) {
1268 /* Pretend we use CPU0, and prevent injection */
1269 cpuid = 0;
1270 can_inject = false;
1271 }
1633 vcpu = kvm_get_vcpu(kvm, cpuid); 1272 vcpu = kvm_get_vcpu(kvm, cpuid);
1634 } 1273 }
1635 1274
@@ -1652,7 +1291,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
1652 1291
1653 enabled = vgic_irq_is_enabled(vcpu, irq_num); 1292 enabled = vgic_irq_is_enabled(vcpu, irq_num);
1654 1293
1655 if (!enabled) { 1294 if (!enabled || !can_inject) {
1656 ret = false; 1295 ret = false;
1657 goto out; 1296 goto out;
1658 } 1297 }
@@ -1698,6 +1337,16 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
1698 int vcpu_id; 1337 int vcpu_id;
1699 1338
1700 if (unlikely(!vgic_initialized(kvm))) { 1339 if (unlikely(!vgic_initialized(kvm))) {
1340 /*
1341 * We only provide the automatic initialization of the VGIC
1342 * for the legacy case of a GICv2. Any other type must
1343 * be explicitly initialized once setup with the respective
1344 * KVM device call.
1345 */
1346 if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) {
1347 ret = -EBUSY;
1348 goto out;
1349 }
1701 mutex_lock(&kvm->lock); 1350 mutex_lock(&kvm->lock);
1702 ret = vgic_init(kvm); 1351 ret = vgic_init(kvm);
1703 mutex_unlock(&kvm->lock); 1352 mutex_unlock(&kvm->lock);
@@ -1762,6 +1411,17 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
1762 return 0; 1411 return 0;
1763} 1412}
1764 1413
1414/**
1415 * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
1416 *
1417 * The host's GIC naturally limits the maximum amount of VCPUs a guest
1418 * can use.
1419 */
1420int kvm_vgic_get_max_vcpus(void)
1421{
1422 return vgic->max_gic_vcpus;
1423}
1424
1765void kvm_vgic_destroy(struct kvm *kvm) 1425void kvm_vgic_destroy(struct kvm *kvm)
1766{ 1426{
1767 struct vgic_dist *dist = &kvm->arch.vgic; 1427 struct vgic_dist *dist = &kvm->arch.vgic;
@@ -1784,6 +1444,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
1784 } 1444 }
1785 kfree(dist->irq_sgi_sources); 1445 kfree(dist->irq_sgi_sources);
1786 kfree(dist->irq_spi_cpu); 1446 kfree(dist->irq_spi_cpu);
1447 kfree(dist->irq_spi_mpidr);
1787 kfree(dist->irq_spi_target); 1448 kfree(dist->irq_spi_target);
1788 kfree(dist->irq_pending_on_cpu); 1449 kfree(dist->irq_pending_on_cpu);
1789 dist->irq_sgi_sources = NULL; 1450 dist->irq_sgi_sources = NULL;
@@ -1797,7 +1458,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
1797 * Allocate and initialize the various data structures. Must be called 1458 * Allocate and initialize the various data structures. Must be called
1798 * with kvm->lock held! 1459 * with kvm->lock held!
1799 */ 1460 */
1800static int vgic_init(struct kvm *kvm) 1461int vgic_init(struct kvm *kvm)
1801{ 1462{
1802 struct vgic_dist *dist = &kvm->arch.vgic; 1463 struct vgic_dist *dist = &kvm->arch.vgic;
1803 struct kvm_vcpu *vcpu; 1464 struct kvm_vcpu *vcpu;
@@ -1809,7 +1470,7 @@ static int vgic_init(struct kvm *kvm)
1809 1470
1810 nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus); 1471 nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
1811 if (!nr_cpus) /* No vcpus? Can't be good... */ 1472 if (!nr_cpus) /* No vcpus? Can't be good... */
1812 return -EINVAL; 1473 return -ENODEV;
1813 1474
1814 /* 1475 /*
1815 * If nobody configured the number of interrupts, use the 1476 * If nobody configured the number of interrupts, use the
@@ -1852,8 +1513,9 @@ static int vgic_init(struct kvm *kvm)
1852 if (ret) 1513 if (ret)
1853 goto out; 1514 goto out;
1854 1515
1855 for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i += 4) 1516 ret = kvm->arch.vgic.vm_ops.init_model(kvm);
1856 vgic_set_target_reg(kvm, 0, i); 1517 if (ret)
1518 goto out;
1857 1519
1858 kvm_for_each_vcpu(vcpu_id, vcpu, kvm) { 1520 kvm_for_each_vcpu(vcpu_id, vcpu, kvm) {
1859 ret = vgic_vcpu_init_maps(vcpu, nr_irqs); 1521 ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
@@ -1882,72 +1544,49 @@ out:
1882 return ret; 1544 return ret;
1883} 1545}
1884 1546
1885/** 1547static int init_vgic_model(struct kvm *kvm, int type)
1886 * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
1887 * @kvm: pointer to the kvm struct
1888 *
1889 * Map the virtual CPU interface into the VM before running any VCPUs. We
1890 * can't do this at creation time, because user space must first set the
1891 * virtual CPU interface address in the guest physical address space.
1892 */
1893int kvm_vgic_map_resources(struct kvm *kvm)
1894{ 1548{
1895 int ret = 0; 1549 switch (type) {
1896 1550 case KVM_DEV_TYPE_ARM_VGIC_V2:
1897 if (!irqchip_in_kernel(kvm)) 1551 vgic_v2_init_emulation(kvm);
1898 return 0; 1552 break;
1899 1553#ifdef CONFIG_ARM_GIC_V3
1900 mutex_lock(&kvm->lock); 1554 case KVM_DEV_TYPE_ARM_VGIC_V3:
1901 1555 vgic_v3_init_emulation(kvm);
1902 if (vgic_ready(kvm)) 1556 break;
1903 goto out; 1557#endif
1904 1558 default:
1905 if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) || 1559 return -ENODEV;
1906 IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_cpu_base)) {
1907 kvm_err("Need to set vgic cpu and dist addresses first\n");
1908 ret = -ENXIO;
1909 goto out;
1910 }
1911
1912 /*
1913 * Initialize the vgic if this hasn't already been done on demand by
1914 * accessing the vgic state from userspace.
1915 */
1916 ret = vgic_init(kvm);
1917 if (ret) {
1918 kvm_err("Unable to allocate maps\n");
1919 goto out;
1920 } 1560 }
1921 1561
1922 ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base, 1562 if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus)
1923 vgic->vcpu_base, KVM_VGIC_V2_CPU_SIZE, 1563 return -E2BIG;
1924 true);
1925 if (ret) {
1926 kvm_err("Unable to remap VGIC CPU to VCPU\n");
1927 goto out;
1928 }
1929 1564
1930 kvm->arch.vgic.ready = true; 1565 return 0;
1931out:
1932 if (ret)
1933 kvm_vgic_destroy(kvm);
1934 mutex_unlock(&kvm->lock);
1935 return ret;
1936} 1566}
1937 1567
1938int kvm_vgic_create(struct kvm *kvm) 1568int kvm_vgic_create(struct kvm *kvm, u32 type)
1939{ 1569{
1940 int i, vcpu_lock_idx = -1, ret; 1570 int i, vcpu_lock_idx = -1, ret;
1941 struct kvm_vcpu *vcpu; 1571 struct kvm_vcpu *vcpu;
1942 1572
1943 mutex_lock(&kvm->lock); 1573 mutex_lock(&kvm->lock);
1944 1574
1945 if (kvm->arch.vgic.vctrl_base) { 1575 if (irqchip_in_kernel(kvm)) {
1946 ret = -EEXIST; 1576 ret = -EEXIST;
1947 goto out; 1577 goto out;
1948 } 1578 }
1949 1579
1950 /* 1580 /*
1581 * This function is also called by the KVM_CREATE_IRQCHIP handler,
1582 * which had no chance yet to check the availability of the GICv2
1583 * emulation. So check this here again. KVM_CREATE_DEVICE does
1584 * the proper checks already.
1585 */
1586 if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2)
1587 return -ENODEV;
1588
1589 /*
1951 * Any time a vcpu is run, vcpu_load is called which tries to grab the 1590 * Any time a vcpu is run, vcpu_load is called which tries to grab the
1952 * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure 1591 * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure
1953 * that no other VCPUs are run while we create the vgic. 1592 * that no other VCPUs are run while we create the vgic.
@@ -1965,11 +1604,17 @@ int kvm_vgic_create(struct kvm *kvm)
1965 } 1604 }
1966 ret = 0; 1605 ret = 0;
1967 1606
1607 ret = init_vgic_model(kvm, type);
1608 if (ret)
1609 goto out_unlock;
1610
1968 spin_lock_init(&kvm->arch.vgic.lock); 1611 spin_lock_init(&kvm->arch.vgic.lock);
1969 kvm->arch.vgic.in_kernel = true; 1612 kvm->arch.vgic.in_kernel = true;
1613 kvm->arch.vgic.vgic_model = type;
1970 kvm->arch.vgic.vctrl_base = vgic->vctrl_base; 1614 kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
1971 kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; 1615 kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
1972 kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; 1616 kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
1617 kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
1973 1618
1974out_unlock: 1619out_unlock:
1975 for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { 1620 for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
@@ -2022,7 +1667,7 @@ static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
2022/** 1667/**
2023 * kvm_vgic_addr - set or get vgic VM base addresses 1668 * kvm_vgic_addr - set or get vgic VM base addresses
2024 * @kvm: pointer to the vm struct 1669 * @kvm: pointer to the vm struct
2025 * @type: the VGIC addr type, one of KVM_VGIC_V2_ADDR_TYPE_XXX 1670 * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
2026 * @addr: pointer to address value 1671 * @addr: pointer to address value
2027 * @write: if true set the address in the VM address space, if false read the 1672 * @write: if true set the address in the VM address space, if false read the
2028 * address 1673 * address
@@ -2036,216 +1681,64 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
2036{ 1681{
2037 int r = 0; 1682 int r = 0;
2038 struct vgic_dist *vgic = &kvm->arch.vgic; 1683 struct vgic_dist *vgic = &kvm->arch.vgic;
1684 int type_needed;
1685 phys_addr_t *addr_ptr, block_size;
1686 phys_addr_t alignment;
2039 1687
2040 mutex_lock(&kvm->lock); 1688 mutex_lock(&kvm->lock);
2041 switch (type) { 1689 switch (type) {
2042 case KVM_VGIC_V2_ADDR_TYPE_DIST: 1690 case KVM_VGIC_V2_ADDR_TYPE_DIST:
2043 if (write) { 1691 type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
2044 r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base, 1692 addr_ptr = &vgic->vgic_dist_base;
2045 *addr, KVM_VGIC_V2_DIST_SIZE); 1693 block_size = KVM_VGIC_V2_DIST_SIZE;
2046 } else { 1694 alignment = SZ_4K;
2047 *addr = vgic->vgic_dist_base;
2048 }
2049 break; 1695 break;
2050 case KVM_VGIC_V2_ADDR_TYPE_CPU: 1696 case KVM_VGIC_V2_ADDR_TYPE_CPU:
2051 if (write) { 1697 type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
2052 r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base, 1698 addr_ptr = &vgic->vgic_cpu_base;
2053 *addr, KVM_VGIC_V2_CPU_SIZE); 1699 block_size = KVM_VGIC_V2_CPU_SIZE;
2054 } else { 1700 alignment = SZ_4K;
2055 *addr = vgic->vgic_cpu_base;
2056 }
2057 break; 1701 break;
2058 default: 1702#ifdef CONFIG_ARM_GIC_V3
2059 r = -ENODEV; 1703 case KVM_VGIC_V3_ADDR_TYPE_DIST:
2060 } 1704 type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
2061 1705 addr_ptr = &vgic->vgic_dist_base;
2062 mutex_unlock(&kvm->lock); 1706 block_size = KVM_VGIC_V3_DIST_SIZE;
2063 return r; 1707 alignment = SZ_64K;
2064}
2065
2066static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
2067 struct kvm_exit_mmio *mmio, phys_addr_t offset)
2068{
2069 bool updated = false;
2070 struct vgic_vmcr vmcr;
2071 u32 *vmcr_field;
2072 u32 reg;
2073
2074 vgic_get_vmcr(vcpu, &vmcr);
2075
2076 switch (offset & ~0x3) {
2077 case GIC_CPU_CTRL:
2078 vmcr_field = &vmcr.ctlr;
2079 break;
2080 case GIC_CPU_PRIMASK:
2081 vmcr_field = &vmcr.pmr;
2082 break; 1708 break;
2083 case GIC_CPU_BINPOINT: 1709 case KVM_VGIC_V3_ADDR_TYPE_REDIST:
2084 vmcr_field = &vmcr.bpr; 1710 type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
2085 break; 1711 addr_ptr = &vgic->vgic_redist_base;
2086 case GIC_CPU_ALIAS_BINPOINT: 1712 block_size = KVM_VGIC_V3_REDIST_SIZE;
2087 vmcr_field = &vmcr.abpr; 1713 alignment = SZ_64K;
2088 break; 1714 break;
1715#endif
2089 default: 1716 default:
2090 BUG(); 1717 r = -ENODEV;
2091 }
2092
2093 if (!mmio->is_write) {
2094 reg = *vmcr_field;
2095 mmio_data_write(mmio, ~0, reg);
2096 } else {
2097 reg = mmio_data_read(mmio, ~0);
2098 if (reg != *vmcr_field) {
2099 *vmcr_field = reg;
2100 vgic_set_vmcr(vcpu, &vmcr);
2101 updated = true;
2102 }
2103 }
2104 return updated;
2105}
2106
2107static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
2108 struct kvm_exit_mmio *mmio, phys_addr_t offset)
2109{
2110 return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
2111}
2112
2113static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
2114 struct kvm_exit_mmio *mmio,
2115 phys_addr_t offset)
2116{
2117 u32 reg;
2118
2119 if (mmio->is_write)
2120 return false;
2121
2122 /* GICC_IIDR */
2123 reg = (PRODUCT_ID_KVM << 20) |
2124 (GICC_ARCH_VERSION_V2 << 16) |
2125 (IMPLEMENTER_ARM << 0);
2126 mmio_data_write(mmio, ~0, reg);
2127 return false;
2128}
2129
2130/*
2131 * CPU Interface Register accesses - these are not accessed by the VM, but by
2132 * user space for saving and restoring VGIC state.
2133 */
2134static const struct mmio_range vgic_cpu_ranges[] = {
2135 {
2136 .base = GIC_CPU_CTRL,
2137 .len = 12,
2138 .handle_mmio = handle_cpu_mmio_misc,
2139 },
2140 {
2141 .base = GIC_CPU_ALIAS_BINPOINT,
2142 .len = 4,
2143 .handle_mmio = handle_mmio_abpr,
2144 },
2145 {
2146 .base = GIC_CPU_ACTIVEPRIO,
2147 .len = 16,
2148 .handle_mmio = handle_mmio_raz_wi,
2149 },
2150 {
2151 .base = GIC_CPU_IDENT,
2152 .len = 4,
2153 .handle_mmio = handle_cpu_mmio_ident,
2154 },
2155};
2156
2157static int vgic_attr_regs_access(struct kvm_device *dev,
2158 struct kvm_device_attr *attr,
2159 u32 *reg, bool is_write)
2160{
2161 const struct mmio_range *r = NULL, *ranges;
2162 phys_addr_t offset;
2163 int ret, cpuid, c;
2164 struct kvm_vcpu *vcpu, *tmp_vcpu;
2165 struct vgic_dist *vgic;
2166 struct kvm_exit_mmio mmio;
2167
2168 offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
2169 cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
2170 KVM_DEV_ARM_VGIC_CPUID_SHIFT;
2171
2172 mutex_lock(&dev->kvm->lock);
2173
2174 ret = vgic_init(dev->kvm);
2175 if (ret)
2176 goto out;
2177
2178 if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
2179 ret = -EINVAL;
2180 goto out; 1718 goto out;
2181 } 1719 }
2182 1720
2183 vcpu = kvm_get_vcpu(dev->kvm, cpuid); 1721 if (vgic->vgic_model != type_needed) {
2184 vgic = &dev->kvm->arch.vgic; 1722 r = -ENODEV;
2185
2186 mmio.len = 4;
2187 mmio.is_write = is_write;
2188 if (is_write)
2189 mmio_data_write(&mmio, ~0, *reg);
2190 switch (attr->group) {
2191 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
2192 mmio.phys_addr = vgic->vgic_dist_base + offset;
2193 ranges = vgic_dist_ranges;
2194 break;
2195 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
2196 mmio.phys_addr = vgic->vgic_cpu_base + offset;
2197 ranges = vgic_cpu_ranges;
2198 break;
2199 default:
2200 BUG();
2201 }
2202 r = find_matching_range(ranges, &mmio, offset);
2203
2204 if (unlikely(!r || !r->handle_mmio)) {
2205 ret = -ENXIO;
2206 goto out; 1723 goto out;
2207 } 1724 }
2208 1725
2209 1726 if (write) {
2210 spin_lock(&vgic->lock); 1727 if (!IS_ALIGNED(*addr, alignment))
2211 1728 r = -EINVAL;
2212 /* 1729 else
2213 * Ensure that no other VCPU is running by checking the vcpu->cpu 1730 r = vgic_ioaddr_assign(kvm, addr_ptr, *addr,
2214 * field. If no other VPCUs are running we can safely access the VGIC 1731 block_size);
2215 * state, because even if another VPU is run after this point, that 1732 } else {
2216 * VCPU will not touch the vgic state, because it will block on 1733 *addr = *addr_ptr;
2217 * getting the vgic->lock in kvm_vgic_sync_hwstate().
2218 */
2219 kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
2220 if (unlikely(tmp_vcpu->cpu != -1)) {
2221 ret = -EBUSY;
2222 goto out_vgic_unlock;
2223 }
2224 } 1734 }
2225 1735
2226 /*
2227 * Move all pending IRQs from the LRs on all VCPUs so the pending
2228 * state can be properly represented in the register state accessible
2229 * through this API.
2230 */
2231 kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
2232 vgic_unqueue_irqs(tmp_vcpu);
2233
2234 offset -= r->base;
2235 r->handle_mmio(vcpu, &mmio, offset);
2236
2237 if (!is_write)
2238 *reg = mmio_data_read(&mmio, ~0);
2239
2240 ret = 0;
2241out_vgic_unlock:
2242 spin_unlock(&vgic->lock);
2243out: 1736out:
2244 mutex_unlock(&dev->kvm->lock); 1737 mutex_unlock(&kvm->lock);
2245 return ret; 1738 return r;
2246} 1739}
2247 1740
2248static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1741int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2249{ 1742{
2250 int r; 1743 int r;
2251 1744
@@ -2261,17 +1754,6 @@ static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2261 r = kvm_vgic_addr(dev->kvm, type, &addr, true); 1754 r = kvm_vgic_addr(dev->kvm, type, &addr, true);
2262 return (r == -ENODEV) ? -ENXIO : r; 1755 return (r == -ENODEV) ? -ENXIO : r;
2263 } 1756 }
2264
2265 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
2266 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
2267 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
2268 u32 reg;
2269
2270 if (get_user(reg, uaddr))
2271 return -EFAULT;
2272
2273 return vgic_attr_regs_access(dev, attr, &reg, true);
2274 }
2275 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { 1757 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
2276 u32 __user *uaddr = (u32 __user *)(long)attr->addr; 1758 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
2277 u32 val; 1759 u32 val;
@@ -2302,13 +1784,20 @@ static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2302 1784
2303 return ret; 1785 return ret;
2304 } 1786 }
2305 1787 case KVM_DEV_ARM_VGIC_GRP_CTRL: {
1788 switch (attr->attr) {
1789 case KVM_DEV_ARM_VGIC_CTRL_INIT:
1790 r = vgic_init(dev->kvm);
1791 return r;
1792 }
1793 break;
1794 }
2306 } 1795 }
2307 1796
2308 return -ENXIO; 1797 return -ENXIO;
2309} 1798}
2310 1799
2311static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1800int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2312{ 1801{
2313 int r = -ENXIO; 1802 int r = -ENXIO;
2314 1803
@@ -2326,20 +1815,9 @@ static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2326 return -EFAULT; 1815 return -EFAULT;
2327 break; 1816 break;
2328 } 1817 }
2329
2330 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
2331 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
2332 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
2333 u32 reg = 0;
2334
2335 r = vgic_attr_regs_access(dev, attr, &reg, false);
2336 if (r)
2337 return r;
2338 r = put_user(reg, uaddr);
2339 break;
2340 }
2341 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { 1818 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
2342 u32 __user *uaddr = (u32 __user *)(long)attr->addr; 1819 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
1820
2343 r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr); 1821 r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
2344 break; 1822 break;
2345 } 1823 }
@@ -2349,61 +1827,17 @@ static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2349 return r; 1827 return r;
2350} 1828}
2351 1829
2352static int vgic_has_attr_regs(const struct mmio_range *ranges, 1830int vgic_has_attr_regs(const struct kvm_mmio_range *ranges, phys_addr_t offset)
2353 phys_addr_t offset)
2354{ 1831{
2355 struct kvm_exit_mmio dev_attr_mmio; 1832 struct kvm_exit_mmio dev_attr_mmio;
2356 1833
2357 dev_attr_mmio.len = 4; 1834 dev_attr_mmio.len = 4;
2358 if (find_matching_range(ranges, &dev_attr_mmio, offset)) 1835 if (vgic_find_range(ranges, &dev_attr_mmio, offset))
2359 return 0; 1836 return 0;
2360 else 1837 else
2361 return -ENXIO; 1838 return -ENXIO;
2362} 1839}
2363 1840
2364static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2365{
2366 phys_addr_t offset;
2367
2368 switch (attr->group) {
2369 case KVM_DEV_ARM_VGIC_GRP_ADDR:
2370 switch (attr->attr) {
2371 case KVM_VGIC_V2_ADDR_TYPE_DIST:
2372 case KVM_VGIC_V2_ADDR_TYPE_CPU:
2373 return 0;
2374 }
2375 break;
2376 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
2377 offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
2378 return vgic_has_attr_regs(vgic_dist_ranges, offset);
2379 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
2380 offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
2381 return vgic_has_attr_regs(vgic_cpu_ranges, offset);
2382 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
2383 return 0;
2384 }
2385 return -ENXIO;
2386}
2387
2388static void vgic_destroy(struct kvm_device *dev)
2389{
2390 kfree(dev);
2391}
2392
2393static int vgic_create(struct kvm_device *dev, u32 type)
2394{
2395 return kvm_vgic_create(dev->kvm);
2396}
2397
2398static struct kvm_device_ops kvm_arm_vgic_v2_ops = {
2399 .name = "kvm-arm-vgic",
2400 .create = vgic_create,
2401 .destroy = vgic_destroy,
2402 .set_attr = vgic_set_attr,
2403 .get_attr = vgic_get_attr,
2404 .has_attr = vgic_has_attr,
2405};
2406
2407static void vgic_init_maintenance_interrupt(void *info) 1841static void vgic_init_maintenance_interrupt(void *info)
2408{ 1842{
2409 enable_percpu_irq(vgic->maint_irq, 0); 1843 enable_percpu_irq(vgic->maint_irq, 0);
@@ -2474,8 +1908,7 @@ int kvm_vgic_hyp_init(void)
2474 1908
2475 on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); 1909 on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
2476 1910
2477 return kvm_register_device_ops(&kvm_arm_vgic_v2_ops, 1911 return 0;
2478 KVM_DEV_TYPE_ARM_VGIC_V2);
2479 1912
2480out_free_irq: 1913out_free_irq:
2481 free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus()); 1914 free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
new file mode 100644
index 000000000000..1e83bdf5f499
--- /dev/null
+++ b/virt/kvm/arm/vgic.h
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) 2012-2014 ARM Ltd.
3 * Author: Marc Zyngier <marc.zyngier@arm.com>
4 *
5 * Derived from virt/kvm/arm/vgic.c
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20#ifndef __KVM_VGIC_H__
21#define __KVM_VGIC_H__
22
23#define VGIC_ADDR_UNDEF (-1)
24#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF)
25
26#define PRODUCT_ID_KVM 0x4b /* ASCII code K */
27#define IMPLEMENTER_ARM 0x43b
28
29#define ACCESS_READ_VALUE (1 << 0)
30#define ACCESS_READ_RAZ (0 << 0)
31#define ACCESS_READ_MASK(x) ((x) & (1 << 0))
32#define ACCESS_WRITE_IGNORED (0 << 1)
33#define ACCESS_WRITE_SETBIT (1 << 1)
34#define ACCESS_WRITE_CLEARBIT (2 << 1)
35#define ACCESS_WRITE_VALUE (3 << 1)
36#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1))
37
38#define VCPU_NOT_ALLOCATED ((u8)-1)
39
40unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
41
42void vgic_update_state(struct kvm *kvm);
43int vgic_init_common_maps(struct kvm *kvm);
44
45u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset);
46u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset);
47
48void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq);
49void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq);
50void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq);
51void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
52 int irq, int val);
53
54void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
55void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
56
57bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq);
58void vgic_unqueue_irqs(struct kvm_vcpu *vcpu);
59
60void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
61 phys_addr_t offset, int mode);
62bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
63 phys_addr_t offset);
64
65static inline
66u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
67{
68 return le32_to_cpu(*((u32 *)mmio->data)) & mask;
69}
70
71static inline
72void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
73{
74 *((u32 *)mmio->data) = cpu_to_le32(value) & mask;
75}
76
77struct kvm_mmio_range {
78 phys_addr_t base;
79 unsigned long len;
80 int bits_per_irq;
81 bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
82 phys_addr_t offset);
83};
84
85static inline bool is_in_range(phys_addr_t addr, unsigned long len,
86 phys_addr_t baseaddr, unsigned long size)
87{
88 return (addr >= baseaddr) && (addr + len <= baseaddr + size);
89}
90
91const
92struct kvm_mmio_range *vgic_find_range(const struct kvm_mmio_range *ranges,
93 struct kvm_exit_mmio *mmio,
94 phys_addr_t offset);
95
96bool vgic_handle_mmio_range(struct kvm_vcpu *vcpu, struct kvm_run *run,
97 struct kvm_exit_mmio *mmio,
98 const struct kvm_mmio_range *ranges,
99 unsigned long mmio_base);
100
101bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
102 phys_addr_t offset, int vcpu_id, int access);
103
104bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
105 phys_addr_t offset, int vcpu_id);
106
107bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
108 phys_addr_t offset, int vcpu_id);
109
110bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
111 phys_addr_t offset);
112
113void vgic_kick_vcpus(struct kvm *kvm);
114
115int vgic_has_attr_regs(const struct kvm_mmio_range *ranges, phys_addr_t offset);
116int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
117int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
118
119int vgic_init(struct kvm *kvm);
120void vgic_v2_init_emulation(struct kvm *kvm);
121void vgic_v3_init_emulation(struct kvm *kvm);
122
123#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 458b9b14b15c..a1093700f3a4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -66,6 +66,9 @@
66MODULE_AUTHOR("Qumranet"); 66MODULE_AUTHOR("Qumranet");
67MODULE_LICENSE("GPL"); 67MODULE_LICENSE("GPL");
68 68
69unsigned int halt_poll_ns = 0;
70module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
71
69/* 72/*
70 * Ordering of locks: 73 * Ordering of locks:
71 * 74 *
@@ -89,7 +92,7 @@ struct dentry *kvm_debugfs_dir;
89 92
90static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 93static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
91 unsigned long arg); 94 unsigned long arg);
92#ifdef CONFIG_COMPAT 95#ifdef CONFIG_KVM_COMPAT
93static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 96static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
94 unsigned long arg); 97 unsigned long arg);
95#endif 98#endif
@@ -176,6 +179,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
176 return called; 179 return called;
177} 180}
178 181
182#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
179void kvm_flush_remote_tlbs(struct kvm *kvm) 183void kvm_flush_remote_tlbs(struct kvm *kvm)
180{ 184{
181 long dirty_count = kvm->tlbs_dirty; 185 long dirty_count = kvm->tlbs_dirty;
@@ -186,6 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
186 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 190 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
187} 191}
188EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 192EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
193#endif
189 194
190void kvm_reload_remote_mmus(struct kvm *kvm) 195void kvm_reload_remote_mmus(struct kvm *kvm)
191{ 196{
@@ -673,6 +678,7 @@ static void update_memslots(struct kvm_memslots *slots,
673 if (!new->npages) { 678 if (!new->npages) {
674 WARN_ON(!mslots[i].npages); 679 WARN_ON(!mslots[i].npages);
675 new->base_gfn = 0; 680 new->base_gfn = 0;
681 new->flags = 0;
676 if (mslots[i].npages) 682 if (mslots[i].npages)
677 slots->used_slots--; 683 slots->used_slots--;
678 } else { 684 } else {
@@ -993,6 +999,86 @@ out:
993} 999}
994EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1000EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
995 1001
1002#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1003/**
1004 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1005 * are dirty write protect them for next write.
1006 * @kvm: pointer to kvm instance
1007 * @log: slot id and address to which we copy the log
1008 * @is_dirty: flag set if any page is dirty
1009 *
1010 * We need to keep it in mind that VCPU threads can write to the bitmap
1011 * concurrently. So, to avoid losing track of dirty pages we keep the
1012 * following order:
1013 *
1014 * 1. Take a snapshot of the bit and clear it if needed.
1015 * 2. Write protect the corresponding page.
1016 * 3. Copy the snapshot to the userspace.
1017 * 4. Upon return caller flushes TLB's if needed.
1018 *
1019 * Between 2 and 4, the guest may write to the page using the remaining TLB
1020 * entry. This is not a problem because the page is reported dirty using
1021 * the snapshot taken before and step 4 ensures that writes done after
1022 * exiting to userspace will be logged for the next call.
1023 *
1024 */
1025int kvm_get_dirty_log_protect(struct kvm *kvm,
1026 struct kvm_dirty_log *log, bool *is_dirty)
1027{
1028 struct kvm_memory_slot *memslot;
1029 int r, i;
1030 unsigned long n;
1031 unsigned long *dirty_bitmap;
1032 unsigned long *dirty_bitmap_buffer;
1033
1034 r = -EINVAL;
1035 if (log->slot >= KVM_USER_MEM_SLOTS)
1036 goto out;
1037
1038 memslot = id_to_memslot(kvm->memslots, log->slot);
1039
1040 dirty_bitmap = memslot->dirty_bitmap;
1041 r = -ENOENT;
1042 if (!dirty_bitmap)
1043 goto out;
1044
1045 n = kvm_dirty_bitmap_bytes(memslot);
1046
1047 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
1048 memset(dirty_bitmap_buffer, 0, n);
1049
1050 spin_lock(&kvm->mmu_lock);
1051 *is_dirty = false;
1052 for (i = 0; i < n / sizeof(long); i++) {
1053 unsigned long mask;
1054 gfn_t offset;
1055
1056 if (!dirty_bitmap[i])
1057 continue;
1058
1059 *is_dirty = true;
1060
1061 mask = xchg(&dirty_bitmap[i], 0);
1062 dirty_bitmap_buffer[i] = mask;
1063
1064 offset = i * BITS_PER_LONG;
1065 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset,
1066 mask);
1067 }
1068
1069 spin_unlock(&kvm->mmu_lock);
1070
1071 r = -EFAULT;
1072 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1073 goto out;
1074
1075 r = 0;
1076out:
1077 return r;
1078}
1079EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1080#endif
1081
996bool kvm_largepages_enabled(void) 1082bool kvm_largepages_enabled(void)
997{ 1083{
998 return largepages_enabled; 1084 return largepages_enabled;
@@ -1551,6 +1637,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1551 } 1637 }
1552 return 0; 1638 return 0;
1553} 1639}
1640EXPORT_SYMBOL_GPL(kvm_write_guest);
1554 1641
1555int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1642int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1556 gpa_t gpa, unsigned long len) 1643 gpa_t gpa, unsigned long len)
@@ -1687,29 +1774,60 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1687} 1774}
1688EXPORT_SYMBOL_GPL(mark_page_dirty); 1775EXPORT_SYMBOL_GPL(mark_page_dirty);
1689 1776
1777static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
1778{
1779 if (kvm_arch_vcpu_runnable(vcpu)) {
1780 kvm_make_request(KVM_REQ_UNHALT, vcpu);
1781 return -EINTR;
1782 }
1783 if (kvm_cpu_has_pending_timer(vcpu))
1784 return -EINTR;
1785 if (signal_pending(current))
1786 return -EINTR;
1787
1788 return 0;
1789}
1790
1690/* 1791/*
1691 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1792 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1692 */ 1793 */
1693void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1794void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1694{ 1795{
1796 ktime_t start, cur;
1695 DEFINE_WAIT(wait); 1797 DEFINE_WAIT(wait);
1798 bool waited = false;
1799
1800 start = cur = ktime_get();
1801 if (halt_poll_ns) {
1802 ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
1803 do {
1804 /*
1805 * This sets KVM_REQ_UNHALT if an interrupt
1806 * arrives.
1807 */
1808 if (kvm_vcpu_check_block(vcpu) < 0) {
1809 ++vcpu->stat.halt_successful_poll;
1810 goto out;
1811 }
1812 cur = ktime_get();
1813 } while (single_task_running() && ktime_before(cur, stop));
1814 }
1696 1815
1697 for (;;) { 1816 for (;;) {
1698 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1817 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1699 1818
1700 if (kvm_arch_vcpu_runnable(vcpu)) { 1819 if (kvm_vcpu_check_block(vcpu) < 0)
1701 kvm_make_request(KVM_REQ_UNHALT, vcpu);
1702 break;
1703 }
1704 if (kvm_cpu_has_pending_timer(vcpu))
1705 break;
1706 if (signal_pending(current))
1707 break; 1820 break;
1708 1821
1822 waited = true;
1709 schedule(); 1823 schedule();
1710 } 1824 }
1711 1825
1712 finish_wait(&vcpu->wq, &wait); 1826 finish_wait(&vcpu->wq, &wait);
1827 cur = ktime_get();
1828
1829out:
1830 trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
1713} 1831}
1714EXPORT_SYMBOL_GPL(kvm_vcpu_block); 1832EXPORT_SYMBOL_GPL(kvm_vcpu_block);
1715 1833
@@ -1892,7 +2010,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1892static struct file_operations kvm_vcpu_fops = { 2010static struct file_operations kvm_vcpu_fops = {
1893 .release = kvm_vcpu_release, 2011 .release = kvm_vcpu_release,
1894 .unlocked_ioctl = kvm_vcpu_ioctl, 2012 .unlocked_ioctl = kvm_vcpu_ioctl,
1895#ifdef CONFIG_COMPAT 2013#ifdef CONFIG_KVM_COMPAT
1896 .compat_ioctl = kvm_vcpu_compat_ioctl, 2014 .compat_ioctl = kvm_vcpu_compat_ioctl,
1897#endif 2015#endif
1898 .mmap = kvm_vcpu_mmap, 2016 .mmap = kvm_vcpu_mmap,
@@ -2182,7 +2300,7 @@ out:
2182 return r; 2300 return r;
2183} 2301}
2184 2302
2185#ifdef CONFIG_COMPAT 2303#ifdef CONFIG_KVM_COMPAT
2186static long kvm_vcpu_compat_ioctl(struct file *filp, 2304static long kvm_vcpu_compat_ioctl(struct file *filp,
2187 unsigned int ioctl, unsigned long arg) 2305 unsigned int ioctl, unsigned long arg)
2188{ 2306{
@@ -2274,7 +2392,7 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
2274 2392
2275static const struct file_operations kvm_device_fops = { 2393static const struct file_operations kvm_device_fops = {
2276 .unlocked_ioctl = kvm_device_ioctl, 2394 .unlocked_ioctl = kvm_device_ioctl,
2277#ifdef CONFIG_COMPAT 2395#ifdef CONFIG_KVM_COMPAT
2278 .compat_ioctl = kvm_device_ioctl, 2396 .compat_ioctl = kvm_device_ioctl,
2279#endif 2397#endif
2280 .release = kvm_device_release, 2398 .release = kvm_device_release,
@@ -2561,7 +2679,7 @@ out:
2561 return r; 2679 return r;
2562} 2680}
2563 2681
2564#ifdef CONFIG_COMPAT 2682#ifdef CONFIG_KVM_COMPAT
2565struct compat_kvm_dirty_log { 2683struct compat_kvm_dirty_log {
2566 __u32 slot; 2684 __u32 slot;
2567 __u32 padding1; 2685 __u32 padding1;
@@ -2608,7 +2726,7 @@ out:
2608static struct file_operations kvm_vm_fops = { 2726static struct file_operations kvm_vm_fops = {
2609 .release = kvm_vm_release, 2727 .release = kvm_vm_release,
2610 .unlocked_ioctl = kvm_vm_ioctl, 2728 .unlocked_ioctl = kvm_vm_ioctl,
2611#ifdef CONFIG_COMPAT 2729#ifdef CONFIG_KVM_COMPAT
2612 .compat_ioctl = kvm_vm_compat_ioctl, 2730 .compat_ioctl = kvm_vm_compat_ioctl,
2613#endif 2731#endif
2614 .llseek = noop_llseek, 2732 .llseek = noop_llseek,