aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 13:14:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 13:14:24 -0500
commit55065bc52795faae549abfb912aacc622dd63876 (patch)
tree63683547e41ed459a2a8747eeafb5e969633d54f
parent008d23e4852d78bb2618f2035f8b2110b6a6b968 (diff)
parente5c301428294cb8925667c9ee39f817c4ab1c2c9 (diff)
Merge branch 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (142 commits) KVM: Initialize fpu state in preemptible context KVM: VMX: when entering real mode align segment base to 16 bytes KVM: MMU: handle 'map_writable' in set_spte() function KVM: MMU: audit: allow audit more guests at the same time KVM: Fetch guest cr3 from hardware on demand KVM: Replace reads of vcpu->arch.cr3 by an accessor KVM: MMU: only write protect mappings at pagetable level KVM: VMX: Correct asm constraint in vmcs_load()/vmcs_clear() KVM: MMU: Initialize base_role for tdp mmus KVM: VMX: Optimize atomic EFER load KVM: VMX: Add definitions for more vm entry/exit control bits KVM: SVM: copy instruction bytes from VMCB KVM: SVM: implement enhanced INVLPG intercept KVM: SVM: enhance mov DR intercept handler KVM: SVM: enhance MOV CR intercept handler KVM: SVM: add new SVM feature bit names KVM: cleanup emulate_instruction KVM: move complete_insn_gp() into x86.c KVM: x86: fix CR8 handling KVM guest: Fix kvm clock initialization when it's configured out ...
-rw-r--r--Documentation/kernel-parameters.txt3
-rw-r--r--Documentation/kvm/api.txt178
-rw-r--r--Documentation/kvm/cpuid.txt3
-rw-r--r--Documentation/kvm/msr.txt36
-rw-r--r--arch/ia64/include/asm/kvm_host.h4
-rw-r--r--arch/ia64/kvm/kvm-ia64.c30
-rw-r--r--arch/powerpc/kvm/book3s.c4
-rw-r--r--arch/powerpc/kvm/powerpc.c20
-rw-r--r--arch/s390/kvm/kvm-s390.c23
-rw-r--r--arch/x86/include/asm/kvm_emulate.h35
-rw-r--r--arch/x86/include/asm/kvm_host.h99
-rw-r--r--arch/x86/include/asm/kvm_para.h24
-rw-r--r--arch/x86/include/asm/svm.h57
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/vmx.h15
-rw-r--r--arch/x86/kernel/entry_32.S10
-rw-r--r--arch/x86/kernel/entry_64.S3
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c13
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c367
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h22
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c251
-rw-r--r--arch/x86/kvm/mmu_audit.c39
-rw-r--r--arch/x86/kvm/paging_tmpl.h147
-rw-r--r--arch/x86/kvm/svm.c865
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx.c156
-rw-r--r--arch/x86/kvm/x86.c474
-rw-r--r--include/linux/kvm.h1
-rw-r--r--include/linux/kvm_host.h101
-rw-r--r--include/linux/kvm_types.h7
-rw-r--r--include/trace/events/kvm.h121
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/assigned-dev.c125
-rw-r--r--virt/kvm/async_pf.c216
-rw-r--r--virt/kvm/async_pf.h36
-rw-r--r--virt/kvm/eventfd.c91
-rw-r--r--virt/kvm/irq_comm.c7
-rw-r--r--virt/kvm/kvm_main.c334
43 files changed, 3078 insertions, 1185 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 338c96ea0855..55fe7599bc8e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1705,6 +1705,9 @@ and is between 256 and 4096 characters. It is defined in the file
1705 1705
1706 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver 1706 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
1707 1707
1708 no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
1709 fault handling.
1710
1708 nolapic [X86-32,APIC] Do not enable or use the local APIC. 1711 nolapic [X86-32,APIC] Do not enable or use the local APIC.
1709 1712
1710 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. 1713 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 50713e37c695..ad85797c1cf0 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1085,6 +1085,184 @@ of 4 instructions that make up a hypercall.
1085If any additional field gets added to this structure later on, a bit for that 1085If any additional field gets added to this structure later on, a bit for that
1086additional piece of information will be set in the flags bitmap. 1086additional piece of information will be set in the flags bitmap.
1087 1087
10884.47 KVM_ASSIGN_PCI_DEVICE
1089
1090Capability: KVM_CAP_DEVICE_ASSIGNMENT
1091Architectures: x86 ia64
1092Type: vm ioctl
1093Parameters: struct kvm_assigned_pci_dev (in)
1094Returns: 0 on success, -1 on error
1095
1096Assigns a host PCI device to the VM.
1097
1098struct kvm_assigned_pci_dev {
1099 __u32 assigned_dev_id;
1100 __u32 busnr;
1101 __u32 devfn;
1102 __u32 flags;
1103 __u32 segnr;
1104 union {
1105 __u32 reserved[11];
1106 };
1107};
1108
1109The PCI device is specified by the triple segnr, busnr, and devfn.
1110Identification in succeeding service requests is done via assigned_dev_id. The
1111following flags are specified:
1112
1113/* Depends on KVM_CAP_IOMMU */
1114#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
1115
11164.48 KVM_DEASSIGN_PCI_DEVICE
1117
1118Capability: KVM_CAP_DEVICE_DEASSIGNMENT
1119Architectures: x86 ia64
1120Type: vm ioctl
1121Parameters: struct kvm_assigned_pci_dev (in)
1122Returns: 0 on success, -1 on error
1123
1124Ends PCI device assignment, releasing all associated resources.
1125
1126See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is
1127used in kvm_assigned_pci_dev to identify the device.
1128
11294.49 KVM_ASSIGN_DEV_IRQ
1130
1131Capability: KVM_CAP_ASSIGN_DEV_IRQ
1132Architectures: x86 ia64
1133Type: vm ioctl
1134Parameters: struct kvm_assigned_irq (in)
1135Returns: 0 on success, -1 on error
1136
1137Assigns an IRQ to a passed-through device.
1138
1139struct kvm_assigned_irq {
1140 __u32 assigned_dev_id;
1141 __u32 host_irq;
1142 __u32 guest_irq;
1143 __u32 flags;
1144 union {
1145 struct {
1146 __u32 addr_lo;
1147 __u32 addr_hi;
1148 __u32 data;
1149 } guest_msi;
1150 __u32 reserved[12];
1151 };
1152};
1153
1154The following flags are defined:
1155
1156#define KVM_DEV_IRQ_HOST_INTX (1 << 0)
1157#define KVM_DEV_IRQ_HOST_MSI (1 << 1)
1158#define KVM_DEV_IRQ_HOST_MSIX (1 << 2)
1159
1160#define KVM_DEV_IRQ_GUEST_INTX (1 << 8)
1161#define KVM_DEV_IRQ_GUEST_MSI (1 << 9)
1162#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10)
1163
1164It is not valid to specify multiple types per host or guest IRQ. However, the
1165IRQ type of host and guest can differ or can even be null.
1166
11674.50 KVM_DEASSIGN_DEV_IRQ
1168
1169Capability: KVM_CAP_ASSIGN_DEV_IRQ
1170Architectures: x86 ia64
1171Type: vm ioctl
1172Parameters: struct kvm_assigned_irq (in)
1173Returns: 0 on success, -1 on error
1174
1175Ends an IRQ assignment to a passed-through device.
1176
1177See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
1178by assigned_dev_id, flags must correspond to the IRQ type specified on
1179KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
1180
11814.51 KVM_SET_GSI_ROUTING
1182
1183Capability: KVM_CAP_IRQ_ROUTING
1184Architectures: x86 ia64
1185Type: vm ioctl
1186Parameters: struct kvm_irq_routing (in)
1187Returns: 0 on success, -1 on error
1188
1189Sets the GSI routing table entries, overwriting any previously set entries.
1190
1191struct kvm_irq_routing {
1192 __u32 nr;
1193 __u32 flags;
1194 struct kvm_irq_routing_entry entries[0];
1195};
1196
1197No flags are specified so far, the corresponding field must be set to zero.
1198
1199struct kvm_irq_routing_entry {
1200 __u32 gsi;
1201 __u32 type;
1202 __u32 flags;
1203 __u32 pad;
1204 union {
1205 struct kvm_irq_routing_irqchip irqchip;
1206 struct kvm_irq_routing_msi msi;
1207 __u32 pad[8];
1208 } u;
1209};
1210
1211/* gsi routing entry types */
1212#define KVM_IRQ_ROUTING_IRQCHIP 1
1213#define KVM_IRQ_ROUTING_MSI 2
1214
1215No flags are specified so far, the corresponding field must be set to zero.
1216
1217struct kvm_irq_routing_irqchip {
1218 __u32 irqchip;
1219 __u32 pin;
1220};
1221
1222struct kvm_irq_routing_msi {
1223 __u32 address_lo;
1224 __u32 address_hi;
1225 __u32 data;
1226 __u32 pad;
1227};
1228
12294.52 KVM_ASSIGN_SET_MSIX_NR
1230
1231Capability: KVM_CAP_DEVICE_MSIX
1232Architectures: x86 ia64
1233Type: vm ioctl
1234Parameters: struct kvm_assigned_msix_nr (in)
1235Returns: 0 on success, -1 on error
1236
1237Set the number of MSI-X interrupts for an assigned device. This service can
1238only be called once in the lifetime of an assigned device.
1239
1240struct kvm_assigned_msix_nr {
1241 __u32 assigned_dev_id;
1242 __u16 entry_nr;
1243 __u16 padding;
1244};
1245
1246#define KVM_MAX_MSIX_PER_DEV 256
1247
12484.53 KVM_ASSIGN_SET_MSIX_ENTRY
1249
1250Capability: KVM_CAP_DEVICE_MSIX
1251Architectures: x86 ia64
1252Type: vm ioctl
1253Parameters: struct kvm_assigned_msix_entry (in)
1254Returns: 0 on success, -1 on error
1255
1256Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting
1257the GSI vector to zero means disabling the interrupt.
1258
1259struct kvm_assigned_msix_entry {
1260 __u32 assigned_dev_id;
1261 __u32 gsi;
1262 __u16 entry; /* The index of entry in the MSI-X table */
1263 __u16 padding[3];
1264};
1265
10885. The kvm_run structure 12665. The kvm_run structure
1089 1267
1090Application code obtains a pointer to the kvm_run structure by 1268Application code obtains a pointer to the kvm_run structure by
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt
index 14a12ea92b7f..882068538c9c 100644
--- a/Documentation/kvm/cpuid.txt
+++ b/Documentation/kvm/cpuid.txt
@@ -36,6 +36,9 @@ KVM_FEATURE_MMU_OP || 2 || deprecated.
36KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs 36KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
37 || || 0x4b564d00 and 0x4b564d01 37 || || 0x4b564d00 and 0x4b564d01
38------------------------------------------------------------------------------ 38------------------------------------------------------------------------------
39KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by
40 || || writing to msr 0x4b564d02
41------------------------------------------------------------------------------
39KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side 42KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
40 || || per-cpu warps are expected in 43 || || per-cpu warps are expected in
41 || || kvmclock. 44 || || kvmclock.
diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt
index 8ddcfe84c09a..d079aed27e03 100644
--- a/Documentation/kvm/msr.txt
+++ b/Documentation/kvm/msr.txt
@@ -3,7 +3,6 @@ Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
3===================================================== 3=====================================================
4 4
5KVM makes use of some custom MSRs to service some requests. 5KVM makes use of some custom MSRs to service some requests.
6At present, this facility is only used by kvmclock.
7 6
8Custom MSRs have a range reserved for them, that goes from 7Custom MSRs have a range reserved for them, that goes from
90x4b564d00 to 0x4b564dff. There are MSRs outside this area, 80x4b564d00 to 0x4b564dff. There are MSRs outside this area,
@@ -151,3 +150,38 @@ MSR_KVM_SYSTEM_TIME: 0x12
151 return PRESENT; 150 return PRESENT;
152 } else 151 } else
153 return NON_PRESENT; 152 return NON_PRESENT;
153
154MSR_KVM_ASYNC_PF_EN: 0x4b564d02
155 data: Bits 63-6 hold 64-byte aligned physical address of a
156 64 byte memory area which must be in guest RAM and must be
157 zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1
158 when asynchronous page faults are enabled on the vcpu 0 when
159 disabled. Bit 2 is 1 if asynchronous page faults can be injected
160 when vcpu is in cpl == 0.
161
162 First 4 byte of 64 byte memory location will be written to by
163 the hypervisor at the time of asynchronous page fault (APF)
164 injection to indicate type of asynchronous page fault. Value
165 of 1 means that the page referred to by the page fault is not
166 present. Value 2 means that the page is now available. Disabling
167 interrupt inhibits APFs. Guest must not enable interrupt
168 before the reason is read, or it may be overwritten by another
169 APF. Since APF uses the same exception vector as regular page
170 fault guest must reset the reason to 0 before it does
171 something that can generate normal page fault. If during page
172 fault APF reason is 0 it means that this is regular page
173 fault.
174
175 During delivery of type 1 APF cr2 contains a token that will
176 be used to notify a guest when missing page becomes
177 available. When page becomes available type 2 APF is sent with
178 cr2 set to the token associated with the page. There is special
179 kind of token 0xffffffff which tells vcpu that it should wake
180 up all processes waiting for APFs and no individual type 2 APFs
181 will be sent.
182
183 If APF is disabled while there are outstanding APFs, they will
184 not be delivered.
185
186 Currently type 2 APF will be always delivered on the same vcpu as
187 type 1 was, but guest should not rely on that.
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 2f229e5de498..2689ee54a1c9 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -590,6 +590,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
590int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); 590int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
591void kvm_sal_emul(struct kvm_vcpu *vcpu); 591void kvm_sal_emul(struct kvm_vcpu *vcpu);
592 592
593#define __KVM_HAVE_ARCH_VM_ALLOC 1
594struct kvm *kvm_arch_alloc_vm(void);
595void kvm_arch_free_vm(struct kvm *kvm);
596
593#endif /* __ASSEMBLY__*/ 597#endif /* __ASSEMBLY__*/
594 598
595#endif 599#endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f56a6316e134..70d224d4264c 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -749,7 +749,7 @@ out:
749 return r; 749 return r;
750} 750}
751 751
752static struct kvm *kvm_alloc_kvm(void) 752struct kvm *kvm_arch_alloc_vm(void)
753{ 753{
754 754
755 struct kvm *kvm; 755 struct kvm *kvm;
@@ -760,7 +760,7 @@ static struct kvm *kvm_alloc_kvm(void)
760 vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE)); 760 vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
761 761
762 if (!vm_base) 762 if (!vm_base)
763 return ERR_PTR(-ENOMEM); 763 return NULL;
764 764
765 memset((void *)vm_base, 0, KVM_VM_DATA_SIZE); 765 memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
766 kvm = (struct kvm *)(vm_base + 766 kvm = (struct kvm *)(vm_base +
@@ -806,10 +806,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
806#define GUEST_PHYSICAL_RR4 0x2739 806#define GUEST_PHYSICAL_RR4 0x2739
807#define VMM_INIT_RR 0x1660 807#define VMM_INIT_RR 0x1660
808 808
809static void kvm_init_vm(struct kvm *kvm) 809int kvm_arch_init_vm(struct kvm *kvm)
810{ 810{
811 BUG_ON(!kvm); 811 BUG_ON(!kvm);
812 812
813 kvm->arch.is_sn2 = ia64_platform_is("sn2");
814
813 kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0; 815 kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
814 kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4; 816 kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
815 kvm->arch.vmm_init_rr = VMM_INIT_RR; 817 kvm->arch.vmm_init_rr = VMM_INIT_RR;
@@ -823,21 +825,8 @@ static void kvm_init_vm(struct kvm *kvm)
823 825
824 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 826 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
825 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 827 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
826}
827
828struct kvm *kvm_arch_create_vm(void)
829{
830 struct kvm *kvm = kvm_alloc_kvm();
831
832 if (IS_ERR(kvm))
833 return ERR_PTR(-ENOMEM);
834
835 kvm->arch.is_sn2 = ia64_platform_is("sn2");
836
837 kvm_init_vm(kvm);
838
839 return kvm;
840 828
829 return 0;
841} 830}
842 831
843static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, 832static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
@@ -962,7 +951,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
962 goto out; 951 goto out;
963 r = kvm_setup_default_irq_routing(kvm); 952 r = kvm_setup_default_irq_routing(kvm);
964 if (r) { 953 if (r) {
954 mutex_lock(&kvm->slots_lock);
965 kvm_ioapic_destroy(kvm); 955 kvm_ioapic_destroy(kvm);
956 mutex_unlock(&kvm->slots_lock);
966 goto out; 957 goto out;
967 } 958 }
968 break; 959 break;
@@ -1357,7 +1348,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
1357 return -EINVAL; 1348 return -EINVAL;
1358} 1349}
1359 1350
1360static void free_kvm(struct kvm *kvm) 1351void kvm_arch_free_vm(struct kvm *kvm)
1361{ 1352{
1362 unsigned long vm_base = kvm->arch.vm_base; 1353 unsigned long vm_base = kvm->arch.vm_base;
1363 1354
@@ -1399,9 +1390,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
1399#endif 1390#endif
1400 kfree(kvm->arch.vioapic); 1391 kfree(kvm->arch.vioapic);
1401 kvm_release_vm_pages(kvm); 1392 kvm_release_vm_pages(kvm);
1402 kvm_free_physmem(kvm);
1403 cleanup_srcu_struct(&kvm->srcu);
1404 free_kvm(kvm);
1405} 1393}
1406 1394
1407void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1395void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index e316847c08c0..badc983031b3 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1307,12 +1307,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
1307 int err = -ENOMEM; 1307 int err = -ENOMEM;
1308 unsigned long p; 1308 unsigned long p;
1309 1309
1310 vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s)); 1310 vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
1311 if (!vcpu_book3s) 1311 if (!vcpu_book3s)
1312 goto out; 1312 goto out;
1313 1313
1314 memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s));
1315
1316 vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) 1314 vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
1317 kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); 1315 kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
1318 if (!vcpu_book3s->shadow_vcpu) 1316 if (!vcpu_book3s->shadow_vcpu)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 38f756f25053..99758460efde 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -145,18 +145,12 @@ void kvm_arch_check_processor_compat(void *rtn)
145 *(int *)rtn = kvmppc_core_check_processor_compat(); 145 *(int *)rtn = kvmppc_core_check_processor_compat();
146} 146}
147 147
148struct kvm *kvm_arch_create_vm(void) 148int kvm_arch_init_vm(struct kvm *kvm)
149{ 149{
150 struct kvm *kvm; 150 return 0;
151
152 kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
153 if (!kvm)
154 return ERR_PTR(-ENOMEM);
155
156 return kvm;
157} 151}
158 152
159static void kvmppc_free_vcpus(struct kvm *kvm) 153void kvm_arch_destroy_vm(struct kvm *kvm)
160{ 154{
161 unsigned int i; 155 unsigned int i;
162 struct kvm_vcpu *vcpu; 156 struct kvm_vcpu *vcpu;
@@ -176,14 +170,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
176{ 170{
177} 171}
178 172
179void kvm_arch_destroy_vm(struct kvm *kvm)
180{
181 kvmppc_free_vcpus(kvm);
182 kvm_free_physmem(kvm);
183 cleanup_srcu_struct(&kvm->srcu);
184 kfree(kvm);
185}
186
187int kvm_dev_ioctl_check_extension(long ext) 173int kvm_dev_ioctl_check_extension(long ext)
188{ 174{
189 int r; 175 int r;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 985d825494f1..bade533ba288 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -164,24 +164,18 @@ long kvm_arch_vm_ioctl(struct file *filp,
164 return r; 164 return r;
165} 165}
166 166
167struct kvm *kvm_arch_create_vm(void) 167int kvm_arch_init_vm(struct kvm *kvm)
168{ 168{
169 struct kvm *kvm;
170 int rc; 169 int rc;
171 char debug_name[16]; 170 char debug_name[16];
172 171
173 rc = s390_enable_sie(); 172 rc = s390_enable_sie();
174 if (rc) 173 if (rc)
175 goto out_nokvm; 174 goto out_err;
176
177 rc = -ENOMEM;
178 kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
179 if (!kvm)
180 goto out_nokvm;
181 175
182 kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL); 176 kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
183 if (!kvm->arch.sca) 177 if (!kvm->arch.sca)
184 goto out_nosca; 178 goto out_err;
185 179
186 sprintf(debug_name, "kvm-%u", current->pid); 180 sprintf(debug_name, "kvm-%u", current->pid);
187 181
@@ -195,13 +189,11 @@ struct kvm *kvm_arch_create_vm(void)
195 debug_register_view(kvm->arch.dbf, &debug_sprintf_view); 189 debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
196 VM_EVENT(kvm, 3, "%s", "vm created"); 190 VM_EVENT(kvm, 3, "%s", "vm created");
197 191
198 return kvm; 192 return 0;
199out_nodbf: 193out_nodbf:
200 free_page((unsigned long)(kvm->arch.sca)); 194 free_page((unsigned long)(kvm->arch.sca));
201out_nosca: 195out_err:
202 kfree(kvm); 196 return rc;
203out_nokvm:
204 return ERR_PTR(rc);
205} 197}
206 198
207void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 199void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -240,11 +232,8 @@ void kvm_arch_sync_events(struct kvm *kvm)
240void kvm_arch_destroy_vm(struct kvm *kvm) 232void kvm_arch_destroy_vm(struct kvm *kvm)
241{ 233{
242 kvm_free_vcpus(kvm); 234 kvm_free_vcpus(kvm);
243 kvm_free_physmem(kvm);
244 free_page((unsigned long)(kvm->arch.sca)); 235 free_page((unsigned long)(kvm->arch.sca));
245 debug_unregister(kvm->arch.dbf); 236 debug_unregister(kvm->arch.dbf);
246 cleanup_srcu_struct(&kvm->srcu);
247 kfree(kvm);
248} 237}
249 238
250/* Section: vcpu related */ 239/* Section: vcpu related */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b36c6b3fe144..8e37deb1eb38 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -15,6 +15,14 @@
15 15
16struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
17 17
18struct x86_exception {
19 u8 vector;
20 bool error_code_valid;
21 u16 error_code;
22 bool nested_page_fault;
23 u64 address; /* cr2 or nested page fault gpa */
24};
25
18/* 26/*
19 * x86_emulate_ops: 27 * x86_emulate_ops:
20 * 28 *
@@ -64,7 +72,8 @@ struct x86_emulate_ops {
64 * @bytes: [IN ] Number of bytes to read from memory. 72 * @bytes: [IN ] Number of bytes to read from memory.
65 */ 73 */
66 int (*read_std)(unsigned long addr, void *val, 74 int (*read_std)(unsigned long addr, void *val,
67 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 75 unsigned int bytes, struct kvm_vcpu *vcpu,
76 struct x86_exception *fault);
68 77
69 /* 78 /*
70 * write_std: Write bytes of standard (non-emulated/special) memory. 79 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -74,7 +83,8 @@ struct x86_emulate_ops {
74 * @bytes: [IN ] Number of bytes to write to memory. 83 * @bytes: [IN ] Number of bytes to write to memory.
75 */ 84 */
76 int (*write_std)(unsigned long addr, void *val, 85 int (*write_std)(unsigned long addr, void *val,
77 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 86 unsigned int bytes, struct kvm_vcpu *vcpu,
87 struct x86_exception *fault);
78 /* 88 /*
79 * fetch: Read bytes of standard (non-emulated/special) memory. 89 * fetch: Read bytes of standard (non-emulated/special) memory.
80 * Used for instruction fetch. 90 * Used for instruction fetch.
@@ -83,7 +93,8 @@ struct x86_emulate_ops {
83 * @bytes: [IN ] Number of bytes to read from memory. 93 * @bytes: [IN ] Number of bytes to read from memory.
84 */ 94 */
85 int (*fetch)(unsigned long addr, void *val, 95 int (*fetch)(unsigned long addr, void *val,
86 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 96 unsigned int bytes, struct kvm_vcpu *vcpu,
97 struct x86_exception *fault);
87 98
88 /* 99 /*
89 * read_emulated: Read bytes from emulated/special memory area. 100 * read_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +105,7 @@ struct x86_emulate_ops {
94 int (*read_emulated)(unsigned long addr, 105 int (*read_emulated)(unsigned long addr,
95 void *val, 106 void *val,
96 unsigned int bytes, 107 unsigned int bytes,
97 unsigned int *error, 108 struct x86_exception *fault,
98 struct kvm_vcpu *vcpu); 109 struct kvm_vcpu *vcpu);
99 110
100 /* 111 /*
@@ -107,7 +118,7 @@ struct x86_emulate_ops {
107 int (*write_emulated)(unsigned long addr, 118 int (*write_emulated)(unsigned long addr,
108 const void *val, 119 const void *val,
109 unsigned int bytes, 120 unsigned int bytes,
110 unsigned int *error, 121 struct x86_exception *fault,
111 struct kvm_vcpu *vcpu); 122 struct kvm_vcpu *vcpu);
112 123
113 /* 124 /*
@@ -122,7 +133,7 @@ struct x86_emulate_ops {
122 const void *old, 133 const void *old,
123 const void *new, 134 const void *new,
124 unsigned int bytes, 135 unsigned int bytes,
125 unsigned int *error, 136 struct x86_exception *fault,
126 struct kvm_vcpu *vcpu); 137 struct kvm_vcpu *vcpu);
127 138
128 int (*pio_in_emulated)(int size, unsigned short port, void *val, 139 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -159,7 +170,10 @@ struct operand {
159 }; 170 };
160 union { 171 union {
161 unsigned long *reg; 172 unsigned long *reg;
162 unsigned long mem; 173 struct segmented_address {
174 ulong ea;
175 unsigned seg;
176 } mem;
163 } addr; 177 } addr;
164 union { 178 union {
165 unsigned long val; 179 unsigned long val;
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
226 240
227 bool perm_ok; /* do not check permissions if true */ 241 bool perm_ok; /* do not check permissions if true */
228 242
229 int exception; /* exception that happens during emulation or -1 */ 243 bool have_exception;
230 u32 error_code; /* error code for exception */ 244 struct x86_exception exception;
231 bool error_code_valid;
232 245
233 /* decode cache */ 246 /* decode cache */
234 struct decode_cache decode; 247 struct decode_cache decode;
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
252#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 265#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
253#endif 266#endif
254 267
255int x86_decode_insn(struct x86_emulate_ctxt *ctxt); 268int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
256#define EMULATION_FAILED -1 269#define EMULATION_FAILED -1
257#define EMULATION_OK 0 270#define EMULATION_OK 0
258#define EMULATION_RESTART 1 271#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f702f82aa1eb..aa75f21a9fba 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,11 +83,14 @@
83#define KVM_NR_FIXED_MTRR_REGION 88 83#define KVM_NR_FIXED_MTRR_REGION 88
84#define KVM_NR_VAR_MTRR 8 84#define KVM_NR_VAR_MTRR 8
85 85
86#define ASYNC_PF_PER_VCPU 64
87
86extern spinlock_t kvm_lock; 88extern spinlock_t kvm_lock;
87extern struct list_head vm_list; 89extern struct list_head vm_list;
88 90
89struct kvm_vcpu; 91struct kvm_vcpu;
90struct kvm; 92struct kvm;
93struct kvm_async_pf;
91 94
92enum kvm_reg { 95enum kvm_reg {
93 VCPU_REGS_RAX = 0, 96 VCPU_REGS_RAX = 0,
@@ -114,6 +117,7 @@ enum kvm_reg {
114 117
115enum kvm_reg_ex { 118enum kvm_reg_ex {
116 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 119 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
120 VCPU_EXREG_CR3,
117}; 121};
118 122
119enum { 123enum {
@@ -238,16 +242,18 @@ struct kvm_mmu {
238 void (*new_cr3)(struct kvm_vcpu *vcpu); 242 void (*new_cr3)(struct kvm_vcpu *vcpu);
239 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); 243 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
240 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); 244 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
241 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 245 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
242 void (*inject_page_fault)(struct kvm_vcpu *vcpu); 246 bool prefault);
247 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
248 struct x86_exception *fault);
243 void (*free)(struct kvm_vcpu *vcpu); 249 void (*free)(struct kvm_vcpu *vcpu);
244 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 250 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
245 u32 *error); 251 struct x86_exception *exception);
246 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); 252 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
247 void (*prefetch_page)(struct kvm_vcpu *vcpu, 253 void (*prefetch_page)(struct kvm_vcpu *vcpu,
248 struct kvm_mmu_page *page); 254 struct kvm_mmu_page *page);
249 int (*sync_page)(struct kvm_vcpu *vcpu, 255 int (*sync_page)(struct kvm_vcpu *vcpu,
250 struct kvm_mmu_page *sp, bool clear_unsync); 256 struct kvm_mmu_page *sp);
251 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
252 hpa_t root_hpa; 258 hpa_t root_hpa;
253 int root_level; 259 int root_level;
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
315 */ 321 */
316 struct kvm_mmu *walk_mmu; 322 struct kvm_mmu *walk_mmu;
317 323
318 /*
319 * This struct is filled with the necessary information to propagate a
320 * page fault into the guest
321 */
322 struct {
323 u64 address;
324 unsigned error_code;
325 bool nested;
326 } fault;
327
328 /* only needed in kvm_pv_mmu_op() path, but it's hot so 324 /* only needed in kvm_pv_mmu_op() path, but it's hot so
329 * put it here to avoid allocation */ 325 * put it here to avoid allocation */
330 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 326 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
412 u64 hv_vapic; 408 u64 hv_vapic;
413 409
414 cpumask_var_t wbinvd_dirty_mask; 410 cpumask_var_t wbinvd_dirty_mask;
411
412 struct {
413 bool halted;
414 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
415 struct gfn_to_hva_cache data;
416 u64 msr_val;
417 u32 id;
418 bool send_user_only;
419 } apf;
415}; 420};
416 421
417struct kvm_arch { 422struct kvm_arch {
@@ -456,6 +461,10 @@ struct kvm_arch {
456 /* fields used by HYPER-V emulation */ 461 /* fields used by HYPER-V emulation */
457 u64 hv_guest_os_id; 462 u64 hv_guest_os_id;
458 u64 hv_hypercall; 463 u64 hv_hypercall;
464
465 #ifdef CONFIG_KVM_MMU_AUDIT
466 int audit_point;
467 #endif
459}; 468};
460 469
461struct kvm_vm_stat { 470struct kvm_vm_stat {
@@ -529,6 +538,7 @@ struct kvm_x86_ops {
529 struct kvm_segment *var, int seg); 538 struct kvm_segment *var, int seg);
530 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 539 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
531 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); 540 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
541 void (*decache_cr3)(struct kvm_vcpu *vcpu);
532 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 542 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
533 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 543 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
534 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 544 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,9 +592,17 @@ struct kvm_x86_ops {
582 592
583 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 593 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
584 594
595 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
585 const struct trace_print_flags *exit_reasons_str; 596 const struct trace_print_flags *exit_reasons_str;
586}; 597};
587 598
599struct kvm_arch_async_pf {
600 u32 token;
601 gfn_t gfn;
602 unsigned long cr3;
603 bool direct_map;
604};
605
588extern struct kvm_x86_ops *kvm_x86_ops; 606extern struct kvm_x86_ops *kvm_x86_ops;
589 607
590int kvm_mmu_module_init(void); 608int kvm_mmu_module_init(void);
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
594int kvm_mmu_create(struct kvm_vcpu *vcpu); 612int kvm_mmu_create(struct kvm_vcpu *vcpu);
595int kvm_mmu_setup(struct kvm_vcpu *vcpu); 613int kvm_mmu_setup(struct kvm_vcpu *vcpu);
596void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 614void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
597void kvm_mmu_set_base_ptes(u64 base_pte);
598void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 615void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
599 u64 dirty_mask, u64 nx_mask, u64 x_mask); 616 u64 dirty_mask, u64 nx_mask, u64 x_mask);
600 617
@@ -623,8 +640,15 @@ enum emulation_result {
623#define EMULTYPE_NO_DECODE (1 << 0) 640#define EMULTYPE_NO_DECODE (1 << 0)
624#define EMULTYPE_TRAP_UD (1 << 1) 641#define EMULTYPE_TRAP_UD (1 << 1)
625#define EMULTYPE_SKIP (1 << 2) 642#define EMULTYPE_SKIP (1 << 2)
626int emulate_instruction(struct kvm_vcpu *vcpu, 643int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
627 unsigned long cr2, u16 error_code, int emulation_type); 644 int emulation_type, void *insn, int insn_len);
645
646static inline int emulate_instruction(struct kvm_vcpu *vcpu,
647 int emulation_type)
648{
649 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
650}
651
628void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 652void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
629void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 653void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
630 654
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
650int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 674int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
651int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 675int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
652int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 676int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
653void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 677int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
654int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 678int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
655int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 679int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
656unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 680unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
668void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 692void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
669void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); 693void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
670void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 694void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
671void kvm_inject_page_fault(struct kvm_vcpu *vcpu); 695void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
672int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 696int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
673 gfn_t gfn, void *data, int offset, int len, 697 gfn_t gfn, void *data, int offset, int len,
674 u32 access); 698 u32 access);
675void kvm_propagate_fault(struct kvm_vcpu *vcpu); 699void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
676bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 700bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
677 701
678int kvm_pic_set_irq(void *opaque, int irq, int level); 702int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
690int kvm_mmu_load(struct kvm_vcpu *vcpu); 714int kvm_mmu_load(struct kvm_vcpu *vcpu);
691void kvm_mmu_unload(struct kvm_vcpu *vcpu); 715void kvm_mmu_unload(struct kvm_vcpu *vcpu);
692void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 716void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
693gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 717gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
694gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 718 struct x86_exception *exception);
695gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 719gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
696gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 720 struct x86_exception *exception);
721gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
722 struct x86_exception *exception);
723gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
724 struct x86_exception *exception);
697 725
698int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 726int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
699 727
700int kvm_fix_hypercall(struct kvm_vcpu *vcpu); 728int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
701 729
702int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); 730int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
731 void *insn, int insn_len);
703void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 732void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
704 733
705void kvm_enable_tdp(void); 734void kvm_enable_tdp(void);
@@ -766,20 +795,25 @@ enum {
766#define HF_VINTR_MASK (1 << 2) 795#define HF_VINTR_MASK (1 << 2)
767#define HF_NMI_MASK (1 << 3) 796#define HF_NMI_MASK (1 << 3)
768#define HF_IRET_MASK (1 << 4) 797#define HF_IRET_MASK (1 << 4)
798#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
769 799
770/* 800/*
771 * Hardware virtualization extension instructions may fault if a 801 * Hardware virtualization extension instructions may fault if a
772 * reboot turns off virtualization while processes are running. 802 * reboot turns off virtualization while processes are running.
773 * Trap the fault and ignore the instruction if that happens. 803 * Trap the fault and ignore the instruction if that happens.
774 */ 804 */
775asmlinkage void kvm_handle_fault_on_reboot(void); 805asmlinkage void kvm_spurious_fault(void);
806extern bool kvm_rebooting;
776 807
777#define __kvm_handle_fault_on_reboot(insn) \ 808#define __kvm_handle_fault_on_reboot(insn) \
778 "666: " insn "\n\t" \ 809 "666: " insn "\n\t" \
810 "668: \n\t" \
779 ".pushsection .fixup, \"ax\" \n" \ 811 ".pushsection .fixup, \"ax\" \n" \
780 "667: \n\t" \ 812 "667: \n\t" \
813 "cmpb $0, kvm_rebooting \n\t" \
814 "jne 668b \n\t" \
781 __ASM_SIZE(push) " $666b \n\t" \ 815 __ASM_SIZE(push) " $666b \n\t" \
782 "jmp kvm_handle_fault_on_reboot \n\t" \ 816 "call kvm_spurious_fault \n\t" \
783 ".popsection \n\t" \ 817 ".popsection \n\t" \
784 ".pushsection __ex_table, \"a\" \n\t" \ 818 ".pushsection __ex_table, \"a\" \n\t" \
785 _ASM_PTR " 666b, 667b \n\t" \ 819 _ASM_PTR " 666b, 667b \n\t" \
@@ -799,4 +833,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
799 833
800bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); 834bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
801 835
836void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
837 struct kvm_async_pf *work);
838void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
839 struct kvm_async_pf *work);
840void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
841 struct kvm_async_pf *work);
842bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
843extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
844
845void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
846
802#endif /* _ASM_X86_KVM_HOST_H */ 847#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6184bc..a427bf77a93d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
20 * are available. The use of 0x11 and 0x12 is deprecated 20 * are available. The use of 0x11 and 0x12 is deprecated
21 */ 21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4
23 24
24/* The last 8 bits are used to indicate how to interpret the flags field 25/* The last 8 bits are used to indicate how to interpret the flags field
25 * in pvclock structure. If no bits are set, all flags are ignored. 26 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,13 @@
32/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ 33/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
33#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 34#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
34#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 35#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
36#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
35 37
36#define KVM_MAX_MMU_OP_BATCH 32 38#define KVM_MAX_MMU_OP_BATCH 32
37 39
40#define KVM_ASYNC_PF_ENABLED (1 << 0)
41#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
42
38/* Operations for KVM_HC_MMU_OP */ 43/* Operations for KVM_HC_MMU_OP */
39#define KVM_MMU_OP_WRITE_PTE 1 44#define KVM_MMU_OP_WRITE_PTE 1
40#define KVM_MMU_OP_FLUSH_TLB 2 45#define KVM_MMU_OP_FLUSH_TLB 2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
61 __u64 pt_phys; 66 __u64 pt_phys;
62}; 67};
63 68
69#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
70#define KVM_PV_REASON_PAGE_READY 2
71
72struct kvm_vcpu_pv_apf_data {
73 __u32 reason;
74 __u8 pad[60];
75 __u32 enabled;
76};
77
64#ifdef __KERNEL__ 78#ifdef __KERNEL__
65#include <asm/processor.h> 79#include <asm/processor.h>
66 80
67extern void kvmclock_init(void); 81extern void kvmclock_init(void);
82extern int kvm_register_clock(char *txt);
68 83
69 84
70/* This instruction is vmcall. On non-VT architectures, it will generate a 85/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
160 175
161#ifdef CONFIG_KVM_GUEST 176#ifdef CONFIG_KVM_GUEST
162void __init kvm_guest_init(void); 177void __init kvm_guest_init(void);
178void kvm_async_pf_task_wait(u32 token);
179void kvm_async_pf_task_wake(u32 token);
180u32 kvm_read_and_reset_pf_reason(void);
163#else 181#else
164#define kvm_guest_init() do { } while (0) 182#define kvm_guest_init() do { } while (0)
183#define kvm_async_pf_task_wait(T) do {} while(0)
184#define kvm_async_pf_task_wake(T) do {} while(0)
185static inline u32 kvm_read_and_reset_pf_reason(void)
186{
187 return 0;
188}
165#endif 189#endif
166 190
167#endif /* __KERNEL__ */ 191#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e831059ac5a..f2b83bc7d784 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,14 +47,13 @@ enum {
47 INTERCEPT_MONITOR, 47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT, 48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND, 49 INTERCEPT_MWAIT_COND,
50 INTERCEPT_XSETBV,
50}; 51};
51 52
52 53
53struct __attribute__ ((__packed__)) vmcb_control_area { 54struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read; 55 u32 intercept_cr;
55 u16 intercept_cr_write; 56 u32 intercept_dr;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 57 u32 intercept_exceptions;
59 u64 intercept; 58 u64 intercept;
60 u8 reserved_1[42]; 59 u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 80 u32 event_inj_err;
82 u64 nested_cr3; 81 u64 nested_cr3;
83 u64 lbr_ctl; 82 u64 lbr_ctl;
84 u64 reserved_5; 83 u32 clean;
84 u32 reserved_5;
85 u64 next_rip; 85 u64 next_rip;
86 u8 reserved_6[816]; 86 u8 insn_len;
87 u8 insn_bytes[15];
88 u8 reserved_6[800];
87}; 89};
88 90
89 91
90#define TLB_CONTROL_DO_NOTHING 0 92#define TLB_CONTROL_DO_NOTHING 0
91#define TLB_CONTROL_FLUSH_ALL_ASID 1 93#define TLB_CONTROL_FLUSH_ALL_ASID 1
94#define TLB_CONTROL_FLUSH_ASID 3
95#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
92 96
93#define V_TPR_MASK 0x0f 97#define V_TPR_MASK 0x0f
94 98
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
204#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK 208#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
205#define SVM_SELECTOR_CODE_MASK (1 << 3) 209#define SVM_SELECTOR_CODE_MASK (1 << 3)
206 210
207#define INTERCEPT_CR0_MASK 1 211#define INTERCEPT_CR0_READ 0
208#define INTERCEPT_CR3_MASK (1 << 3) 212#define INTERCEPT_CR3_READ 3
209#define INTERCEPT_CR4_MASK (1 << 4) 213#define INTERCEPT_CR4_READ 4
210#define INTERCEPT_CR8_MASK (1 << 8) 214#define INTERCEPT_CR8_READ 8
211 215#define INTERCEPT_CR0_WRITE (16 + 0)
212#define INTERCEPT_DR0_MASK 1 216#define INTERCEPT_CR3_WRITE (16 + 3)
213#define INTERCEPT_DR1_MASK (1 << 1) 217#define INTERCEPT_CR4_WRITE (16 + 4)
214#define INTERCEPT_DR2_MASK (1 << 2) 218#define INTERCEPT_CR8_WRITE (16 + 8)
215#define INTERCEPT_DR3_MASK (1 << 3) 219
216#define INTERCEPT_DR4_MASK (1 << 4) 220#define INTERCEPT_DR0_READ 0
217#define INTERCEPT_DR5_MASK (1 << 5) 221#define INTERCEPT_DR1_READ 1
218#define INTERCEPT_DR6_MASK (1 << 6) 222#define INTERCEPT_DR2_READ 2
219#define INTERCEPT_DR7_MASK (1 << 7) 223#define INTERCEPT_DR3_READ 3
224#define INTERCEPT_DR4_READ 4
225#define INTERCEPT_DR5_READ 5
226#define INTERCEPT_DR6_READ 6
227#define INTERCEPT_DR7_READ 7
228#define INTERCEPT_DR0_WRITE (16 + 0)
229#define INTERCEPT_DR1_WRITE (16 + 1)
230#define INTERCEPT_DR2_WRITE (16 + 2)
231#define INTERCEPT_DR3_WRITE (16 + 3)
232#define INTERCEPT_DR4_WRITE (16 + 4)
233#define INTERCEPT_DR5_WRITE (16 + 5)
234#define INTERCEPT_DR6_WRITE (16 + 6)
235#define INTERCEPT_DR7_WRITE (16 + 7)
220 236
221#define SVM_EVTINJ_VEC_MASK 0xff 237#define SVM_EVTINJ_VEC_MASK 0xff
222 238
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 262#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 263#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
248 264
265#define SVM_EXITINFO_REG_MASK 0x0F
266
249#define SVM_EXIT_READ_CR0 0x000 267#define SVM_EXIT_READ_CR0 0x000
250#define SVM_EXIT_READ_CR3 0x003 268#define SVM_EXIT_READ_CR3 0x003
251#define SVM_EXIT_READ_CR4 0x004 269#define SVM_EXIT_READ_CR4 0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
316#define SVM_EXIT_MONITOR 0x08a 334#define SVM_EXIT_MONITOR 0x08a
317#define SVM_EXIT_MWAIT 0x08b 335#define SVM_EXIT_MWAIT 0x08b
318#define SVM_EXIT_MWAIT_COND 0x08c 336#define SVM_EXIT_MWAIT_COND 0x08c
337#define SVM_EXIT_XSETBV 0x08d
319#define SVM_EXIT_NPF 0x400 338#define SVM_EXIT_NPF 0x400
320 339
321#define SVM_EXIT_ERR -1 340#define SVM_EXIT_ERR -1
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
30asmlinkage void stack_segment(void); 30asmlinkage void stack_segment(void);
31asmlinkage void general_protection(void); 31asmlinkage void general_protection(void);
32asmlinkage void page_fault(void); 32asmlinkage void page_fault(void);
33asmlinkage void async_page_fault(void);
33asmlinkage void spurious_interrupt_bug(void); 34asmlinkage void spurious_interrupt_bug(void);
34asmlinkage void coprocessor_error(void); 35asmlinkage void coprocessor_error(void);
35asmlinkage void alignment_check(void); 36asmlinkage void alignment_check(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd987d50..84471b810460 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
66#define PIN_BASED_NMI_EXITING 0x00000008 66#define PIN_BASED_NMI_EXITING 0x00000008
67#define PIN_BASED_VIRTUAL_NMIS 0x00000020 67#define PIN_BASED_VIRTUAL_NMIS 0x00000020
68 68
69#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
69#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 70#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
71#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
70#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 72#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
71#define VM_EXIT_SAVE_IA32_PAT 0x00040000 73#define VM_EXIT_SAVE_IA32_PAT 0x00040000
72#define VM_EXIT_LOAD_IA32_PAT 0x00080000 74#define VM_EXIT_LOAD_IA32_PAT 0x00080000
75#define VM_EXIT_SAVE_IA32_EFER 0x00100000
76#define VM_EXIT_LOAD_IA32_EFER 0x00200000
77#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
73 78
79#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
74#define VM_ENTRY_IA32E_MODE 0x00000200 80#define VM_ENTRY_IA32E_MODE 0x00000200
75#define VM_ENTRY_SMM 0x00000400 81#define VM_ENTRY_SMM 0x00000400
76#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 82#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
83#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
77#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 84#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
85#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
78 86
79/* VMCS Encodings */ 87/* VMCS Encodings */
80enum vmcs_field { 88enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
239#define EXIT_REASON_TASK_SWITCH 9 247#define EXIT_REASON_TASK_SWITCH 9
240#define EXIT_REASON_CPUID 10 248#define EXIT_REASON_CPUID 10
241#define EXIT_REASON_HLT 12 249#define EXIT_REASON_HLT 12
250#define EXIT_REASON_INVD 13
242#define EXIT_REASON_INVLPG 14 251#define EXIT_REASON_INVLPG 14
243#define EXIT_REASON_RDPMC 15 252#define EXIT_REASON_RDPMC 15
244#define EXIT_REASON_RDTSC 16 253#define EXIT_REASON_RDTSC 16
@@ -296,6 +305,12 @@ enum vmcs_field {
296#define GUEST_INTR_STATE_SMI 0x00000004 305#define GUEST_INTR_STATE_SMI 0x00000004
297#define GUEST_INTR_STATE_NMI 0x00000008 306#define GUEST_INTR_STATE_NMI 0x00000008
298 307
308/* GUEST_ACTIVITY_STATE flags */
309#define GUEST_ACTIVITY_ACTIVE 0
310#define GUEST_ACTIVITY_HLT 1
311#define GUEST_ACTIVITY_SHUTDOWN 2
312#define GUEST_ACTIVITY_WAIT_SIPI 3
313
299/* 314/*
300 * Exit Qualifications for MOV for Control Register Access 315 * Exit Qualifications for MOV for Control Register Access
301 */ 316 */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 591e60104278..c8b4efad7ebb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
1406 CFI_ENDPROC 1406 CFI_ENDPROC
1407END(general_protection) 1407END(general_protection)
1408 1408
1409#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault)
1411 RING0_EC_FRAME
1412 pushl $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code
1415 CFI_ENDPROC
1416END(apf_page_fault)
1417#endif
1418
1409/* 1419/*
1410 * End of kprobes section 1420 * End of kprobes section
1411 */ 1421 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d3b895f375d3..aed1ffbeb0c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1329,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
1329#endif 1329#endif
1330errorentry general_protection do_general_protection 1330errorentry general_protection do_general_protection
1331errorentry page_fault do_page_fault 1331errorentry page_fault do_page_fault
1332#ifdef CONFIG_KVM_GUEST
1333errorentry async_page_fault do_async_page_fault
1334#endif
1332#ifdef CONFIG_X86_MCE 1335#ifdef CONFIG_X86_MCE
1333paranoidzeroentry machine_check *machine_check_vector(%rip) 1336paranoidzeroentry machine_check *machine_check_vector(%rip)
1334#endif 1337#endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 58bb239a2fd7..e60c38cc0eed 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
169 set_stopped_child_used_math(tsk); 169 set_stopped_child_used_math(tsk);
170 return 0; 170 return 0;
171} 171}
172EXPORT_SYMBOL_GPL(init_fpu);
172 173
173/* 174/*
174 * The xstateregs_active() routine is the same as the fpregs_active() routine, 175 * The xstateregs_active() routine is the same as the fpregs_active() routine,
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..8dc44662394b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/notifier.h>
31#include <linux/reboot.h>
32#include <linux/hash.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/kprobes.h>
30#include <asm/timer.h> 36#include <asm/timer.h>
37#include <asm/cpu.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
31 41
32#define MMU_QUEUE_SIZE 1024 42#define MMU_QUEUE_SIZE 1024
33 43
44static int kvmapf = 1;
45
46static int parse_no_kvmapf(char *arg)
47{
48 kvmapf = 0;
49 return 0;
50}
51
52early_param("no-kvmapf", parse_no_kvmapf);
53
34struct kvm_para_state { 54struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 55 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 56 int mmu_queue_len;
37}; 57};
38 58
39static DEFINE_PER_CPU(struct kvm_para_state, para_state); 59static DEFINE_PER_CPU(struct kvm_para_state, para_state);
60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
40 61
41static struct kvm_para_state *kvm_para_state(void) 62static struct kvm_para_state *kvm_para_state(void)
42{ 63{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
50{ 71{
51} 72}
52 73
74#define KVM_TASK_SLEEP_HASHBITS 8
75#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
76
77struct kvm_task_sleep_node {
78 struct hlist_node link;
79 wait_queue_head_t wq;
80 u32 token;
81 int cpu;
82 bool halted;
83 struct mm_struct *mm;
84};
85
86static struct kvm_task_sleep_head {
87 spinlock_t lock;
88 struct hlist_head list;
89} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
90
91static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
92 u32 token)
93{
94 struct hlist_node *p;
95
96 hlist_for_each(p, &b->list) {
97 struct kvm_task_sleep_node *n =
98 hlist_entry(p, typeof(*n), link);
99 if (n->token == token)
100 return n;
101 }
102
103 return NULL;
104}
105
106void kvm_async_pf_task_wait(u32 token)
107{
108 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
109 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
110 struct kvm_task_sleep_node n, *e;
111 DEFINE_WAIT(wait);
112 int cpu, idle;
113
114 cpu = get_cpu();
115 idle = idle_cpu(cpu);
116 put_cpu();
117
118 spin_lock(&b->lock);
119 e = _find_apf_task(b, token);
120 if (e) {
121 /* dummy entry exist -> wake up was delivered ahead of PF */
122 hlist_del(&e->link);
123 kfree(e);
124 spin_unlock(&b->lock);
125 return;
126 }
127
128 n.token = token;
129 n.cpu = smp_processor_id();
130 n.mm = current->active_mm;
131 n.halted = idle || preempt_count() > 1;
132 atomic_inc(&n.mm->mm_count);
133 init_waitqueue_head(&n.wq);
134 hlist_add_head(&n.link, &b->list);
135 spin_unlock(&b->lock);
136
137 for (;;) {
138 if (!n.halted)
139 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
140 if (hlist_unhashed(&n.link))
141 break;
142
143 if (!n.halted) {
144 local_irq_enable();
145 schedule();
146 local_irq_disable();
147 } else {
148 /*
149 * We cannot reschedule. So halt.
150 */
151 native_safe_halt();
152 local_irq_disable();
153 }
154 }
155 if (!n.halted)
156 finish_wait(&n.wq, &wait);
157
158 return;
159}
160EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
161
162static void apf_task_wake_one(struct kvm_task_sleep_node *n)
163{
164 hlist_del_init(&n->link);
165 if (!n->mm)
166 return;
167 mmdrop(n->mm);
168 if (n->halted)
169 smp_send_reschedule(n->cpu);
170 else if (waitqueue_active(&n->wq))
171 wake_up(&n->wq);
172}
173
174static void apf_task_wake_all(void)
175{
176 int i;
177
178 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
179 struct hlist_node *p, *next;
180 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
181 spin_lock(&b->lock);
182 hlist_for_each_safe(p, next, &b->list) {
183 struct kvm_task_sleep_node *n =
184 hlist_entry(p, typeof(*n), link);
185 if (n->cpu == smp_processor_id())
186 apf_task_wake_one(n);
187 }
188 spin_unlock(&b->lock);
189 }
190}
191
192void kvm_async_pf_task_wake(u32 token)
193{
194 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
195 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
196 struct kvm_task_sleep_node *n;
197
198 if (token == ~0) {
199 apf_task_wake_all();
200 return;
201 }
202
203again:
204 spin_lock(&b->lock);
205 n = _find_apf_task(b, token);
206 if (!n) {
207 /*
208 * async PF was not yet handled.
209 * Add dummy entry for the token.
210 */
211 n = kmalloc(sizeof(*n), GFP_ATOMIC);
212 if (!n) {
213 /*
214 * Allocation failed! Busy wait while other cpu
215 * handles async PF.
216 */
217 spin_unlock(&b->lock);
218 cpu_relax();
219 goto again;
220 }
221 n->token = token;
222 n->cpu = smp_processor_id();
223 n->mm = NULL;
224 init_waitqueue_head(&n->wq);
225 hlist_add_head(&n->link, &b->list);
226 } else
227 apf_task_wake_one(n);
228 spin_unlock(&b->lock);
229 return;
230}
231EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
232
233u32 kvm_read_and_reset_pf_reason(void)
234{
235 u32 reason = 0;
236
237 if (__get_cpu_var(apf_reason).enabled) {
238 reason = __get_cpu_var(apf_reason).reason;
239 __get_cpu_var(apf_reason).reason = 0;
240 }
241
242 return reason;
243}
244EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
245
246dotraplinkage void __kprobes
247do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
248{
249 switch (kvm_read_and_reset_pf_reason()) {
250 default:
251 do_page_fault(regs, error_code);
252 break;
253 case KVM_PV_REASON_PAGE_NOT_PRESENT:
254 /* page is swapped out by the host. */
255 kvm_async_pf_task_wait((u32)read_cr2());
256 break;
257 case KVM_PV_REASON_PAGE_READY:
258 kvm_async_pf_task_wake((u32)read_cr2());
259 break;
260 }
261}
262
53static void kvm_mmu_op(void *buffer, unsigned len) 263static void kvm_mmu_op(void *buffer, unsigned len)
54{ 264{
55 int r; 265 int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
231#endif 441#endif
232} 442}
233 443
444void __cpuinit kvm_guest_cpu_init(void)
445{
446 if (!kvm_para_available())
447 return;
448
449 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
450 u64 pa = __pa(&__get_cpu_var(apf_reason));
451
452#ifdef CONFIG_PREEMPT
453 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
454#endif
455 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
456 __get_cpu_var(apf_reason).enabled = 1;
457 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
458 smp_processor_id());
459 }
460}
461
462static void kvm_pv_disable_apf(void *unused)
463{
464 if (!__get_cpu_var(apf_reason).enabled)
465 return;
466
467 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
468 __get_cpu_var(apf_reason).enabled = 0;
469
470 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
471 smp_processor_id());
472}
473
474static int kvm_pv_reboot_notify(struct notifier_block *nb,
475 unsigned long code, void *unused)
476{
477 if (code == SYS_RESTART)
478 on_each_cpu(kvm_pv_disable_apf, NULL, 1);
479 return NOTIFY_DONE;
480}
481
482static struct notifier_block kvm_pv_reboot_nb = {
483 .notifier_call = kvm_pv_reboot_notify,
484};
485
486#ifdef CONFIG_SMP
487static void __init kvm_smp_prepare_boot_cpu(void)
488{
489#ifdef CONFIG_KVM_CLOCK
490 WARN_ON(kvm_register_clock("primary cpu clock"));
491#endif
492 kvm_guest_cpu_init();
493 native_smp_prepare_boot_cpu();
494}
495
496static void kvm_guest_cpu_online(void *dummy)
497{
498 kvm_guest_cpu_init();
499}
500
501static void kvm_guest_cpu_offline(void *dummy)
502{
503 kvm_pv_disable_apf(NULL);
504 apf_task_wake_all();
505}
506
507static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
508 unsigned long action, void *hcpu)
509{
510 int cpu = (unsigned long)hcpu;
511 switch (action) {
512 case CPU_ONLINE:
513 case CPU_DOWN_FAILED:
514 case CPU_ONLINE_FROZEN:
515 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
516 break;
517 case CPU_DOWN_PREPARE:
518 case CPU_DOWN_PREPARE_FROZEN:
519 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
520 break;
521 default:
522 break;
523 }
524 return NOTIFY_OK;
525}
526
527static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
528 .notifier_call = kvm_cpu_notify,
529};
530#endif
531
532static void __init kvm_apf_trap_init(void)
533{
534 set_intr_gate(14, &async_page_fault);
535}
536
234void __init kvm_guest_init(void) 537void __init kvm_guest_init(void)
235{ 538{
539 int i;
540
236 if (!kvm_para_available()) 541 if (!kvm_para_available())
237 return; 542 return;
238 543
239 paravirt_ops_setup(); 544 paravirt_ops_setup();
545 register_reboot_notifier(&kvm_pv_reboot_nb);
546 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
547 spin_lock_init(&async_pf_sleepers[i].lock);
548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
549 x86_init.irqs.trap_init = kvm_apf_trap_init;
550
551#ifdef CONFIG_SMP
552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
553 register_cpu_notifier(&kvm_cpu_notifier);
554#else
555 kvm_guest_cpu_init();
556#endif
240} 557}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca43ce31a19c..f98d3eafe07a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
125 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 125 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
126}; 126};
127 127
128static int kvm_register_clock(char *txt) 128int kvm_register_clock(char *txt)
129{ 129{
130 int cpu = smp_processor_id(); 130 int cpu = smp_processor_id();
131 int low, high, ret; 131 int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
152} 152}
153#endif 153#endif
154 154
155#ifdef CONFIG_SMP
156static void __init kvm_smp_prepare_boot_cpu(void)
157{
158 WARN_ON(kvm_register_clock("primary cpu clock"));
159 native_smp_prepare_boot_cpu();
160}
161#endif
162
163/* 155/*
164 * After the clock is registered, the host will keep writing to the 156 * After the clock is registered, the host will keep writing to the
165 * registered memory location. If the guest happens to shutdown, this memory 157 * registered memory location. If the guest happens to shutdown, this memory
@@ -206,9 +198,6 @@ void __init kvmclock_init(void)
206 x86_cpuinit.setup_percpu_clockev = 198 x86_cpuinit.setup_percpu_clockev =
207 kvm_setup_secondary_clock; 199 kvm_setup_secondary_clock;
208#endif 200#endif
209#ifdef CONFIG_SMP
210 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
211#endif
212 machine_ops.shutdown = kvm_shutdown; 201 machine_ops.shutdown = kvm_shutdown;
213#ifdef CONFIG_KEXEC 202#ifdef CONFIG_KEXEC
214 machine_ops.crash_shutdown = kvm_crash_shutdown; 203 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select KVM_ASYNC_PF
31 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
32 select KVM_MMIO 33 select KVM_MMIO
33 ---help--- 34 ---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 assigned-dev.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
12 13
13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
14 i8254.o timer.o 15 i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8dafaff..caf966781d25 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,8 @@
20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
21 */ 21 */
22 22
23#ifndef __KERNEL__
24#include <stdio.h>
25#include <stdint.h>
26#include <public/xen.h>
27#define DPRINTF(_f, _a ...) printf(_f , ## _a)
28#else
29#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
30#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
31#define DPRINTF(x...) do {} while (0)
32#endif
33#include <linux/module.h> 25#include <linux/module.h>
34#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
35 27
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
418} 410}
419 411
420static inline unsigned long 412static inline unsigned long
421register_address(struct decode_cache *c, unsigned long base, unsigned long reg) 413register_address(struct decode_cache *c, unsigned long reg)
422{ 414{
423 return base + address_mask(c, reg); 415 return address_mask(c, reg);
424} 416}
425 417
426static inline void 418static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
452 return ops->get_cached_segment_base(seg, ctxt->vcpu); 444 return ops->get_cached_segment_base(seg, ctxt->vcpu);
453} 445}
454 446
455static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 447static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
456 struct x86_emulate_ops *ops, 448 struct x86_emulate_ops *ops,
457 struct decode_cache *c) 449 struct decode_cache *c)
458{ 450{
459 if (!c->has_seg_override) 451 if (!c->has_seg_override)
460 return 0; 452 return 0;
461 453
462 return seg_base(ctxt, ops, c->seg_override); 454 return c->seg_override;
463} 455}
464 456
465static unsigned long es_base(struct x86_emulate_ctxt *ctxt, 457static ulong linear(struct x86_emulate_ctxt *ctxt,
466 struct x86_emulate_ops *ops) 458 struct segmented_address addr)
467{ 459{
468 return seg_base(ctxt, ops, VCPU_SREG_ES); 460 struct decode_cache *c = &ctxt->decode;
469} 461 ulong la;
470
471static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
472 struct x86_emulate_ops *ops)
473{
474 return seg_base(ctxt, ops, VCPU_SREG_SS);
475}
476 462
477static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 463 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
478 u32 error, bool valid) 464 if (c->ad_bytes != 8)
479{ 465 la &= (u32)-1;
480 ctxt->exception = vec; 466 return la;
481 ctxt->error_code = error;
482 ctxt->error_code_valid = valid;
483} 467}
484 468
485static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 469static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
470 u32 error, bool valid)
486{ 471{
487 emulate_exception(ctxt, GP_VECTOR, err, true); 472 ctxt->exception.vector = vec;
473 ctxt->exception.error_code = error;
474 ctxt->exception.error_code_valid = valid;
475 return X86EMUL_PROPAGATE_FAULT;
488} 476}
489 477
490static void emulate_pf(struct x86_emulate_ctxt *ctxt) 478static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
491{ 479{
492 emulate_exception(ctxt, PF_VECTOR, 0, true); 480 return emulate_exception(ctxt, GP_VECTOR, err, true);
493} 481}
494 482
495static void emulate_ud(struct x86_emulate_ctxt *ctxt) 483static int emulate_ud(struct x86_emulate_ctxt *ctxt)
496{ 484{
497 emulate_exception(ctxt, UD_VECTOR, 0, false); 485 return emulate_exception(ctxt, UD_VECTOR, 0, false);
498} 486}
499 487
500static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) 488static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
501{ 489{
502 emulate_exception(ctxt, TS_VECTOR, err, true); 490 return emulate_exception(ctxt, TS_VECTOR, err, true);
503} 491}
504 492
505static int emulate_de(struct x86_emulate_ctxt *ctxt) 493static int emulate_de(struct x86_emulate_ctxt *ctxt)
506{ 494{
507 emulate_exception(ctxt, DE_VECTOR, 0, false); 495 return emulate_exception(ctxt, DE_VECTOR, 0, false);
508 return X86EMUL_PROPAGATE_FAULT;
509} 496}
510 497
511static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 498static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
520 cur_size = fc->end - fc->start; 507 cur_size = fc->end - fc->start;
521 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 508 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
522 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 509 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
523 size, ctxt->vcpu, NULL); 510 size, ctxt->vcpu, &ctxt->exception);
524 if (rc != X86EMUL_CONTINUE) 511 if (rc != X86EMUL_CONTINUE)
525 return rc; 512 return rc;
526 fc->end += size; 513 fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
564 551
565static int read_descriptor(struct x86_emulate_ctxt *ctxt, 552static int read_descriptor(struct x86_emulate_ctxt *ctxt,
566 struct x86_emulate_ops *ops, 553 struct x86_emulate_ops *ops,
567 ulong addr, 554 struct segmented_address addr,
568 u16 *size, unsigned long *address, int op_bytes) 555 u16 *size, unsigned long *address, int op_bytes)
569{ 556{
570 int rc; 557 int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
572 if (op_bytes == 2) 559 if (op_bytes == 2)
573 op_bytes = 3; 560 op_bytes = 3;
574 *address = 0; 561 *address = 0;
575 rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); 562 rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
563 ctxt->vcpu, &ctxt->exception);
576 if (rc != X86EMUL_CONTINUE) 564 if (rc != X86EMUL_CONTINUE)
577 return rc; 565 return rc;
578 rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); 566 addr.ea += 2;
567 rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
568 ctxt->vcpu, &ctxt->exception);
579 return rc; 569 return rc;
580} 570}
581 571
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
768 break; 758 break;
769 } 759 }
770 } 760 }
771 op->addr.mem = modrm_ea; 761 op->addr.mem.ea = modrm_ea;
772done: 762done:
773 return rc; 763 return rc;
774} 764}
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
783 op->type = OP_MEM; 773 op->type = OP_MEM;
784 switch (c->ad_bytes) { 774 switch (c->ad_bytes) {
785 case 2: 775 case 2:
786 op->addr.mem = insn_fetch(u16, 2, c->eip); 776 op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
787 break; 777 break;
788 case 4: 778 case 4:
789 op->addr.mem = insn_fetch(u32, 4, c->eip); 779 op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
790 break; 780 break;
791 case 8: 781 case 8:
792 op->addr.mem = insn_fetch(u64, 8, c->eip); 782 op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
793 break; 783 break;
794 } 784 }
795done: 785done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
808 else if (c->src.bytes == 4) 798 else if (c->src.bytes == 4)
809 sv = (s32)c->src.val & (s32)mask; 799 sv = (s32)c->src.val & (s32)mask;
810 800
811 c->dst.addr.mem += (sv >> 3); 801 c->dst.addr.mem.ea += (sv >> 3);
812 } 802 }
813 803
814 /* only subword offset */ 804 /* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
821{ 811{
822 int rc; 812 int rc;
823 struct read_cache *mc = &ctxt->decode.mem_read; 813 struct read_cache *mc = &ctxt->decode.mem_read;
824 u32 err;
825 814
826 while (size) { 815 while (size) {
827 int n = min(size, 8u); 816 int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
829 if (mc->pos < mc->end) 818 if (mc->pos < mc->end)
830 goto read_cached; 819 goto read_cached;
831 820
832 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 821 rc = ops->read_emulated(addr, mc->data + mc->end, n,
833 ctxt->vcpu); 822 &ctxt->exception, ctxt->vcpu);
834 if (rc == X86EMUL_PROPAGATE_FAULT)
835 emulate_pf(ctxt);
836 if (rc != X86EMUL_CONTINUE) 823 if (rc != X86EMUL_CONTINUE)
837 return rc; 824 return rc;
838 mc->end += n; 825 mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
907 struct desc_ptr dt; 894 struct desc_ptr dt;
908 u16 index = selector >> 3; 895 u16 index = selector >> 3;
909 int ret; 896 int ret;
910 u32 err;
911 ulong addr; 897 ulong addr;
912 898
913 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 899 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
914 900
915 if (dt.size < index * 8 + 7) { 901 if (dt.size < index * 8 + 7)
916 emulate_gp(ctxt, selector & 0xfffc); 902 return emulate_gp(ctxt, selector & 0xfffc);
917 return X86EMUL_PROPAGATE_FAULT;
918 }
919 addr = dt.address + index * 8; 903 addr = dt.address + index * 8;
920 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 904 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
921 if (ret == X86EMUL_PROPAGATE_FAULT) 905 &ctxt->exception);
922 emulate_pf(ctxt);
923 906
924 return ret; 907 return ret;
925} 908}
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
931{ 914{
932 struct desc_ptr dt; 915 struct desc_ptr dt;
933 u16 index = selector >> 3; 916 u16 index = selector >> 3;
934 u32 err;
935 ulong addr; 917 ulong addr;
936 int ret; 918 int ret;
937 919
938 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 920 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
939 921
940 if (dt.size < index * 8 + 7) { 922 if (dt.size < index * 8 + 7)
941 emulate_gp(ctxt, selector & 0xfffc); 923 return emulate_gp(ctxt, selector & 0xfffc);
942 return X86EMUL_PROPAGATE_FAULT;
943 }
944 924
945 addr = dt.address + index * 8; 925 addr = dt.address + index * 8;
946 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 926 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
947 if (ret == X86EMUL_PROPAGATE_FAULT) 927 &ctxt->exception);
948 emulate_pf(ctxt);
949 928
950 return ret; 929 return ret;
951} 930}
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1092{ 1071{
1093 int rc; 1072 int rc;
1094 struct decode_cache *c = &ctxt->decode; 1073 struct decode_cache *c = &ctxt->decode;
1095 u32 err;
1096 1074
1097 switch (c->dst.type) { 1075 switch (c->dst.type) {
1098 case OP_REG: 1076 case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1101 case OP_MEM: 1079 case OP_MEM:
1102 if (c->lock_prefix) 1080 if (c->lock_prefix)
1103 rc = ops->cmpxchg_emulated( 1081 rc = ops->cmpxchg_emulated(
1104 c->dst.addr.mem, 1082 linear(ctxt, c->dst.addr.mem),
1105 &c->dst.orig_val, 1083 &c->dst.orig_val,
1106 &c->dst.val, 1084 &c->dst.val,
1107 c->dst.bytes, 1085 c->dst.bytes,
1108 &err, 1086 &ctxt->exception,
1109 ctxt->vcpu); 1087 ctxt->vcpu);
1110 else 1088 else
1111 rc = ops->write_emulated( 1089 rc = ops->write_emulated(
1112 c->dst.addr.mem, 1090 linear(ctxt, c->dst.addr.mem),
1113 &c->dst.val, 1091 &c->dst.val,
1114 c->dst.bytes, 1092 c->dst.bytes,
1115 &err, 1093 &ctxt->exception,
1116 ctxt->vcpu); 1094 ctxt->vcpu);
1117 if (rc == X86EMUL_PROPAGATE_FAULT)
1118 emulate_pf(ctxt);
1119 if (rc != X86EMUL_CONTINUE) 1095 if (rc != X86EMUL_CONTINUE)
1120 return rc; 1096 return rc;
1121 break; 1097 break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1137 c->dst.bytes = c->op_bytes; 1113 c->dst.bytes = c->op_bytes;
1138 c->dst.val = c->src.val; 1114 c->dst.val = c->src.val;
1139 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1115 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1140 c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), 1116 c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1141 c->regs[VCPU_REGS_RSP]); 1117 c->dst.addr.mem.seg = VCPU_SREG_SS;
1142} 1118}
1143 1119
1144static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1120static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1147{ 1123{
1148 struct decode_cache *c = &ctxt->decode; 1124 struct decode_cache *c = &ctxt->decode;
1149 int rc; 1125 int rc;
1126 struct segmented_address addr;
1150 1127
1151 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), 1128 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1152 c->regs[VCPU_REGS_RSP]), 1129 addr.seg = VCPU_SREG_SS;
1153 dest, len); 1130 rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
1154 if (rc != X86EMUL_CONTINUE) 1131 if (rc != X86EMUL_CONTINUE)
1155 return rc; 1132 return rc;
1156 1133
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1184 change_mask |= EFLG_IF; 1161 change_mask |= EFLG_IF;
1185 break; 1162 break;
1186 case X86EMUL_MODE_VM86: 1163 case X86EMUL_MODE_VM86:
1187 if (iopl < 3) { 1164 if (iopl < 3)
1188 emulate_gp(ctxt, 0); 1165 return emulate_gp(ctxt, 0);
1189 return X86EMUL_PROPAGATE_FAULT;
1190 }
1191 change_mask |= EFLG_IF; 1166 change_mask |= EFLG_IF;
1192 break; 1167 break;
1193 default: /* real mode */ 1168 default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1198 *(unsigned long *)dest = 1173 *(unsigned long *)dest =
1199 (ctxt->eflags & ~change_mask) | (val & change_mask); 1174 (ctxt->eflags & ~change_mask) | (val & change_mask);
1200 1175
1201 if (rc == X86EMUL_PROPAGATE_FAULT)
1202 emulate_pf(ctxt);
1203
1204 return rc; 1176 return rc;
1205} 1177}
1206 1178
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1287 gva_t cs_addr; 1259 gva_t cs_addr;
1288 gva_t eip_addr; 1260 gva_t eip_addr;
1289 u16 cs, eip; 1261 u16 cs, eip;
1290 u32 err;
1291 1262
1292 /* TODO: Add limit checks */ 1263 /* TODO: Add limit checks */
1293 c->src.val = ctxt->eflags; 1264 c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1317 eip_addr = dt.address + (irq << 2); 1288 eip_addr = dt.address + (irq << 2);
1318 cs_addr = dt.address + (irq << 2) + 2; 1289 cs_addr = dt.address + (irq << 2) + 2;
1319 1290
1320 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); 1291 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
1321 if (rc != X86EMUL_CONTINUE) 1292 if (rc != X86EMUL_CONTINUE)
1322 return rc; 1293 return rc;
1323 1294
1324 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); 1295 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
1325 if (rc != X86EMUL_CONTINUE) 1296 if (rc != X86EMUL_CONTINUE)
1326 return rc; 1297 return rc;
1327 1298
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1370 if (rc != X86EMUL_CONTINUE) 1341 if (rc != X86EMUL_CONTINUE)
1371 return rc; 1342 return rc;
1372 1343
1373 if (temp_eip & ~0xffff) { 1344 if (temp_eip & ~0xffff)
1374 emulate_gp(ctxt, 0); 1345 return emulate_gp(ctxt, 0);
1375 return X86EMUL_PROPAGATE_FAULT;
1376 }
1377 1346
1378 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1347 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1379 1348
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1624 1593
1625 /* syscall is not available in real mode */ 1594 /* syscall is not available in real mode */
1626 if (ctxt->mode == X86EMUL_MODE_REAL || 1595 if (ctxt->mode == X86EMUL_MODE_REAL ||
1627 ctxt->mode == X86EMUL_MODE_VM86) { 1596 ctxt->mode == X86EMUL_MODE_VM86)
1628 emulate_ud(ctxt); 1597 return emulate_ud(ctxt);
1629 return X86EMUL_PROPAGATE_FAULT;
1630 }
1631 1598
1632 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1599 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1633 1600
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1678 u16 cs_sel, ss_sel; 1645 u16 cs_sel, ss_sel;
1679 1646
1680 /* inject #GP if in real mode */ 1647 /* inject #GP if in real mode */
1681 if (ctxt->mode == X86EMUL_MODE_REAL) { 1648 if (ctxt->mode == X86EMUL_MODE_REAL)
1682 emulate_gp(ctxt, 0); 1649 return emulate_gp(ctxt, 0);
1683 return X86EMUL_PROPAGATE_FAULT;
1684 }
1685 1650
1686 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1651 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1687 * Therefore, we inject an #UD. 1652 * Therefore, we inject an #UD.
1688 */ 1653 */
1689 if (ctxt->mode == X86EMUL_MODE_PROT64) { 1654 if (ctxt->mode == X86EMUL_MODE_PROT64)
1690 emulate_ud(ctxt); 1655 return emulate_ud(ctxt);
1691 return X86EMUL_PROPAGATE_FAULT;
1692 }
1693 1656
1694 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1657 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1695 1658
1696 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1659 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1697 switch (ctxt->mode) { 1660 switch (ctxt->mode) {
1698 case X86EMUL_MODE_PROT32: 1661 case X86EMUL_MODE_PROT32:
1699 if ((msr_data & 0xfffc) == 0x0) { 1662 if ((msr_data & 0xfffc) == 0x0)
1700 emulate_gp(ctxt, 0); 1663 return emulate_gp(ctxt, 0);
1701 return X86EMUL_PROPAGATE_FAULT;
1702 }
1703 break; 1664 break;
1704 case X86EMUL_MODE_PROT64: 1665 case X86EMUL_MODE_PROT64:
1705 if (msr_data == 0x0) { 1666 if (msr_data == 0x0)
1706 emulate_gp(ctxt, 0); 1667 return emulate_gp(ctxt, 0);
1707 return X86EMUL_PROPAGATE_FAULT;
1708 }
1709 break; 1668 break;
1710 } 1669 }
1711 1670
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1745 1704
1746 /* inject #GP if in real mode or Virtual 8086 mode */ 1705 /* inject #GP if in real mode or Virtual 8086 mode */
1747 if (ctxt->mode == X86EMUL_MODE_REAL || 1706 if (ctxt->mode == X86EMUL_MODE_REAL ||
1748 ctxt->mode == X86EMUL_MODE_VM86) { 1707 ctxt->mode == X86EMUL_MODE_VM86)
1749 emulate_gp(ctxt, 0); 1708 return emulate_gp(ctxt, 0);
1750 return X86EMUL_PROPAGATE_FAULT;
1751 }
1752 1709
1753 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1710 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1754 1711
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1763 switch (usermode) { 1720 switch (usermode) {
1764 case X86EMUL_MODE_PROT32: 1721 case X86EMUL_MODE_PROT32:
1765 cs_sel = (u16)(msr_data + 16); 1722 cs_sel = (u16)(msr_data + 16);
1766 if ((msr_data & 0xfffc) == 0x0) { 1723 if ((msr_data & 0xfffc) == 0x0)
1767 emulate_gp(ctxt, 0); 1724 return emulate_gp(ctxt, 0);
1768 return X86EMUL_PROPAGATE_FAULT;
1769 }
1770 ss_sel = (u16)(msr_data + 24); 1725 ss_sel = (u16)(msr_data + 24);
1771 break; 1726 break;
1772 case X86EMUL_MODE_PROT64: 1727 case X86EMUL_MODE_PROT64:
1773 cs_sel = (u16)(msr_data + 32); 1728 cs_sel = (u16)(msr_data + 32);
1774 if (msr_data == 0x0) { 1729 if (msr_data == 0x0)
1775 emulate_gp(ctxt, 0); 1730 return emulate_gp(ctxt, 0);
1776 return X86EMUL_PROPAGATE_FAULT;
1777 }
1778 ss_sel = cs_sel + 8; 1731 ss_sel = cs_sel + 8;
1779 cs.d = 0; 1732 cs.d = 0;
1780 cs.l = 1; 1733 cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1934{ 1887{
1935 struct tss_segment_16 tss_seg; 1888 struct tss_segment_16 tss_seg;
1936 int ret; 1889 int ret;
1937 u32 err, new_tss_base = get_desc_base(new_desc); 1890 u32 new_tss_base = get_desc_base(new_desc);
1938 1891
1939 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1892 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1940 &err); 1893 &ctxt->exception);
1941 if (ret == X86EMUL_PROPAGATE_FAULT) { 1894 if (ret != X86EMUL_CONTINUE)
1942 /* FIXME: need to provide precise fault address */ 1895 /* FIXME: need to provide precise fault address */
1943 emulate_pf(ctxt);
1944 return ret; 1896 return ret;
1945 }
1946 1897
1947 save_state_to_tss16(ctxt, ops, &tss_seg); 1898 save_state_to_tss16(ctxt, ops, &tss_seg);
1948 1899
1949 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1900 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1950 &err); 1901 &ctxt->exception);
1951 if (ret == X86EMUL_PROPAGATE_FAULT) { 1902 if (ret != X86EMUL_CONTINUE)
1952 /* FIXME: need to provide precise fault address */ 1903 /* FIXME: need to provide precise fault address */
1953 emulate_pf(ctxt);
1954 return ret; 1904 return ret;
1955 }
1956 1905
1957 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1906 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1958 &err); 1907 &ctxt->exception);
1959 if (ret == X86EMUL_PROPAGATE_FAULT) { 1908 if (ret != X86EMUL_CONTINUE)
1960 /* FIXME: need to provide precise fault address */ 1909 /* FIXME: need to provide precise fault address */
1961 emulate_pf(ctxt);
1962 return ret; 1910 return ret;
1963 }
1964 1911
1965 if (old_tss_sel != 0xffff) { 1912 if (old_tss_sel != 0xffff) {
1966 tss_seg.prev_task_link = old_tss_sel; 1913 tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1968 ret = ops->write_std(new_tss_base, 1915 ret = ops->write_std(new_tss_base,
1969 &tss_seg.prev_task_link, 1916 &tss_seg.prev_task_link,
1970 sizeof tss_seg.prev_task_link, 1917 sizeof tss_seg.prev_task_link,
1971 ctxt->vcpu, &err); 1918 ctxt->vcpu, &ctxt->exception);
1972 if (ret == X86EMUL_PROPAGATE_FAULT) { 1919 if (ret != X86EMUL_CONTINUE)
1973 /* FIXME: need to provide precise fault address */ 1920 /* FIXME: need to provide precise fault address */
1974 emulate_pf(ctxt);
1975 return ret; 1921 return ret;
1976 }
1977 } 1922 }
1978 1923
1979 return load_state_from_tss16(ctxt, ops, &tss_seg); 1924 return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2013 struct decode_cache *c = &ctxt->decode; 1958 struct decode_cache *c = &ctxt->decode;
2014 int ret; 1959 int ret;
2015 1960
2016 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { 1961 if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
2017 emulate_gp(ctxt, 0); 1962 return emulate_gp(ctxt, 0);
2018 return X86EMUL_PROPAGATE_FAULT;
2019 }
2020 c->eip = tss->eip; 1963 c->eip = tss->eip;
2021 ctxt->eflags = tss->eflags | 2; 1964 ctxt->eflags = tss->eflags | 2;
2022 c->regs[VCPU_REGS_RAX] = tss->eax; 1965 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2076{ 2019{
2077 struct tss_segment_32 tss_seg; 2020 struct tss_segment_32 tss_seg;
2078 int ret; 2021 int ret;
2079 u32 err, new_tss_base = get_desc_base(new_desc); 2022 u32 new_tss_base = get_desc_base(new_desc);
2080 2023
2081 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2024 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2082 &err); 2025 &ctxt->exception);
2083 if (ret == X86EMUL_PROPAGATE_FAULT) { 2026 if (ret != X86EMUL_CONTINUE)
2084 /* FIXME: need to provide precise fault address */ 2027 /* FIXME: need to provide precise fault address */
2085 emulate_pf(ctxt);
2086 return ret; 2028 return ret;
2087 }
2088 2029
2089 save_state_to_tss32(ctxt, ops, &tss_seg); 2030 save_state_to_tss32(ctxt, ops, &tss_seg);
2090 2031
2091 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2032 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2092 &err); 2033 &ctxt->exception);
2093 if (ret == X86EMUL_PROPAGATE_FAULT) { 2034 if (ret != X86EMUL_CONTINUE)
2094 /* FIXME: need to provide precise fault address */ 2035 /* FIXME: need to provide precise fault address */
2095 emulate_pf(ctxt);
2096 return ret; 2036 return ret;
2097 }
2098 2037
2099 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2038 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2100 &err); 2039 &ctxt->exception);
2101 if (ret == X86EMUL_PROPAGATE_FAULT) { 2040 if (ret != X86EMUL_CONTINUE)
2102 /* FIXME: need to provide precise fault address */ 2041 /* FIXME: need to provide precise fault address */
2103 emulate_pf(ctxt);
2104 return ret; 2042 return ret;
2105 }
2106 2043
2107 if (old_tss_sel != 0xffff) { 2044 if (old_tss_sel != 0xffff) {
2108 tss_seg.prev_task_link = old_tss_sel; 2045 tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2110 ret = ops->write_std(new_tss_base, 2047 ret = ops->write_std(new_tss_base,
2111 &tss_seg.prev_task_link, 2048 &tss_seg.prev_task_link,
2112 sizeof tss_seg.prev_task_link, 2049 sizeof tss_seg.prev_task_link,
2113 ctxt->vcpu, &err); 2050 ctxt->vcpu, &ctxt->exception);
2114 if (ret == X86EMUL_PROPAGATE_FAULT) { 2051 if (ret != X86EMUL_CONTINUE)
2115 /* FIXME: need to provide precise fault address */ 2052 /* FIXME: need to provide precise fault address */
2116 emulate_pf(ctxt);
2117 return ret; 2053 return ret;
2118 }
2119 } 2054 }
2120 2055
2121 return load_state_from_tss32(ctxt, ops, &tss_seg); 2056 return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2146 2081
2147 if (reason != TASK_SWITCH_IRET) { 2082 if (reason != TASK_SWITCH_IRET) {
2148 if ((tss_selector & 3) > next_tss_desc.dpl || 2083 if ((tss_selector & 3) > next_tss_desc.dpl ||
2149 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2084 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
2150 emulate_gp(ctxt, 0); 2085 return emulate_gp(ctxt, 0);
2151 return X86EMUL_PROPAGATE_FAULT;
2152 }
2153 } 2086 }
2154 2087
2155 desc_limit = desc_limit_scaled(&next_tss_desc); 2088 desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2231 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2164 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2232} 2165}
2233 2166
2234static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, 2167static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2235 int reg, struct operand *op) 2168 int reg, struct operand *op)
2236{ 2169{
2237 struct decode_cache *c = &ctxt->decode; 2170 struct decode_cache *c = &ctxt->decode;
2238 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2171 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2239 2172
2240 register_address_increment(c, &c->regs[reg], df * op->bytes); 2173 register_address_increment(c, &c->regs[reg], df * op->bytes);
2241 op->addr.mem = register_address(c, base, c->regs[reg]); 2174 op->addr.mem.ea = register_address(c, c->regs[reg]);
2175 op->addr.mem.seg = seg;
2242} 2176}
2243 2177
2244static int em_push(struct x86_emulate_ctxt *ctxt) 2178static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2369 struct decode_cache *c = &ctxt->decode; 2303 struct decode_cache *c = &ctxt->decode;
2370 u64 tsc = 0; 2304 u64 tsc = 0;
2371 2305
2372 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { 2306 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
2373 emulate_gp(ctxt, 0); 2307 return emulate_gp(ctxt, 0);
2374 return X86EMUL_PROPAGATE_FAULT;
2375 }
2376 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); 2308 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2377 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2309 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2378 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2310 c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
2647 2579
2648 op->type = OP_IMM; 2580 op->type = OP_IMM;
2649 op->bytes = size; 2581 op->bytes = size;
2650 op->addr.mem = c->eip; 2582 op->addr.mem.ea = c->eip;
2651 /* NB. Immediates are sign-extended as necessary. */ 2583 /* NB. Immediates are sign-extended as necessary. */
2652 switch (op->bytes) { 2584 switch (op->bytes) {
2653 case 1: 2585 case 1:
@@ -2678,7 +2610,7 @@ done:
2678} 2610}
2679 2611
2680int 2612int
2681x86_decode_insn(struct x86_emulate_ctxt *ctxt) 2613x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2682{ 2614{
2683 struct x86_emulate_ops *ops = ctxt->ops; 2615 struct x86_emulate_ops *ops = ctxt->ops;
2684 struct decode_cache *c = &ctxt->decode; 2616 struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
2689 struct operand memop = { .type = OP_NONE }; 2621 struct operand memop = { .type = OP_NONE };
2690 2622
2691 c->eip = ctxt->eip; 2623 c->eip = ctxt->eip;
2692 c->fetch.start = c->fetch.end = c->eip; 2624 c->fetch.start = c->eip;
2625 c->fetch.end = c->fetch.start + insn_len;
2626 if (insn_len > 0)
2627 memcpy(c->fetch.data, insn, insn_len);
2693 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); 2628 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2694 2629
2695 switch (mode) { 2630 switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
2803 c->execute = opcode.u.execute; 2738 c->execute = opcode.u.execute;
2804 2739
2805 /* Unrecognised? */ 2740 /* Unrecognised? */
2806 if (c->d == 0 || (c->d & Undefined)) { 2741 if (c->d == 0 || (c->d & Undefined))
2807 DPRINTF("Cannot emulate %02x\n", c->b);
2808 return -1; 2742 return -1;
2809 }
2810 2743
2811 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2812 c->op_bytes = 8; 2745 c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
2831 if (!c->has_seg_override) 2764 if (!c->has_seg_override)
2832 set_seg_override(c, VCPU_SREG_DS); 2765 set_seg_override(c, VCPU_SREG_DS);
2833 2766
2834 if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) 2767 memop.addr.mem.seg = seg_override(ctxt, ops, c);
2835 memop.addr.mem += seg_override_base(ctxt, ops, c);
2836 2768
2837 if (memop.type == OP_MEM && c->ad_bytes != 8) 2769 if (memop.type == OP_MEM && c->ad_bytes != 8)
2838 memop.addr.mem = (u32)memop.addr.mem; 2770 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
2839 2771
2840 if (memop.type == OP_MEM && c->rip_relative) 2772 if (memop.type == OP_MEM && c->rip_relative)
2841 memop.addr.mem += c->eip; 2773 memop.addr.mem.ea += c->eip;
2842 2774
2843 /* 2775 /*
2844 * Decode and fetch the source operand: register, memory 2776 * Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
2890 case SrcSI: 2822 case SrcSI:
2891 c->src.type = OP_MEM; 2823 c->src.type = OP_MEM;
2892 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2824 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2893 c->src.addr.mem = 2825 c->src.addr.mem.ea =
2894 register_address(c, seg_override_base(ctxt, ops, c), 2826 register_address(c, c->regs[VCPU_REGS_RSI]);
2895 c->regs[VCPU_REGS_RSI]); 2827 c->src.addr.mem.seg = seg_override(ctxt, ops, c),
2896 c->src.val = 0; 2828 c->src.val = 0;
2897 break; 2829 break;
2898 case SrcImmFAddr: 2830 case SrcImmFAddr:
2899 c->src.type = OP_IMM; 2831 c->src.type = OP_IMM;
2900 c->src.addr.mem = c->eip; 2832 c->src.addr.mem.ea = c->eip;
2901 c->src.bytes = c->op_bytes + 2; 2833 c->src.bytes = c->op_bytes + 2;
2902 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 2834 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
2903 break; 2835 break;
@@ -2944,7 +2876,7 @@ done_prefixes:
2944 break; 2876 break;
2945 case DstImmUByte: 2877 case DstImmUByte:
2946 c->dst.type = OP_IMM; 2878 c->dst.type = OP_IMM;
2947 c->dst.addr.mem = c->eip; 2879 c->dst.addr.mem.ea = c->eip;
2948 c->dst.bytes = 1; 2880 c->dst.bytes = 1;
2949 c->dst.val = insn_fetch(u8, 1, c->eip); 2881 c->dst.val = insn_fetch(u8, 1, c->eip);
2950 break; 2882 break;
@@ -2969,9 +2901,9 @@ done_prefixes:
2969 case DstDI: 2901 case DstDI:
2970 c->dst.type = OP_MEM; 2902 c->dst.type = OP_MEM;
2971 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2903 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2972 c->dst.addr.mem = 2904 c->dst.addr.mem.ea =
2973 register_address(c, es_base(ctxt, ops), 2905 register_address(c, c->regs[VCPU_REGS_RDI]);
2974 c->regs[VCPU_REGS_RDI]); 2906 c->dst.addr.mem.seg = VCPU_SREG_ES;
2975 c->dst.val = 0; 2907 c->dst.val = 0;
2976 break; 2908 break;
2977 case ImplicitOps: 2909 case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3020 ctxt->decode.mem_read.pos = 0; 2952 ctxt->decode.mem_read.pos = 0;
3021 2953
3022 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2954 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
3023 emulate_ud(ctxt); 2955 rc = emulate_ud(ctxt);
3024 goto done; 2956 goto done;
3025 } 2957 }
3026 2958
3027 /* LOCK prefix is allowed only with some instructions */ 2959 /* LOCK prefix is allowed only with some instructions */
3028 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2960 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
3029 emulate_ud(ctxt); 2961 rc = emulate_ud(ctxt);
3030 goto done; 2962 goto done;
3031 } 2963 }
3032 2964
3033 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { 2965 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3034 emulate_ud(ctxt); 2966 rc = emulate_ud(ctxt);
3035 goto done; 2967 goto done;
3036 } 2968 }
3037 2969
3038 /* Privileged instruction can be executed only in CPL=0 */ 2970 /* Privileged instruction can be executed only in CPL=0 */
3039 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2971 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
3040 emulate_gp(ctxt, 0); 2972 rc = emulate_gp(ctxt, 0);
3041 goto done; 2973 goto done;
3042 } 2974 }
3043 2975
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3050 } 2982 }
3051 2983
3052 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 2984 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
3053 rc = read_emulated(ctxt, ops, c->src.addr.mem, 2985 rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
3054 c->src.valptr, c->src.bytes); 2986 c->src.valptr, c->src.bytes);
3055 if (rc != X86EMUL_CONTINUE) 2987 if (rc != X86EMUL_CONTINUE)
3056 goto done; 2988 goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3058 } 2990 }
3059 2991
3060 if (c->src2.type == OP_MEM) { 2992 if (c->src2.type == OP_MEM) {
3061 rc = read_emulated(ctxt, ops, c->src2.addr.mem, 2993 rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
3062 &c->src2.val, c->src2.bytes); 2994 &c->src2.val, c->src2.bytes);
3063 if (rc != X86EMUL_CONTINUE) 2995 if (rc != X86EMUL_CONTINUE)
3064 goto done; 2996 goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3070 3002
3071 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3003 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
3072 /* optimisation - avoid slow emulated read if Mov */ 3004 /* optimisation - avoid slow emulated read if Mov */
3073 rc = read_emulated(ctxt, ops, c->dst.addr.mem, 3005 rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
3074 &c->dst.val, c->dst.bytes); 3006 &c->dst.val, c->dst.bytes);
3075 if (rc != X86EMUL_CONTINUE) 3007 if (rc != X86EMUL_CONTINUE)
3076 goto done; 3008 goto done;
@@ -3215,13 +3147,13 @@ special_insn:
3215 break; 3147 break;
3216 case 0x8c: /* mov r/m, sreg */ 3148 case 0x8c: /* mov r/m, sreg */
3217 if (c->modrm_reg > VCPU_SREG_GS) { 3149 if (c->modrm_reg > VCPU_SREG_GS) {
3218 emulate_ud(ctxt); 3150 rc = emulate_ud(ctxt);
3219 goto done; 3151 goto done;
3220 } 3152 }
3221 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3153 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
3222 break; 3154 break;
3223 case 0x8d: /* lea r16/r32, m */ 3155 case 0x8d: /* lea r16/r32, m */
3224 c->dst.val = c->src.addr.mem; 3156 c->dst.val = c->src.addr.mem.ea;
3225 break; 3157 break;
3226 case 0x8e: { /* mov seg, r/m16 */ 3158 case 0x8e: { /* mov seg, r/m16 */
3227 uint16_t sel; 3159 uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
3230 3162
3231 if (c->modrm_reg == VCPU_SREG_CS || 3163 if (c->modrm_reg == VCPU_SREG_CS ||
3232 c->modrm_reg > VCPU_SREG_GS) { 3164 c->modrm_reg > VCPU_SREG_GS) {
3233 emulate_ud(ctxt); 3165 rc = emulate_ud(ctxt);
3234 goto done; 3166 goto done;
3235 } 3167 }
3236 3168
@@ -3268,7 +3200,6 @@ special_insn:
3268 break; 3200 break;
3269 case 0xa6 ... 0xa7: /* cmps */ 3201 case 0xa6 ... 0xa7: /* cmps */
3270 c->dst.type = OP_NONE; /* Disable writeback. */ 3202 c->dst.type = OP_NONE; /* Disable writeback. */
3271 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
3272 goto cmp; 3203 goto cmp;
3273 case 0xa8 ... 0xa9: /* test ax, imm */ 3204 case 0xa8 ... 0xa9: /* test ax, imm */
3274 goto test; 3205 goto test;
@@ -3363,7 +3294,7 @@ special_insn:
3363 do_io_in: 3294 do_io_in:
3364 c->dst.bytes = min(c->dst.bytes, 4u); 3295 c->dst.bytes = min(c->dst.bytes, 4u);
3365 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 3296 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
3366 emulate_gp(ctxt, 0); 3297 rc = emulate_gp(ctxt, 0);
3367 goto done; 3298 goto done;
3368 } 3299 }
3369 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 3300 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
3377 c->src.bytes = min(c->src.bytes, 4u); 3308 c->src.bytes = min(c->src.bytes, 4u);
3378 if (!emulator_io_permited(ctxt, ops, c->dst.val, 3309 if (!emulator_io_permited(ctxt, ops, c->dst.val,
3379 c->src.bytes)) { 3310 c->src.bytes)) {
3380 emulate_gp(ctxt, 0); 3311 rc = emulate_gp(ctxt, 0);
3381 goto done; 3312 goto done;
3382 } 3313 }
3383 ops->pio_out_emulated(c->src.bytes, c->dst.val, 3314 ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
3402 break; 3333 break;
3403 case 0xfa: /* cli */ 3334 case 0xfa: /* cli */
3404 if (emulator_bad_iopl(ctxt, ops)) { 3335 if (emulator_bad_iopl(ctxt, ops)) {
3405 emulate_gp(ctxt, 0); 3336 rc = emulate_gp(ctxt, 0);
3406 goto done; 3337 goto done;
3407 } else 3338 } else
3408 ctxt->eflags &= ~X86_EFLAGS_IF; 3339 ctxt->eflags &= ~X86_EFLAGS_IF;
3409 break; 3340 break;
3410 case 0xfb: /* sti */ 3341 case 0xfb: /* sti */
3411 if (emulator_bad_iopl(ctxt, ops)) { 3342 if (emulator_bad_iopl(ctxt, ops)) {
3412 emulate_gp(ctxt, 0); 3343 rc = emulate_gp(ctxt, 0);
3413 goto done; 3344 goto done;
3414 } else { 3345 } else {
3415 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3346 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
3449 c->dst.type = saved_dst_type; 3380 c->dst.type = saved_dst_type;
3450 3381
3451 if ((c->d & SrcMask) == SrcSI) 3382 if ((c->d & SrcMask) == SrcSI)
3452 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), 3383 string_addr_inc(ctxt, seg_override(ctxt, ops, c),
3453 VCPU_REGS_RSI, &c->src); 3384 VCPU_REGS_RSI, &c->src);
3454 3385
3455 if ((c->d & DstMask) == DstDI) 3386 if ((c->d & DstMask) == DstDI)
3456 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, 3387 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
3457 &c->dst); 3388 &c->dst);
3458 3389
3459 if (c->rep_prefix && (c->d & String)) { 3390 if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
3482 ctxt->eip = c->eip; 3413 ctxt->eip = c->eip;
3483 3414
3484done: 3415done:
3416 if (rc == X86EMUL_PROPAGATE_FAULT)
3417 ctxt->have_exception = true;
3485 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3418 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3486 3419
3487twobyte_insn: 3420twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
3544 break; 3477 break;
3545 case 5: /* not defined */ 3478 case 5: /* not defined */
3546 emulate_ud(ctxt); 3479 emulate_ud(ctxt);
3480 rc = X86EMUL_PROPAGATE_FAULT;
3547 goto done; 3481 goto done;
3548 case 7: /* invlpg*/ 3482 case 7: /* invlpg*/
3549 emulate_invlpg(ctxt->vcpu, c->src.addr.mem); 3483 emulate_invlpg(ctxt->vcpu,
3484 linear(ctxt, c->src.addr.mem));
3550 /* Disable writeback. */ 3485 /* Disable writeback. */
3551 c->dst.type = OP_NONE; 3486 c->dst.type = OP_NONE;
3552 break; 3487 break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
3573 case 5 ... 7: 3508 case 5 ... 7:
3574 case 9 ... 15: 3509 case 9 ... 15:
3575 emulate_ud(ctxt); 3510 emulate_ud(ctxt);
3511 rc = X86EMUL_PROPAGATE_FAULT;
3576 goto done; 3512 goto done;
3577 } 3513 }
3578 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3514 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
3581 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3517 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3582 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3518 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3583 emulate_ud(ctxt); 3519 emulate_ud(ctxt);
3520 rc = X86EMUL_PROPAGATE_FAULT;
3584 goto done; 3521 goto done;
3585 } 3522 }
3586 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); 3523 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
3588 case 0x22: /* mov reg, cr */ 3525 case 0x22: /* mov reg, cr */
3589 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { 3526 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
3590 emulate_gp(ctxt, 0); 3527 emulate_gp(ctxt, 0);
3528 rc = X86EMUL_PROPAGATE_FAULT;
3591 goto done; 3529 goto done;
3592 } 3530 }
3593 c->dst.type = OP_NONE; 3531 c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
3596 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3534 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3597 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3535 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3598 emulate_ud(ctxt); 3536 emulate_ud(ctxt);
3537 rc = X86EMUL_PROPAGATE_FAULT;
3599 goto done; 3538 goto done;
3600 } 3539 }
3601 3540
@@ -3604,6 +3543,7 @@ twobyte_insn:
3604 ~0ULL : ~0U), ctxt->vcpu) < 0) { 3543 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3605 /* #UD condition is already handled by the code above */ 3544 /* #UD condition is already handled by the code above */
3606 emulate_gp(ctxt, 0); 3545 emulate_gp(ctxt, 0);
3546 rc = X86EMUL_PROPAGATE_FAULT;
3607 goto done; 3547 goto done;
3608 } 3548 }
3609 3549
@@ -3615,6 +3555,7 @@ twobyte_insn:
3615 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3555 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3616 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3556 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3617 emulate_gp(ctxt, 0); 3557 emulate_gp(ctxt, 0);
3558 rc = X86EMUL_PROPAGATE_FAULT;
3618 goto done; 3559 goto done;
3619 } 3560 }
3620 rc = X86EMUL_CONTINUE; 3561 rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
3623 /* rdmsr */ 3564 /* rdmsr */
3624 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3565 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3625 emulate_gp(ctxt, 0); 3566 emulate_gp(ctxt, 0);
3567 rc = X86EMUL_PROPAGATE_FAULT;
3626 goto done; 3568 goto done;
3627 } else { 3569 } else {
3628 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3570 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
3785 goto writeback; 3727 goto writeback;
3786 3728
3787cannot_emulate: 3729cannot_emulate:
3788 DPRINTF("Cannot emulate %02x\n", c->b);
3789 return -1; 3730 return -1;
3790} 3731}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45329a1..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
73 return vcpu->arch.cr4 & mask; 73 return vcpu->arch.cr4 & mask;
74} 74}
75 75
76static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
77{
78 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
79 kvm_x86_ops->decache_cr3(vcpu);
80 return vcpu->arch.cr3;
81}
82
76static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) 83static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
77{ 84{
78 return kvm_read_cr4_bits(vcpu, ~0UL); 85 return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
84 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 91 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
85} 92}
86 93
94static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
95{
96 vcpu->arch.hflags |= HF_GUEST_MASK;
97}
98
99static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
100{
101 vcpu->arch.hflags &= ~HF_GUEST_MASK;
102}
103
104static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
105{
106 return vcpu->arch.hflags & HF_GUEST_MASK;
107}
108
87#endif 109#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f8973a855..93cf9d0d3653 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
277 277
278 if (old_ppr != ppr) { 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr); 279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 280 if (ppr < old_ppr)
281 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
281 } 282 }
282} 283}
283 284
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbb04aee8301..9cafbb499813 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
194 196
195static u64 __read_mostly shadow_trap_nonpresent_pte; 197static u64 __read_mostly shadow_trap_nonpresent_pte;
196static u64 __read_mostly shadow_notrap_nonpresent_pte; 198static u64 __read_mostly shadow_notrap_nonpresent_pte;
197static u64 __read_mostly shadow_base_present_pte;
198static u64 __read_mostly shadow_nx_mask; 199static u64 __read_mostly shadow_nx_mask;
199static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 200static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
200static u64 __read_mostly shadow_user_mask; 201static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
213} 214}
214EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 215EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
215 216
216void kvm_mmu_set_base_ptes(u64 base_pte)
217{
218 shadow_base_present_pte = base_pte;
219}
220EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
221
222void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 217void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223 u64 dirty_mask, u64 nx_mask, u64 x_mask) 218 u64 dirty_mask, u64 nx_mask, u64 x_mask)
224{ 219{
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
482} 477}
483 478
484/* 479/*
485 * Return the pointer to the largepage write count for a given 480 * Return the pointer to the large page information for a given gfn,
486 * gfn, handling slots that are not large page aligned. 481 * handling slots that are not large page aligned.
487 */ 482 */
488static int *slot_largepage_idx(gfn_t gfn, 483static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
489 struct kvm_memory_slot *slot, 484 struct kvm_memory_slot *slot,
490 int level) 485 int level)
491{ 486{
492 unsigned long idx; 487 unsigned long idx;
493 488
494 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 489 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
495 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 490 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
496 return &slot->lpage_info[level - 2][idx].write_count; 491 return &slot->lpage_info[level - 2][idx];
497} 492}
498 493
499static void account_shadowed(struct kvm *kvm, gfn_t gfn) 494static void account_shadowed(struct kvm *kvm, gfn_t gfn)
500{ 495{
501 struct kvm_memory_slot *slot; 496 struct kvm_memory_slot *slot;
502 int *write_count; 497 struct kvm_lpage_info *linfo;
503 int i; 498 int i;
504 499
505 slot = gfn_to_memslot(kvm, gfn); 500 slot = gfn_to_memslot(kvm, gfn);
506 for (i = PT_DIRECTORY_LEVEL; 501 for (i = PT_DIRECTORY_LEVEL;
507 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 502 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
508 write_count = slot_largepage_idx(gfn, slot, i); 503 linfo = lpage_info_slot(gfn, slot, i);
509 *write_count += 1; 504 linfo->write_count += 1;
510 } 505 }
511} 506}
512 507
513static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 508static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
514{ 509{
515 struct kvm_memory_slot *slot; 510 struct kvm_memory_slot *slot;
516 int *write_count; 511 struct kvm_lpage_info *linfo;
517 int i; 512 int i;
518 513
519 slot = gfn_to_memslot(kvm, gfn); 514 slot = gfn_to_memslot(kvm, gfn);
520 for (i = PT_DIRECTORY_LEVEL; 515 for (i = PT_DIRECTORY_LEVEL;
521 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 516 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
522 write_count = slot_largepage_idx(gfn, slot, i); 517 linfo = lpage_info_slot(gfn, slot, i);
523 *write_count -= 1; 518 linfo->write_count -= 1;
524 WARN_ON(*write_count < 0); 519 WARN_ON(linfo->write_count < 0);
525 } 520 }
526} 521}
527 522
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
530 int level) 525 int level)
531{ 526{
532 struct kvm_memory_slot *slot; 527 struct kvm_memory_slot *slot;
533 int *largepage_idx; 528 struct kvm_lpage_info *linfo;
534 529
535 slot = gfn_to_memslot(kvm, gfn); 530 slot = gfn_to_memslot(kvm, gfn);
536 if (slot) { 531 if (slot) {
537 largepage_idx = slot_largepage_idx(gfn, slot, level); 532 linfo = lpage_info_slot(gfn, slot, level);
538 return *largepage_idx; 533 return linfo->write_count;
539 } 534 }
540 535
541 return 1; 536 return 1;
@@ -590,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
590static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 585static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
591{ 586{
592 struct kvm_memory_slot *slot; 587 struct kvm_memory_slot *slot;
593 unsigned long idx; 588 struct kvm_lpage_info *linfo;
594 589
595 slot = gfn_to_memslot(kvm, gfn); 590 slot = gfn_to_memslot(kvm, gfn);
596 if (likely(level == PT_PAGE_TABLE_LEVEL)) 591 if (likely(level == PT_PAGE_TABLE_LEVEL))
597 return &slot->rmap[gfn - slot->base_gfn]; 592 return &slot->rmap[gfn - slot->base_gfn];
598 593
599 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 594 linfo = lpage_info_slot(gfn, slot, level);
600 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
601 595
602 return &slot->lpage_info[level - 2][idx].rmap_pde; 596 return &linfo->rmap_pde;
603} 597}
604 598
605/* 599/*
@@ -887,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
887 end = start + (memslot->npages << PAGE_SHIFT); 881 end = start + (memslot->npages << PAGE_SHIFT);
888 if (hva >= start && hva < end) { 882 if (hva >= start && hva < end) {
889 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 883 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
884 gfn_t gfn = memslot->base_gfn + gfn_offset;
890 885
891 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 886 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
892 887
893 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 888 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
894 unsigned long idx; 889 struct kvm_lpage_info *linfo;
895 int sh; 890
896 891 linfo = lpage_info_slot(gfn, memslot,
897 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 892 PT_DIRECTORY_LEVEL + j);
898 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 893 ret |= handler(kvm, &linfo->rmap_pde, data);
899 (memslot->base_gfn >> sh);
900 ret |= handler(kvm,
901 &memslot->lpage_info[j][idx].rmap_pde,
902 data);
903 } 894 }
904 trace_kvm_age_page(hva, memslot, ret); 895 trace_kvm_age_page(hva, memslot, ret);
905 retval |= ret; 896 retval |= ret;
@@ -1161,7 +1152,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1161} 1152}
1162 1153
1163static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1154static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1164 struct kvm_mmu_page *sp, bool clear_unsync) 1155 struct kvm_mmu_page *sp)
1165{ 1156{
1166 return 1; 1157 return 1;
1167} 1158}
@@ -1291,7 +1282,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1291 if (clear_unsync) 1282 if (clear_unsync)
1292 kvm_unlink_unsync_page(vcpu->kvm, sp); 1283 kvm_unlink_unsync_page(vcpu->kvm, sp);
1293 1284
1294 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1285 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1295 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1286 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1296 return 1; 1287 return 1;
1297 } 1288 }
@@ -1332,12 +1323,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1332 continue; 1323 continue;
1333 1324
1334 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1325 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1326 kvm_unlink_unsync_page(vcpu->kvm, s);
1335 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1327 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1336 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1328 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1337 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1329 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1338 continue; 1330 continue;
1339 } 1331 }
1340 kvm_unlink_unsync_page(vcpu->kvm, s);
1341 flush = true; 1332 flush = true;
1342 } 1333 }
1343 1334
@@ -1963,9 +1954,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1963 unsigned pte_access, int user_fault, 1954 unsigned pte_access, int user_fault,
1964 int write_fault, int dirty, int level, 1955 int write_fault, int dirty, int level,
1965 gfn_t gfn, pfn_t pfn, bool speculative, 1956 gfn_t gfn, pfn_t pfn, bool speculative,
1966 bool can_unsync, bool reset_host_protection) 1957 bool can_unsync, bool host_writable)
1967{ 1958{
1968 u64 spte; 1959 u64 spte, entry = *sptep;
1969 int ret = 0; 1960 int ret = 0;
1970 1961
1971 /* 1962 /*
@@ -1973,7 +1964,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 * whether the guest actually used the pte (in order to detect 1964 * whether the guest actually used the pte (in order to detect
1974 * demand paging). 1965 * demand paging).
1975 */ 1966 */
1976 spte = shadow_base_present_pte; 1967 spte = PT_PRESENT_MASK;
1977 if (!speculative) 1968 if (!speculative)
1978 spte |= shadow_accessed_mask; 1969 spte |= shadow_accessed_mask;
1979 if (!dirty) 1970 if (!dirty)
@@ -1990,8 +1981,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1990 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 1981 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1991 kvm_is_mmio_pfn(pfn)); 1982 kvm_is_mmio_pfn(pfn));
1992 1983
1993 if (reset_host_protection) 1984 if (host_writable)
1994 spte |= SPTE_HOST_WRITEABLE; 1985 spte |= SPTE_HOST_WRITEABLE;
1986 else
1987 pte_access &= ~ACC_WRITE_MASK;
1995 1988
1996 spte |= (u64)pfn << PAGE_SHIFT; 1989 spte |= (u64)pfn << PAGE_SHIFT;
1997 1990
@@ -2036,6 +2029,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2036 2029
2037set_pte: 2030set_pte:
2038 update_spte(sptep, spte); 2031 update_spte(sptep, spte);
2032 /*
2033 * If we overwrite a writable spte with a read-only one we
2034 * should flush remote TLBs. Otherwise rmap_write_protect
2035 * will find a read-only spte, even though the writable spte
2036 * might be cached on a CPU's TLB.
2037 */
2038 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2039 kvm_flush_remote_tlbs(vcpu->kvm);
2039done: 2040done:
2040 return ret; 2041 return ret;
2041} 2042}
@@ -2045,7 +2046,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 int user_fault, int write_fault, int dirty, 2046 int user_fault, int write_fault, int dirty,
2046 int *ptwrite, int level, gfn_t gfn, 2047 int *ptwrite, int level, gfn_t gfn,
2047 pfn_t pfn, bool speculative, 2048 pfn_t pfn, bool speculative,
2048 bool reset_host_protection) 2049 bool host_writable)
2049{ 2050{
2050 int was_rmapped = 0; 2051 int was_rmapped = 0;
2051 int rmap_count; 2052 int rmap_count;
@@ -2080,7 +2081,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2080 2081
2081 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2082 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2082 dirty, level, gfn, pfn, speculative, true, 2083 dirty, level, gfn, pfn, speculative, true,
2083 reset_host_protection)) { 2084 host_writable)) {
2084 if (write_fault) 2085 if (write_fault)
2085 *ptwrite = 1; 2086 *ptwrite = 1;
2086 kvm_mmu_flush_tlb(vcpu); 2087 kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2212,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2211} 2212}
2212 2213
2213static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2214static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2214 int level, gfn_t gfn, pfn_t pfn) 2215 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2216 bool prefault)
2215{ 2217{
2216 struct kvm_shadow_walk_iterator iterator; 2218 struct kvm_shadow_walk_iterator iterator;
2217 struct kvm_mmu_page *sp; 2219 struct kvm_mmu_page *sp;
@@ -2220,9 +2222,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2220 2222
2221 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2223 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2222 if (iterator.level == level) { 2224 if (iterator.level == level) {
2223 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2225 unsigned pte_access = ACC_ALL;
2226
2227 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2224 0, write, 1, &pt_write, 2228 0, write, 1, &pt_write,
2225 level, gfn, pfn, false, true); 2229 level, gfn, pfn, prefault, map_writable);
2226 direct_pte_prefetch(vcpu, iterator.sptep); 2230 direct_pte_prefetch(vcpu, iterator.sptep);
2227 ++vcpu->stat.pf_fixed; 2231 ++vcpu->stat.pf_fixed;
2228 break; 2232 break;
@@ -2277,12 +2281,17 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2277 return 1; 2281 return 1;
2278} 2282}
2279 2283
2280static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2284static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2285 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2286
2287static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2288 bool prefault)
2281{ 2289{
2282 int r; 2290 int r;
2283 int level; 2291 int level;
2284 pfn_t pfn; 2292 pfn_t pfn;
2285 unsigned long mmu_seq; 2293 unsigned long mmu_seq;
2294 bool map_writable;
2286 2295
2287 level = mapping_level(vcpu, gfn); 2296 level = mapping_level(vcpu, gfn);
2288 2297
@@ -2297,7 +2306,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2297 2306
2298 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2307 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2299 smp_rmb(); 2308 smp_rmb();
2300 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2309
2310 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2311 return 0;
2301 2312
2302 /* mmio */ 2313 /* mmio */
2303 if (is_error_pfn(pfn)) 2314 if (is_error_pfn(pfn))
@@ -2307,7 +2318,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2307 if (mmu_notifier_retry(vcpu, mmu_seq)) 2318 if (mmu_notifier_retry(vcpu, mmu_seq))
2308 goto out_unlock; 2319 goto out_unlock;
2309 kvm_mmu_free_some_pages(vcpu); 2320 kvm_mmu_free_some_pages(vcpu);
2310 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2321 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2322 prefault);
2311 spin_unlock(&vcpu->kvm->mmu_lock); 2323 spin_unlock(&vcpu->kvm->mmu_lock);
2312 2324
2313 2325
@@ -2530,6 +2542,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2530 hpa_t root = vcpu->arch.mmu.root_hpa; 2542 hpa_t root = vcpu->arch.mmu.root_hpa;
2531 sp = page_header(root); 2543 sp = page_header(root);
2532 mmu_sync_children(vcpu, sp); 2544 mmu_sync_children(vcpu, sp);
2545 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2533 return; 2546 return;
2534 } 2547 }
2535 for (i = 0; i < 4; ++i) { 2548 for (i = 0; i < 4; ++i) {
@@ -2552,23 +2565,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2552} 2565}
2553 2566
2554static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2567static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2555 u32 access, u32 *error) 2568 u32 access, struct x86_exception *exception)
2556{ 2569{
2557 if (error) 2570 if (exception)
2558 *error = 0; 2571 exception->error_code = 0;
2559 return vaddr; 2572 return vaddr;
2560} 2573}
2561 2574
2562static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, 2575static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2563 u32 access, u32 *error) 2576 u32 access,
2577 struct x86_exception *exception)
2564{ 2578{
2565 if (error) 2579 if (exception)
2566 *error = 0; 2580 exception->error_code = 0;
2567 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2581 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2568} 2582}
2569 2583
2570static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2584static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2571 u32 error_code) 2585 u32 error_code, bool prefault)
2572{ 2586{
2573 gfn_t gfn; 2587 gfn_t gfn;
2574 int r; 2588 int r;
@@ -2584,17 +2598,67 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2584 gfn = gva >> PAGE_SHIFT; 2598 gfn = gva >> PAGE_SHIFT;
2585 2599
2586 return nonpaging_map(vcpu, gva & PAGE_MASK, 2600 return nonpaging_map(vcpu, gva & PAGE_MASK,
2587 error_code & PFERR_WRITE_MASK, gfn); 2601 error_code & PFERR_WRITE_MASK, gfn, prefault);
2602}
2603
2604static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2605{
2606 struct kvm_arch_async_pf arch;
2607
2608 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2609 arch.gfn = gfn;
2610 arch.direct_map = vcpu->arch.mmu.direct_map;
2611 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2612
2613 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2588} 2614}
2589 2615
2590static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2616static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2591 u32 error_code) 2617{
2618 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2619 kvm_event_needs_reinjection(vcpu)))
2620 return false;
2621
2622 return kvm_x86_ops->interrupt_allowed(vcpu);
2623}
2624
2625static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2626 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2627{
2628 bool async;
2629
2630 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2631
2632 if (!async)
2633 return false; /* *pfn has correct page already */
2634
2635 put_page(pfn_to_page(*pfn));
2636
2637 if (!prefault && can_do_async_pf(vcpu)) {
2638 trace_kvm_try_async_get_page(gva, gfn);
2639 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2640 trace_kvm_async_pf_doublefault(gva, gfn);
2641 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2642 return true;
2643 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2644 return true;
2645 }
2646
2647 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2648
2649 return false;
2650}
2651
2652static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2653 bool prefault)
2592{ 2654{
2593 pfn_t pfn; 2655 pfn_t pfn;
2594 int r; 2656 int r;
2595 int level; 2657 int level;
2596 gfn_t gfn = gpa >> PAGE_SHIFT; 2658 gfn_t gfn = gpa >> PAGE_SHIFT;
2597 unsigned long mmu_seq; 2659 unsigned long mmu_seq;
2660 int write = error_code & PFERR_WRITE_MASK;
2661 bool map_writable;
2598 2662
2599 ASSERT(vcpu); 2663 ASSERT(vcpu);
2600 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2664 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2609,15 +2673,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2609 2673
2610 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2674 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2611 smp_rmb(); 2675 smp_rmb();
2612 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2676
2677 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2678 return 0;
2679
2680 /* mmio */
2613 if (is_error_pfn(pfn)) 2681 if (is_error_pfn(pfn))
2614 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2682 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2615 spin_lock(&vcpu->kvm->mmu_lock); 2683 spin_lock(&vcpu->kvm->mmu_lock);
2616 if (mmu_notifier_retry(vcpu, mmu_seq)) 2684 if (mmu_notifier_retry(vcpu, mmu_seq))
2617 goto out_unlock; 2685 goto out_unlock;
2618 kvm_mmu_free_some_pages(vcpu); 2686 kvm_mmu_free_some_pages(vcpu);
2619 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2687 r = __direct_map(vcpu, gpa, write, map_writable,
2620 level, gfn, pfn); 2688 level, gfn, pfn, prefault);
2621 spin_unlock(&vcpu->kvm->mmu_lock); 2689 spin_unlock(&vcpu->kvm->mmu_lock);
2622 2690
2623 return r; 2691 return r;
@@ -2659,18 +2727,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2659 2727
2660static void paging_new_cr3(struct kvm_vcpu *vcpu) 2728static void paging_new_cr3(struct kvm_vcpu *vcpu)
2661{ 2729{
2662 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2730 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2663 mmu_free_roots(vcpu); 2731 mmu_free_roots(vcpu);
2664} 2732}
2665 2733
2666static unsigned long get_cr3(struct kvm_vcpu *vcpu) 2734static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2667{ 2735{
2668 return vcpu->arch.cr3; 2736 return kvm_read_cr3(vcpu);
2669} 2737}
2670 2738
2671static void inject_page_fault(struct kvm_vcpu *vcpu) 2739static void inject_page_fault(struct kvm_vcpu *vcpu,
2740 struct x86_exception *fault)
2672{ 2741{
2673 vcpu->arch.mmu.inject_page_fault(vcpu); 2742 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2674} 2743}
2675 2744
2676static void paging_free(struct kvm_vcpu *vcpu) 2745static void paging_free(struct kvm_vcpu *vcpu)
@@ -2816,6 +2885,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2816{ 2885{
2817 struct kvm_mmu *context = vcpu->arch.walk_mmu; 2886 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2818 2887
2888 context->base_role.word = 0;
2819 context->new_cr3 = nonpaging_new_cr3; 2889 context->new_cr3 = nonpaging_new_cr3;
2820 context->page_fault = tdp_page_fault; 2890 context->page_fault = tdp_page_fault;
2821 context->free = nonpaging_free; 2891 context->free = nonpaging_free;
@@ -3008,9 +3078,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3008 return; 3078 return;
3009 } 3079 }
3010 3080
3011 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
3012 return;
3013
3014 ++vcpu->kvm->stat.mmu_pte_updated; 3081 ++vcpu->kvm->stat.mmu_pte_updated;
3015 if (!sp->role.cr4_pae) 3082 if (!sp->role.cr4_pae)
3016 paging32_update_pte(vcpu, sp, spte, new); 3083 paging32_update_pte(vcpu, sp, spte, new);
@@ -3264,12 +3331,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3264 } 3331 }
3265} 3332}
3266 3333
3267int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3334int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3335 void *insn, int insn_len)
3268{ 3336{
3269 int r; 3337 int r;
3270 enum emulation_result er; 3338 enum emulation_result er;
3271 3339
3272 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 3340 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3273 if (r < 0) 3341 if (r < 0)
3274 goto out; 3342 goto out;
3275 3343
@@ -3282,7 +3350,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
3282 if (r) 3350 if (r)
3283 goto out; 3351 goto out;
3284 3352
3285 er = emulate_instruction(vcpu, cr2, error_code, 0); 3353 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3286 3354
3287 switch (er) { 3355 switch (er) {
3288 case EMULATE_DONE: 3356 case EMULATE_DONE:
@@ -3377,11 +3445,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3377 if (!test_bit(slot, sp->slot_bitmap)) 3445 if (!test_bit(slot, sp->slot_bitmap))
3378 continue; 3446 continue;
3379 3447
3448 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3449 continue;
3450
3380 pt = sp->spt; 3451 pt = sp->spt;
3381 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3452 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3382 /* avoid RMW */ 3453 /* avoid RMW */
3383 if (is_writable_pte(pt[i])) 3454 if (is_writable_pte(pt[i]))
3384 pt[i] &= ~PT_WRITABLE_MASK; 3455 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3385 } 3456 }
3386 kvm_flush_remote_tlbs(kvm); 3457 kvm_flush_remote_tlbs(kvm);
3387} 3458}
@@ -3463,13 +3534,6 @@ static void mmu_destroy_caches(void)
3463 kmem_cache_destroy(mmu_page_header_cache); 3534 kmem_cache_destroy(mmu_page_header_cache);
3464} 3535}
3465 3536
3466void kvm_mmu_module_exit(void)
3467{
3468 mmu_destroy_caches();
3469 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3470 unregister_shrinker(&mmu_shrinker);
3471}
3472
3473int kvm_mmu_module_init(void) 3537int kvm_mmu_module_init(void)
3474{ 3538{
3475 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3539 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3566,7 +3630,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3566 3630
3567static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3631static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3568{ 3632{
3569 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3633 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3570 return 1; 3634 return 1;
3571} 3635}
3572 3636
@@ -3662,12 +3726,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3662} 3726}
3663EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3727EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3664 3728
3665#ifdef CONFIG_KVM_MMU_AUDIT
3666#include "mmu_audit.c"
3667#else
3668static void mmu_audit_disable(void) { }
3669#endif
3670
3671void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 3729void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3672{ 3730{
3673 ASSERT(vcpu); 3731 ASSERT(vcpu);
@@ -3675,5 +3733,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3675 destroy_kvm_mmu(vcpu); 3733 destroy_kvm_mmu(vcpu);
3676 free_mmu_pages(vcpu); 3734 free_mmu_pages(vcpu);
3677 mmu_free_memory_caches(vcpu); 3735 mmu_free_memory_caches(vcpu);
3736}
3737
3738#ifdef CONFIG_KVM_MMU_AUDIT
3739#include "mmu_audit.c"
3740#else
3741static void mmu_audit_disable(void) { }
3742#endif
3743
3744void kvm_mmu_module_exit(void)
3745{
3746 mmu_destroy_caches();
3747 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3748 unregister_shrinker(&mmu_shrinker);
3678 mmu_audit_disable(); 3749 mmu_audit_disable();
3679} 3750}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdde6221..5f6223b8bcf7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
19 19
20#include <linux/ratelimit.h> 20#include <linux/ratelimit.h>
21 21
22static int audit_point; 22#define audit_printk(kvm, fmt, args...) \
23
24#define audit_printk(fmt, args...) \
25 printk(KERN_ERR "audit: (%s) error: " \ 23 printk(KERN_ERR "audit: (%s) error: " \
26 fmt, audit_point_name[audit_point], ##args) 24 fmt, audit_point_name[kvm->arch.audit_point], ##args)
27 25
28typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); 26typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
29 27
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
97 95
98 if (sp->unsync) { 96 if (sp->unsync) {
99 if (level != PT_PAGE_TABLE_LEVEL) { 97 if (level != PT_PAGE_TABLE_LEVEL) {
100 audit_printk("unsync sp: %p level = %d\n", sp, level); 98 audit_printk(vcpu->kvm, "unsync sp: %p "
99 "level = %d\n", sp, level);
101 return; 100 return;
102 } 101 }
103 102
104 if (*sptep == shadow_notrap_nonpresent_pte) { 103 if (*sptep == shadow_notrap_nonpresent_pte) {
105 audit_printk("notrap spte in unsync sp: %p\n", sp); 104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return; 106 return;
107 } 107 }
108 } 108 }
109 109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { 110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk("notrap spte in direct sp: %p\n", sp); 111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
112 return; 113 return;
113 } 114 }
114 115
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
125 126
126 hpa = pfn << PAGE_SHIFT; 127 hpa = pfn << PAGE_SHIFT;
127 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) 128 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
128 audit_printk("levels %d pfn %llx hpa %llx ent %llxn", 129 audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
129 vcpu->arch.mmu.root_level, pfn, hpa, *sptep); 130 "ent %llxn", vcpu->arch.mmu.root_level, pfn,
131 hpa, *sptep);
130} 132}
131 133
132static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 134static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
142 if (!gfn_to_memslot(kvm, gfn)) { 144 if (!gfn_to_memslot(kvm, gfn)) {
143 if (!printk_ratelimit()) 145 if (!printk_ratelimit())
144 return; 146 return;
145 audit_printk("no memslot for gfn %llx\n", gfn); 147 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
146 audit_printk("index %ld of sp (gfn=%llx)\n", 148 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
147 (long int)(sptep - rev_sp->spt), rev_sp->gfn); 149 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
148 dump_stack(); 150 dump_stack();
149 return; 151 return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
153 if (!*rmapp) { 155 if (!*rmapp) {
154 if (!printk_ratelimit()) 156 if (!printk_ratelimit())
155 return; 157 return;
156 audit_printk("no rmap for writable spte %llx\n", *sptep); 158 audit_printk(kvm, "no rmap for writable spte %llx\n",
159 *sptep);
157 dump_stack(); 160 dump_stack();
158 } 161 }
159} 162}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
168{ 171{
169 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 172 struct kvm_mmu_page *sp = page_header(__pa(sptep));
170 173
171 if (audit_point == AUDIT_POST_SYNC && sp->unsync) 174 if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
172 audit_printk("meet unsync sp(%p) after sync root.\n", sp); 175 audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
176 "root.\n", sp);
173} 177}
174 178
175static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) 179static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
202 spte = rmap_next(kvm, rmapp, NULL); 206 spte = rmap_next(kvm, rmapp, NULL);
203 while (spte) { 207 while (spte) {
204 if (is_writable_pte(*spte)) 208 if (is_writable_pte(*spte))
205 audit_printk("shadow page has writable mappings: gfn " 209 audit_printk(kvm, "shadow page has writable "
206 "%llx role %x\n", sp->gfn, sp->role.word); 210 "mappings: gfn %llx role %x\n",
211 sp->gfn, sp->role.word);
207 spte = rmap_next(kvm, rmapp, spte); 212 spte = rmap_next(kvm, rmapp, spte);
208 } 213 }
209} 214}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
238 if (!__ratelimit(&ratelimit_state)) 243 if (!__ratelimit(&ratelimit_state))
239 return; 244 return;
240 245
241 audit_point = point; 246 vcpu->kvm->arch.audit_point = point;
242 audit_all_active_sps(vcpu->kvm); 247 audit_all_active_sps(vcpu->kvm);
243 audit_vcpu_spte(vcpu); 248 audit_vcpu_spte(vcpu);
244} 249}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..53210f1e94c2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
72 unsigned pt_access; 72 unsigned pt_access;
73 unsigned pte_access; 73 unsigned pte_access;
74 gfn_t gfn; 74 gfn_t gfn;
75 u32 error_code; 75 struct x86_exception fault;
76}; 76};
77 77
78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
266 return 1; 266 return 1;
267 267
268error: 268error:
269 walker->error_code = 0; 269 walker->fault.vector = PF_VECTOR;
270 walker->fault.error_code_valid = true;
271 walker->fault.error_code = 0;
270 if (present) 272 if (present)
271 walker->error_code |= PFERR_PRESENT_MASK; 273 walker->fault.error_code |= PFERR_PRESENT_MASK;
272 274
273 walker->error_code |= write_fault | user_fault; 275 walker->fault.error_code |= write_fault | user_fault;
274 276
275 if (fetch_fault && mmu->nx) 277 if (fetch_fault && mmu->nx)
276 walker->error_code |= PFERR_FETCH_MASK; 278 walker->fault.error_code |= PFERR_FETCH_MASK;
277 if (rsvd_fault) 279 if (rsvd_fault)
278 walker->error_code |= PFERR_RSVD_MASK; 280 walker->fault.error_code |= PFERR_RSVD_MASK;
279 281
280 vcpu->arch.fault.address = addr; 282 walker->fault.address = addr;
281 vcpu->arch.fault.error_code = walker->error_code; 283 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
282 284
283 trace_kvm_mmu_walker_error(walker->error_code); 285 trace_kvm_mmu_walker_error(walker->fault.error_code);
284 return 0; 286 return 0;
285} 287}
286 288
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
299 addr, access); 301 addr, access);
300} 302}
301 303
304static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
305 struct kvm_mmu_page *sp, u64 *spte,
306 pt_element_t gpte)
307{
308 u64 nonpresent = shadow_trap_nonpresent_pte;
309
310 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
311 goto no_present;
312
313 if (!is_present_gpte(gpte)) {
314 if (!sp->unsync)
315 nonpresent = shadow_notrap_nonpresent_pte;
316 goto no_present;
317 }
318
319 if (!(gpte & PT_ACCESSED_MASK))
320 goto no_present;
321
322 return false;
323
324no_present:
325 drop_spte(vcpu->kvm, spte, nonpresent);
326 return true;
327}
328
302static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
303 u64 *spte, const void *pte) 330 u64 *spte, const void *pte)
304{ 331{
305 pt_element_t gpte; 332 pt_element_t gpte;
306 unsigned pte_access; 333 unsigned pte_access;
307 pfn_t pfn; 334 pfn_t pfn;
308 u64 new_spte;
309 335
310 gpte = *(const pt_element_t *)pte; 336 gpte = *(const pt_element_t *)pte;
311 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 337 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
312 if (!is_present_gpte(gpte)) {
313 if (sp->unsync)
314 new_spte = shadow_trap_nonpresent_pte;
315 else
316 new_spte = shadow_notrap_nonpresent_pte;
317 __set_spte(spte, new_spte);
318 }
319 return; 338 return;
320 } 339
321 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
322 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
323 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329 return; 348 return;
330 kvm_get_pfn(pfn); 349 kvm_get_pfn(pfn);
331 /* 350 /*
332 * we call mmu_set_spte() with reset_host_protection = true beacuse that 351 * we call mmu_set_spte() with host_writable = true beacuse that
333 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
334 */ 353 */
335 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
364 u64 *sptep) 383 u64 *sptep)
365{ 384{
366 struct kvm_mmu_page *sp; 385 struct kvm_mmu_page *sp;
367 struct kvm_mmu *mmu = &vcpu->arch.mmu;
368 pt_element_t *gptep = gw->prefetch_ptes; 386 pt_element_t *gptep = gw->prefetch_ptes;
369 u64 *spte; 387 u64 *spte;
370 int i; 388 int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
395 413
396 gpte = gptep[i]; 414 gpte = gptep[i];
397 415
398 if (!is_present_gpte(gpte) || 416 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
399 is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
400 if (!sp->unsync)
401 __set_spte(spte, shadow_notrap_nonpresent_pte);
402 continue;
403 }
404
405 if (!(gpte & PT_ACCESSED_MASK))
406 continue; 417 continue;
407 418
408 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 419 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
427static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 438static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
428 struct guest_walker *gw, 439 struct guest_walker *gw,
429 int user_fault, int write_fault, int hlevel, 440 int user_fault, int write_fault, int hlevel,
430 int *ptwrite, pfn_t pfn) 441 int *ptwrite, pfn_t pfn, bool map_writable,
442 bool prefault)
431{ 443{
432 unsigned access = gw->pt_access; 444 unsigned access = gw->pt_access;
433 struct kvm_mmu_page *sp = NULL; 445 struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
501 513
502 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 514 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
503 user_fault, write_fault, dirty, ptwrite, it.level, 515 user_fault, write_fault, dirty, ptwrite, it.level,
504 gw->gfn, pfn, false, true); 516 gw->gfn, pfn, prefault, map_writable);
505 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 517 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
506 518
507 return it.sptep; 519 return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
527 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 539 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
528 * a negative value on error. 540 * a negative value on error.
529 */ 541 */
530static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 542static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
531 u32 error_code) 543 bool prefault)
532{ 544{
533 int write_fault = error_code & PFERR_WRITE_MASK; 545 int write_fault = error_code & PFERR_WRITE_MASK;
534 int user_fault = error_code & PFERR_USER_MASK; 546 int user_fault = error_code & PFERR_USER_MASK;
@@ -539,6 +551,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
539 pfn_t pfn; 551 pfn_t pfn;
540 int level = PT_PAGE_TABLE_LEVEL; 552 int level = PT_PAGE_TABLE_LEVEL;
541 unsigned long mmu_seq; 553 unsigned long mmu_seq;
554 bool map_writable;
542 555
543 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 556 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
544 557
@@ -556,8 +569,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
556 */ 569 */
557 if (!r) { 570 if (!r) {
558 pgprintk("%s: guest page fault\n", __func__); 571 pgprintk("%s: guest page fault\n", __func__);
559 inject_page_fault(vcpu); 572 if (!prefault) {
560 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 573 inject_page_fault(vcpu, &walker.fault);
574 /* reset fork detector */
575 vcpu->arch.last_pt_write_count = 0;
576 }
561 return 0; 577 return 0;
562 } 578 }
563 579
@@ -568,7 +584,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
568 584
569 mmu_seq = vcpu->kvm->mmu_notifier_seq; 585 mmu_seq = vcpu->kvm->mmu_notifier_seq;
570 smp_rmb(); 586 smp_rmb();
571 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 587
588 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
589 &map_writable))
590 return 0;
572 591
573 /* mmio */ 592 /* mmio */
574 if (is_error_pfn(pfn)) 593 if (is_error_pfn(pfn))
@@ -581,7 +600,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
581 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 600 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
582 kvm_mmu_free_some_pages(vcpu); 601 kvm_mmu_free_some_pages(vcpu);
583 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 602 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
584 level, &write_pt, pfn); 603 level, &write_pt, pfn, map_writable, prefault);
585 (void)sptep; 604 (void)sptep;
586 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 605 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
587 sptep, *sptep, write_pt); 606 sptep, *sptep, write_pt);
@@ -661,7 +680,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
661} 680}
662 681
663static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 682static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
664 u32 *error) 683 struct x86_exception *exception)
665{ 684{
666 struct guest_walker walker; 685 struct guest_walker walker;
667 gpa_t gpa = UNMAPPED_GVA; 686 gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +691,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
672 if (r) { 691 if (r) {
673 gpa = gfn_to_gpa(walker.gfn); 692 gpa = gfn_to_gpa(walker.gfn);
674 gpa |= vaddr & ~PAGE_MASK; 693 gpa |= vaddr & ~PAGE_MASK;
675 } else if (error) 694 } else if (exception)
676 *error = walker.error_code; 695 *exception = walker.fault;
677 696
678 return gpa; 697 return gpa;
679} 698}
680 699
681static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 700static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
682 u32 access, u32 *error) 701 u32 access,
702 struct x86_exception *exception)
683{ 703{
684 struct guest_walker walker; 704 struct guest_walker walker;
685 gpa_t gpa = UNMAPPED_GVA; 705 gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +710,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
690 if (r) { 710 if (r) {
691 gpa = gfn_to_gpa(walker.gfn); 711 gpa = gfn_to_gpa(walker.gfn);
692 gpa |= vaddr & ~PAGE_MASK; 712 gpa |= vaddr & ~PAGE_MASK;
693 } else if (error) 713 } else if (exception)
694 *error = walker.error_code; 714 *exception = walker.fault;
695 715
696 return gpa; 716 return gpa;
697} 717}
@@ -730,12 +750,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
730 * Using the cached information from sp->gfns is safe because: 750 * Using the cached information from sp->gfns is safe because:
731 * - The spte has a reference to the struct page, so the pfn for a given gfn 751 * - The spte has a reference to the struct page, so the pfn for a given gfn
732 * can't change unless all sptes pointing to it are nuked first. 752 * can't change unless all sptes pointing to it are nuked first.
753 *
754 * Note:
755 * We should flush all tlbs if spte is dropped even though guest is
756 * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
757 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
758 * used by guest then tlbs are not flushed, so guest is allowed to access the
759 * freed pages.
760 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
733 */ 761 */
734static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 762static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
735 bool clear_unsync)
736{ 763{
737 int i, offset, nr_present; 764 int i, offset, nr_present;
738 bool reset_host_protection; 765 bool host_writable;
739 gpa_t first_pte_gpa; 766 gpa_t first_pte_gpa;
740 767
741 offset = nr_present = 0; 768 offset = nr_present = 0;
@@ -764,31 +791,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
764 return -EINVAL; 791 return -EINVAL;
765 792
766 gfn = gpte_to_gfn(gpte); 793 gfn = gpte_to_gfn(gpte);
767 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
768 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
769 || !(gpte & PT_ACCESSED_MASK)) {
770 u64 nonpresent;
771 794
772 if (is_present_gpte(gpte) || !clear_unsync) 795 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
773 nonpresent = shadow_trap_nonpresent_pte; 796 vcpu->kvm->tlbs_dirty++;
774 else 797 continue;
775 nonpresent = shadow_notrap_nonpresent_pte; 798 }
776 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); 799
800 if (gfn != sp->gfns[i]) {
801 drop_spte(vcpu->kvm, &sp->spt[i],
802 shadow_trap_nonpresent_pte);
803 vcpu->kvm->tlbs_dirty++;
777 continue; 804 continue;
778 } 805 }
779 806
780 nr_present++; 807 nr_present++;
781 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 808 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
782 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { 809 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
783 pte_access &= ~ACC_WRITE_MASK; 810
784 reset_host_protection = 0;
785 } else {
786 reset_host_protection = 1;
787 }
788 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 811 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
789 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 812 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
790 spte_to_pfn(sp->spt[i]), true, false, 813 spte_to_pfn(sp->spt[i]), true, false,
791 reset_host_protection); 814 host_writable);
792 } 815 }
793 816
794 return !nr_present; 817 return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b81a9b7c2ca4..25bd1bc5aad2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
31 31
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/desc.h> 33#include <asm/desc.h>
34#include <asm/kvm_para.h>
34 35
35#include <asm/virtext.h> 36#include <asm/virtext.h>
36#include "trace.h" 37#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
50#define SVM_FEATURE_LBRV (1 << 1) 51#define SVM_FEATURE_LBRV (1 << 1)
51#define SVM_FEATURE_SVML (1 << 2) 52#define SVM_FEATURE_SVML (1 << 2)
52#define SVM_FEATURE_NRIP (1 << 3) 53#define SVM_FEATURE_NRIP (1 << 3)
54#define SVM_FEATURE_TSC_RATE (1 << 4)
55#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
56#define SVM_FEATURE_FLUSH_ASID (1 << 6)
57#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
53#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 58#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
54 59
55#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 60#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
97 unsigned long vmexit_rax; 102 unsigned long vmexit_rax;
98 103
99 /* cache for intercepts of the guest */ 104 /* cache for intercepts of the guest */
100 u16 intercept_cr_read; 105 u32 intercept_cr;
101 u16 intercept_cr_write; 106 u32 intercept_dr;
102 u16 intercept_dr_read;
103 u16 intercept_dr_write;
104 u32 intercept_exceptions; 107 u32 intercept_exceptions;
105 u64 intercept; 108 u64 intercept;
106 109
@@ -123,7 +126,12 @@ struct vcpu_svm {
123 u64 next_rip; 126 u64 next_rip;
124 127
125 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 128 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
126 u64 host_gs_base; 129 struct {
130 u16 fs;
131 u16 gs;
132 u16 ldt;
133 u64 gs_base;
134 } host;
127 135
128 u32 *msrpm; 136 u32 *msrpm;
129 137
@@ -133,6 +141,7 @@ struct vcpu_svm {
133 141
134 unsigned int3_injected; 142 unsigned int3_injected;
135 unsigned long int3_rip; 143 unsigned long int3_rip;
144 u32 apf_reason;
136}; 145};
137 146
138#define MSR_INVALID 0xffffffffU 147#define MSR_INVALID 0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
180static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 189static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
181 bool has_error_code, u32 error_code); 190 bool has_error_code, u32 error_code);
182 191
192enum {
193 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
194 pause filter count */
195 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
196 VMCB_ASID, /* ASID */
197 VMCB_INTR, /* int_ctl, int_vector */
198 VMCB_NPT, /* npt_en, nCR3, gPAT */
199 VMCB_CR, /* CR0, CR3, CR4, EFER */
200 VMCB_DR, /* DR6, DR7 */
201 VMCB_DT, /* GDT, IDT */
202 VMCB_SEG, /* CS, DS, SS, ES, CPL */
203 VMCB_CR2, /* CR2 only */
204 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
205 VMCB_DIRTY_MAX,
206};
207
208/* TPR and CR2 are always written before VMRUN */
209#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
210
211static inline void mark_all_dirty(struct vmcb *vmcb)
212{
213 vmcb->control.clean = 0;
214}
215
216static inline void mark_all_clean(struct vmcb *vmcb)
217{
218 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
219 & ~VMCB_ALWAYS_DIRTY_MASK;
220}
221
222static inline void mark_dirty(struct vmcb *vmcb, int bit)
223{
224 vmcb->control.clean &= ~(1 << bit);
225}
226
183static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 227static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
184{ 228{
185 return container_of(vcpu, struct vcpu_svm, vcpu); 229 return container_of(vcpu, struct vcpu_svm, vcpu);
186} 230}
187 231
188static inline bool is_nested(struct vcpu_svm *svm) 232static void recalc_intercepts(struct vcpu_svm *svm)
233{
234 struct vmcb_control_area *c, *h;
235 struct nested_state *g;
236
237 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
238
239 if (!is_guest_mode(&svm->vcpu))
240 return;
241
242 c = &svm->vmcb->control;
243 h = &svm->nested.hsave->control;
244 g = &svm->nested;
245
246 c->intercept_cr = h->intercept_cr | g->intercept_cr;
247 c->intercept_dr = h->intercept_dr | g->intercept_dr;
248 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
249 c->intercept = h->intercept | g->intercept;
250}
251
252static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
253{
254 if (is_guest_mode(&svm->vcpu))
255 return svm->nested.hsave;
256 else
257 return svm->vmcb;
258}
259
260static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
261{
262 struct vmcb *vmcb = get_host_vmcb(svm);
263
264 vmcb->control.intercept_cr |= (1U << bit);
265
266 recalc_intercepts(svm);
267}
268
269static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
270{
271 struct vmcb *vmcb = get_host_vmcb(svm);
272
273 vmcb->control.intercept_cr &= ~(1U << bit);
274
275 recalc_intercepts(svm);
276}
277
278static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
279{
280 struct vmcb *vmcb = get_host_vmcb(svm);
281
282 return vmcb->control.intercept_cr & (1U << bit);
283}
284
285static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
286{
287 struct vmcb *vmcb = get_host_vmcb(svm);
288
289 vmcb->control.intercept_dr |= (1U << bit);
290
291 recalc_intercepts(svm);
292}
293
294static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
295{
296 struct vmcb *vmcb = get_host_vmcb(svm);
297
298 vmcb->control.intercept_dr &= ~(1U << bit);
299
300 recalc_intercepts(svm);
301}
302
303static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
304{
305 struct vmcb *vmcb = get_host_vmcb(svm);
306
307 vmcb->control.intercept_exceptions |= (1U << bit);
308
309 recalc_intercepts(svm);
310}
311
312static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
189{ 313{
190 return svm->nested.vmcb; 314 struct vmcb *vmcb = get_host_vmcb(svm);
315
316 vmcb->control.intercept_exceptions &= ~(1U << bit);
317
318 recalc_intercepts(svm);
319}
320
321static inline void set_intercept(struct vcpu_svm *svm, int bit)
322{
323 struct vmcb *vmcb = get_host_vmcb(svm);
324
325 vmcb->control.intercept |= (1ULL << bit);
326
327 recalc_intercepts(svm);
328}
329
330static inline void clr_intercept(struct vcpu_svm *svm, int bit)
331{
332 struct vmcb *vmcb = get_host_vmcb(svm);
333
334 vmcb->control.intercept &= ~(1ULL << bit);
335
336 recalc_intercepts(svm);
191} 337}
192 338
193static inline void enable_gif(struct vcpu_svm *svm) 339static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
264 410
265#define MAX_INST_SIZE 15 411#define MAX_INST_SIZE 15
266 412
267static inline u32 svm_has(u32 feat)
268{
269 return svm_features & feat;
270}
271
272static inline void clgi(void) 413static inline void clgi(void)
273{ 414{
274 asm volatile (__ex(SVM_CLGI)); 415 asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
284 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 425 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
285} 426}
286 427
287static inline void force_new_asid(struct kvm_vcpu *vcpu)
288{
289 to_svm(vcpu)->asid_generation--;
290}
291
292static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
293{
294 force_new_asid(vcpu);
295}
296
297static int get_npt_level(void) 428static int get_npt_level(void)
298{ 429{
299#ifdef CONFIG_X86_64 430#ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
310 efer &= ~EFER_LME; 441 efer &= ~EFER_LME;
311 442
312 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 443 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
444 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
313} 445}
314 446
315static int is_external_interrupt(u32 info) 447static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
347 svm->next_rip = svm->vmcb->control.next_rip; 479 svm->next_rip = svm->vmcb->control.next_rip;
348 480
349 if (!svm->next_rip) { 481 if (!svm->next_rip) {
350 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 482 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
351 EMULATE_DONE) 483 EMULATE_DONE)
352 printk(KERN_DEBUG "%s: NOP\n", __func__); 484 printk(KERN_DEBUG "%s: NOP\n", __func__);
353 return; 485 return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
374 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 506 nested_svm_check_exception(svm, nr, has_error_code, error_code))
375 return; 507 return;
376 508
377 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { 509 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
378 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 510 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
379 511
380 /* 512 /*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
670 802
671 svm_features = cpuid_edx(SVM_CPUID_FUNC); 803 svm_features = cpuid_edx(SVM_CPUID_FUNC);
672 804
673 if (!svm_has(SVM_FEATURE_NPT)) 805 if (!boot_cpu_has(X86_FEATURE_NPT))
674 npt_enabled = false; 806 npt_enabled = false;
675 807
676 if (npt_enabled && !npt) { 808 if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
725 struct vcpu_svm *svm = to_svm(vcpu); 857 struct vcpu_svm *svm = to_svm(vcpu);
726 u64 g_tsc_offset = 0; 858 u64 g_tsc_offset = 0;
727 859
728 if (is_nested(svm)) { 860 if (is_guest_mode(vcpu)) {
729 g_tsc_offset = svm->vmcb->control.tsc_offset - 861 g_tsc_offset = svm->vmcb->control.tsc_offset -
730 svm->nested.hsave->control.tsc_offset; 862 svm->nested.hsave->control.tsc_offset;
731 svm->nested.hsave->control.tsc_offset = offset; 863 svm->nested.hsave->control.tsc_offset = offset;
732 } 864 }
733 865
734 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 866 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
867
868 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
735} 869}
736 870
737static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 871static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
739 struct vcpu_svm *svm = to_svm(vcpu); 873 struct vcpu_svm *svm = to_svm(vcpu);
740 874
741 svm->vmcb->control.tsc_offset += adjustment; 875 svm->vmcb->control.tsc_offset += adjustment;
742 if (is_nested(svm)) 876 if (is_guest_mode(vcpu))
743 svm->nested.hsave->control.tsc_offset += adjustment; 877 svm->nested.hsave->control.tsc_offset += adjustment;
878 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
744} 879}
745 880
746static void init_vmcb(struct vcpu_svm *svm) 881static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
749 struct vmcb_save_area *save = &svm->vmcb->save; 884 struct vmcb_save_area *save = &svm->vmcb->save;
750 885
751 svm->vcpu.fpu_active = 1; 886 svm->vcpu.fpu_active = 1;
887 svm->vcpu.arch.hflags = 0;
752 888
753 control->intercept_cr_read = INTERCEPT_CR0_MASK | 889 set_cr_intercept(svm, INTERCEPT_CR0_READ);
754 INTERCEPT_CR3_MASK | 890 set_cr_intercept(svm, INTERCEPT_CR3_READ);
755 INTERCEPT_CR4_MASK; 891 set_cr_intercept(svm, INTERCEPT_CR4_READ);
756 892 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
757 control->intercept_cr_write = INTERCEPT_CR0_MASK | 893 set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
758 INTERCEPT_CR3_MASK | 894 set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
759 INTERCEPT_CR4_MASK | 895 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
760 INTERCEPT_CR8_MASK; 896
761 897 set_dr_intercept(svm, INTERCEPT_DR0_READ);
762 control->intercept_dr_read = INTERCEPT_DR0_MASK | 898 set_dr_intercept(svm, INTERCEPT_DR1_READ);
763 INTERCEPT_DR1_MASK | 899 set_dr_intercept(svm, INTERCEPT_DR2_READ);
764 INTERCEPT_DR2_MASK | 900 set_dr_intercept(svm, INTERCEPT_DR3_READ);
765 INTERCEPT_DR3_MASK | 901 set_dr_intercept(svm, INTERCEPT_DR4_READ);
766 INTERCEPT_DR4_MASK | 902 set_dr_intercept(svm, INTERCEPT_DR5_READ);
767 INTERCEPT_DR5_MASK | 903 set_dr_intercept(svm, INTERCEPT_DR6_READ);
768 INTERCEPT_DR6_MASK | 904 set_dr_intercept(svm, INTERCEPT_DR7_READ);
769 INTERCEPT_DR7_MASK; 905
770 906 set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
771 control->intercept_dr_write = INTERCEPT_DR0_MASK | 907 set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
772 INTERCEPT_DR1_MASK | 908 set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
773 INTERCEPT_DR2_MASK | 909 set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
774 INTERCEPT_DR3_MASK | 910 set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
775 INTERCEPT_DR4_MASK | 911 set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
776 INTERCEPT_DR5_MASK | 912 set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
777 INTERCEPT_DR6_MASK | 913 set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
778 INTERCEPT_DR7_MASK; 914
779 915 set_exception_intercept(svm, PF_VECTOR);
780 control->intercept_exceptions = (1 << PF_VECTOR) | 916 set_exception_intercept(svm, UD_VECTOR);
781 (1 << UD_VECTOR) | 917 set_exception_intercept(svm, MC_VECTOR);
782 (1 << MC_VECTOR); 918
783 919 set_intercept(svm, INTERCEPT_INTR);
784 920 set_intercept(svm, INTERCEPT_NMI);
785 control->intercept = (1ULL << INTERCEPT_INTR) | 921 set_intercept(svm, INTERCEPT_SMI);
786 (1ULL << INTERCEPT_NMI) | 922 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
787 (1ULL << INTERCEPT_SMI) | 923 set_intercept(svm, INTERCEPT_CPUID);
788 (1ULL << INTERCEPT_SELECTIVE_CR0) | 924 set_intercept(svm, INTERCEPT_INVD);
789 (1ULL << INTERCEPT_CPUID) | 925 set_intercept(svm, INTERCEPT_HLT);
790 (1ULL << INTERCEPT_INVD) | 926 set_intercept(svm, INTERCEPT_INVLPG);
791 (1ULL << INTERCEPT_HLT) | 927 set_intercept(svm, INTERCEPT_INVLPGA);
792 (1ULL << INTERCEPT_INVLPG) | 928 set_intercept(svm, INTERCEPT_IOIO_PROT);
793 (1ULL << INTERCEPT_INVLPGA) | 929 set_intercept(svm, INTERCEPT_MSR_PROT);
794 (1ULL << INTERCEPT_IOIO_PROT) | 930 set_intercept(svm, INTERCEPT_TASK_SWITCH);
795 (1ULL << INTERCEPT_MSR_PROT) | 931 set_intercept(svm, INTERCEPT_SHUTDOWN);
796 (1ULL << INTERCEPT_TASK_SWITCH) | 932 set_intercept(svm, INTERCEPT_VMRUN);
797 (1ULL << INTERCEPT_SHUTDOWN) | 933 set_intercept(svm, INTERCEPT_VMMCALL);
798 (1ULL << INTERCEPT_VMRUN) | 934 set_intercept(svm, INTERCEPT_VMLOAD);
799 (1ULL << INTERCEPT_VMMCALL) | 935 set_intercept(svm, INTERCEPT_VMSAVE);
800 (1ULL << INTERCEPT_VMLOAD) | 936 set_intercept(svm, INTERCEPT_STGI);
801 (1ULL << INTERCEPT_VMSAVE) | 937 set_intercept(svm, INTERCEPT_CLGI);
802 (1ULL << INTERCEPT_STGI) | 938 set_intercept(svm, INTERCEPT_SKINIT);
803 (1ULL << INTERCEPT_CLGI) | 939 set_intercept(svm, INTERCEPT_WBINVD);
804 (1ULL << INTERCEPT_SKINIT) | 940 set_intercept(svm, INTERCEPT_MONITOR);
805 (1ULL << INTERCEPT_WBINVD) | 941 set_intercept(svm, INTERCEPT_MWAIT);
806 (1ULL << INTERCEPT_MONITOR) | 942 set_intercept(svm, INTERCEPT_XSETBV);
807 (1ULL << INTERCEPT_MWAIT);
808 943
809 control->iopm_base_pa = iopm_base; 944 control->iopm_base_pa = iopm_base;
810 control->msrpm_base_pa = __pa(svm->msrpm); 945 control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
855 if (npt_enabled) { 990 if (npt_enabled) {
856 /* Setup VMCB for Nested Paging */ 991 /* Setup VMCB for Nested Paging */
857 control->nested_ctl = 1; 992 control->nested_ctl = 1;
858 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 993 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
859 (1ULL << INTERCEPT_INVLPG)); 994 clr_intercept(svm, INTERCEPT_INVLPG);
860 control->intercept_exceptions &= ~(1 << PF_VECTOR); 995 clr_exception_intercept(svm, PF_VECTOR);
861 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; 996 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
862 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; 997 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
863 save->g_pat = 0x0007040600070406ULL; 998 save->g_pat = 0x0007040600070406ULL;
864 save->cr3 = 0; 999 save->cr3 = 0;
865 save->cr4 = 0; 1000 save->cr4 = 0;
866 } 1001 }
867 force_new_asid(&svm->vcpu); 1002 svm->asid_generation = 0;
868 1003
869 svm->nested.vmcb = 0; 1004 svm->nested.vmcb = 0;
870 svm->vcpu.arch.hflags = 0; 1005 svm->vcpu.arch.hflags = 0;
871 1006
872 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { 1007 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
873 control->pause_filter_count = 3000; 1008 control->pause_filter_count = 3000;
874 control->intercept |= (1ULL << INTERCEPT_PAUSE); 1009 set_intercept(svm, INTERCEPT_PAUSE);
875 } 1010 }
876 1011
1012 mark_all_dirty(svm->vmcb);
1013
877 enable_gif(svm); 1014 enable_gif(svm);
878} 1015}
879 1016
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
990 1127
991 if (unlikely(cpu != vcpu->cpu)) { 1128 if (unlikely(cpu != vcpu->cpu)) {
992 svm->asid_generation = 0; 1129 svm->asid_generation = 0;
1130 mark_all_dirty(svm->vmcb);
993 } 1131 }
994 1132
1133#ifdef CONFIG_X86_64
1134 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1135#endif
1136 savesegment(fs, svm->host.fs);
1137 savesegment(gs, svm->host.gs);
1138 svm->host.ldt = kvm_read_ldt();
1139
995 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1140 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
996 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1141 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
997} 1142}
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1002 int i; 1147 int i;
1003 1148
1004 ++vcpu->stat.host_state_reload; 1149 ++vcpu->stat.host_state_reload;
1150 kvm_load_ldt(svm->host.ldt);
1151#ifdef CONFIG_X86_64
1152 loadsegment(fs, svm->host.fs);
1153 load_gs_index(svm->host.gs);
1154 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1155#else
1156 loadsegment(gs, svm->host.gs);
1157#endif
1005 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1006 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1007} 1160}
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1021 switch (reg) { 1174 switch (reg) {
1022 case VCPU_EXREG_PDPTR: 1175 case VCPU_EXREG_PDPTR:
1023 BUG_ON(!npt_enabled); 1176 BUG_ON(!npt_enabled);
1024 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 1177 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1025 break; 1178 break;
1026 default: 1179 default:
1027 BUG(); 1180 BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1030 1183
1031static void svm_set_vintr(struct vcpu_svm *svm) 1184static void svm_set_vintr(struct vcpu_svm *svm)
1032{ 1185{
1033 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 1186 set_intercept(svm, INTERCEPT_VINTR);
1034} 1187}
1035 1188
1036static void svm_clear_vintr(struct vcpu_svm *svm) 1189static void svm_clear_vintr(struct vcpu_svm *svm)
1037{ 1190{
1038 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1191 clr_intercept(svm, INTERCEPT_VINTR);
1039} 1192}
1040 1193
1041static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1194static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1150 1303
1151 svm->vmcb->save.idtr.limit = dt->size; 1304 svm->vmcb->save.idtr.limit = dt->size;
1152 svm->vmcb->save.idtr.base = dt->address ; 1305 svm->vmcb->save.idtr.base = dt->address ;
1306 mark_dirty(svm->vmcb, VMCB_DT);
1153} 1307}
1154 1308
1155static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1309static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1166 1320
1167 svm->vmcb->save.gdtr.limit = dt->size; 1321 svm->vmcb->save.gdtr.limit = dt->size;
1168 svm->vmcb->save.gdtr.base = dt->address ; 1322 svm->vmcb->save.gdtr.base = dt->address ;
1323 mark_dirty(svm->vmcb, VMCB_DT);
1169} 1324}
1170 1325
1171static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1326static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1172{ 1327{
1173} 1328}
1174 1329
1330static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1331{
1332}
1333
1175static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1334static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1176{ 1335{
1177} 1336}
1178 1337
1179static void update_cr0_intercept(struct vcpu_svm *svm) 1338static void update_cr0_intercept(struct vcpu_svm *svm)
1180{ 1339{
1181 struct vmcb *vmcb = svm->vmcb;
1182 ulong gcr0 = svm->vcpu.arch.cr0; 1340 ulong gcr0 = svm->vcpu.arch.cr0;
1183 u64 *hcr0 = &svm->vmcb->save.cr0; 1341 u64 *hcr0 = &svm->vmcb->save.cr0;
1184 1342
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
1188 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 1346 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1189 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 1347 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1190 1348
1349 mark_dirty(svm->vmcb, VMCB_CR);
1191 1350
1192 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1351 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1193 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1352 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1194 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1353 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1195 if (is_nested(svm)) {
1196 struct vmcb *hsave = svm->nested.hsave;
1197
1198 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1199 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1200 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1201 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1202 }
1203 } else { 1354 } else {
1204 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1355 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1205 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1356 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1206 if (is_nested(svm)) {
1207 struct vmcb *hsave = svm->nested.hsave;
1208
1209 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1210 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1211 }
1212 } 1357 }
1213} 1358}
1214 1359
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1216{ 1361{
1217 struct vcpu_svm *svm = to_svm(vcpu); 1362 struct vcpu_svm *svm = to_svm(vcpu);
1218 1363
1219 if (is_nested(svm)) { 1364 if (is_guest_mode(vcpu)) {
1220 /* 1365 /*
1221 * We are here because we run in nested mode, the host kvm 1366 * We are here because we run in nested mode, the host kvm
1222 * intercepts cr0 writes but the l1 hypervisor does not. 1367 * intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1268 */ 1413 */
1269 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1414 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1270 svm->vmcb->save.cr0 = cr0; 1415 svm->vmcb->save.cr0 = cr0;
1416 mark_dirty(svm->vmcb, VMCB_CR);
1271 update_cr0_intercept(svm); 1417 update_cr0_intercept(svm);
1272} 1418}
1273 1419
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1277 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1423 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1278 1424
1279 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1425 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1280 force_new_asid(vcpu); 1426 svm_flush_tlb(vcpu);
1281 1427
1282 vcpu->arch.cr4 = cr4; 1428 vcpu->arch.cr4 = cr4;
1283 if (!npt_enabled) 1429 if (!npt_enabled)
1284 cr4 |= X86_CR4_PAE; 1430 cr4 |= X86_CR4_PAE;
1285 cr4 |= host_cr4_mce; 1431 cr4 |= host_cr4_mce;
1286 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1432 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1433 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1287} 1434}
1288 1435
1289static void svm_set_segment(struct kvm_vcpu *vcpu, 1436static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1312 = (svm->vmcb->save.cs.attrib 1459 = (svm->vmcb->save.cs.attrib
1313 >> SVM_SELECTOR_DPL_SHIFT) & 3; 1460 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1314 1461
1462 mark_dirty(svm->vmcb, VMCB_SEG);
1315} 1463}
1316 1464
1317static void update_db_intercept(struct kvm_vcpu *vcpu) 1465static void update_db_intercept(struct kvm_vcpu *vcpu)
1318{ 1466{
1319 struct vcpu_svm *svm = to_svm(vcpu); 1467 struct vcpu_svm *svm = to_svm(vcpu);
1320 1468
1321 svm->vmcb->control.intercept_exceptions &= 1469 clr_exception_intercept(svm, DB_VECTOR);
1322 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1470 clr_exception_intercept(svm, BP_VECTOR);
1323 1471
1324 if (svm->nmi_singlestep) 1472 if (svm->nmi_singlestep)
1325 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1473 set_exception_intercept(svm, DB_VECTOR);
1326 1474
1327 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1475 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1328 if (vcpu->guest_debug & 1476 if (vcpu->guest_debug &
1329 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 1477 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1330 svm->vmcb->control.intercept_exceptions |= 1478 set_exception_intercept(svm, DB_VECTOR);
1331 1 << DB_VECTOR;
1332 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1479 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1333 svm->vmcb->control.intercept_exceptions |= 1480 set_exception_intercept(svm, BP_VECTOR);
1334 1 << BP_VECTOR;
1335 } else 1481 } else
1336 vcpu->guest_debug = 0; 1482 vcpu->guest_debug = 0;
1337} 1483}
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1345 else 1491 else
1346 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1492 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1347 1493
1348 update_db_intercept(vcpu); 1494 mark_dirty(svm->vmcb, VMCB_DR);
1349}
1350
1351static void load_host_msrs(struct kvm_vcpu *vcpu)
1352{
1353#ifdef CONFIG_X86_64
1354 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1355#endif
1356}
1357 1495
1358static void save_host_msrs(struct kvm_vcpu *vcpu) 1496 update_db_intercept(vcpu);
1359{
1360#ifdef CONFIG_X86_64
1361 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1362#endif
1363} 1497}
1364 1498
1365static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1499static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1372 1506
1373 svm->asid_generation = sd->asid_generation; 1507 svm->asid_generation = sd->asid_generation;
1374 svm->vmcb->control.asid = sd->next_asid++; 1508 svm->vmcb->control.asid = sd->next_asid++;
1509
1510 mark_dirty(svm->vmcb, VMCB_ASID);
1375} 1511}
1376 1512
1377static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1513static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1379 struct vcpu_svm *svm = to_svm(vcpu); 1515 struct vcpu_svm *svm = to_svm(vcpu);
1380 1516
1381 svm->vmcb->save.dr7 = value; 1517 svm->vmcb->save.dr7 = value;
1518 mark_dirty(svm->vmcb, VMCB_DR);
1382} 1519}
1383 1520
1384static int pf_interception(struct vcpu_svm *svm) 1521static int pf_interception(struct vcpu_svm *svm)
1385{ 1522{
1386 u64 fault_address; 1523 u64 fault_address = svm->vmcb->control.exit_info_2;
1387 u32 error_code; 1524 u32 error_code;
1525 int r = 1;
1388 1526
1389 fault_address = svm->vmcb->control.exit_info_2; 1527 switch (svm->apf_reason) {
1390 error_code = svm->vmcb->control.exit_info_1; 1528 default:
1391 1529 error_code = svm->vmcb->control.exit_info_1;
1392 trace_kvm_page_fault(fault_address, error_code); 1530
1393 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 1531 trace_kvm_page_fault(fault_address, error_code);
1394 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1532 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1395 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1533 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1534 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1535 svm->vmcb->control.insn_bytes,
1536 svm->vmcb->control.insn_len);
1537 break;
1538 case KVM_PV_REASON_PAGE_NOT_PRESENT:
1539 svm->apf_reason = 0;
1540 local_irq_disable();
1541 kvm_async_pf_task_wait(fault_address);
1542 local_irq_enable();
1543 break;
1544 case KVM_PV_REASON_PAGE_READY:
1545 svm->apf_reason = 0;
1546 local_irq_disable();
1547 kvm_async_pf_task_wake(fault_address);
1548 local_irq_enable();
1549 break;
1550 }
1551 return r;
1396} 1552}
1397 1553
1398static int db_interception(struct vcpu_svm *svm) 1554static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
1440{ 1596{
1441 int er; 1597 int er;
1442 1598
1443 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); 1599 er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1444 if (er != EMULATE_DONE) 1600 if (er != EMULATE_DONE)
1445 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1601 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1446 return 1; 1602 return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
1449static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1605static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1450{ 1606{
1451 struct vcpu_svm *svm = to_svm(vcpu); 1607 struct vcpu_svm *svm = to_svm(vcpu);
1452 u32 excp;
1453
1454 if (is_nested(svm)) {
1455 u32 h_excp, n_excp;
1456
1457 h_excp = svm->nested.hsave->control.intercept_exceptions;
1458 n_excp = svm->nested.intercept_exceptions;
1459 h_excp &= ~(1 << NM_VECTOR);
1460 excp = h_excp | n_excp;
1461 } else {
1462 excp = svm->vmcb->control.intercept_exceptions;
1463 excp &= ~(1 << NM_VECTOR);
1464 }
1465 1608
1466 svm->vmcb->control.intercept_exceptions = excp; 1609 clr_exception_intercept(svm, NM_VECTOR);
1467 1610
1468 svm->vcpu.fpu_active = 1; 1611 svm->vcpu.fpu_active = 1;
1469 update_cr0_intercept(svm); 1612 update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
1570 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1713 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1571 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1714 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1572 if (string || in) 1715 if (string || in)
1573 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 1716 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1574 1717
1575 port = io_info >> 16; 1718 port = io_info >> 16;
1576 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1719 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1624 struct vcpu_svm *svm = to_svm(vcpu); 1767 struct vcpu_svm *svm = to_svm(vcpu);
1625 1768
1626 svm->vmcb->control.nested_cr3 = root; 1769 svm->vmcb->control.nested_cr3 = root;
1627 force_new_asid(vcpu); 1770 mark_dirty(svm->vmcb, VMCB_NPT);
1771 svm_flush_tlb(vcpu);
1628} 1772}
1629 1773
1630static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) 1774static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1775 struct x86_exception *fault)
1631{ 1776{
1632 struct vcpu_svm *svm = to_svm(vcpu); 1777 struct vcpu_svm *svm = to_svm(vcpu);
1633 1778
1634 svm->vmcb->control.exit_code = SVM_EXIT_NPF; 1779 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1635 svm->vmcb->control.exit_code_hi = 0; 1780 svm->vmcb->control.exit_code_hi = 0;
1636 svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; 1781 svm->vmcb->control.exit_info_1 = fault->error_code;
1637 svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; 1782 svm->vmcb->control.exit_info_2 = fault->address;
1638 1783
1639 nested_svm_vmexit(svm); 1784 nested_svm_vmexit(svm);
1640} 1785}
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1680{ 1825{
1681 int vmexit; 1826 int vmexit;
1682 1827
1683 if (!is_nested(svm)) 1828 if (!is_guest_mode(&svm->vcpu))
1684 return 0; 1829 return 0;
1685 1830
1686 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1831 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1698/* This function returns true if it is save to enable the irq window */ 1843/* This function returns true if it is save to enable the irq window */
1699static inline bool nested_svm_intr(struct vcpu_svm *svm) 1844static inline bool nested_svm_intr(struct vcpu_svm *svm)
1700{ 1845{
1701 if (!is_nested(svm)) 1846 if (!is_guest_mode(&svm->vcpu))
1702 return true; 1847 return true;
1703 1848
1704 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1849 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1737/* This function returns true if it is save to enable the nmi window */ 1882/* This function returns true if it is save to enable the nmi window */
1738static inline bool nested_svm_nmi(struct vcpu_svm *svm) 1883static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1739{ 1884{
1740 if (!is_nested(svm)) 1885 if (!is_guest_mode(&svm->vcpu))
1741 return true; 1886 return true;
1742 1887
1743 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 1888 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1836 return NESTED_EXIT_HOST; 1981 return NESTED_EXIT_HOST;
1837 break; 1982 break;
1838 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1983 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1839 /* When we're shadowing, trap PFs */ 1984 /* When we're shadowing, trap PFs, but not async PF */
1840 if (!npt_enabled) 1985 if (!npt_enabled && svm->apf_reason == 0)
1841 return NESTED_EXIT_HOST; 1986 return NESTED_EXIT_HOST;
1842 break; 1987 break;
1843 case SVM_EXIT_EXCP_BASE + NM_VECTOR: 1988 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1865 case SVM_EXIT_IOIO: 2010 case SVM_EXIT_IOIO:
1866 vmexit = nested_svm_intercept_ioio(svm); 2011 vmexit = nested_svm_intercept_ioio(svm);
1867 break; 2012 break;
1868 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 2013 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1869 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 2014 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
1870 if (svm->nested.intercept_cr_read & cr_bits) 2015 if (svm->nested.intercept_cr & bit)
1871 vmexit = NESTED_EXIT_DONE; 2016 vmexit = NESTED_EXIT_DONE;
1872 break; 2017 break;
1873 } 2018 }
1874 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { 2019 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1875 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); 2020 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
1876 if (svm->nested.intercept_cr_write & cr_bits) 2021 if (svm->nested.intercept_dr & bit)
1877 vmexit = NESTED_EXIT_DONE;
1878 break;
1879 }
1880 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1881 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1882 if (svm->nested.intercept_dr_read & dr_bits)
1883 vmexit = NESTED_EXIT_DONE;
1884 break;
1885 }
1886 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1887 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1888 if (svm->nested.intercept_dr_write & dr_bits)
1889 vmexit = NESTED_EXIT_DONE; 2022 vmexit = NESTED_EXIT_DONE;
1890 break; 2023 break;
1891 } 2024 }
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1893 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 2026 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1894 if (svm->nested.intercept_exceptions & excp_bits) 2027 if (svm->nested.intercept_exceptions & excp_bits)
1895 vmexit = NESTED_EXIT_DONE; 2028 vmexit = NESTED_EXIT_DONE;
2029 /* async page fault always cause vmexit */
2030 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2031 svm->apf_reason != 0)
2032 vmexit = NESTED_EXIT_DONE;
1896 break; 2033 break;
1897 } 2034 }
1898 case SVM_EXIT_ERR: { 2035 case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
1926 struct vmcb_control_area *dst = &dst_vmcb->control; 2063 struct vmcb_control_area *dst = &dst_vmcb->control;
1927 struct vmcb_control_area *from = &from_vmcb->control; 2064 struct vmcb_control_area *from = &from_vmcb->control;
1928 2065
1929 dst->intercept_cr_read = from->intercept_cr_read; 2066 dst->intercept_cr = from->intercept_cr;
1930 dst->intercept_cr_write = from->intercept_cr_write; 2067 dst->intercept_dr = from->intercept_dr;
1931 dst->intercept_dr_read = from->intercept_dr_read;
1932 dst->intercept_dr_write = from->intercept_dr_write;
1933 dst->intercept_exceptions = from->intercept_exceptions; 2068 dst->intercept_exceptions = from->intercept_exceptions;
1934 dst->intercept = from->intercept; 2069 dst->intercept = from->intercept;
1935 dst->iopm_base_pa = from->iopm_base_pa; 2070 dst->iopm_base_pa = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1970 if (!nested_vmcb) 2105 if (!nested_vmcb)
1971 return 1; 2106 return 1;
1972 2107
1973 /* Exit nested SVM mode */ 2108 /* Exit Guest-Mode */
2109 leave_guest_mode(&svm->vcpu);
1974 svm->nested.vmcb = 0; 2110 svm->nested.vmcb = 0;
1975 2111
1976 /* Give the current vmcb to the guest */ 2112 /* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1984 nested_vmcb->save.idtr = vmcb->save.idtr; 2120 nested_vmcb->save.idtr = vmcb->save.idtr;
1985 nested_vmcb->save.efer = svm->vcpu.arch.efer; 2121 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1986 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 2122 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1987 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 2123 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
1988 nested_vmcb->save.cr2 = vmcb->save.cr2; 2124 nested_vmcb->save.cr2 = vmcb->save.cr2;
1989 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2125 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1990 nested_vmcb->save.rflags = vmcb->save.rflags; 2126 nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2061 svm->vmcb->save.cpl = 0; 2197 svm->vmcb->save.cpl = 0;
2062 svm->vmcb->control.exit_int_info = 0; 2198 svm->vmcb->control.exit_int_info = 0;
2063 2199
2200 mark_all_dirty(svm->vmcb);
2201
2064 nested_svm_unmap(page); 2202 nested_svm_unmap(page);
2065 2203
2066 nested_svm_uninit_mmu_context(&svm->vcpu); 2204 nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2148 nested_vmcb->control.event_inj, 2286 nested_vmcb->control.event_inj,
2149 nested_vmcb->control.nested_ctl); 2287 nested_vmcb->control.nested_ctl);
2150 2288
2151 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, 2289 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2152 nested_vmcb->control.intercept_cr_write, 2290 nested_vmcb->control.intercept_cr >> 16,
2153 nested_vmcb->control.intercept_exceptions, 2291 nested_vmcb->control.intercept_exceptions,
2154 nested_vmcb->control.intercept); 2292 nested_vmcb->control.intercept);
2155 2293
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2177 if (npt_enabled) 2315 if (npt_enabled)
2178 hsave->save.cr3 = vmcb->save.cr3; 2316 hsave->save.cr3 = vmcb->save.cr3;
2179 else 2317 else
2180 hsave->save.cr3 = svm->vcpu.arch.cr3; 2318 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
2181 2319
2182 copy_vmcb_control_area(hsave, vmcb); 2320 copy_vmcb_control_area(hsave, vmcb);
2183 2321
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2229 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 2367 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
2230 2368
2231 /* cache intercepts */ 2369 /* cache intercepts */
2232 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2370 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
2233 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; 2371 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
2234 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
2235 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
2236 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 2372 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2237 svm->nested.intercept = nested_vmcb->control.intercept; 2373 svm->nested.intercept = nested_vmcb->control.intercept;
2238 2374
2239 force_new_asid(&svm->vcpu); 2375 svm_flush_tlb(&svm->vcpu);
2240 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 2376 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2241 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 2377 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2242 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 2378 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2245 2381
2246 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 2382 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2247 /* We only want the cr8 intercept bits of the guest */ 2383 /* We only want the cr8 intercept bits of the guest */
2248 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; 2384 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2249 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2385 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2250 } 2386 }
2251 2387
2252 /* We don't want to see VMMCALLs from a nested guest */ 2388 /* We don't want to see VMMCALLs from a nested guest */
2253 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); 2389 clr_intercept(svm, INTERCEPT_VMMCALL);
2254
2255 /*
2256 * We don't want a nested guest to be more powerful than the guest, so
2257 * all intercepts are ORed
2258 */
2259 svm->vmcb->control.intercept_cr_read |=
2260 nested_vmcb->control.intercept_cr_read;
2261 svm->vmcb->control.intercept_cr_write |=
2262 nested_vmcb->control.intercept_cr_write;
2263 svm->vmcb->control.intercept_dr_read |=
2264 nested_vmcb->control.intercept_dr_read;
2265 svm->vmcb->control.intercept_dr_write |=
2266 nested_vmcb->control.intercept_dr_write;
2267 svm->vmcb->control.intercept_exceptions |=
2268 nested_vmcb->control.intercept_exceptions;
2269
2270 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2271 2390
2272 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; 2391 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2273 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2392 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2278 2397
2279 nested_svm_unmap(page); 2398 nested_svm_unmap(page);
2280 2399
2281 /* nested_vmcb is our indicator if nested SVM is activated */ 2400 /* Enter Guest-Mode */
2401 enter_guest_mode(&svm->vcpu);
2402
2403 /*
2404 * Merge guest and host intercepts - must be called with vcpu in
2405 * guest-mode to take affect here
2406 */
2407 recalc_intercepts(svm);
2408
2282 svm->nested.vmcb = vmcb_gpa; 2409 svm->nested.vmcb = vmcb_gpa;
2283 2410
2284 enable_gif(svm); 2411 enable_gif(svm);
2285 2412
2413 mark_all_dirty(svm->vmcb);
2414
2286 return true; 2415 return true;
2287} 2416}
2288 2417
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
2400 svm_clear_vintr(svm); 2529 svm_clear_vintr(svm);
2401 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2530 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2402 2531
2532 mark_dirty(svm->vmcb, VMCB_INTR);
2533
2403 return 1; 2534 return 1;
2404} 2535}
2405 2536
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
2426 return 1; 2557 return 1;
2427} 2558}
2428 2559
2560static int xsetbv_interception(struct vcpu_svm *svm)
2561{
2562 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2563 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2564
2565 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2566 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2567 skip_emulated_instruction(&svm->vcpu);
2568 }
2569
2570 return 1;
2571}
2572
2429static int invalid_op_interception(struct vcpu_svm *svm) 2573static int invalid_op_interception(struct vcpu_svm *svm)
2430{ 2574{
2431 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2575 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
2507static int iret_interception(struct vcpu_svm *svm) 2651static int iret_interception(struct vcpu_svm *svm)
2508{ 2652{
2509 ++svm->vcpu.stat.nmi_window_exits; 2653 ++svm->vcpu.stat.nmi_window_exits;
2510 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 2654 clr_intercept(svm, INTERCEPT_IRET);
2511 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2655 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2512 return 1; 2656 return 1;
2513} 2657}
2514 2658
2515static int invlpg_interception(struct vcpu_svm *svm) 2659static int invlpg_interception(struct vcpu_svm *svm)
2516{ 2660{
2517 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2661 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2662 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2663
2664 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2665 skip_emulated_instruction(&svm->vcpu);
2666 return 1;
2518} 2667}
2519 2668
2520static int emulate_on_interception(struct vcpu_svm *svm) 2669static int emulate_on_interception(struct vcpu_svm *svm)
2521{ 2670{
2522 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2671 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2672}
2673
2674#define CR_VALID (1ULL << 63)
2675
2676static int cr_interception(struct vcpu_svm *svm)
2677{
2678 int reg, cr;
2679 unsigned long val;
2680 int err;
2681
2682 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2683 return emulate_on_interception(svm);
2684
2685 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2686 return emulate_on_interception(svm);
2687
2688 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2689 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2690
2691 err = 0;
2692 if (cr >= 16) { /* mov to cr */
2693 cr -= 16;
2694 val = kvm_register_read(&svm->vcpu, reg);
2695 switch (cr) {
2696 case 0:
2697 err = kvm_set_cr0(&svm->vcpu, val);
2698 break;
2699 case 3:
2700 err = kvm_set_cr3(&svm->vcpu, val);
2701 break;
2702 case 4:
2703 err = kvm_set_cr4(&svm->vcpu, val);
2704 break;
2705 case 8:
2706 err = kvm_set_cr8(&svm->vcpu, val);
2707 break;
2708 default:
2709 WARN(1, "unhandled write to CR%d", cr);
2710 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2711 return 1;
2712 }
2713 } else { /* mov from cr */
2714 switch (cr) {
2715 case 0:
2716 val = kvm_read_cr0(&svm->vcpu);
2717 break;
2718 case 2:
2719 val = svm->vcpu.arch.cr2;
2720 break;
2721 case 3:
2722 val = kvm_read_cr3(&svm->vcpu);
2723 break;
2724 case 4:
2725 val = kvm_read_cr4(&svm->vcpu);
2726 break;
2727 case 8:
2728 val = kvm_get_cr8(&svm->vcpu);
2729 break;
2730 default:
2731 WARN(1, "unhandled read from CR%d", cr);
2732 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2733 return 1;
2734 }
2735 kvm_register_write(&svm->vcpu, reg, val);
2736 }
2737 kvm_complete_insn_gp(&svm->vcpu, err);
2738
2739 return 1;
2523} 2740}
2524 2741
2525static int cr0_write_interception(struct vcpu_svm *svm) 2742static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2527 struct kvm_vcpu *vcpu = &svm->vcpu; 2744 struct kvm_vcpu *vcpu = &svm->vcpu;
2528 int r; 2745 int r;
2529 2746
2530 r = emulate_instruction(&svm->vcpu, 0, 0, 0); 2747 r = cr_interception(svm);
2531 2748
2532 if (svm->nested.vmexit_rip) { 2749 if (svm->nested.vmexit_rip) {
2533 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); 2750 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2536 svm->nested.vmexit_rip = 0; 2753 svm->nested.vmexit_rip = 0;
2537 } 2754 }
2538 2755
2539 return r == EMULATE_DONE; 2756 return r;
2757}
2758
2759static int dr_interception(struct vcpu_svm *svm)
2760{
2761 int reg, dr;
2762 unsigned long val;
2763 int err;
2764
2765 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2766 return emulate_on_interception(svm);
2767
2768 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2769 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2770
2771 if (dr >= 16) { /* mov to DRn */
2772 val = kvm_register_read(&svm->vcpu, reg);
2773 kvm_set_dr(&svm->vcpu, dr - 16, val);
2774 } else {
2775 err = kvm_get_dr(&svm->vcpu, dr, &val);
2776 if (!err)
2777 kvm_register_write(&svm->vcpu, reg, val);
2778 }
2779
2780 return 1;
2540} 2781}
2541 2782
2542static int cr8_write_interception(struct vcpu_svm *svm) 2783static int cr8_write_interception(struct vcpu_svm *svm)
2543{ 2784{
2544 struct kvm_run *kvm_run = svm->vcpu.run; 2785 struct kvm_run *kvm_run = svm->vcpu.run;
2786 int r;
2545 2787
2546 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2788 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2547 /* instruction emulation calls kvm_set_cr8() */ 2789 /* instruction emulation calls kvm_set_cr8() */
2548 emulate_instruction(&svm->vcpu, 0, 0, 0); 2790 r = cr_interception(svm);
2549 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2791 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2550 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2792 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2551 return 1; 2793 return r;
2552 } 2794 }
2553 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 2795 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2554 return 1; 2796 return r;
2555 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2797 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2556 return 0; 2798 return 0;
2557} 2799}
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2562 2804
2563 switch (ecx) { 2805 switch (ecx) {
2564 case MSR_IA32_TSC: { 2806 case MSR_IA32_TSC: {
2565 u64 tsc_offset; 2807 struct vmcb *vmcb = get_host_vmcb(svm);
2566 2808
2567 if (is_nested(svm)) 2809 *data = vmcb->control.tsc_offset + native_read_tsc();
2568 tsc_offset = svm->nested.hsave->control.tsc_offset;
2569 else
2570 tsc_offset = svm->vmcb->control.tsc_offset;
2571
2572 *data = tsc_offset + native_read_tsc();
2573 break; 2810 break;
2574 } 2811 }
2575 case MSR_STAR: 2812 case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2714 svm->vmcb->save.sysenter_esp = data; 2951 svm->vmcb->save.sysenter_esp = data;
2715 break; 2952 break;
2716 case MSR_IA32_DEBUGCTLMSR: 2953 case MSR_IA32_DEBUGCTLMSR:
2717 if (!svm_has(SVM_FEATURE_LBRV)) { 2954 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2718 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 2955 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2719 __func__, data); 2956 __func__, data);
2720 break; 2957 break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2723 return 1; 2960 return 1;
2724 2961
2725 svm->vmcb->save.dbgctl = data; 2962 svm->vmcb->save.dbgctl = data;
2963 mark_dirty(svm->vmcb, VMCB_LBR);
2726 if (data & (1ULL<<0)) 2964 if (data & (1ULL<<0))
2727 svm_enable_lbrv(svm); 2965 svm_enable_lbrv(svm);
2728 else 2966 else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2775 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3013 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2776 svm_clear_vintr(svm); 3014 svm_clear_vintr(svm);
2777 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3015 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3016 mark_dirty(svm->vmcb, VMCB_INTR);
2778 /* 3017 /*
2779 * If the user space waits to inject interrupts, exit as soon as 3018 * If the user space waits to inject interrupts, exit as soon as
2780 * possible 3019 * possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
2797} 3036}
2798 3037
2799static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3038static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2800 [SVM_EXIT_READ_CR0] = emulate_on_interception, 3039 [SVM_EXIT_READ_CR0] = cr_interception,
2801 [SVM_EXIT_READ_CR3] = emulate_on_interception, 3040 [SVM_EXIT_READ_CR3] = cr_interception,
2802 [SVM_EXIT_READ_CR4] = emulate_on_interception, 3041 [SVM_EXIT_READ_CR4] = cr_interception,
2803 [SVM_EXIT_READ_CR8] = emulate_on_interception, 3042 [SVM_EXIT_READ_CR8] = cr_interception,
2804 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3043 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2805 [SVM_EXIT_WRITE_CR0] = cr0_write_interception, 3044 [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
2806 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 3045 [SVM_EXIT_WRITE_CR3] = cr_interception,
2807 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 3046 [SVM_EXIT_WRITE_CR4] = cr_interception,
2808 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3047 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2809 [SVM_EXIT_READ_DR0] = emulate_on_interception, 3048 [SVM_EXIT_READ_DR0] = dr_interception,
2810 [SVM_EXIT_READ_DR1] = emulate_on_interception, 3049 [SVM_EXIT_READ_DR1] = dr_interception,
2811 [SVM_EXIT_READ_DR2] = emulate_on_interception, 3050 [SVM_EXIT_READ_DR2] = dr_interception,
2812 [SVM_EXIT_READ_DR3] = emulate_on_interception, 3051 [SVM_EXIT_READ_DR3] = dr_interception,
2813 [SVM_EXIT_READ_DR4] = emulate_on_interception, 3052 [SVM_EXIT_READ_DR4] = dr_interception,
2814 [SVM_EXIT_READ_DR5] = emulate_on_interception, 3053 [SVM_EXIT_READ_DR5] = dr_interception,
2815 [SVM_EXIT_READ_DR6] = emulate_on_interception, 3054 [SVM_EXIT_READ_DR6] = dr_interception,
2816 [SVM_EXIT_READ_DR7] = emulate_on_interception, 3055 [SVM_EXIT_READ_DR7] = dr_interception,
2817 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 3056 [SVM_EXIT_WRITE_DR0] = dr_interception,
2818 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 3057 [SVM_EXIT_WRITE_DR1] = dr_interception,
2819 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 3058 [SVM_EXIT_WRITE_DR2] = dr_interception,
2820 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 3059 [SVM_EXIT_WRITE_DR3] = dr_interception,
2821 [SVM_EXIT_WRITE_DR4] = emulate_on_interception, 3060 [SVM_EXIT_WRITE_DR4] = dr_interception,
2822 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 3061 [SVM_EXIT_WRITE_DR5] = dr_interception,
2823 [SVM_EXIT_WRITE_DR6] = emulate_on_interception, 3062 [SVM_EXIT_WRITE_DR6] = dr_interception,
2824 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 3063 [SVM_EXIT_WRITE_DR7] = dr_interception,
2825 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3064 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2826 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3065 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2827 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3066 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2854 [SVM_EXIT_WBINVD] = emulate_on_interception, 3093 [SVM_EXIT_WBINVD] = emulate_on_interception,
2855 [SVM_EXIT_MONITOR] = invalid_op_interception, 3094 [SVM_EXIT_MONITOR] = invalid_op_interception,
2856 [SVM_EXIT_MWAIT] = invalid_op_interception, 3095 [SVM_EXIT_MWAIT] = invalid_op_interception,
3096 [SVM_EXIT_XSETBV] = xsetbv_interception,
2857 [SVM_EXIT_NPF] = pf_interception, 3097 [SVM_EXIT_NPF] = pf_interception,
2858}; 3098};
2859 3099
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2864 struct vmcb_save_area *save = &svm->vmcb->save; 3104 struct vmcb_save_area *save = &svm->vmcb->save;
2865 3105
2866 pr_err("VMCB Control Area:\n"); 3106 pr_err("VMCB Control Area:\n");
2867 pr_err("cr_read: %04x\n", control->intercept_cr_read); 3107 pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
2868 pr_err("cr_write: %04x\n", control->intercept_cr_write); 3108 pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
2869 pr_err("dr_read: %04x\n", control->intercept_dr_read); 3109 pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
2870 pr_err("dr_write: %04x\n", control->intercept_dr_write); 3110 pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
2871 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3111 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2872 pr_err("intercepts: %016llx\n", control->intercept); 3112 pr_err("intercepts: %016llx\n", control->intercept);
2873 pr_err("pause filter count: %d\n", control->pause_filter_count); 3113 pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2950 3190
2951} 3191}
2952 3192
3193static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3194{
3195 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3196
3197 *info1 = control->exit_info_1;
3198 *info2 = control->exit_info_2;
3199}
3200
2953static int handle_exit(struct kvm_vcpu *vcpu) 3201static int handle_exit(struct kvm_vcpu *vcpu)
2954{ 3202{
2955 struct vcpu_svm *svm = to_svm(vcpu); 3203 struct vcpu_svm *svm = to_svm(vcpu);
2956 struct kvm_run *kvm_run = vcpu->run; 3204 struct kvm_run *kvm_run = vcpu->run;
2957 u32 exit_code = svm->vmcb->control.exit_code; 3205 u32 exit_code = svm->vmcb->control.exit_code;
2958 3206
2959 trace_kvm_exit(exit_code, vcpu); 3207 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
2960 3208
2961 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) 3209 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2962 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3210 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2963 if (npt_enabled) 3211 if (npt_enabled)
2964 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3212 vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2970 return 1; 3218 return 1;
2971 } 3219 }
2972 3220
2973 if (is_nested(svm)) { 3221 if (is_guest_mode(vcpu)) {
2974 int vmexit; 3222 int vmexit;
2975 3223
2976 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 3224 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
3033 3281
3034 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 3282 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3035 3283
3036 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3037 /* FIXME: handle wraparound of asid_generation */ 3284 /* FIXME: handle wraparound of asid_generation */
3038 if (svm->asid_generation != sd->asid_generation) 3285 if (svm->asid_generation != sd->asid_generation)
3039 new_asid(svm, sd); 3286 new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3045 3292
3046 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3293 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3047 vcpu->arch.hflags |= HF_NMI_MASK; 3294 vcpu->arch.hflags |= HF_NMI_MASK;
3048 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3295 set_intercept(svm, INTERCEPT_IRET);
3049 ++vcpu->stat.nmi_injections; 3296 ++vcpu->stat.nmi_injections;
3050} 3297}
3051 3298
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3058 control->int_ctl &= ~V_INTR_PRIO_MASK; 3305 control->int_ctl &= ~V_INTR_PRIO_MASK;
3059 control->int_ctl |= V_IRQ_MASK | 3306 control->int_ctl |= V_IRQ_MASK |
3060 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 3307 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3308 mark_dirty(svm->vmcb, VMCB_INTR);
3061} 3309}
3062 3310
3063static void svm_set_irq(struct kvm_vcpu *vcpu) 3311static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3077{ 3325{
3078 struct vcpu_svm *svm = to_svm(vcpu); 3326 struct vcpu_svm *svm = to_svm(vcpu);
3079 3327
3080 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3328 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3081 return; 3329 return;
3082 3330
3083 if (irr == -1) 3331 if (irr == -1)
3084 return; 3332 return;
3085 3333
3086 if (tpr >= irr) 3334 if (tpr >= irr)
3087 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 3335 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3088} 3336}
3089 3337
3090static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3338static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3112 3360
3113 if (masked) { 3361 if (masked) {
3114 svm->vcpu.arch.hflags |= HF_NMI_MASK; 3362 svm->vcpu.arch.hflags |= HF_NMI_MASK;
3115 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3363 set_intercept(svm, INTERCEPT_IRET);
3116 } else { 3364 } else {
3117 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 3365 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3118 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 3366 clr_intercept(svm, INTERCEPT_IRET);
3119 } 3367 }
3120} 3368}
3121 3369
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3131 3379
3132 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3380 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
3133 3381
3134 if (is_nested(svm)) 3382 if (is_guest_mode(vcpu))
3135 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3383 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3136 3384
3137 return ret; 3385 return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3177 3425
3178static void svm_flush_tlb(struct kvm_vcpu *vcpu) 3426static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3179{ 3427{
3180 force_new_asid(vcpu); 3428 struct vcpu_svm *svm = to_svm(vcpu);
3429
3430 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3431 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3432 else
3433 svm->asid_generation--;
3181} 3434}
3182 3435
3183static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 3436static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3188{ 3441{
3189 struct vcpu_svm *svm = to_svm(vcpu); 3442 struct vcpu_svm *svm = to_svm(vcpu);
3190 3443
3191 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3444 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3192 return; 3445 return;
3193 3446
3194 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 3447 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3195 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3448 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3196 kvm_set_cr8(vcpu, cr8); 3449 kvm_set_cr8(vcpu, cr8);
3197 } 3450 }
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3202 struct vcpu_svm *svm = to_svm(vcpu); 3455 struct vcpu_svm *svm = to_svm(vcpu);
3203 u64 cr8; 3456 u64 cr8;
3204 3457
3205 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3458 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3206 return; 3459 return;
3207 3460
3208 cr8 = kvm_get_cr8(vcpu); 3461 cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3289static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3542static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3290{ 3543{
3291 struct vcpu_svm *svm = to_svm(vcpu); 3544 struct vcpu_svm *svm = to_svm(vcpu);
3292 u16 fs_selector;
3293 u16 gs_selector;
3294 u16 ldt_selector;
3295 3545
3296 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3546 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3297 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3547 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3308 3558
3309 sync_lapic_to_cr8(vcpu); 3559 sync_lapic_to_cr8(vcpu);
3310 3560
3311 save_host_msrs(vcpu);
3312 savesegment(fs, fs_selector);
3313 savesegment(gs, gs_selector);
3314 ldt_selector = kvm_read_ldt();
3315 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3561 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3316 3562
3317 clgi(); 3563 clgi();
@@ -3389,19 +3635,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3389#endif 3635#endif
3390 ); 3636 );
3391 3637
3392 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3393 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3394 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3395 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3396
3397 load_host_msrs(vcpu);
3398 kvm_load_ldt(ldt_selector);
3399 loadsegment(fs, fs_selector);
3400#ifdef CONFIG_X86_64 3638#ifdef CONFIG_X86_64
3401 load_gs_index(gs_selector); 3639 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3402 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
3403#else 3640#else
3404 loadsegment(gs, gs_selector); 3641 loadsegment(fs, svm->host.fs);
3405#endif 3642#endif
3406 3643
3407 reload_tss(vcpu); 3644 reload_tss(vcpu);
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3410 3647
3411 stgi(); 3648 stgi();
3412 3649
3650 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3651 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3652 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3653 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3654
3413 sync_cr8_to_lapic(vcpu); 3655 sync_cr8_to_lapic(vcpu);
3414 3656
3415 svm->next_rip = 0; 3657 svm->next_rip = 0;
3416 3658
3659 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3660
3661 /* if exit due to PF check for async PF */
3662 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3663 svm->apf_reason = kvm_read_and_reset_pf_reason();
3664
3417 if (npt_enabled) { 3665 if (npt_enabled) {
3418 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3666 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3419 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3667 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3426 if (unlikely(svm->vmcb->control.exit_code == 3674 if (unlikely(svm->vmcb->control.exit_code ==
3427 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3675 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3428 svm_handle_mce(svm); 3676 svm_handle_mce(svm);
3677
3678 mark_all_clean(svm->vmcb);
3429} 3679}
3430 3680
3431#undef R 3681#undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3435 struct vcpu_svm *svm = to_svm(vcpu); 3685 struct vcpu_svm *svm = to_svm(vcpu);
3436 3686
3437 svm->vmcb->save.cr3 = root; 3687 svm->vmcb->save.cr3 = root;
3438 force_new_asid(vcpu); 3688 mark_dirty(svm->vmcb, VMCB_CR);
3689 svm_flush_tlb(vcpu);
3439} 3690}
3440 3691
3441static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) 3692static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3443 struct vcpu_svm *svm = to_svm(vcpu); 3694 struct vcpu_svm *svm = to_svm(vcpu);
3444 3695
3445 svm->vmcb->control.nested_cr3 = root; 3696 svm->vmcb->control.nested_cr3 = root;
3697 mark_dirty(svm->vmcb, VMCB_NPT);
3446 3698
3447 /* Also sync guest cr3 here in case we live migrate */ 3699 /* Also sync guest cr3 here in case we live migrate */
3448 svm->vmcb->save.cr3 = vcpu->arch.cr3; 3700 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3701 mark_dirty(svm->vmcb, VMCB_CR);
3449 3702
3450 force_new_asid(vcpu); 3703 svm_flush_tlb(vcpu);
3451} 3704}
3452 3705
3453static int is_disabled(void) 3706static int is_disabled(void)
@@ -3494,10 +3747,6 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3494static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3747static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3495{ 3748{
3496 switch (func) { 3749 switch (func) {
3497 case 0x00000001:
3498 /* Mask out xsave bit as long as it is not supported by SVM */
3499 entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
3500 break;
3501 case 0x80000001: 3750 case 0x80000001:
3502 if (nested) 3751 if (nested)
3503 entry->ecx |= (1 << 2); /* Set SVM bit */ 3752 entry->ecx |= (1 << 2); /* Set SVM bit */
@@ -3511,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3511 additional features */ 3760 additional features */
3512 3761
3513 /* Support next_rip if host supports it */ 3762 /* Support next_rip if host supports it */
3514 if (svm_has(SVM_FEATURE_NRIP)) 3763 if (boot_cpu_has(X86_FEATURE_NRIPS))
3515 entry->edx |= SVM_FEATURE_NRIP; 3764 entry->edx |= SVM_FEATURE_NRIP;
3516 3765
3517 /* Support NPT for the guest if enabled */ 3766 /* Support NPT for the guest if enabled */
@@ -3571,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
3571 { SVM_EXIT_WBINVD, "wbinvd" }, 3820 { SVM_EXIT_WBINVD, "wbinvd" },
3572 { SVM_EXIT_MONITOR, "monitor" }, 3821 { SVM_EXIT_MONITOR, "monitor" },
3573 { SVM_EXIT_MWAIT, "mwait" }, 3822 { SVM_EXIT_MWAIT, "mwait" },
3823 { SVM_EXIT_XSETBV, "xsetbv" },
3574 { SVM_EXIT_NPF, "npf" }, 3824 { SVM_EXIT_NPF, "npf" },
3575 { -1, NULL } 3825 { -1, NULL }
3576}; 3826};
@@ -3594,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3594{ 3844{
3595 struct vcpu_svm *svm = to_svm(vcpu); 3845 struct vcpu_svm *svm = to_svm(vcpu);
3596 3846
3597 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3847 set_exception_intercept(svm, NM_VECTOR);
3598 if (is_nested(svm))
3599 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3600 update_cr0_intercept(svm); 3848 update_cr0_intercept(svm);
3601} 3849}
3602 3850
@@ -3627,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3627 .get_cpl = svm_get_cpl, 3875 .get_cpl = svm_get_cpl,
3628 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 3876 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3629 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, 3877 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
3878 .decache_cr3 = svm_decache_cr3,
3630 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 3879 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
3631 .set_cr0 = svm_set_cr0, 3880 .set_cr0 = svm_set_cr0,
3632 .set_cr3 = svm_set_cr3, 3881 .set_cr3 = svm_set_cr3,
@@ -3667,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
3667 .get_tdp_level = get_npt_level, 3916 .get_tdp_level = get_npt_level,
3668 .get_mt_mask = svm_get_mt_mask, 3917 .get_mt_mask = svm_get_mt_mask,
3669 3918
3919 .get_exit_info = svm_get_exit_info,
3670 .exit_reasons_str = svm_exit_reasons_str, 3920 .exit_reasons_str = svm_exit_reasons_str,
3921
3671 .get_lpage_level = svm_get_lpage_level, 3922 .get_lpage_level = svm_get_lpage_level,
3672 3923
3673 .cpuid_update = svm_cpuid_update, 3924 .cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..1357d7cf4ec8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) 178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) 179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
180 180
181#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2
183
181/* 184/*
182 * Tracepoint for kvm guest exit: 185 * Tracepoint for kvm guest exit:
183 */ 186 */
184TRACE_EVENT(kvm_exit, 187TRACE_EVENT(kvm_exit,
185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), 188 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
186 TP_ARGS(exit_reason, vcpu), 189 TP_ARGS(exit_reason, vcpu, isa),
187 190
188 TP_STRUCT__entry( 191 TP_STRUCT__entry(
189 __field( unsigned int, exit_reason ) 192 __field( unsigned int, exit_reason )
190 __field( unsigned long, guest_rip ) 193 __field( unsigned long, guest_rip )
194 __field( u32, isa )
195 __field( u64, info1 )
196 __field( u64, info2 )
191 ), 197 ),
192 198
193 TP_fast_assign( 199 TP_fast_assign(
194 __entry->exit_reason = exit_reason; 200 __entry->exit_reason = exit_reason;
195 __entry->guest_rip = kvm_rip_read(vcpu); 201 __entry->guest_rip = kvm_rip_read(vcpu);
202 __entry->isa = isa;
203 kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
204 &__entry->info2);
196 ), 205 ),
197 206
198 TP_printk("reason %s rip 0x%lx", 207 TP_printk("reason %s rip 0x%lx info %llx %llx",
199 ftrace_print_symbols_seq(p, __entry->exit_reason, 208 ftrace_print_symbols_seq(p, __entry->exit_reason,
200 kvm_x86_ops->exit_reasons_str), 209 kvm_x86_ops->exit_reasons_str),
201 __entry->guest_rip) 210 __entry->guest_rip, __entry->info1, __entry->info2)
202); 211);
203 212
204/* 213/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81fcbe9515c5..bf89ec2cfb82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
69static int __read_mostly vmm_exclusive = 1; 69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 70module_param(vmm_exclusive, bool, S_IRUGO);
71 71
72static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO);
74
72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
74#define KVM_GUEST_CR0_MASK \ 77#define KVM_GUEST_CR0_MASK \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
177static u64 construct_eptp(unsigned long root_hpa); 180static u64 construct_eptp(unsigned long root_hpa);
178static void kvm_cpu_vmxon(u64 addr); 181static void kvm_cpu_vmxon(u64 addr);
179static void kvm_cpu_vmxoff(void); 182static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
180 184
181static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
182static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
188static unsigned long *vmx_msr_bitmap_legacy; 192static unsigned long *vmx_msr_bitmap_legacy;
189static unsigned long *vmx_msr_bitmap_longmode; 193static unsigned long *vmx_msr_bitmap_longmode;
190 194
195static bool cpu_has_load_ia32_efer;
196
191static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 197static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
192static DEFINE_SPINLOCK(vmx_vpid_lock); 198static DEFINE_SPINLOCK(vmx_vpid_lock);
193 199
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
472 u8 error; 478 u8 error;
473 479
474 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 480 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
475 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 481 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
476 : "cc", "memory"); 482 : "cc", "memory");
477 if (error) 483 if (error)
478 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 484 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
485 u8 error; 491 u8 error;
486 492
487 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 493 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
488 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 494 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
489 : "cc", "memory"); 495 : "cc", "memory");
490 if (error) 496 if (error)
491 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 497 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
565 571
566static unsigned long vmcs_readl(unsigned long field) 572static unsigned long vmcs_readl(unsigned long field)
567{ 573{
568 unsigned long value; 574 unsigned long value = 0;
569 575
570 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 576 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
571 : "=a"(value) : "d"(field) : "cc"); 577 : "+a"(value) : "d"(field) : "cc");
572 return value; 578 return value;
573} 579}
574 580
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
661 unsigned i; 667 unsigned i;
662 struct msr_autoload *m = &vmx->msr_autoload; 668 struct msr_autoload *m = &vmx->msr_autoload;
663 669
670 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
671 vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
672 vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
673 return;
674 }
675
664 for (i = 0; i < m->nr; ++i) 676 for (i = 0; i < m->nr; ++i)
665 if (m->guest[i].index == msr) 677 if (m->guest[i].index == msr)
666 break; 678 break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
680 unsigned i; 692 unsigned i;
681 struct msr_autoload *m = &vmx->msr_autoload; 693 struct msr_autoload *m = &vmx->msr_autoload;
682 694
695 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
696 vmcs_write64(GUEST_IA32_EFER, guest_val);
697 vmcs_write64(HOST_IA32_EFER, host_val);
698 vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
699 vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
700 return;
701 }
702
683 for (i = 0; i < m->nr; ++i) 703 for (i = 0; i < m->nr; ++i)
684 if (m->guest[i].index == msr) 704 if (m->guest[i].index == msr)
685 break; 705 break;
@@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1009 vmx_set_interrupt_shadow(vcpu, 0); 1029 vmx_set_interrupt_shadow(vcpu, 0);
1010} 1030}
1011 1031
1032static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1033{
1034 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1035 * explicitly skip the instruction because if the HLT state is set, then
1036 * the instruction is already executing and RIP has already been
1037 * advanced. */
1038 if (!yield_on_hlt &&
1039 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1040 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1041}
1042
1012static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1043static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1013 bool has_error_code, u32 error_code, 1044 bool has_error_code, u32 error_code,
1014 bool reinject) 1045 bool reinject)
@@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1035 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1066 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1036 1067
1037 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1068 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1069 vmx_clear_hlt(vcpu);
1038} 1070}
1039 1071
1040static bool vmx_rdtscp_supported(void) 1072static bool vmx_rdtscp_supported(void)
@@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
1305 && tboot_enabled()) 1337 && tboot_enabled())
1306 return 1; 1338 return 1;
1307 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1308 && !tboot_enabled()) 1340 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n");
1309 return 1; 1343 return 1;
1344 }
1310 } 1345 }
1311 1346
1312 return 0; 1347 return 0;
@@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1400 return 0; 1435 return 0;
1401} 1436}
1402 1437
1438static __init bool allow_1_setting(u32 msr, u32 ctl)
1439{
1440 u32 vmx_msr_low, vmx_msr_high;
1441
1442 rdmsr(msr, vmx_msr_low, vmx_msr_high);
1443 return vmx_msr_high & ctl;
1444}
1445
1403static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 1446static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1404{ 1447{
1405 u32 vmx_msr_low, vmx_msr_high; 1448 u32 vmx_msr_low, vmx_msr_high;
@@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1416 &_pin_based_exec_control) < 0) 1459 &_pin_based_exec_control) < 0)
1417 return -EIO; 1460 return -EIO;
1418 1461
1419 min = CPU_BASED_HLT_EXITING | 1462 min =
1420#ifdef CONFIG_X86_64 1463#ifdef CONFIG_X86_64
1421 CPU_BASED_CR8_LOAD_EXITING | 1464 CPU_BASED_CR8_LOAD_EXITING |
1422 CPU_BASED_CR8_STORE_EXITING | 1465 CPU_BASED_CR8_STORE_EXITING |
@@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1429 CPU_BASED_MWAIT_EXITING | 1472 CPU_BASED_MWAIT_EXITING |
1430 CPU_BASED_MONITOR_EXITING | 1473 CPU_BASED_MONITOR_EXITING |
1431 CPU_BASED_INVLPG_EXITING; 1474 CPU_BASED_INVLPG_EXITING;
1475
1476 if (yield_on_hlt)
1477 min |= CPU_BASED_HLT_EXITING;
1478
1432 opt = CPU_BASED_TPR_SHADOW | 1479 opt = CPU_BASED_TPR_SHADOW |
1433 CPU_BASED_USE_MSR_BITMAPS | 1480 CPU_BASED_USE_MSR_BITMAPS |
1434 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1481 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1510 vmcs_conf->vmexit_ctrl = _vmexit_control; 1557 vmcs_conf->vmexit_ctrl = _vmexit_control;
1511 vmcs_conf->vmentry_ctrl = _vmentry_control; 1558 vmcs_conf->vmentry_ctrl = _vmentry_control;
1512 1559
1560 cpu_has_load_ia32_efer =
1561 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
1562 VM_ENTRY_LOAD_IA32_EFER)
1563 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
1564 VM_EXIT_LOAD_IA32_EFER);
1565
1513 return 0; 1566 return 0;
1514} 1567}
1515 1568
@@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1683 save->limit = vmcs_read32(sf->limit); 1736 save->limit = vmcs_read32(sf->limit);
1684 save->ar = vmcs_read32(sf->ar_bytes); 1737 save->ar = vmcs_read32(sf->ar_bytes);
1685 vmcs_write16(sf->selector, save->base >> 4); 1738 vmcs_write16(sf->selector, save->base >> 4);
1686 vmcs_write32(sf->base, save->base & 0xfffff); 1739 vmcs_write32(sf->base, save->base & 0xffff0);
1687 vmcs_write32(sf->limit, 0xffff); 1740 vmcs_write32(sf->limit, 0xffff);
1688 vmcs_write32(sf->ar_bytes, 0xf3); 1741 vmcs_write32(sf->ar_bytes, 0xf3);
1742 if (save->base & 0xf)
1743 printk_once(KERN_WARNING "kvm: segment base is not paragraph"
1744 " aligned when entering protected mode (seg=%d)",
1745 seg);
1689} 1746}
1690 1747
1691static void enter_rmode(struct kvm_vcpu *vcpu) 1748static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1814 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 1871 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1815} 1872}
1816 1873
1874static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
1875{
1876 if (enable_ept && is_paging(vcpu))
1877 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
1878 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
1879}
1880
1817static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1881static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1818{ 1882{
1819 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 1883 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1857 unsigned long cr0, 1921 unsigned long cr0,
1858 struct kvm_vcpu *vcpu) 1922 struct kvm_vcpu *vcpu)
1859{ 1923{
1924 vmx_decache_cr3(vcpu);
1860 if (!(cr0 & X86_CR0_PG)) { 1925 if (!(cr0 & X86_CR0_PG)) {
1861 /* From paging/starting to nonpaging */ 1926 /* From paging/starting to nonpaging */
1862 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1927 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1937 if (enable_ept) { 2002 if (enable_ept) {
1938 eptp = construct_eptp(cr3); 2003 eptp = construct_eptp(cr3);
1939 vmcs_write64(EPT_POINTER, eptp); 2004 vmcs_write64(EPT_POINTER, eptp);
1940 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 2005 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
1941 vcpu->kvm->arch.ept_identity_map_addr; 2006 vcpu->kvm->arch.ept_identity_map_addr;
1942 ept_load_pdptrs(vcpu); 2007 ept_load_pdptrs(vcpu);
1943 } 2008 }
@@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2725 vmcs_writel(GUEST_IDTR_BASE, 0); 2790 vmcs_writel(GUEST_IDTR_BASE, 0);
2726 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 2791 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2727 2792
2728 vmcs_write32(GUEST_ACTIVITY_STATE, 0); 2793 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2729 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 2794 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2730 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 2795 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2731 2796
@@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2787 return; 2852 return;
2788 } 2853 }
2789 2854
2855 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
2856 enable_irq_window(vcpu);
2857 return;
2858 }
2790 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2859 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2791 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 2860 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2792 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2861 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2814 } else 2883 } else
2815 intr |= INTR_TYPE_EXT_INTR; 2884 intr |= INTR_TYPE_EXT_INTR;
2816 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 2885 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2886 vmx_clear_hlt(vcpu);
2817} 2887}
2818 2888
2819static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2889static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2841 } 2911 }
2842 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2912 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2843 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2913 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2914 vmx_clear_hlt(vcpu);
2844} 2915}
2845 2916
2846static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 2917static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2849 return 0; 2920 return 0;
2850 2921
2851 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2922 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2852 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); 2923 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
2924 | GUEST_INTR_STATE_NMI));
2853} 2925}
2854 2926
2855static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 2927static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2910 * Cause the #SS fault with 0 error code in VM86 mode. 2982 * Cause the #SS fault with 0 error code in VM86 mode.
2911 */ 2983 */
2912 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2984 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2913 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) 2985 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
2914 return 1; 2986 return 1;
2915 /* 2987 /*
2916 * Forward all other exceptions that are valid in real mode. 2988 * Forward all other exceptions that are valid in real mode.
@@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3007 } 3079 }
3008 3080
3009 if (is_invalid_opcode(intr_info)) { 3081 if (is_invalid_opcode(intr_info)) {
3010 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); 3082 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3011 if (er != EMULATE_DONE) 3083 if (er != EMULATE_DONE)
3012 kvm_queue_exception(vcpu, UD_VECTOR); 3084 kvm_queue_exception(vcpu, UD_VECTOR);
3013 return 1; 3085 return 1;
@@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3026 3098
3027 if (kvm_event_needs_reinjection(vcpu)) 3099 if (kvm_event_needs_reinjection(vcpu))
3028 kvm_mmu_unprotect_page_virt(vcpu, cr2); 3100 kvm_mmu_unprotect_page_virt(vcpu, cr2);
3029 return kvm_mmu_page_fault(vcpu, cr2, error_code); 3101 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3030 } 3102 }
3031 3103
3032 if (vmx->rmode.vm86_active && 3104 if (vmx->rmode.vm86_active &&
@@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3098 ++vcpu->stat.io_exits; 3170 ++vcpu->stat.io_exits;
3099 3171
3100 if (string || in) 3172 if (string || in)
3101 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3173 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3102 3174
3103 port = exit_qualification >> 16; 3175 port = exit_qualification >> 16;
3104 size = (exit_qualification & 7) + 1; 3176 size = (exit_qualification & 7) + 1;
@@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3118 hypercall[2] = 0xc1; 3190 hypercall[2] = 0xc1;
3119} 3191}
3120 3192
3121static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3122{
3123 if (err)
3124 kvm_inject_gp(vcpu, 0);
3125 else
3126 skip_emulated_instruction(vcpu);
3127}
3128
3129static int handle_cr(struct kvm_vcpu *vcpu) 3193static int handle_cr(struct kvm_vcpu *vcpu)
3130{ 3194{
3131 unsigned long exit_qualification, val; 3195 unsigned long exit_qualification, val;
@@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3143 switch (cr) { 3207 switch (cr) {
3144 case 0: 3208 case 0:
3145 err = kvm_set_cr0(vcpu, val); 3209 err = kvm_set_cr0(vcpu, val);
3146 complete_insn_gp(vcpu, err); 3210 kvm_complete_insn_gp(vcpu, err);
3147 return 1; 3211 return 1;
3148 case 3: 3212 case 3:
3149 err = kvm_set_cr3(vcpu, val); 3213 err = kvm_set_cr3(vcpu, val);
3150 complete_insn_gp(vcpu, err); 3214 kvm_complete_insn_gp(vcpu, err);
3151 return 1; 3215 return 1;
3152 case 4: 3216 case 4:
3153 err = kvm_set_cr4(vcpu, val); 3217 err = kvm_set_cr4(vcpu, val);
3154 complete_insn_gp(vcpu, err); 3218 kvm_complete_insn_gp(vcpu, err);
3155 return 1; 3219 return 1;
3156 case 8: { 3220 case 8: {
3157 u8 cr8_prev = kvm_get_cr8(vcpu); 3221 u8 cr8_prev = kvm_get_cr8(vcpu);
3158 u8 cr8 = kvm_register_read(vcpu, reg); 3222 u8 cr8 = kvm_register_read(vcpu, reg);
3159 kvm_set_cr8(vcpu, cr8); 3223 err = kvm_set_cr8(vcpu, cr8);
3160 skip_emulated_instruction(vcpu); 3224 kvm_complete_insn_gp(vcpu, err);
3161 if (irqchip_in_kernel(vcpu->kvm)) 3225 if (irqchip_in_kernel(vcpu->kvm))
3162 return 1; 3226 return 1;
3163 if (cr8_prev <= cr8) 3227 if (cr8_prev <= cr8)
@@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3176 case 1: /*mov from cr*/ 3240 case 1: /*mov from cr*/
3177 switch (cr) { 3241 switch (cr) {
3178 case 3: 3242 case 3:
3179 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 3243 val = kvm_read_cr3(vcpu);
3180 trace_kvm_cr_read(cr, vcpu->arch.cr3); 3244 kvm_register_write(vcpu, reg, val);
3245 trace_kvm_cr_read(cr, val);
3181 skip_emulated_instruction(vcpu); 3246 skip_emulated_instruction(vcpu);
3182 return 1; 3247 return 1;
3183 case 8: 3248 case 8:
@@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3349 return 1; 3414 return 1;
3350} 3415}
3351 3416
3417static int handle_invd(struct kvm_vcpu *vcpu)
3418{
3419 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3420}
3421
3352static int handle_invlpg(struct kvm_vcpu *vcpu) 3422static int handle_invlpg(struct kvm_vcpu *vcpu)
3353{ 3423{
3354 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3424 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
3377 3447
3378static int handle_apic_access(struct kvm_vcpu *vcpu) 3448static int handle_apic_access(struct kvm_vcpu *vcpu)
3379{ 3449{
3380 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3450 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3381} 3451}
3382 3452
3383static int handle_task_switch(struct kvm_vcpu *vcpu) 3453static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
3476 3546
3477 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3547 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3478 trace_kvm_page_fault(gpa, exit_qualification); 3548 trace_kvm_page_fault(gpa, exit_qualification);
3479 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3549 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
3480} 3550}
3481 3551
3482static u64 ept_rsvd_mask(u64 spte, int level) 3552static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3592 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) 3662 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3593 return handle_interrupt_window(&vmx->vcpu); 3663 return handle_interrupt_window(&vmx->vcpu);
3594 3664
3595 err = emulate_instruction(vcpu, 0, 0, 0); 3665 err = emulate_instruction(vcpu, 0);
3596 3666
3597 if (err == EMULATE_DO_MMIO) { 3667 if (err == EMULATE_DO_MMIO) {
3598 ret = 0; 3668 ret = 0;
@@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3649 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 3719 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
3650 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 3720 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
3651 [EXIT_REASON_HLT] = handle_halt, 3721 [EXIT_REASON_HLT] = handle_halt,
3722 [EXIT_REASON_INVD] = handle_invd,
3652 [EXIT_REASON_INVLPG] = handle_invlpg, 3723 [EXIT_REASON_INVLPG] = handle_invlpg,
3653 [EXIT_REASON_VMCALL] = handle_vmcall, 3724 [EXIT_REASON_VMCALL] = handle_vmcall,
3654 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 3725 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3676static const int kvm_vmx_max_exit_handlers = 3747static const int kvm_vmx_max_exit_handlers =
3677 ARRAY_SIZE(kvm_vmx_exit_handlers); 3748 ARRAY_SIZE(kvm_vmx_exit_handlers);
3678 3749
3750static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3751{
3752 *info1 = vmcs_readl(EXIT_QUALIFICATION);
3753 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
3754}
3755
3679/* 3756/*
3680 * The guest has exited. See if we can fix it or if we need userspace 3757 * The guest has exited. See if we can fix it or if we need userspace
3681 * assistance. 3758 * assistance.
@@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3686 u32 exit_reason = vmx->exit_reason; 3763 u32 exit_reason = vmx->exit_reason;
3687 u32 vectoring_info = vmx->idt_vectoring_info; 3764 u32 vectoring_info = vmx->idt_vectoring_info;
3688 3765
3689 trace_kvm_exit(exit_reason, vcpu); 3766 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
3690 3767
3691 /* If guest state is invalid, start emulating */ 3768 /* If guest state is invalid, start emulating */
3692 if (vmx->emulation_required && emulate_invalid_guest_state) 3769 if (vmx->emulation_required && emulate_invalid_guest_state)
3693 return handle_invalid_guest_state(vcpu); 3770 return handle_invalid_guest_state(vcpu);
3694 3771
3695 /* Access CR3 don't cause VMExit in paging mode, so we need
3696 * to sync with guest real CR3. */
3697 if (enable_ept && is_paging(vcpu))
3698 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3699
3700 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 3772 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3701 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3773 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3702 vcpu->run->fail_entry.hardware_entry_failure_reason 3774 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4013 ); 4085 );
4014 4086
4015 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4087 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4016 | (1 << VCPU_EXREG_PDPTR)); 4088 | (1 << VCPU_EXREG_PDPTR)
4089 | (1 << VCPU_EXREG_CR3));
4017 vcpu->arch.regs_dirty = 0; 4090 vcpu->arch.regs_dirty = 0;
4018 4091
4019 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4092 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4280 .get_cpl = vmx_get_cpl, 4353 .get_cpl = vmx_get_cpl,
4281 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4354 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4282 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 4355 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
4356 .decache_cr3 = vmx_decache_cr3,
4283 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4357 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
4284 .set_cr0 = vmx_set_cr0, 4358 .set_cr0 = vmx_set_cr0,
4285 .set_cr3 = vmx_set_cr3, 4359 .set_cr3 = vmx_set_cr3,
@@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
4320 .get_tdp_level = get_ept_level, 4394 .get_tdp_level = get_ept_level,
4321 .get_mt_mask = vmx_get_mt_mask, 4395 .get_mt_mask = vmx_get_mt_mask,
4322 4396
4397 .get_exit_info = vmx_get_exit_info,
4323 .exit_reasons_str = vmx_exit_reasons_str, 4398 .exit_reasons_str = vmx_exit_reasons_str,
4399
4324 .get_lpage_level = vmx_get_lpage_level, 4400 .get_lpage_level = vmx_get_lpage_level,
4325 4401
4326 .cpuid_update = vmx_cpuid_update, 4402 .cpuid_update = vmx_cpuid_update,
@@ -4396,8 +4472,6 @@ static int __init vmx_init(void)
4396 4472
4397 if (enable_ept) { 4473 if (enable_ept) {
4398 bypass_guest_pf = 0; 4474 bypass_guest_pf = 0;
4399 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
4400 VMX_EPT_WRITABLE_MASK);
4401 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 4475 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4402 VMX_EPT_EXECUTABLE_MASK); 4476 VMX_EPT_EXECUTABLE_MASK);
4403 kvm_enable_tdp(); 4477 kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 46a368cb651e..bcc0efce85bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h>
46#include <trace/events/kvm.h> 47#include <trace/events/kvm.h>
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
155 156
156u64 __read_mostly host_xcr0; 157u64 __read_mostly host_xcr0;
157 158
159static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
160{
161 int i;
162 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
163 vcpu->arch.apf.gfns[i] = ~0;
164}
165
158static void kvm_on_user_return(struct user_return_notifier *urn) 166static void kvm_on_user_return(struct user_return_notifier *urn)
159{ 167{
160 unsigned slot; 168 unsigned slot;
@@ -326,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
326} 334}
327EXPORT_SYMBOL_GPL(kvm_requeue_exception); 335EXPORT_SYMBOL_GPL(kvm_requeue_exception);
328 336
329void kvm_inject_page_fault(struct kvm_vcpu *vcpu) 337void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
330{ 338{
331 unsigned error_code = vcpu->arch.fault.error_code; 339 if (err)
340 kvm_inject_gp(vcpu, 0);
341 else
342 kvm_x86_ops->skip_emulated_instruction(vcpu);
343}
344EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
332 345
346void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
347{
333 ++vcpu->stat.pf_guest; 348 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = vcpu->arch.fault.address; 349 vcpu->arch.cr2 = fault->address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 350 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
336} 351}
337 352
338void kvm_propagate_fault(struct kvm_vcpu *vcpu) 353void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
339{ 354{
340 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) 355 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
341 vcpu->arch.nested_mmu.inject_page_fault(vcpu); 356 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
342 else 357 else
343 vcpu->arch.mmu.inject_page_fault(vcpu); 358 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
344
345 vcpu->arch.fault.nested = false;
346} 359}
347 360
348void kvm_inject_nmi(struct kvm_vcpu *vcpu) 361void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -460,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
460 (unsigned long *)&vcpu->arch.regs_avail)) 473 (unsigned long *)&vcpu->arch.regs_avail))
461 return true; 474 return true;
462 475
463 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; 476 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
464 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); 477 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
465 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 478 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
466 PFERR_USER_MASK | PFERR_WRITE_MASK); 479 PFERR_USER_MASK | PFERR_WRITE_MASK);
467 if (r < 0) 480 if (r < 0)
@@ -506,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
506 } else 519 } else
507#endif 520#endif
508 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 521 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
509 vcpu->arch.cr3)) 522 kvm_read_cr3(vcpu)))
510 return 1; 523 return 1;
511 } 524 }
512 525
513 kvm_x86_ops->set_cr0(vcpu, cr0); 526 kvm_x86_ops->set_cr0(vcpu, cr0);
514 527
528 if ((cr0 ^ old_cr0) & X86_CR0_PG)
529 kvm_clear_async_pf_completion_queue(vcpu);
530
515 if ((cr0 ^ old_cr0) & update_bits) 531 if ((cr0 ^ old_cr0) & update_bits)
516 kvm_mmu_reset_context(vcpu); 532 kvm_mmu_reset_context(vcpu);
517 return 0; 533 return 0;
@@ -595,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
595 return 1; 611 return 1;
596 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 612 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
597 && ((cr4 ^ old_cr4) & pdptr_bits) 613 && ((cr4 ^ old_cr4) & pdptr_bits)
598 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) 614 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
615 kvm_read_cr3(vcpu)))
599 return 1; 616 return 1;
600 617
601 if (cr4 & X86_CR4_VMXE) 618 if (cr4 & X86_CR4_VMXE)
@@ -615,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
615 632
616int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 633int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
617{ 634{
618 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 635 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
619 kvm_mmu_sync_roots(vcpu); 636 kvm_mmu_sync_roots(vcpu);
620 kvm_mmu_flush_tlb(vcpu); 637 kvm_mmu_flush_tlb(vcpu);
621 return 0; 638 return 0;
@@ -650,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
650 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 667 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
651 return 1; 668 return 1;
652 vcpu->arch.cr3 = cr3; 669 vcpu->arch.cr3 = cr3;
670 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
653 vcpu->arch.mmu.new_cr3(vcpu); 671 vcpu->arch.mmu.new_cr3(vcpu);
654 return 0; 672 return 0;
655} 673}
656EXPORT_SYMBOL_GPL(kvm_set_cr3); 674EXPORT_SYMBOL_GPL(kvm_set_cr3);
657 675
658int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 676int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
659{ 677{
660 if (cr8 & CR8_RESERVED_BITS) 678 if (cr8 & CR8_RESERVED_BITS)
661 return 1; 679 return 1;
@@ -665,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
665 vcpu->arch.cr8 = cr8; 683 vcpu->arch.cr8 = cr8;
666 return 0; 684 return 0;
667} 685}
668
669void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
670{
671 if (__kvm_set_cr8(vcpu, cr8))
672 kvm_inject_gp(vcpu, 0);
673}
674EXPORT_SYMBOL_GPL(kvm_set_cr8); 686EXPORT_SYMBOL_GPL(kvm_set_cr8);
675 687
676unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 688unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -775,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
775 * kvm-specific. Those are put in the beginning of the list. 787 * kvm-specific. Those are put in the beginning of the list.
776 */ 788 */
777 789
778#define KVM_SAVE_MSRS_BEGIN 7 790#define KVM_SAVE_MSRS_BEGIN 8
779static u32 msrs_to_save[] = { 791static u32 msrs_to_save[] = {
780 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
781 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
782 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
783 HV_X64_MSR_APIC_ASSIST_PAGE, 795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
784 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
785 MSR_STAR, 797 MSR_STAR,
786#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -830,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
830 kvm_x86_ops->set_efer(vcpu, efer); 842 kvm_x86_ops->set_efer(vcpu, efer);
831 843
832 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 844 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
833 kvm_mmu_reset_context(vcpu);
834 845
835 /* Update reserved bits */ 846 /* Update reserved bits */
836 if ((efer ^ old_efer) & EFER_NX) 847 if ((efer ^ old_efer) & EFER_NX)
@@ -1418,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1418 return 0; 1429 return 0;
1419} 1430}
1420 1431
1432static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1433{
1434 gpa_t gpa = data & ~0x3f;
1435
1436 /* Bits 2:5 are resrved, Should be zero */
1437 if (data & 0x3c)
1438 return 1;
1439
1440 vcpu->arch.apf.msr_val = data;
1441
1442 if (!(data & KVM_ASYNC_PF_ENABLED)) {
1443 kvm_clear_async_pf_completion_queue(vcpu);
1444 kvm_async_pf_hash_reset(vcpu);
1445 return 0;
1446 }
1447
1448 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
1449 return 1;
1450
1451 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1452 kvm_async_pf_wakeup_all(vcpu);
1453 return 0;
1454}
1455
1421int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1422{ 1457{
1423 switch (msr) { 1458 switch (msr) {
@@ -1499,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1499 } 1534 }
1500 break; 1535 break;
1501 } 1536 }
1537 case MSR_KVM_ASYNC_PF_EN:
1538 if (kvm_pv_enable_async_pf(vcpu, data))
1539 return 1;
1540 break;
1502 case MSR_IA32_MCG_CTL: 1541 case MSR_IA32_MCG_CTL:
1503 case MSR_IA32_MCG_STATUS: 1542 case MSR_IA32_MCG_STATUS:
1504 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1543 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1775,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1775 case MSR_KVM_SYSTEM_TIME_NEW: 1814 case MSR_KVM_SYSTEM_TIME_NEW:
1776 data = vcpu->arch.time; 1815 data = vcpu->arch.time;
1777 break; 1816 break;
1817 case MSR_KVM_ASYNC_PF_EN:
1818 data = vcpu->arch.apf.msr_val;
1819 break;
1778 case MSR_IA32_P5_MC_ADDR: 1820 case MSR_IA32_P5_MC_ADDR:
1779 case MSR_IA32_P5_MC_TYPE: 1821 case MSR_IA32_P5_MC_TYPE:
1780 case MSR_IA32_MCG_CAP: 1822 case MSR_IA32_MCG_CAP:
@@ -1904,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1904 case KVM_CAP_NOP_IO_DELAY: 1946 case KVM_CAP_NOP_IO_DELAY:
1905 case KVM_CAP_MP_STATE: 1947 case KVM_CAP_MP_STATE:
1906 case KVM_CAP_SYNC_MMU: 1948 case KVM_CAP_SYNC_MMU:
1949 case KVM_CAP_USER_NMI:
1907 case KVM_CAP_REINJECT_CONTROL: 1950 case KVM_CAP_REINJECT_CONTROL:
1908 case KVM_CAP_IRQ_INJECT_STATUS: 1951 case KVM_CAP_IRQ_INJECT_STATUS:
1909 case KVM_CAP_ASSIGN_DEV_IRQ: 1952 case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1922,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1922 case KVM_CAP_DEBUGREGS: 1965 case KVM_CAP_DEBUGREGS:
1923 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1966 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1924 case KVM_CAP_XSAVE: 1967 case KVM_CAP_XSAVE:
1968 case KVM_CAP_ASYNC_PF:
1925 r = 1; 1969 r = 1;
1926 break; 1970 break;
1927 case KVM_CAP_COALESCED_MMIO: 1971 case KVM_CAP_COALESCED_MMIO:
@@ -2185,6 +2229,11 @@ out:
2185 return r; 2229 return r;
2186} 2230}
2187 2231
2232static void cpuid_mask(u32 *word, int wordnum)
2233{
2234 *word &= boot_cpu_data.x86_capability[wordnum];
2235}
2236
2188static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2237static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2189 u32 index) 2238 u32 index)
2190{ 2239{
@@ -2259,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2259 break; 2308 break;
2260 case 1: 2309 case 1:
2261 entry->edx &= kvm_supported_word0_x86_features; 2310 entry->edx &= kvm_supported_word0_x86_features;
2311 cpuid_mask(&entry->edx, 0);
2262 entry->ecx &= kvm_supported_word4_x86_features; 2312 entry->ecx &= kvm_supported_word4_x86_features;
2313 cpuid_mask(&entry->ecx, 4);
2263 /* we support x2apic emulation even if host does not support 2314 /* we support x2apic emulation even if host does not support
2264 * it since we emulate x2apic in software */ 2315 * it since we emulate x2apic in software */
2265 entry->ecx |= F(X2APIC); 2316 entry->ecx |= F(X2APIC);
@@ -2350,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2350 break; 2401 break;
2351 case 0x80000001: 2402 case 0x80000001:
2352 entry->edx &= kvm_supported_word1_x86_features; 2403 entry->edx &= kvm_supported_word1_x86_features;
2404 cpuid_mask(&entry->edx, 1);
2353 entry->ecx &= kvm_supported_word6_x86_features; 2405 entry->ecx &= kvm_supported_word6_x86_features;
2406 cpuid_mask(&entry->ecx, 6);
2354 break; 2407 break;
2355 } 2408 }
2356 2409
@@ -3169,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3169 struct kvm_memslots *slots, *old_slots; 3222 struct kvm_memslots *slots, *old_slots;
3170 unsigned long *dirty_bitmap; 3223 unsigned long *dirty_bitmap;
3171 3224
3172 r = -ENOMEM; 3225 dirty_bitmap = memslot->dirty_bitmap_head;
3173 dirty_bitmap = vmalloc(n); 3226 if (memslot->dirty_bitmap == dirty_bitmap)
3174 if (!dirty_bitmap) 3227 dirty_bitmap += n / sizeof(long);
3175 goto out;
3176 memset(dirty_bitmap, 0, n); 3228 memset(dirty_bitmap, 0, n);
3177 3229
3178 r = -ENOMEM; 3230 r = -ENOMEM;
3179 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3231 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
3180 if (!slots) { 3232 if (!slots)
3181 vfree(dirty_bitmap);
3182 goto out; 3233 goto out;
3183 }
3184 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3234 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
3185 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3235 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
3236 slots->generation++;
3186 3237
3187 old_slots = kvm->memslots; 3238 old_slots = kvm->memslots;
3188 rcu_assign_pointer(kvm->memslots, slots); 3239 rcu_assign_pointer(kvm->memslots, slots);
@@ -3195,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3195 spin_unlock(&kvm->mmu_lock); 3246 spin_unlock(&kvm->mmu_lock);
3196 3247
3197 r = -EFAULT; 3248 r = -EFAULT;
3198 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 3249 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
3199 vfree(dirty_bitmap);
3200 goto out; 3250 goto out;
3201 }
3202 vfree(dirty_bitmap);
3203 } else { 3251 } else {
3204 r = -EFAULT; 3252 r = -EFAULT;
3205 if (clear_user(log->dirty_bitmap, n)) 3253 if (clear_user(log->dirty_bitmap, n))
@@ -3266,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3266 if (vpic) { 3314 if (vpic) {
3267 r = kvm_ioapic_init(kvm); 3315 r = kvm_ioapic_init(kvm);
3268 if (r) { 3316 if (r) {
3317 mutex_lock(&kvm->slots_lock);
3269 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3318 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3270 &vpic->dev); 3319 &vpic->dev);
3320 mutex_unlock(&kvm->slots_lock);
3271 kfree(vpic); 3321 kfree(vpic);
3272 goto create_irqchip_unlock; 3322 goto create_irqchip_unlock;
3273 } 3323 }
@@ -3278,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
3278 smp_wmb(); 3328 smp_wmb();
3279 r = kvm_setup_default_irq_routing(kvm); 3329 r = kvm_setup_default_irq_routing(kvm);
3280 if (r) { 3330 if (r) {
3331 mutex_lock(&kvm->slots_lock);
3281 mutex_lock(&kvm->irq_lock); 3332 mutex_lock(&kvm->irq_lock);
3282 kvm_ioapic_destroy(kvm); 3333 kvm_ioapic_destroy(kvm);
3283 kvm_destroy_pic(kvm); 3334 kvm_destroy_pic(kvm);
3284 mutex_unlock(&kvm->irq_lock); 3335 mutex_unlock(&kvm->irq_lock);
3336 mutex_unlock(&kvm->slots_lock);
3285 } 3337 }
3286 create_irqchip_unlock: 3338 create_irqchip_unlock:
3287 mutex_unlock(&kvm->lock); 3339 mutex_unlock(&kvm->lock);
@@ -3557,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3557static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3609static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3558{ 3610{
3559 gpa_t t_gpa; 3611 gpa_t t_gpa;
3560 u32 error; 3612 struct x86_exception exception;
3561 3613
3562 BUG_ON(!mmu_is_nested(vcpu)); 3614 BUG_ON(!mmu_is_nested(vcpu));
3563 3615
3564 /* NPT walks are always user-walks */ 3616 /* NPT walks are always user-walks */
3565 access |= PFERR_USER_MASK; 3617 access |= PFERR_USER_MASK;
3566 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); 3618 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3567 if (t_gpa == UNMAPPED_GVA)
3568 vcpu->arch.fault.nested = true;
3569 3619
3570 return t_gpa; 3620 return t_gpa;
3571} 3621}
3572 3622
3573gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3623gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3624 struct x86_exception *exception)
3574{ 3625{
3575 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3626 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3576 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3627 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3577} 3628}
3578 3629
3579 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3630 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3631 struct x86_exception *exception)
3580{ 3632{
3581 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3633 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3582 access |= PFERR_FETCH_MASK; 3634 access |= PFERR_FETCH_MASK;
3583 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3635 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3584} 3636}
3585 3637
3586gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3638gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3639 struct x86_exception *exception)
3587{ 3640{
3588 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3641 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3589 access |= PFERR_WRITE_MASK; 3642 access |= PFERR_WRITE_MASK;
3590 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3643 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3591} 3644}
3592 3645
3593/* uses this to access any guest's mapped memory without checking CPL */ 3646/* uses this to access any guest's mapped memory without checking CPL */
3594gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3647gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3648 struct x86_exception *exception)
3595{ 3649{
3596 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); 3650 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3597} 3651}
3598 3652
3599static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3653static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3600 struct kvm_vcpu *vcpu, u32 access, 3654 struct kvm_vcpu *vcpu, u32 access,
3601 u32 *error) 3655 struct x86_exception *exception)
3602{ 3656{
3603 void *data = val; 3657 void *data = val;
3604 int r = X86EMUL_CONTINUE; 3658 int r = X86EMUL_CONTINUE;
3605 3659
3606 while (bytes) { 3660 while (bytes) {
3607 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, 3661 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3608 error); 3662 exception);
3609 unsigned offset = addr & (PAGE_SIZE-1); 3663 unsigned offset = addr & (PAGE_SIZE-1);
3610 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3664 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3611 int ret; 3665 int ret;
3612 3666
3613 if (gpa == UNMAPPED_GVA) { 3667 if (gpa == UNMAPPED_GVA)
3614 r = X86EMUL_PROPAGATE_FAULT; 3668 return X86EMUL_PROPAGATE_FAULT;
3615 goto out;
3616 }
3617 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3669 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3618 if (ret < 0) { 3670 if (ret < 0) {
3619 r = X86EMUL_IO_NEEDED; 3671 r = X86EMUL_IO_NEEDED;
@@ -3630,31 +3682,35 @@ out:
3630 3682
3631/* used for instruction fetching */ 3683/* used for instruction fetching */
3632static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3684static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3633 struct kvm_vcpu *vcpu, u32 *error) 3685 struct kvm_vcpu *vcpu,
3686 struct x86_exception *exception)
3634{ 3687{
3635 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3688 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3636 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3689 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3637 access | PFERR_FETCH_MASK, error); 3690 access | PFERR_FETCH_MASK,
3691 exception);
3638} 3692}
3639 3693
3640static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3694static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3641 struct kvm_vcpu *vcpu, u32 *error) 3695 struct kvm_vcpu *vcpu,
3696 struct x86_exception *exception)
3642{ 3697{
3643 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3698 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3644 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3699 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3645 error); 3700 exception);
3646} 3701}
3647 3702
3648static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3703static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3649 struct kvm_vcpu *vcpu, u32 *error) 3704 struct kvm_vcpu *vcpu,
3705 struct x86_exception *exception)
3650{ 3706{
3651 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3707 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3652} 3708}
3653 3709
3654static int kvm_write_guest_virt_system(gva_t addr, void *val, 3710static int kvm_write_guest_virt_system(gva_t addr, void *val,
3655 unsigned int bytes, 3711 unsigned int bytes,
3656 struct kvm_vcpu *vcpu, 3712 struct kvm_vcpu *vcpu,
3657 u32 *error) 3713 struct x86_exception *exception)
3658{ 3714{
3659 void *data = val; 3715 void *data = val;
3660 int r = X86EMUL_CONTINUE; 3716 int r = X86EMUL_CONTINUE;
@@ -3662,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3662 while (bytes) { 3718 while (bytes) {
3663 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 3719 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3664 PFERR_WRITE_MASK, 3720 PFERR_WRITE_MASK,
3665 error); 3721 exception);
3666 unsigned offset = addr & (PAGE_SIZE-1); 3722 unsigned offset = addr & (PAGE_SIZE-1);
3667 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3723 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3668 int ret; 3724 int ret;
3669 3725
3670 if (gpa == UNMAPPED_GVA) { 3726 if (gpa == UNMAPPED_GVA)
3671 r = X86EMUL_PROPAGATE_FAULT; 3727 return X86EMUL_PROPAGATE_FAULT;
3672 goto out;
3673 }
3674 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3728 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3675 if (ret < 0) { 3729 if (ret < 0) {
3676 r = X86EMUL_IO_NEEDED; 3730 r = X86EMUL_IO_NEEDED;
@@ -3688,7 +3742,7 @@ out:
3688static int emulator_read_emulated(unsigned long addr, 3742static int emulator_read_emulated(unsigned long addr,
3689 void *val, 3743 void *val,
3690 unsigned int bytes, 3744 unsigned int bytes,
3691 unsigned int *error_code, 3745 struct x86_exception *exception,
3692 struct kvm_vcpu *vcpu) 3746 struct kvm_vcpu *vcpu)
3693{ 3747{
3694 gpa_t gpa; 3748 gpa_t gpa;
@@ -3701,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
3701 return X86EMUL_CONTINUE; 3755 return X86EMUL_CONTINUE;
3702 } 3756 }
3703 3757
3704 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3758 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
3705 3759
3706 if (gpa == UNMAPPED_GVA) 3760 if (gpa == UNMAPPED_GVA)
3707 return X86EMUL_PROPAGATE_FAULT; 3761 return X86EMUL_PROPAGATE_FAULT;
@@ -3710,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
3710 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3764 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3711 goto mmio; 3765 goto mmio;
3712 3766
3713 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) 3767 if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
3714 == X86EMUL_CONTINUE) 3768 == X86EMUL_CONTINUE)
3715 return X86EMUL_CONTINUE; 3769 return X86EMUL_CONTINUE;
3716 3770
3717mmio: 3771mmio:
@@ -3735,7 +3789,7 @@ mmio:
3735} 3789}
3736 3790
3737int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3791int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3738 const void *val, int bytes) 3792 const void *val, int bytes)
3739{ 3793{
3740 int ret; 3794 int ret;
3741 3795
@@ -3749,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3749static int emulator_write_emulated_onepage(unsigned long addr, 3803static int emulator_write_emulated_onepage(unsigned long addr,
3750 const void *val, 3804 const void *val,
3751 unsigned int bytes, 3805 unsigned int bytes,
3752 unsigned int *error_code, 3806 struct x86_exception *exception,
3753 struct kvm_vcpu *vcpu) 3807 struct kvm_vcpu *vcpu)
3754{ 3808{
3755 gpa_t gpa; 3809 gpa_t gpa;
3756 3810
3757 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3811 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3758 3812
3759 if (gpa == UNMAPPED_GVA) 3813 if (gpa == UNMAPPED_GVA)
3760 return X86EMUL_PROPAGATE_FAULT; 3814 return X86EMUL_PROPAGATE_FAULT;
@@ -3787,7 +3841,7 @@ mmio:
3787int emulator_write_emulated(unsigned long addr, 3841int emulator_write_emulated(unsigned long addr,
3788 const void *val, 3842 const void *val,
3789 unsigned int bytes, 3843 unsigned int bytes,
3790 unsigned int *error_code, 3844 struct x86_exception *exception,
3791 struct kvm_vcpu *vcpu) 3845 struct kvm_vcpu *vcpu)
3792{ 3846{
3793 /* Crossing a page boundary? */ 3847 /* Crossing a page boundary? */
@@ -3795,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
3795 int rc, now; 3849 int rc, now;
3796 3850
3797 now = -addr & ~PAGE_MASK; 3851 now = -addr & ~PAGE_MASK;
3798 rc = emulator_write_emulated_onepage(addr, val, now, error_code, 3852 rc = emulator_write_emulated_onepage(addr, val, now, exception,
3799 vcpu); 3853 vcpu);
3800 if (rc != X86EMUL_CONTINUE) 3854 if (rc != X86EMUL_CONTINUE)
3801 return rc; 3855 return rc;
@@ -3803,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
3803 val += now; 3857 val += now;
3804 bytes -= now; 3858 bytes -= now;
3805 } 3859 }
3806 return emulator_write_emulated_onepage(addr, val, bytes, error_code, 3860 return emulator_write_emulated_onepage(addr, val, bytes, exception,
3807 vcpu); 3861 vcpu);
3808} 3862}
3809 3863
@@ -3821,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3821 const void *old, 3875 const void *old,
3822 const void *new, 3876 const void *new,
3823 unsigned int bytes, 3877 unsigned int bytes,
3824 unsigned int *error_code, 3878 struct x86_exception *exception,
3825 struct kvm_vcpu *vcpu) 3879 struct kvm_vcpu *vcpu)
3826{ 3880{
3827 gpa_t gpa; 3881 gpa_t gpa;
@@ -3879,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3879emul_write: 3933emul_write:
3880 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3934 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3881 3935
3882 return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 3936 return emulator_write_emulated(addr, new, bytes, exception, vcpu);
3883} 3937}
3884 3938
3885static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3939static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3904,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3904 if (vcpu->arch.pio.count) 3958 if (vcpu->arch.pio.count)
3905 goto data_avail; 3959 goto data_avail;
3906 3960
3907 trace_kvm_pio(0, port, size, 1); 3961 trace_kvm_pio(0, port, size, count);
3908 3962
3909 vcpu->arch.pio.port = port; 3963 vcpu->arch.pio.port = port;
3910 vcpu->arch.pio.in = 1; 3964 vcpu->arch.pio.in = 1;
@@ -3932,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3932 const void *val, unsigned int count, 3986 const void *val, unsigned int count,
3933 struct kvm_vcpu *vcpu) 3987 struct kvm_vcpu *vcpu)
3934{ 3988{
3935 trace_kvm_pio(1, port, size, 1); 3989 trace_kvm_pio(1, port, size, count);
3936 3990
3937 vcpu->arch.pio.port = port; 3991 vcpu->arch.pio.port = port;
3938 vcpu->arch.pio.in = 0; 3992 vcpu->arch.pio.in = 0;
@@ -3973,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3973 return X86EMUL_CONTINUE; 4027 return X86EMUL_CONTINUE;
3974 4028
3975 if (kvm_x86_ops->has_wbinvd_exit()) { 4029 if (kvm_x86_ops->has_wbinvd_exit()) {
3976 preempt_disable(); 4030 int cpu = get_cpu();
4031
4032 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3977 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4033 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3978 wbinvd_ipi, NULL, 1); 4034 wbinvd_ipi, NULL, 1);
3979 preempt_enable(); 4035 put_cpu();
3980 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4036 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3981 } 4037 } else
3982 wbinvd(); 4038 wbinvd();
3983 return X86EMUL_CONTINUE; 4039 return X86EMUL_CONTINUE;
3984} 4040}
3985EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4041EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4019,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
4019 value = vcpu->arch.cr2; 4075 value = vcpu->arch.cr2;
4020 break; 4076 break;
4021 case 3: 4077 case 3:
4022 value = vcpu->arch.cr3; 4078 value = kvm_read_cr3(vcpu);
4023 break; 4079 break;
4024 case 4: 4080 case 4:
4025 value = kvm_read_cr4(vcpu); 4081 value = kvm_read_cr4(vcpu);
@@ -4053,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
4053 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4109 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4054 break; 4110 break;
4055 case 8: 4111 case 8:
4056 res = __kvm_set_cr8(vcpu, val & 0xfUL); 4112 res = kvm_set_cr8(vcpu, val);
4057 break; 4113 break;
4058 default: 4114 default:
4059 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4115 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4206,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4206static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4262static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4207{ 4263{
4208 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4264 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4209 if (ctxt->exception == PF_VECTOR) 4265 if (ctxt->exception.vector == PF_VECTOR)
4210 kvm_propagate_fault(vcpu); 4266 kvm_propagate_fault(vcpu, &ctxt->exception);
4211 else if (ctxt->error_code_valid) 4267 else if (ctxt->exception.error_code_valid)
4212 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4268 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4269 ctxt->exception.error_code);
4213 else 4270 else
4214 kvm_queue_exception(vcpu, ctxt->exception); 4271 kvm_queue_exception(vcpu, ctxt->exception.vector);
4215} 4272}
4216 4273
4217static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4274static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4267,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4267 4324
4268static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4325static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4269{ 4326{
4327 int r = EMULATE_DONE;
4328
4270 ++vcpu->stat.insn_emulation_fail; 4329 ++vcpu->stat.insn_emulation_fail;
4271 trace_kvm_emulate_insn_failed(vcpu); 4330 trace_kvm_emulate_insn_failed(vcpu);
4272 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4331 if (!is_guest_mode(vcpu)) {
4273 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4332 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4274 vcpu->run->internal.ndata = 0; 4333 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4334 vcpu->run->internal.ndata = 0;
4335 r = EMULATE_FAIL;
4336 }
4275 kvm_queue_exception(vcpu, UD_VECTOR); 4337 kvm_queue_exception(vcpu, UD_VECTOR);
4276 return EMULATE_FAIL; 4338
4339 return r;
4277} 4340}
4278 4341
4279static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4342static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4302,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4302 return false; 4365 return false;
4303} 4366}
4304 4367
4305int emulate_instruction(struct kvm_vcpu *vcpu, 4368int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4306 unsigned long cr2, 4369 unsigned long cr2,
4307 u16 error_code, 4370 int emulation_type,
4308 int emulation_type) 4371 void *insn,
4372 int insn_len)
4309{ 4373{
4310 int r; 4374 int r;
4311 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4375 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4323,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4323 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4387 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4324 init_emulate_ctxt(vcpu); 4388 init_emulate_ctxt(vcpu);
4325 vcpu->arch.emulate_ctxt.interruptibility = 0; 4389 vcpu->arch.emulate_ctxt.interruptibility = 0;
4326 vcpu->arch.emulate_ctxt.exception = -1; 4390 vcpu->arch.emulate_ctxt.have_exception = false;
4327 vcpu->arch.emulate_ctxt.perm_ok = false; 4391 vcpu->arch.emulate_ctxt.perm_ok = false;
4328 4392
4329 r = x86_decode_insn(&vcpu->arch.emulate_ctxt); 4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4330 if (r == X86EMUL_PROPAGATE_FAULT) 4394 if (r == X86EMUL_PROPAGATE_FAULT)
4331 goto done; 4395 goto done;
4332 4396
@@ -4389,7 +4453,7 @@ restart:
4389 } 4453 }
4390 4454
4391done: 4455done:
4392 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4456 if (vcpu->arch.emulate_ctxt.have_exception) {
4393 inject_emulated_exception(vcpu); 4457 inject_emulated_exception(vcpu);
4394 r = EMULATE_DONE; 4458 r = EMULATE_DONE;
4395 } else if (vcpu->arch.pio.count) { 4459 } else if (vcpu->arch.pio.count) {
@@ -4413,7 +4477,7 @@ done:
4413 4477
4414 return r; 4478 return r;
4415} 4479}
4416EXPORT_SYMBOL_GPL(emulate_instruction); 4480EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4417 4481
4418int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4482int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4419{ 4483{
@@ -4653,7 +4717,6 @@ int kvm_arch_init(void *opaque)
4653 4717
4654 kvm_x86_ops = ops; 4718 kvm_x86_ops = ops;
4655 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4719 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4656 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
4657 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4720 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4658 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4721 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4659 4722
@@ -5116,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5116 vcpu->fpu_active = 0; 5179 vcpu->fpu_active = 0;
5117 kvm_x86_ops->fpu_deactivate(vcpu); 5180 kvm_x86_ops->fpu_deactivate(vcpu);
5118 } 5181 }
5182 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5183 /* Page is swapped out. Do synthetic halt */
5184 vcpu->arch.apf.halted = true;
5185 r = 1;
5186 goto out;
5187 }
5119 } 5188 }
5120 5189
5121 r = kvm_mmu_reload(vcpu); 5190 r = kvm_mmu_reload(vcpu);
@@ -5244,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5244 5313
5245 r = 1; 5314 r = 1;
5246 while (r > 0) { 5315 while (r > 0) {
5247 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 5316 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5317 !vcpu->arch.apf.halted)
5248 r = vcpu_enter_guest(vcpu); 5318 r = vcpu_enter_guest(vcpu);
5249 else { 5319 else {
5250 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5320 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5257,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5257 vcpu->arch.mp_state = 5327 vcpu->arch.mp_state =
5258 KVM_MP_STATE_RUNNABLE; 5328 KVM_MP_STATE_RUNNABLE;
5259 case KVM_MP_STATE_RUNNABLE: 5329 case KVM_MP_STATE_RUNNABLE:
5330 vcpu->arch.apf.halted = false;
5260 break; 5331 break;
5261 case KVM_MP_STATE_SIPI_RECEIVED: 5332 case KVM_MP_STATE_SIPI_RECEIVED:
5262 default: 5333 default:
@@ -5278,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5278 vcpu->run->exit_reason = KVM_EXIT_INTR; 5349 vcpu->run->exit_reason = KVM_EXIT_INTR;
5279 ++vcpu->stat.request_irq_exits; 5350 ++vcpu->stat.request_irq_exits;
5280 } 5351 }
5352
5353 kvm_check_async_pf_completion(vcpu);
5354
5281 if (signal_pending(current)) { 5355 if (signal_pending(current)) {
5282 r = -EINTR; 5356 r = -EINTR;
5283 vcpu->run->exit_reason = KVM_EXIT_INTR; 5357 vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5302,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5302 int r; 5376 int r;
5303 sigset_t sigsaved; 5377 sigset_t sigsaved;
5304 5378
5379 if (!tsk_used_math(current) && init_fpu(current))
5380 return -ENOMEM;
5381
5305 if (vcpu->sigset_active) 5382 if (vcpu->sigset_active)
5306 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5383 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
5307 5384
@@ -5313,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5313 } 5390 }
5314 5391
5315 /* re-sync apic's tpr */ 5392 /* re-sync apic's tpr */
5316 if (!irqchip_in_kernel(vcpu->kvm)) 5393 if (!irqchip_in_kernel(vcpu->kvm)) {
5317 kvm_set_cr8(vcpu, kvm_run->cr8); 5394 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
5395 r = -EINVAL;
5396 goto out;
5397 }
5398 }
5318 5399
5319 if (vcpu->arch.pio.count || vcpu->mmio_needed) { 5400 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
5320 if (vcpu->mmio_needed) { 5401 if (vcpu->mmio_needed) {
@@ -5323,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5323 vcpu->mmio_needed = 0; 5404 vcpu->mmio_needed = 0;
5324 } 5405 }
5325 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5406 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5326 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 5407 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5327 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5408 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5328 if (r != EMULATE_DONE) { 5409 if (r != EMULATE_DONE) {
5329 r = 0; 5410 r = 0;
@@ -5436,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
5436 5517
5437 sregs->cr0 = kvm_read_cr0(vcpu); 5518 sregs->cr0 = kvm_read_cr0(vcpu);
5438 sregs->cr2 = vcpu->arch.cr2; 5519 sregs->cr2 = vcpu->arch.cr2;
5439 sregs->cr3 = vcpu->arch.cr3; 5520 sregs->cr3 = kvm_read_cr3(vcpu);
5440 sregs->cr4 = kvm_read_cr4(vcpu); 5521 sregs->cr4 = kvm_read_cr4(vcpu);
5441 sregs->cr8 = kvm_get_cr8(vcpu); 5522 sregs->cr8 = kvm_get_cr8(vcpu);
5442 sregs->efer = vcpu->arch.efer; 5523 sregs->efer = vcpu->arch.efer;
@@ -5504,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5504 kvm_x86_ops->set_gdt(vcpu, &dt); 5585 kvm_x86_ops->set_gdt(vcpu, &dt);
5505 5586
5506 vcpu->arch.cr2 = sregs->cr2; 5587 vcpu->arch.cr2 = sregs->cr2;
5507 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 5588 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
5508 vcpu->arch.cr3 = sregs->cr3; 5589 vcpu->arch.cr3 = sregs->cr3;
5590 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5509 5591
5510 kvm_set_cr8(vcpu, sregs->cr8); 5592 kvm_set_cr8(vcpu, sregs->cr8);
5511 5593
@@ -5522,7 +5604,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5522 if (sregs->cr4 & X86_CR4_OSXSAVE) 5604 if (sregs->cr4 & X86_CR4_OSXSAVE)
5523 update_cpuid(vcpu); 5605 update_cpuid(vcpu);
5524 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5525 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5526 mmu_reset_needed = 1; 5608 mmu_reset_needed = 1;
5527 } 5609 }
5528 5610
@@ -5773,6 +5855,8 @@ free_vcpu:
5773 5855
5774void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 5856void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5775{ 5857{
5858 vcpu->arch.apf.msr_val = 0;
5859
5776 vcpu_load(vcpu); 5860 vcpu_load(vcpu);
5777 kvm_mmu_unload(vcpu); 5861 kvm_mmu_unload(vcpu);
5778 vcpu_put(vcpu); 5862 vcpu_put(vcpu);
@@ -5792,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5792 vcpu->arch.dr7 = DR7_FIXED_1; 5876 vcpu->arch.dr7 = DR7_FIXED_1;
5793 5877
5794 kvm_make_request(KVM_REQ_EVENT, vcpu); 5878 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0;
5880
5881 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false;
5795 5884
5796 return kvm_x86_ops->vcpu_reset(vcpu); 5885 return kvm_x86_ops->vcpu_reset(vcpu);
5797} 5886}
@@ -5881,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5881 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5970 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5882 goto fail_free_mce_banks; 5971 goto fail_free_mce_banks;
5883 5972
5973 kvm_async_pf_hash_reset(vcpu);
5974
5884 return 0; 5975 return 0;
5885fail_free_mce_banks: 5976fail_free_mce_banks:
5886 kfree(vcpu->arch.mce_banks); 5977 kfree(vcpu->arch.mce_banks);
@@ -5906,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5906 free_page((unsigned long)vcpu->arch.pio_data); 5997 free_page((unsigned long)vcpu->arch.pio_data);
5907} 5998}
5908 5999
5909struct kvm *kvm_arch_create_vm(void) 6000int kvm_arch_init_vm(struct kvm *kvm)
5910{ 6001{
5911 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5912
5913 if (!kvm)
5914 return ERR_PTR(-ENOMEM);
5915
5916 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6002 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5917 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6003 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5918 6004
@@ -5921,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
5921 6007
5922 spin_lock_init(&kvm->arch.tsc_write_lock); 6008 spin_lock_init(&kvm->arch.tsc_write_lock);
5923 6009
5924 return kvm; 6010 return 0;
5925} 6011}
5926 6012
5927static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6013static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5939,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
5939 /* 6025 /*
5940 * Unpin any mmu pages first. 6026 * Unpin any mmu pages first.
5941 */ 6027 */
5942 kvm_for_each_vcpu(i, vcpu, kvm) 6028 kvm_for_each_vcpu(i, vcpu, kvm) {
6029 kvm_clear_async_pf_completion_queue(vcpu);
5943 kvm_unload_vcpu_mmu(vcpu); 6030 kvm_unload_vcpu_mmu(vcpu);
6031 }
5944 kvm_for_each_vcpu(i, vcpu, kvm) 6032 kvm_for_each_vcpu(i, vcpu, kvm)
5945 kvm_arch_vcpu_free(vcpu); 6033 kvm_arch_vcpu_free(vcpu);
5946 6034
@@ -5964,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5964 kfree(kvm->arch.vpic); 6052 kfree(kvm->arch.vpic);
5965 kfree(kvm->arch.vioapic); 6053 kfree(kvm->arch.vioapic);
5966 kvm_free_vcpus(kvm); 6054 kvm_free_vcpus(kvm);
5967 kvm_free_physmem(kvm);
5968 if (kvm->arch.apic_access_page) 6055 if (kvm->arch.apic_access_page)
5969 put_page(kvm->arch.apic_access_page); 6056 put_page(kvm->arch.apic_access_page);
5970 if (kvm->arch.ept_identity_pagetable) 6057 if (kvm->arch.ept_identity_pagetable)
5971 put_page(kvm->arch.ept_identity_pagetable); 6058 put_page(kvm->arch.ept_identity_pagetable);
5972 cleanup_srcu_struct(&kvm->srcu);
5973 kfree(kvm);
5974} 6059}
5975 6060
5976int kvm_arch_prepare_memory_region(struct kvm *kvm, 6061int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6051,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
6051 6136
6052int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6137int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6053{ 6138{
6054 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 6139 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6140 !vcpu->arch.apf.halted)
6141 || !list_empty_careful(&vcpu->async_pf.done)
6055 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6142 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6056 || vcpu->arch.nmi_pending || 6143 || vcpu->arch.nmi_pending ||
6057 (kvm_arch_interrupt_allowed(vcpu) && 6144 (kvm_arch_interrupt_allowed(vcpu) &&
@@ -6110,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6110} 6197}
6111EXPORT_SYMBOL_GPL(kvm_set_rflags); 6198EXPORT_SYMBOL_GPL(kvm_set_rflags);
6112 6199
6200void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6201{
6202 int r;
6203
6204 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6205 is_error_page(work->page))
6206 return;
6207
6208 r = kvm_mmu_reload(vcpu);
6209 if (unlikely(r))
6210 return;
6211
6212 if (!vcpu->arch.mmu.direct_map &&
6213 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6214 return;
6215
6216 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6217}
6218
6219static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6220{
6221 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6222}
6223
6224static inline u32 kvm_async_pf_next_probe(u32 key)
6225{
6226 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6227}
6228
6229static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6230{
6231 u32 key = kvm_async_pf_hash_fn(gfn);
6232
6233 while (vcpu->arch.apf.gfns[key] != ~0)
6234 key = kvm_async_pf_next_probe(key);
6235
6236 vcpu->arch.apf.gfns[key] = gfn;
6237}
6238
6239static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6240{
6241 int i;
6242 u32 key = kvm_async_pf_hash_fn(gfn);
6243
6244 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6245 (vcpu->arch.apf.gfns[key] != gfn &&
6246 vcpu->arch.apf.gfns[key] != ~0); i++)
6247 key = kvm_async_pf_next_probe(key);
6248
6249 return key;
6250}
6251
6252bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6253{
6254 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6255}
6256
6257static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6258{
6259 u32 i, j, k;
6260
6261 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6262 while (true) {
6263 vcpu->arch.apf.gfns[i] = ~0;
6264 do {
6265 j = kvm_async_pf_next_probe(j);
6266 if (vcpu->arch.apf.gfns[j] == ~0)
6267 return;
6268 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6269 /*
6270 * k lies cyclically in ]i,j]
6271 * | i.k.j |
6272 * |....j i.k.| or |.k..j i...|
6273 */
6274 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6275 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6276 i = j;
6277 }
6278}
6279
6280static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
6281{
6282
6283 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
6284 sizeof(val));
6285}
6286
6287void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6288 struct kvm_async_pf *work)
6289{
6290 struct x86_exception fault;
6291
6292 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
6293 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6294
6295 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
6296 (vcpu->arch.apf.send_user_only &&
6297 kvm_x86_ops->get_cpl(vcpu) == 0))
6298 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6299 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6300 fault.vector = PF_VECTOR;
6301 fault.error_code_valid = true;
6302 fault.error_code = 0;
6303 fault.nested_page_fault = false;
6304 fault.address = work->arch.token;
6305 kvm_inject_page_fault(vcpu, &fault);
6306 }
6307}
6308
6309void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6310 struct kvm_async_pf *work)
6311{
6312 struct x86_exception fault;
6313
6314 trace_kvm_async_pf_ready(work->arch.token, work->gva);
6315 if (is_error_page(work->page))
6316 work->arch.token = ~0; /* broadcast wakeup */
6317 else
6318 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
6319
6320 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
6321 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
6322 fault.vector = PF_VECTOR;
6323 fault.error_code_valid = true;
6324 fault.error_code = 0;
6325 fault.nested_page_fault = false;
6326 fault.address = work->arch.token;
6327 kvm_inject_page_fault(vcpu, &fault);
6328 }
6329 vcpu->arch.apf.halted = false;
6330}
6331
6332bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
6333{
6334 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
6335 return true;
6336 else
6337 return !kvm_event_needs_reinjection(vcpu) &&
6338 kvm_x86_ops->interrupt_allowed(vcpu);
6339}
6340
6113EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6341EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
6114EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6342EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
6115EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6343EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 919ae53adc5c..ea2dc1a2e13d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -540,6 +540,7 @@ struct kvm_ppc_pvinfo {
540#endif 540#endif
541#define KVM_CAP_PPC_GET_PVINFO 57 541#define KVM_CAP_PPC_GET_PVINFO 57
542#define KVM_CAP_PPC_IRQ_LEVEL 58 542#define KVM_CAP_PPC_IRQ_LEVEL 58
543#define KVM_CAP_ASYNC_PF 59
543 544
544#ifdef KVM_CAP_IRQ_ROUTING 545#ifdef KVM_CAP_IRQ_ROUTING
545 546
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a0557422715e..b5021db21858 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -16,6 +16,8 @@
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/preempt.h> 17#include <linux/preempt.h>
18#include <linux/msi.h> 18#include <linux/msi.h>
19#include <linux/slab.h>
20#include <linux/rcupdate.h>
19#include <asm/signal.h> 21#include <asm/signal.h>
20 22
21#include <linux/kvm.h> 23#include <linux/kvm.h>
@@ -40,6 +42,7 @@
40#define KVM_REQ_KICK 9 42#define KVM_REQ_KICK 9
41#define KVM_REQ_DEACTIVATE_FPU 10 43#define KVM_REQ_DEACTIVATE_FPU 10
42#define KVM_REQ_EVENT 11 44#define KVM_REQ_EVENT 11
45#define KVM_REQ_APF_HALT 12
43 46
44#define KVM_USERSPACE_IRQ_SOURCE_ID 0 47#define KVM_USERSPACE_IRQ_SOURCE_ID 0
45 48
@@ -74,6 +77,27 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
74int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 77int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
75 struct kvm_io_device *dev); 78 struct kvm_io_device *dev);
76 79
80#ifdef CONFIG_KVM_ASYNC_PF
81struct kvm_async_pf {
82 struct work_struct work;
83 struct list_head link;
84 struct list_head queue;
85 struct kvm_vcpu *vcpu;
86 struct mm_struct *mm;
87 gva_t gva;
88 unsigned long addr;
89 struct kvm_arch_async_pf arch;
90 struct page *page;
91 bool done;
92};
93
94void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
95void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
96int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
97 struct kvm_arch_async_pf *arch);
98int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
99#endif
100
77struct kvm_vcpu { 101struct kvm_vcpu {
78 struct kvm *kvm; 102 struct kvm *kvm;
79#ifdef CONFIG_PREEMPT_NOTIFIERS 103#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -104,6 +128,15 @@ struct kvm_vcpu {
104 gpa_t mmio_phys_addr; 128 gpa_t mmio_phys_addr;
105#endif 129#endif
106 130
131#ifdef CONFIG_KVM_ASYNC_PF
132 struct {
133 u32 queued;
134 struct list_head queue;
135 struct list_head done;
136 spinlock_t lock;
137 } async_pf;
138#endif
139
107 struct kvm_vcpu_arch arch; 140 struct kvm_vcpu_arch arch;
108}; 141};
109 142
@@ -113,16 +146,19 @@ struct kvm_vcpu {
113 */ 146 */
114#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) 147#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
115 148
149struct kvm_lpage_info {
150 unsigned long rmap_pde;
151 int write_count;
152};
153
116struct kvm_memory_slot { 154struct kvm_memory_slot {
117 gfn_t base_gfn; 155 gfn_t base_gfn;
118 unsigned long npages; 156 unsigned long npages;
119 unsigned long flags; 157 unsigned long flags;
120 unsigned long *rmap; 158 unsigned long *rmap;
121 unsigned long *dirty_bitmap; 159 unsigned long *dirty_bitmap;
122 struct { 160 unsigned long *dirty_bitmap_head;
123 unsigned long rmap_pde; 161 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
124 int write_count;
125 } *lpage_info[KVM_NR_PAGE_SIZES - 1];
126 unsigned long userspace_addr; 162 unsigned long userspace_addr;
127 int user_alloc; 163 int user_alloc;
128 int id; 164 int id;
@@ -169,6 +205,7 @@ struct kvm_irq_routing_table {};
169 205
170struct kvm_memslots { 206struct kvm_memslots {
171 int nmemslots; 207 int nmemslots;
208 u64 generation;
172 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + 209 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
173 KVM_PRIVATE_MEM_SLOTS]; 210 KVM_PRIVATE_MEM_SLOTS];
174}; 211};
@@ -206,6 +243,10 @@ struct kvm {
206 243
207 struct mutex irq_lock; 244 struct mutex irq_lock;
208#ifdef CONFIG_HAVE_KVM_IRQCHIP 245#ifdef CONFIG_HAVE_KVM_IRQCHIP
246 /*
247 * Update side is protected by irq_lock and,
248 * if configured, irqfds.lock.
249 */
209 struct kvm_irq_routing_table __rcu *irq_routing; 250 struct kvm_irq_routing_table __rcu *irq_routing;
210 struct hlist_head mask_notifier_list; 251 struct hlist_head mask_notifier_list;
211 struct hlist_head irq_ack_notifier_list; 252 struct hlist_head irq_ack_notifier_list;
@@ -216,6 +257,7 @@ struct kvm {
216 unsigned long mmu_notifier_seq; 257 unsigned long mmu_notifier_seq;
217 long mmu_notifier_count; 258 long mmu_notifier_count;
218#endif 259#endif
260 long tlbs_dirty;
219}; 261};
220 262
221/* The guest did something we don't support. */ 263/* The guest did something we don't support. */
@@ -302,7 +344,11 @@ void kvm_set_page_accessed(struct page *page);
302 344
303pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); 345pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
304pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); 346pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
347pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
348 bool write_fault, bool *writable);
305pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 349pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
350pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
351 bool *writable);
306pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 352pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
307 struct kvm_memory_slot *slot, gfn_t gfn); 353 struct kvm_memory_slot *slot, gfn_t gfn);
308int memslot_id(struct kvm *kvm, gfn_t gfn); 354int memslot_id(struct kvm *kvm, gfn_t gfn);
@@ -321,18 +367,25 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
321 int offset, int len); 367 int offset, int len);
322int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 368int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
323 unsigned long len); 369 unsigned long len);
370int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
371 void *data, unsigned long len);
372int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
373 gpa_t gpa);
324int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); 374int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
325int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); 375int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
326struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); 376struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
327int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); 377int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
328unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); 378unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
329void mark_page_dirty(struct kvm *kvm, gfn_t gfn); 379void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
380void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
381 gfn_t gfn);
330 382
331void kvm_vcpu_block(struct kvm_vcpu *vcpu); 383void kvm_vcpu_block(struct kvm_vcpu *vcpu);
332void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); 384void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
333void kvm_resched(struct kvm_vcpu *vcpu); 385void kvm_resched(struct kvm_vcpu *vcpu);
334void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 386void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
335void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 387void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
388
336void kvm_flush_remote_tlbs(struct kvm *kvm); 389void kvm_flush_remote_tlbs(struct kvm *kvm);
337void kvm_reload_remote_mmus(struct kvm *kvm); 390void kvm_reload_remote_mmus(struct kvm *kvm);
338 391
@@ -398,7 +451,19 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
398 451
399void kvm_free_physmem(struct kvm *kvm); 452void kvm_free_physmem(struct kvm *kvm);
400 453
401struct kvm *kvm_arch_create_vm(void); 454#ifndef __KVM_HAVE_ARCH_VM_ALLOC
455static inline struct kvm *kvm_arch_alloc_vm(void)
456{
457 return kzalloc(sizeof(struct kvm), GFP_KERNEL);
458}
459
460static inline void kvm_arch_free_vm(struct kvm *kvm)
461{
462 kfree(kvm);
463}
464#endif
465
466int kvm_arch_init_vm(struct kvm *kvm);
402void kvm_arch_destroy_vm(struct kvm *kvm); 467void kvm_arch_destroy_vm(struct kvm *kvm);
403void kvm_free_all_assigned_devices(struct kvm *kvm); 468void kvm_free_all_assigned_devices(struct kvm *kvm);
404void kvm_arch_sync_events(struct kvm *kvm); 469void kvm_arch_sync_events(struct kvm *kvm);
@@ -414,16 +479,8 @@ struct kvm_irq_ack_notifier {
414 void (*irq_acked)(struct kvm_irq_ack_notifier *kian); 479 void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
415}; 480};
416 481
417#define KVM_ASSIGNED_MSIX_PENDING 0x1
418struct kvm_guest_msix_entry {
419 u32 vector;
420 u16 entry;
421 u16 flags;
422};
423
424struct kvm_assigned_dev_kernel { 482struct kvm_assigned_dev_kernel {
425 struct kvm_irq_ack_notifier ack_notifier; 483 struct kvm_irq_ack_notifier ack_notifier;
426 struct work_struct interrupt_work;
427 struct list_head list; 484 struct list_head list;
428 int assigned_dev_id; 485 int assigned_dev_id;
429 int host_segnr; 486 int host_segnr;
@@ -434,13 +491,14 @@ struct kvm_assigned_dev_kernel {
434 bool host_irq_disabled; 491 bool host_irq_disabled;
435 struct msix_entry *host_msix_entries; 492 struct msix_entry *host_msix_entries;
436 int guest_irq; 493 int guest_irq;
437 struct kvm_guest_msix_entry *guest_msix_entries; 494 struct msix_entry *guest_msix_entries;
438 unsigned long irq_requested_type; 495 unsigned long irq_requested_type;
439 int irq_source_id; 496 int irq_source_id;
440 int flags; 497 int flags;
441 struct pci_dev *dev; 498 struct pci_dev *dev;
442 struct kvm *kvm; 499 struct kvm *kvm;
443 spinlock_t assigned_dev_lock; 500 spinlock_t intx_lock;
501 char irq_name[32];
444}; 502};
445 503
446struct kvm_irq_mask_notifier { 504struct kvm_irq_mask_notifier {
@@ -462,6 +520,8 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
462 unsigned long *deliver_bitmask); 520 unsigned long *deliver_bitmask);
463#endif 521#endif
464int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); 522int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
523int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
524 int irq_source_id, int level);
465void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 525void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
466void kvm_register_irq_ack_notifier(struct kvm *kvm, 526void kvm_register_irq_ack_notifier(struct kvm *kvm,
467 struct kvm_irq_ack_notifier *kian); 527 struct kvm_irq_ack_notifier *kian);
@@ -603,17 +663,28 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
603void kvm_eventfd_init(struct kvm *kvm); 663void kvm_eventfd_init(struct kvm *kvm);
604int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); 664int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
605void kvm_irqfd_release(struct kvm *kvm); 665void kvm_irqfd_release(struct kvm *kvm);
666void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
606int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); 667int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
607 668
608#else 669#else
609 670
610static inline void kvm_eventfd_init(struct kvm *kvm) {} 671static inline void kvm_eventfd_init(struct kvm *kvm) {}
672
611static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) 673static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
612{ 674{
613 return -EINVAL; 675 return -EINVAL;
614} 676}
615 677
616static inline void kvm_irqfd_release(struct kvm *kvm) {} 678static inline void kvm_irqfd_release(struct kvm *kvm) {}
679
680#ifdef CONFIG_HAVE_KVM_IRQCHIP
681static inline void kvm_irq_routing_update(struct kvm *kvm,
682 struct kvm_irq_routing_table *irq_rt)
683{
684 rcu_assign_pointer(kvm->irq_routing, irq_rt);
685}
686#endif
687
617static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 688static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
618{ 689{
619 return -ENOSYS; 690 return -ENOSYS;
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 7ac0d4eee430..fa7cc7244cbd 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -67,4 +67,11 @@ struct kvm_lapic_irq {
67 u32 dest_id; 67 u32 dest_id;
68}; 68};
69 69
70struct gfn_to_hva_cache {
71 u64 generation;
72 gpa_t gpa;
73 unsigned long hva;
74 struct kvm_memory_slot *memslot;
75};
76
70#endif /* __KVM_TYPES_H__ */ 77#endif /* __KVM_TYPES_H__ */
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 6dd3a51ab1cb..46e3cd8e197a 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -6,6 +6,36 @@
6#undef TRACE_SYSTEM 6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm 7#define TRACE_SYSTEM kvm
8 8
9#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x }
10
11#define kvm_trace_exit_reason \
12 ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \
13 ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \
14 ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \
15 ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
16 ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI)
17
18TRACE_EVENT(kvm_userspace_exit,
19 TP_PROTO(__u32 reason, int errno),
20 TP_ARGS(reason, errno),
21
22 TP_STRUCT__entry(
23 __field( __u32, reason )
24 __field( int, errno )
25 ),
26
27 TP_fast_assign(
28 __entry->reason = reason;
29 __entry->errno = errno;
30 ),
31
32 TP_printk("reason %s (%d)",
33 __entry->errno < 0 ?
34 (__entry->errno == -EINTR ? "restart" : "error") :
35 __print_symbolic(__entry->reason, kvm_trace_exit_reason),
36 __entry->errno < 0 ? -__entry->errno : __entry->reason)
37);
38
9#if defined(__KVM_HAVE_IOAPIC) 39#if defined(__KVM_HAVE_IOAPIC)
10TRACE_EVENT(kvm_set_irq, 40TRACE_EVENT(kvm_set_irq,
11 TP_PROTO(unsigned int gsi, int level, int irq_source_id), 41 TP_PROTO(unsigned int gsi, int level, int irq_source_id),
@@ -185,6 +215,97 @@ TRACE_EVENT(kvm_age_page,
185 __entry->referenced ? "YOUNG" : "OLD") 215 __entry->referenced ? "YOUNG" : "OLD")
186); 216);
187 217
218#ifdef CONFIG_KVM_ASYNC_PF
219DECLARE_EVENT_CLASS(kvm_async_get_page_class,
220
221 TP_PROTO(u64 gva, u64 gfn),
222
223 TP_ARGS(gva, gfn),
224
225 TP_STRUCT__entry(
226 __field(__u64, gva)
227 __field(u64, gfn)
228 ),
229
230 TP_fast_assign(
231 __entry->gva = gva;
232 __entry->gfn = gfn;
233 ),
234
235 TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
236);
237
238DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page,
239
240 TP_PROTO(u64 gva, u64 gfn),
241
242 TP_ARGS(gva, gfn)
243);
244
245DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault,
246
247 TP_PROTO(u64 gva, u64 gfn),
248
249 TP_ARGS(gva, gfn)
250);
251
252DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready,
253
254 TP_PROTO(u64 token, u64 gva),
255
256 TP_ARGS(token, gva),
257
258 TP_STRUCT__entry(
259 __field(__u64, token)
260 __field(__u64, gva)
261 ),
262
263 TP_fast_assign(
264 __entry->token = token;
265 __entry->gva = gva;
266 ),
267
268 TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva)
269
270);
271
272DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present,
273
274 TP_PROTO(u64 token, u64 gva),
275
276 TP_ARGS(token, gva)
277);
278
279DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready,
280
281 TP_PROTO(u64 token, u64 gva),
282
283 TP_ARGS(token, gva)
284);
285
286TRACE_EVENT(
287 kvm_async_pf_completed,
288 TP_PROTO(unsigned long address, struct page *page, u64 gva),
289 TP_ARGS(address, page, gva),
290
291 TP_STRUCT__entry(
292 __field(unsigned long, address)
293 __field(pfn_t, pfn)
294 __field(u64, gva)
295 ),
296
297 TP_fast_assign(
298 __entry->address = address;
299 __entry->pfn = page ? page_to_pfn(page) : 0;
300 __entry->gva = gva;
301 ),
302
303 TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva,
304 __entry->address, __entry->pfn)
305);
306
307#endif
308
188#endif /* _TRACE_KVM_MAIN_H */ 309#endif /* _TRACE_KVM_MAIN_H */
189 310
190/* This part must be outside protection */ 311/* This part must be outside protection */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 7f1178f6b839..f63ccb0a5982 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE
15 15
16config KVM_MMIO 16config KVM_MMIO
17 bool 17 bool
18
19config KVM_ASYNC_PF
20 bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 7c98928b09d9..ae72ae604c89 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
55 return index; 55 return index;
56} 56}
57 57
58static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 58static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
59{ 59{
60 struct kvm_assigned_dev_kernel *assigned_dev; 60 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
61 int i; 61 u32 vector;
62 int index;
62 63
63 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 64 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
64 interrupt_work); 65 spin_lock(&assigned_dev->intx_lock);
66 disable_irq_nosync(irq);
67 assigned_dev->host_irq_disabled = true;
68 spin_unlock(&assigned_dev->intx_lock);
69 }
65 70
66 spin_lock_irq(&assigned_dev->assigned_dev_lock);
67 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 71 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
68 struct kvm_guest_msix_entry *guest_entries = 72 index = find_index_from_host_irq(assigned_dev, irq);
69 assigned_dev->guest_msix_entries; 73 if (index >= 0) {
70 for (i = 0; i < assigned_dev->entries_nr; i++) { 74 vector = assigned_dev->
71 if (!(guest_entries[i].flags & 75 guest_msix_entries[index].vector;
72 KVM_ASSIGNED_MSIX_PENDING))
73 continue;
74 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
75 kvm_set_irq(assigned_dev->kvm, 76 kvm_set_irq(assigned_dev->kvm,
76 assigned_dev->irq_source_id, 77 assigned_dev->irq_source_id, vector, 1);
77 guest_entries[i].vector, 1);
78 } 78 }
79 } else 79 } else
80 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 80 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
81 assigned_dev->guest_irq, 1); 81 assigned_dev->guest_irq, 1);
82 82
83 spin_unlock_irq(&assigned_dev->assigned_dev_lock);
84}
85
86static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
87{
88 unsigned long flags;
89 struct kvm_assigned_dev_kernel *assigned_dev =
90 (struct kvm_assigned_dev_kernel *) dev_id;
91
92 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
93 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
94 int index = find_index_from_host_irq(assigned_dev, irq);
95 if (index < 0)
96 goto out;
97 assigned_dev->guest_msix_entries[index].flags |=
98 KVM_ASSIGNED_MSIX_PENDING;
99 }
100
101 schedule_work(&assigned_dev->interrupt_work);
102
103 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
104 disable_irq_nosync(irq);
105 assigned_dev->host_irq_disabled = true;
106 }
107
108out:
109 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
110 return IRQ_HANDLED; 83 return IRQ_HANDLED;
111} 84}
112 85
@@ -114,7 +87,6 @@ out:
114static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 87static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
115{ 88{
116 struct kvm_assigned_dev_kernel *dev; 89 struct kvm_assigned_dev_kernel *dev;
117 unsigned long flags;
118 90
119 if (kian->gsi == -1) 91 if (kian->gsi == -1)
120 return; 92 return;
@@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
127 /* The guest irq may be shared so this ack may be 99 /* The guest irq may be shared so this ack may be
128 * from another device. 100 * from another device.
129 */ 101 */
130 spin_lock_irqsave(&dev->assigned_dev_lock, flags); 102 spin_lock(&dev->intx_lock);
131 if (dev->host_irq_disabled) { 103 if (dev->host_irq_disabled) {
132 enable_irq(dev->host_irq); 104 enable_irq(dev->host_irq);
133 dev->host_irq_disabled = false; 105 dev->host_irq_disabled = false;
134 } 106 }
135 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); 107 spin_unlock(&dev->intx_lock);
136} 108}
137 109
138static void deassign_guest_irq(struct kvm *kvm, 110static void deassign_guest_irq(struct kvm *kvm,
@@ -141,6 +113,9 @@ static void deassign_guest_irq(struct kvm *kvm,
141 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); 113 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
142 assigned_dev->ack_notifier.gsi = -1; 114 assigned_dev->ack_notifier.gsi = -1;
143 115
116 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
117 assigned_dev->guest_irq, 0);
118
144 if (assigned_dev->irq_source_id != -1) 119 if (assigned_dev->irq_source_id != -1)
145 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 120 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
146 assigned_dev->irq_source_id = -1; 121 assigned_dev->irq_source_id = -1;
@@ -152,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm,
152 struct kvm_assigned_dev_kernel *assigned_dev) 127 struct kvm_assigned_dev_kernel *assigned_dev)
153{ 128{
154 /* 129 /*
155 * In kvm_free_device_irq, cancel_work_sync return true if: 130 * We disable irq here to prevent further events.
156 * 1. work is scheduled, and then cancelled.
157 * 2. work callback is executed.
158 *
159 * The first one ensured that the irq is disabled and no more events
160 * would happen. But for the second one, the irq may be enabled (e.g.
161 * for MSI). So we disable irq here to prevent further events.
162 * 131 *
163 * Notice this maybe result in nested disable if the interrupt type is 132 * Notice this maybe result in nested disable if the interrupt type is
164 * INTx, but it's OK for we are going to free it. 133 * INTx, but it's OK for we are going to free it.
165 * 134 *
166 * If this function is a part of VM destroy, please ensure that till 135 * If this function is a part of VM destroy, please ensure that till
167 * now, the kvm state is still legal for probably we also have to wait 136 * now, the kvm state is still legal for probably we also have to wait
168 * interrupt_work done. 137 * on a currently running IRQ handler.
169 */ 138 */
170 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 139 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
171 int i; 140 int i;
172 for (i = 0; i < assigned_dev->entries_nr; i++) 141 for (i = 0; i < assigned_dev->entries_nr; i++)
173 disable_irq_nosync(assigned_dev-> 142 disable_irq(assigned_dev->host_msix_entries[i].vector);
174 host_msix_entries[i].vector);
175
176 cancel_work_sync(&assigned_dev->interrupt_work);
177 143
178 for (i = 0; i < assigned_dev->entries_nr; i++) 144 for (i = 0; i < assigned_dev->entries_nr; i++)
179 free_irq(assigned_dev->host_msix_entries[i].vector, 145 free_irq(assigned_dev->host_msix_entries[i].vector,
@@ -185,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm,
185 pci_disable_msix(assigned_dev->dev); 151 pci_disable_msix(assigned_dev->dev);
186 } else { 152 } else {
187 /* Deal with MSI and INTx */ 153 /* Deal with MSI and INTx */
188 disable_irq_nosync(assigned_dev->host_irq); 154 disable_irq(assigned_dev->host_irq);
189 cancel_work_sync(&assigned_dev->interrupt_work);
190 155
191 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 156 free_irq(assigned_dev->host_irq, (void *)assigned_dev);
192 157
@@ -232,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
232{ 197{
233 kvm_free_assigned_irq(kvm, assigned_dev); 198 kvm_free_assigned_irq(kvm, assigned_dev);
234 199
235 pci_reset_function(assigned_dev->dev); 200 __pci_reset_function(assigned_dev->dev);
201 pci_restore_state(assigned_dev->dev);
236 202
237 pci_release_regions(assigned_dev->dev); 203 pci_release_regions(assigned_dev->dev);
238 pci_disable_device(assigned_dev->dev); 204 pci_disable_device(assigned_dev->dev);
@@ -265,8 +231,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
265 * on the same interrupt line is not a happy situation: there 231 * on the same interrupt line is not a happy situation: there
266 * are going to be long delays in accepting, acking, etc. 232 * are going to be long delays in accepting, acking, etc.
267 */ 233 */
268 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 234 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
269 0, "kvm_assigned_intx_device", (void *)dev)) 235 IRQF_ONESHOT, dev->irq_name, (void *)dev))
270 return -EIO; 236 return -EIO;
271 return 0; 237 return 0;
272} 238}
@@ -284,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
284 } 250 }
285 251
286 dev->host_irq = dev->dev->irq; 252 dev->host_irq = dev->dev->irq;
287 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, 253 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
288 "kvm_assigned_msi_device", (void *)dev)) { 254 0, dev->irq_name, (void *)dev)) {
289 pci_disable_msi(dev->dev); 255 pci_disable_msi(dev->dev);
290 return -EIO; 256 return -EIO;
291 } 257 }
@@ -310,10 +276,9 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
310 return r; 276 return r;
311 277
312 for (i = 0; i < dev->entries_nr; i++) { 278 for (i = 0; i < dev->entries_nr; i++) {
313 r = request_irq(dev->host_msix_entries[i].vector, 279 r = request_threaded_irq(dev->host_msix_entries[i].vector,
314 kvm_assigned_dev_intr, 0, 280 NULL, kvm_assigned_dev_thread,
315 "kvm_assigned_msix_device", 281 0, dev->irq_name, (void *)dev);
316 (void *)dev);
317 if (r) 282 if (r)
318 goto err; 283 goto err;
319 } 284 }
@@ -370,6 +335,9 @@ static int assign_host_irq(struct kvm *kvm,
370 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) 335 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
371 return r; 336 return r;
372 337
338 snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
339 pci_name(dev->dev));
340
373 switch (host_irq_type) { 341 switch (host_irq_type) {
374 case KVM_DEV_IRQ_HOST_INTX: 342 case KVM_DEV_IRQ_HOST_INTX:
375 r = assigned_device_enable_host_intx(kvm, dev); 343 r = assigned_device_enable_host_intx(kvm, dev);
@@ -547,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
547 } 515 }
548 516
549 pci_reset_function(dev); 517 pci_reset_function(dev);
518 pci_save_state(dev);
550 519
551 match->assigned_dev_id = assigned_dev->assigned_dev_id; 520 match->assigned_dev_id = assigned_dev->assigned_dev_id;
552 match->host_segnr = assigned_dev->segnr; 521 match->host_segnr = assigned_dev->segnr;
@@ -554,12 +523,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
554 match->host_devfn = assigned_dev->devfn; 523 match->host_devfn = assigned_dev->devfn;
555 match->flags = assigned_dev->flags; 524 match->flags = assigned_dev->flags;
556 match->dev = dev; 525 match->dev = dev;
557 spin_lock_init(&match->assigned_dev_lock); 526 spin_lock_init(&match->intx_lock);
558 match->irq_source_id = -1; 527 match->irq_source_id = -1;
559 match->kvm = kvm; 528 match->kvm = kvm;
560 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 529 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
561 INIT_WORK(&match->interrupt_work,
562 kvm_assigned_dev_interrupt_work_handler);
563 530
564 list_add(&match->list, &kvm->arch.assigned_dev_head); 531 list_add(&match->list, &kvm->arch.assigned_dev_head);
565 532
@@ -579,6 +546,7 @@ out:
579 mutex_unlock(&kvm->lock); 546 mutex_unlock(&kvm->lock);
580 return r; 547 return r;
581out_list_del: 548out_list_del:
549 pci_restore_state(dev);
582 list_del(&match->list); 550 list_del(&match->list);
583 pci_release_regions(dev); 551 pci_release_regions(dev);
584out_disable: 552out_disable:
@@ -651,9 +619,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
651 r = -ENOMEM; 619 r = -ENOMEM;
652 goto msix_nr_out; 620 goto msix_nr_out;
653 } 621 }
654 adev->guest_msix_entries = kzalloc( 622 adev->guest_msix_entries =
655 sizeof(struct kvm_guest_msix_entry) * 623 kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
656 entry_nr->entry_nr, GFP_KERNEL); 624 GFP_KERNEL);
657 if (!adev->guest_msix_entries) { 625 if (!adev->guest_msix_entries) {
658 kfree(adev->host_msix_entries); 626 kfree(adev->host_msix_entries);
659 r = -ENOMEM; 627 r = -ENOMEM;
@@ -706,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
706 unsigned long arg) 674 unsigned long arg)
707{ 675{
708 void __user *argp = (void __user *)arg; 676 void __user *argp = (void __user *)arg;
709 int r = -ENOTTY; 677 int r;
710 678
711 switch (ioctl) { 679 switch (ioctl) {
712 case KVM_ASSIGN_PCI_DEVICE: { 680 case KVM_ASSIGN_PCI_DEVICE: {
@@ -724,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
724 r = -EOPNOTSUPP; 692 r = -EOPNOTSUPP;
725 break; 693 break;
726 } 694 }
727#ifdef KVM_CAP_ASSIGN_DEV_IRQ
728 case KVM_ASSIGN_DEV_IRQ: { 695 case KVM_ASSIGN_DEV_IRQ: {
729 struct kvm_assigned_irq assigned_irq; 696 struct kvm_assigned_irq assigned_irq;
730 697
@@ -747,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
747 goto out; 714 goto out;
748 break; 715 break;
749 } 716 }
750#endif
751#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
752 case KVM_DEASSIGN_PCI_DEVICE: { 717 case KVM_DEASSIGN_PCI_DEVICE: {
753 struct kvm_assigned_pci_dev assigned_dev; 718 struct kvm_assigned_pci_dev assigned_dev;
754 719
@@ -760,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
760 goto out; 725 goto out;
761 break; 726 break;
762 } 727 }
763#endif
764#ifdef KVM_CAP_IRQ_ROUTING 728#ifdef KVM_CAP_IRQ_ROUTING
765 case KVM_SET_GSI_ROUTING: { 729 case KVM_SET_GSI_ROUTING: {
766 struct kvm_irq_routing routing; 730 struct kvm_irq_routing routing;
@@ -813,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
813 break; 777 break;
814 } 778 }
815#endif 779#endif
780 default:
781 r = -ENOTTY;
782 break;
816 } 783 }
817out: 784out:
818 return r; 785 return r;
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
new file mode 100644
index 000000000000..74268b4c2ee1
--- /dev/null
+++ b/virt/kvm/async_pf.c
@@ -0,0 +1,216 @@
1/*
2 * kvm asynchronous fault support
3 *
4 * Copyright 2010 Red Hat, Inc.
5 *
6 * Author:
7 * Gleb Natapov <gleb@redhat.com>
8 *
9 * This file is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23#include <linux/kvm_host.h>
24#include <linux/slab.h>
25#include <linux/module.h>
26#include <linux/mmu_context.h>
27
28#include "async_pf.h"
29#include <trace/events/kvm.h>
30
31static struct kmem_cache *async_pf_cache;
32
33int kvm_async_pf_init(void)
34{
35 async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
36
37 if (!async_pf_cache)
38 return -ENOMEM;
39
40 return 0;
41}
42
43void kvm_async_pf_deinit(void)
44{
45 if (async_pf_cache)
46 kmem_cache_destroy(async_pf_cache);
47 async_pf_cache = NULL;
48}
49
50void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
51{
52 INIT_LIST_HEAD(&vcpu->async_pf.done);
53 INIT_LIST_HEAD(&vcpu->async_pf.queue);
54 spin_lock_init(&vcpu->async_pf.lock);
55}
56
57static void async_pf_execute(struct work_struct *work)
58{
59 struct page *page = NULL;
60 struct kvm_async_pf *apf =
61 container_of(work, struct kvm_async_pf, work);
62 struct mm_struct *mm = apf->mm;
63 struct kvm_vcpu *vcpu = apf->vcpu;
64 unsigned long addr = apf->addr;
65 gva_t gva = apf->gva;
66
67 might_sleep();
68
69 use_mm(mm);
70 down_read(&mm->mmap_sem);
71 get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
72 up_read(&mm->mmap_sem);
73 unuse_mm(mm);
74
75 spin_lock(&vcpu->async_pf.lock);
76 list_add_tail(&apf->link, &vcpu->async_pf.done);
77 apf->page = page;
78 apf->done = true;
79 spin_unlock(&vcpu->async_pf.lock);
80
81 /*
82 * apf may be freed by kvm_check_async_pf_completion() after
83 * this point
84 */
85
86 trace_kvm_async_pf_completed(addr, page, gva);
87
88 if (waitqueue_active(&vcpu->wq))
89 wake_up_interruptible(&vcpu->wq);
90
91 mmdrop(mm);
92 kvm_put_kvm(vcpu->kvm);
93}
94
95void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
96{
97 /* cancel outstanding work queue item */
98 while (!list_empty(&vcpu->async_pf.queue)) {
99 struct kvm_async_pf *work =
100 list_entry(vcpu->async_pf.queue.next,
101 typeof(*work), queue);
102 cancel_work_sync(&work->work);
103 list_del(&work->queue);
104 if (!work->done) /* work was canceled */
105 kmem_cache_free(async_pf_cache, work);
106 }
107
108 spin_lock(&vcpu->async_pf.lock);
109 while (!list_empty(&vcpu->async_pf.done)) {
110 struct kvm_async_pf *work =
111 list_entry(vcpu->async_pf.done.next,
112 typeof(*work), link);
113 list_del(&work->link);
114 if (work->page)
115 put_page(work->page);
116 kmem_cache_free(async_pf_cache, work);
117 }
118 spin_unlock(&vcpu->async_pf.lock);
119
120 vcpu->async_pf.queued = 0;
121}
122
123void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
124{
125 struct kvm_async_pf *work;
126
127 while (!list_empty_careful(&vcpu->async_pf.done) &&
128 kvm_arch_can_inject_async_page_present(vcpu)) {
129 spin_lock(&vcpu->async_pf.lock);
130 work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
131 link);
132 list_del(&work->link);
133 spin_unlock(&vcpu->async_pf.lock);
134
135 if (work->page)
136 kvm_arch_async_page_ready(vcpu, work);
137 kvm_arch_async_page_present(vcpu, work);
138
139 list_del(&work->queue);
140 vcpu->async_pf.queued--;
141 if (work->page)
142 put_page(work->page);
143 kmem_cache_free(async_pf_cache, work);
144 }
145}
146
147int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
148 struct kvm_arch_async_pf *arch)
149{
150 struct kvm_async_pf *work;
151
152 if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
153 return 0;
154
155 /* setup delayed work */
156
157 /*
158 * do alloc nowait since if we are going to sleep anyway we
159 * may as well sleep faulting in page
160 */
161 work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
162 if (!work)
163 return 0;
164
165 work->page = NULL;
166 work->done = false;
167 work->vcpu = vcpu;
168 work->gva = gva;
169 work->addr = gfn_to_hva(vcpu->kvm, gfn);
170 work->arch = *arch;
171 work->mm = current->mm;
172 atomic_inc(&work->mm->mm_count);
173 kvm_get_kvm(work->vcpu->kvm);
174
175 /* this can't really happen otherwise gfn_to_pfn_async
176 would succeed */
177 if (unlikely(kvm_is_error_hva(work->addr)))
178 goto retry_sync;
179
180 INIT_WORK(&work->work, async_pf_execute);
181 if (!schedule_work(&work->work))
182 goto retry_sync;
183
184 list_add_tail(&work->queue, &vcpu->async_pf.queue);
185 vcpu->async_pf.queued++;
186 kvm_arch_async_page_not_present(vcpu, work);
187 return 1;
188retry_sync:
189 kvm_put_kvm(work->vcpu->kvm);
190 mmdrop(work->mm);
191 kmem_cache_free(async_pf_cache, work);
192 return 0;
193}
194
195int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
196{
197 struct kvm_async_pf *work;
198
199 if (!list_empty_careful(&vcpu->async_pf.done))
200 return 0;
201
202 work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
203 if (!work)
204 return -ENOMEM;
205
206 work->page = bad_page;
207 get_page(bad_page);
208 INIT_LIST_HEAD(&work->queue); /* for list_del to work */
209
210 spin_lock(&vcpu->async_pf.lock);
211 list_add_tail(&work->link, &vcpu->async_pf.done);
212 spin_unlock(&vcpu->async_pf.lock);
213
214 vcpu->async_pf.queued++;
215 return 0;
216}
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
new file mode 100644
index 000000000000..e7ef6447cb82
--- /dev/null
+++ b/virt/kvm/async_pf.h
@@ -0,0 +1,36 @@
1/*
2 * kvm asynchronous fault support
3 *
4 * Copyright 2010 Red Hat, Inc.
5 *
6 * Author:
7 * Gleb Natapov <gleb@redhat.com>
8 *
9 * This file is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23#ifndef __KVM_ASYNC_PF_H__
24#define __KVM_ASYNC_PF_H__
25
26#ifdef CONFIG_KVM_ASYNC_PF
27int kvm_async_pf_init(void);
28void kvm_async_pf_deinit(void);
29void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
30#else
31#define kvm_async_pf_init() (0)
32#define kvm_async_pf_deinit() do{}while(0)
33#define kvm_async_pf_vcpu_init(C) do{}while(0)
34#endif
35
36#endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index c1f1e3c62984..2ca4535f4fb7 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -44,14 +44,19 @@
44 */ 44 */
45 45
46struct _irqfd { 46struct _irqfd {
47 struct kvm *kvm; 47 /* Used for MSI fast-path */
48 struct eventfd_ctx *eventfd; 48 struct kvm *kvm;
49 int gsi; 49 wait_queue_t wait;
50 struct list_head list; 50 /* Update side is protected by irqfds.lock */
51 poll_table pt; 51 struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
52 wait_queue_t wait; 52 /* Used for level IRQ fast-path */
53 struct work_struct inject; 53 int gsi;
54 struct work_struct shutdown; 54 struct work_struct inject;
55 /* Used for setup/shutdown */
56 struct eventfd_ctx *eventfd;
57 struct list_head list;
58 poll_table pt;
59 struct work_struct shutdown;
55}; 60};
56 61
57static struct workqueue_struct *irqfd_cleanup_wq; 62static struct workqueue_struct *irqfd_cleanup_wq;
@@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
125{ 130{
126 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); 131 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
127 unsigned long flags = (unsigned long)key; 132 unsigned long flags = (unsigned long)key;
133 struct kvm_kernel_irq_routing_entry *irq;
134 struct kvm *kvm = irqfd->kvm;
128 135
129 if (flags & POLLIN) 136 if (flags & POLLIN) {
137 rcu_read_lock();
138 irq = rcu_dereference(irqfd->irq_entry);
130 /* An event has been signaled, inject an interrupt */ 139 /* An event has been signaled, inject an interrupt */
131 schedule_work(&irqfd->inject); 140 if (irq)
141 kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
142 else
143 schedule_work(&irqfd->inject);
144 rcu_read_unlock();
145 }
132 146
133 if (flags & POLLHUP) { 147 if (flags & POLLHUP) {
134 /* The eventfd is closing, detach from KVM */ 148 /* The eventfd is closing, detach from KVM */
135 struct kvm *kvm = irqfd->kvm;
136 unsigned long flags; 149 unsigned long flags;
137 150
138 spin_lock_irqsave(&kvm->irqfds.lock, flags); 151 spin_lock_irqsave(&kvm->irqfds.lock, flags);
@@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
163 add_wait_queue(wqh, &irqfd->wait); 176 add_wait_queue(wqh, &irqfd->wait);
164} 177}
165 178
179/* Must be called under irqfds.lock */
180static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
181 struct kvm_irq_routing_table *irq_rt)
182{
183 struct kvm_kernel_irq_routing_entry *e;
184 struct hlist_node *n;
185
186 if (irqfd->gsi >= irq_rt->nr_rt_entries) {
187 rcu_assign_pointer(irqfd->irq_entry, NULL);
188 return;
189 }
190
191 hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
192 /* Only fast-path MSI. */
193 if (e->type == KVM_IRQ_ROUTING_MSI)
194 rcu_assign_pointer(irqfd->irq_entry, e);
195 else
196 rcu_assign_pointer(irqfd->irq_entry, NULL);
197 }
198}
199
166static int 200static int
167kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) 201kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
168{ 202{
203 struct kvm_irq_routing_table *irq_rt;
169 struct _irqfd *irqfd, *tmp; 204 struct _irqfd *irqfd, *tmp;
170 struct file *file = NULL; 205 struct file *file = NULL;
171 struct eventfd_ctx *eventfd = NULL; 206 struct eventfd_ctx *eventfd = NULL;
@@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
215 goto fail; 250 goto fail;
216 } 251 }
217 252
253 irq_rt = rcu_dereference_protected(kvm->irq_routing,
254 lockdep_is_held(&kvm->irqfds.lock));
255 irqfd_update(kvm, irqfd, irq_rt);
256
218 events = file->f_op->poll(file, &irqfd->pt); 257 events = file->f_op->poll(file, &irqfd->pt);
219 258
220 list_add_tail(&irqfd->list, &kvm->irqfds.items); 259 list_add_tail(&irqfd->list, &kvm->irqfds.items);
@@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
271 spin_lock_irq(&kvm->irqfds.lock); 310 spin_lock_irq(&kvm->irqfds.lock);
272 311
273 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 312 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
274 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) 313 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
314 /*
315 * This rcu_assign_pointer is needed for when
316 * another thread calls kvm_irqfd_update before
317 * we flush workqueue below.
318 * It is paired with synchronize_rcu done by caller
319 * of that function.
320 */
321 rcu_assign_pointer(irqfd->irq_entry, NULL);
275 irqfd_deactivate(irqfd); 322 irqfd_deactivate(irqfd);
323 }
276 } 324 }
277 325
278 spin_unlock_irq(&kvm->irqfds.lock); 326 spin_unlock_irq(&kvm->irqfds.lock);
@@ -322,6 +370,25 @@ kvm_irqfd_release(struct kvm *kvm)
322} 370}
323 371
324/* 372/*
373 * Change irq_routing and irqfd.
374 * Caller must invoke synchronize_rcu afterwards.
375 */
376void kvm_irq_routing_update(struct kvm *kvm,
377 struct kvm_irq_routing_table *irq_rt)
378{
379 struct _irqfd *irqfd;
380
381 spin_lock_irq(&kvm->irqfds.lock);
382
383 rcu_assign_pointer(kvm->irq_routing, irq_rt);
384
385 list_for_each_entry(irqfd, &kvm->irqfds.items, list)
386 irqfd_update(kvm, irqfd, irq_rt);
387
388 spin_unlock_irq(&kvm->irqfds.lock);
389}
390
391/*
325 * create a host-wide workqueue for issuing deferred shutdown requests 392 * create a host-wide workqueue for issuing deferred shutdown requests
326 * aggregated from all vm* instances. We need our own isolated single-thread 393 * aggregated from all vm* instances. We need our own isolated single-thread
327 * queue to prevent deadlock against flushing the normal work-queue. 394 * queue to prevent deadlock against flushing the normal work-queue.
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 8edca9141b78..9f614b4e365f 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
114 return r; 114 return r;
115} 115}
116 116
117static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 117int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
118 struct kvm *kvm, int irq_source_id, int level) 118 struct kvm *kvm, int irq_source_id, int level)
119{ 119{
120 struct kvm_lapic_irq irq; 120 struct kvm_lapic_irq irq;
121 121
@@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
409 409
410 mutex_lock(&kvm->irq_lock); 410 mutex_lock(&kvm->irq_lock);
411 old = kvm->irq_routing; 411 old = kvm->irq_routing;
412 rcu_assign_pointer(kvm->irq_routing, new); 412 kvm_irq_routing_update(kvm, new);
413 mutex_unlock(&kvm->irq_lock); 413 mutex_unlock(&kvm->irq_lock);
414
414 synchronize_rcu(); 415 synchronize_rcu();
415 416
416 new = old; 417 new = old;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5225052aebc1..7f686251f711 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -55,6 +55,7 @@
55#include <asm-generic/bitops/le.h> 55#include <asm-generic/bitops/le.h>
56 56
57#include "coalesced_mmio.h" 57#include "coalesced_mmio.h"
58#include "async_pf.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/kvm.h> 61#include <trace/events/kvm.h>
@@ -89,7 +90,8 @@ static void hardware_disable_all(void);
89 90
90static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 91static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
91 92
92static bool kvm_rebooting; 93bool kvm_rebooting;
94EXPORT_SYMBOL_GPL(kvm_rebooting);
93 95
94static bool largepages_enabled = true; 96static bool largepages_enabled = true;
95 97
@@ -167,8 +169,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
167 169
168void kvm_flush_remote_tlbs(struct kvm *kvm) 170void kvm_flush_remote_tlbs(struct kvm *kvm)
169{ 171{
172 int dirty_count = kvm->tlbs_dirty;
173
174 smp_mb();
170 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 175 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
171 ++kvm->stat.remote_tlb_flush; 176 ++kvm->stat.remote_tlb_flush;
177 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
172} 178}
173 179
174void kvm_reload_remote_mmus(struct kvm *kvm) 180void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -186,6 +192,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
186 vcpu->kvm = kvm; 192 vcpu->kvm = kvm;
187 vcpu->vcpu_id = id; 193 vcpu->vcpu_id = id;
188 init_waitqueue_head(&vcpu->wq); 194 init_waitqueue_head(&vcpu->wq);
195 kvm_async_pf_vcpu_init(vcpu);
189 196
190 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 197 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
191 if (!page) { 198 if (!page) {
@@ -247,7 +254,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
247 idx = srcu_read_lock(&kvm->srcu); 254 idx = srcu_read_lock(&kvm->srcu);
248 spin_lock(&kvm->mmu_lock); 255 spin_lock(&kvm->mmu_lock);
249 kvm->mmu_notifier_seq++; 256 kvm->mmu_notifier_seq++;
250 need_tlb_flush = kvm_unmap_hva(kvm, address); 257 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
251 spin_unlock(&kvm->mmu_lock); 258 spin_unlock(&kvm->mmu_lock);
252 srcu_read_unlock(&kvm->srcu, idx); 259 srcu_read_unlock(&kvm->srcu, idx);
253 260
@@ -291,6 +298,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
291 kvm->mmu_notifier_count++; 298 kvm->mmu_notifier_count++;
292 for (; start < end; start += PAGE_SIZE) 299 for (; start < end; start += PAGE_SIZE)
293 need_tlb_flush |= kvm_unmap_hva(kvm, start); 300 need_tlb_flush |= kvm_unmap_hva(kvm, start);
301 need_tlb_flush |= kvm->tlbs_dirty;
294 spin_unlock(&kvm->mmu_lock); 302 spin_unlock(&kvm->mmu_lock);
295 srcu_read_unlock(&kvm->srcu, idx); 303 srcu_read_unlock(&kvm->srcu, idx);
296 304
@@ -381,11 +389,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
381 389
382static struct kvm *kvm_create_vm(void) 390static struct kvm *kvm_create_vm(void)
383{ 391{
384 int r = 0, i; 392 int r, i;
385 struct kvm *kvm = kvm_arch_create_vm(); 393 struct kvm *kvm = kvm_arch_alloc_vm();
386 394
387 if (IS_ERR(kvm)) 395 if (!kvm)
388 goto out; 396 return ERR_PTR(-ENOMEM);
397
398 r = kvm_arch_init_vm(kvm);
399 if (r)
400 goto out_err_nodisable;
389 401
390 r = hardware_enable_all(); 402 r = hardware_enable_all();
391 if (r) 403 if (r)
@@ -399,23 +411,19 @@ static struct kvm *kvm_create_vm(void)
399 r = -ENOMEM; 411 r = -ENOMEM;
400 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 412 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
401 if (!kvm->memslots) 413 if (!kvm->memslots)
402 goto out_err; 414 goto out_err_nosrcu;
403 if (init_srcu_struct(&kvm->srcu)) 415 if (init_srcu_struct(&kvm->srcu))
404 goto out_err; 416 goto out_err_nosrcu;
405 for (i = 0; i < KVM_NR_BUSES; i++) { 417 for (i = 0; i < KVM_NR_BUSES; i++) {
406 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 418 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
407 GFP_KERNEL); 419 GFP_KERNEL);
408 if (!kvm->buses[i]) { 420 if (!kvm->buses[i])
409 cleanup_srcu_struct(&kvm->srcu);
410 goto out_err; 421 goto out_err;
411 }
412 } 422 }
413 423
414 r = kvm_init_mmu_notifier(kvm); 424 r = kvm_init_mmu_notifier(kvm);
415 if (r) { 425 if (r)
416 cleanup_srcu_struct(&kvm->srcu);
417 goto out_err; 426 goto out_err;
418 }
419 427
420 kvm->mm = current->mm; 428 kvm->mm = current->mm;
421 atomic_inc(&kvm->mm->mm_count); 429 atomic_inc(&kvm->mm->mm_count);
@@ -429,19 +437,35 @@ static struct kvm *kvm_create_vm(void)
429 spin_lock(&kvm_lock); 437 spin_lock(&kvm_lock);
430 list_add(&kvm->vm_list, &vm_list); 438 list_add(&kvm->vm_list, &vm_list);
431 spin_unlock(&kvm_lock); 439 spin_unlock(&kvm_lock);
432out: 440
433 return kvm; 441 return kvm;
434 442
435out_err: 443out_err:
444 cleanup_srcu_struct(&kvm->srcu);
445out_err_nosrcu:
436 hardware_disable_all(); 446 hardware_disable_all();
437out_err_nodisable: 447out_err_nodisable:
438 for (i = 0; i < KVM_NR_BUSES; i++) 448 for (i = 0; i < KVM_NR_BUSES; i++)
439 kfree(kvm->buses[i]); 449 kfree(kvm->buses[i]);
440 kfree(kvm->memslots); 450 kfree(kvm->memslots);
441 kfree(kvm); 451 kvm_arch_free_vm(kvm);
442 return ERR_PTR(r); 452 return ERR_PTR(r);
443} 453}
444 454
455static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
456{
457 if (!memslot->dirty_bitmap)
458 return;
459
460 if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
461 vfree(memslot->dirty_bitmap_head);
462 else
463 kfree(memslot->dirty_bitmap_head);
464
465 memslot->dirty_bitmap = NULL;
466 memslot->dirty_bitmap_head = NULL;
467}
468
445/* 469/*
446 * Free any memory in @free but not in @dont. 470 * Free any memory in @free but not in @dont.
447 */ 471 */
@@ -454,7 +478,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
454 vfree(free->rmap); 478 vfree(free->rmap);
455 479
456 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 480 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
457 vfree(free->dirty_bitmap); 481 kvm_destroy_dirty_bitmap(free);
458 482
459 483
460 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 484 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
@@ -465,7 +489,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
465 } 489 }
466 490
467 free->npages = 0; 491 free->npages = 0;
468 free->dirty_bitmap = NULL;
469 free->rmap = NULL; 492 free->rmap = NULL;
470} 493}
471 494
@@ -499,6 +522,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
499 kvm_arch_flush_shadow(kvm); 522 kvm_arch_flush_shadow(kvm);
500#endif 523#endif
501 kvm_arch_destroy_vm(kvm); 524 kvm_arch_destroy_vm(kvm);
525 kvm_free_physmem(kvm);
526 cleanup_srcu_struct(&kvm->srcu);
527 kvm_arch_free_vm(kvm);
502 hardware_disable_all(); 528 hardware_disable_all();
503 mmdrop(mm); 529 mmdrop(mm);
504} 530}
@@ -528,6 +554,27 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
528} 554}
529 555
530/* 556/*
557 * Allocation size is twice as large as the actual dirty bitmap size.
558 * This makes it possible to do double buffering: see x86's
559 * kvm_vm_ioctl_get_dirty_log().
560 */
561static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
562{
563 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
564
565 if (dirty_bytes > PAGE_SIZE)
566 memslot->dirty_bitmap = vzalloc(dirty_bytes);
567 else
568 memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
569
570 if (!memslot->dirty_bitmap)
571 return -ENOMEM;
572
573 memslot->dirty_bitmap_head = memslot->dirty_bitmap;
574 return 0;
575}
576
577/*
531 * Allocate some memory and give it an address in the guest physical address 578 * Allocate some memory and give it an address in the guest physical address
532 * space. 579 * space.
533 * 580 *
@@ -604,13 +651,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
604 /* Allocate if a slot is being created */ 651 /* Allocate if a slot is being created */
605#ifndef CONFIG_S390 652#ifndef CONFIG_S390
606 if (npages && !new.rmap) { 653 if (npages && !new.rmap) {
607 new.rmap = vmalloc(npages * sizeof(*new.rmap)); 654 new.rmap = vzalloc(npages * sizeof(*new.rmap));
608 655
609 if (!new.rmap) 656 if (!new.rmap)
610 goto out_free; 657 goto out_free;
611 658
612 memset(new.rmap, 0, npages * sizeof(*new.rmap));
613
614 new.user_alloc = user_alloc; 659 new.user_alloc = user_alloc;
615 new.userspace_addr = mem->userspace_addr; 660 new.userspace_addr = mem->userspace_addr;
616 } 661 }
@@ -633,14 +678,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
633 >> KVM_HPAGE_GFN_SHIFT(level)); 678 >> KVM_HPAGE_GFN_SHIFT(level));
634 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); 679 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
635 680
636 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 681 new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
637 682
638 if (!new.lpage_info[i]) 683 if (!new.lpage_info[i])
639 goto out_free; 684 goto out_free;
640 685
641 memset(new.lpage_info[i], 0,
642 lpages * sizeof(*new.lpage_info[i]));
643
644 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 686 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
645 new.lpage_info[i][0].write_count = 1; 687 new.lpage_info[i][0].write_count = 1;
646 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 688 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
@@ -661,12 +703,8 @@ skip_lpage:
661 703
662 /* Allocate page dirty bitmap if needed */ 704 /* Allocate page dirty bitmap if needed */
663 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 705 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
664 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); 706 if (kvm_create_dirty_bitmap(&new) < 0)
665
666 new.dirty_bitmap = vmalloc(dirty_bytes);
667 if (!new.dirty_bitmap)
668 goto out_free; 707 goto out_free;
669 memset(new.dirty_bitmap, 0, dirty_bytes);
670 /* destroy any largepage mappings for dirty tracking */ 708 /* destroy any largepage mappings for dirty tracking */
671 if (old.npages) 709 if (old.npages)
672 flush_shadow = 1; 710 flush_shadow = 1;
@@ -685,6 +723,7 @@ skip_lpage:
685 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 723 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
686 if (mem->slot >= slots->nmemslots) 724 if (mem->slot >= slots->nmemslots)
687 slots->nmemslots = mem->slot + 1; 725 slots->nmemslots = mem->slot + 1;
726 slots->generation++;
688 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; 727 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
689 728
690 old_memslots = kvm->memslots; 729 old_memslots = kvm->memslots;
@@ -719,6 +758,7 @@ skip_lpage:
719 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 758 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
720 if (mem->slot >= slots->nmemslots) 759 if (mem->slot >= slots->nmemslots)
721 slots->nmemslots = mem->slot + 1; 760 slots->nmemslots = mem->slot + 1;
761 slots->generation++;
722 762
723 /* actual memory is freed via old in kvm_free_physmem_slot below */ 763 /* actual memory is freed via old in kvm_free_physmem_slot below */
724 if (!npages) { 764 if (!npages) {
@@ -849,10 +889,10 @@ int kvm_is_error_hva(unsigned long addr)
849} 889}
850EXPORT_SYMBOL_GPL(kvm_is_error_hva); 890EXPORT_SYMBOL_GPL(kvm_is_error_hva);
851 891
852struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 892static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
893 gfn_t gfn)
853{ 894{
854 int i; 895 int i;
855 struct kvm_memslots *slots = kvm_memslots(kvm);
856 896
857 for (i = 0; i < slots->nmemslots; ++i) { 897 for (i = 0; i < slots->nmemslots; ++i) {
858 struct kvm_memory_slot *memslot = &slots->memslots[i]; 898 struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -863,6 +903,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
863 } 903 }
864 return NULL; 904 return NULL;
865} 905}
906
907struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
908{
909 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
910}
866EXPORT_SYMBOL_GPL(gfn_to_memslot); 911EXPORT_SYMBOL_GPL(gfn_to_memslot);
867 912
868int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 913int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -925,12 +970,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
925 return memslot - slots->memslots; 970 return memslot - slots->memslots;
926} 971}
927 972
928static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, 973static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
929 gfn_t *nr_pages) 974 gfn_t *nr_pages)
930{ 975{
931 struct kvm_memory_slot *slot;
932
933 slot = gfn_to_memslot(kvm, gfn);
934 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 976 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
935 return bad_hva(); 977 return bad_hva();
936 978
@@ -942,28 +984,61 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
942 984
943unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 985unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
944{ 986{
945 return gfn_to_hva_many(kvm, gfn, NULL); 987 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
946} 988}
947EXPORT_SYMBOL_GPL(gfn_to_hva); 989EXPORT_SYMBOL_GPL(gfn_to_hva);
948 990
949static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) 991static pfn_t get_fault_pfn(void)
992{
993 get_page(fault_page);
994 return fault_pfn;
995}
996
997static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
998 bool *async, bool write_fault, bool *writable)
950{ 999{
951 struct page *page[1]; 1000 struct page *page[1];
952 int npages; 1001 int npages = 0;
953 pfn_t pfn; 1002 pfn_t pfn;
954 1003
955 if (atomic) 1004 /* we can do it either atomically or asynchronously, not both */
1005 BUG_ON(atomic && async);
1006
1007 BUG_ON(!write_fault && !writable);
1008
1009 if (writable)
1010 *writable = true;
1011
1012 if (atomic || async)
956 npages = __get_user_pages_fast(addr, 1, 1, page); 1013 npages = __get_user_pages_fast(addr, 1, 1, page);
957 else { 1014
1015 if (unlikely(npages != 1) && !atomic) {
958 might_sleep(); 1016 might_sleep();
959 npages = get_user_pages_fast(addr, 1, 1, page); 1017
1018 if (writable)
1019 *writable = write_fault;
1020
1021 npages = get_user_pages_fast(addr, 1, write_fault, page);
1022
1023 /* map read fault as writable if possible */
1024 if (unlikely(!write_fault) && npages == 1) {
1025 struct page *wpage[1];
1026
1027 npages = __get_user_pages_fast(addr, 1, 1, wpage);
1028 if (npages == 1) {
1029 *writable = true;
1030 put_page(page[0]);
1031 page[0] = wpage[0];
1032 }
1033 npages = 1;
1034 }
960 } 1035 }
961 1036
962 if (unlikely(npages != 1)) { 1037 if (unlikely(npages != 1)) {
963 struct vm_area_struct *vma; 1038 struct vm_area_struct *vma;
964 1039
965 if (atomic) 1040 if (atomic)
966 goto return_fault_page; 1041 return get_fault_pfn();
967 1042
968 down_read(&current->mm->mmap_sem); 1043 down_read(&current->mm->mmap_sem);
969 if (is_hwpoison_address(addr)) { 1044 if (is_hwpoison_address(addr)) {
@@ -972,19 +1047,20 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
972 return page_to_pfn(hwpoison_page); 1047 return page_to_pfn(hwpoison_page);
973 } 1048 }
974 1049
975 vma = find_vma(current->mm, addr); 1050 vma = find_vma_intersection(current->mm, addr, addr+1);
976 1051
977 if (vma == NULL || addr < vma->vm_start || 1052 if (vma == NULL)
978 !(vma->vm_flags & VM_PFNMAP)) { 1053 pfn = get_fault_pfn();
979 up_read(&current->mm->mmap_sem); 1054 else if ((vma->vm_flags & VM_PFNMAP)) {
980return_fault_page: 1055 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
981 get_page(fault_page); 1056 vma->vm_pgoff;
982 return page_to_pfn(fault_page); 1057 BUG_ON(!kvm_is_mmio_pfn(pfn));
1058 } else {
1059 if (async && (vma->vm_flags & VM_WRITE))
1060 *async = true;
1061 pfn = get_fault_pfn();
983 } 1062 }
984
985 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
986 up_read(&current->mm->mmap_sem); 1063 up_read(&current->mm->mmap_sem);
987 BUG_ON(!kvm_is_mmio_pfn(pfn));
988 } else 1064 } else
989 pfn = page_to_pfn(page[0]); 1065 pfn = page_to_pfn(page[0]);
990 1066
@@ -993,40 +1069,58 @@ return_fault_page:
993 1069
994pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1070pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
995{ 1071{
996 return hva_to_pfn(kvm, addr, true); 1072 return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
997} 1073}
998EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); 1074EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
999 1075
1000static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) 1076static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1077 bool write_fault, bool *writable)
1001{ 1078{
1002 unsigned long addr; 1079 unsigned long addr;
1003 1080
1081 if (async)
1082 *async = false;
1083
1004 addr = gfn_to_hva(kvm, gfn); 1084 addr = gfn_to_hva(kvm, gfn);
1005 if (kvm_is_error_hva(addr)) { 1085 if (kvm_is_error_hva(addr)) {
1006 get_page(bad_page); 1086 get_page(bad_page);
1007 return page_to_pfn(bad_page); 1087 return page_to_pfn(bad_page);
1008 } 1088 }
1009 1089
1010 return hva_to_pfn(kvm, addr, atomic); 1090 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
1011} 1091}
1012 1092
1013pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1093pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1014{ 1094{
1015 return __gfn_to_pfn(kvm, gfn, true); 1095 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
1016} 1096}
1017EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1097EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1018 1098
1099pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1100 bool write_fault, bool *writable)
1101{
1102 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
1103}
1104EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1105
1019pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1106pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1020{ 1107{
1021 return __gfn_to_pfn(kvm, gfn, false); 1108 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
1022} 1109}
1023EXPORT_SYMBOL_GPL(gfn_to_pfn); 1110EXPORT_SYMBOL_GPL(gfn_to_pfn);
1024 1111
1112pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1113 bool *writable)
1114{
1115 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1116}
1117EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1118
1025pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1119pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1026 struct kvm_memory_slot *slot, gfn_t gfn) 1120 struct kvm_memory_slot *slot, gfn_t gfn)
1027{ 1121{
1028 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1122 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1029 return hva_to_pfn(kvm, addr, false); 1123 return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1030} 1124}
1031 1125
1032int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1126int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@@ -1035,7 +1129,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1035 unsigned long addr; 1129 unsigned long addr;
1036 gfn_t entry; 1130 gfn_t entry;
1037 1131
1038 addr = gfn_to_hva_many(kvm, gfn, &entry); 1132 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
1039 if (kvm_is_error_hva(addr)) 1133 if (kvm_is_error_hva(addr))
1040 return -1; 1134 return -1;
1041 1135
@@ -1219,9 +1313,51 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1219 return 0; 1313 return 0;
1220} 1314}
1221 1315
1316int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1317 gpa_t gpa)
1318{
1319 struct kvm_memslots *slots = kvm_memslots(kvm);
1320 int offset = offset_in_page(gpa);
1321 gfn_t gfn = gpa >> PAGE_SHIFT;
1322
1323 ghc->gpa = gpa;
1324 ghc->generation = slots->generation;
1325 ghc->memslot = __gfn_to_memslot(slots, gfn);
1326 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1327 if (!kvm_is_error_hva(ghc->hva))
1328 ghc->hva += offset;
1329 else
1330 return -EFAULT;
1331
1332 return 0;
1333}
1334EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1335
1336int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1337 void *data, unsigned long len)
1338{
1339 struct kvm_memslots *slots = kvm_memslots(kvm);
1340 int r;
1341
1342 if (slots->generation != ghc->generation)
1343 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1344
1345 if (kvm_is_error_hva(ghc->hva))
1346 return -EFAULT;
1347
1348 r = copy_to_user((void __user *)ghc->hva, data, len);
1349 if (r)
1350 return -EFAULT;
1351 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1352
1353 return 0;
1354}
1355EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1356
1222int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1357int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1223{ 1358{
1224 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1359 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1360 offset, len);
1225} 1361}
1226EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1362EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1227 1363
@@ -1244,11 +1380,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1244} 1380}
1245EXPORT_SYMBOL_GPL(kvm_clear_guest); 1381EXPORT_SYMBOL_GPL(kvm_clear_guest);
1246 1382
1247void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1383void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1384 gfn_t gfn)
1248{ 1385{
1249 struct kvm_memory_slot *memslot;
1250
1251 memslot = gfn_to_memslot(kvm, gfn);
1252 if (memslot && memslot->dirty_bitmap) { 1386 if (memslot && memslot->dirty_bitmap) {
1253 unsigned long rel_gfn = gfn - memslot->base_gfn; 1387 unsigned long rel_gfn = gfn - memslot->base_gfn;
1254 1388
@@ -1256,6 +1390,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1256 } 1390 }
1257} 1391}
1258 1392
1393void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1394{
1395 struct kvm_memory_slot *memslot;
1396
1397 memslot = gfn_to_memslot(kvm, gfn);
1398 mark_page_dirty_in_slot(kvm, memslot, gfn);
1399}
1400
1259/* 1401/*
1260 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1402 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1261 */ 1403 */
@@ -1457,6 +1599,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
1457 if (arg) 1599 if (arg)
1458 goto out; 1600 goto out;
1459 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1601 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1602 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
1460 break; 1603 break;
1461 case KVM_GET_REGS: { 1604 case KVM_GET_REGS: {
1462 struct kvm_regs *kvm_regs; 1605 struct kvm_regs *kvm_regs;
@@ -1824,7 +1967,7 @@ static struct file_operations kvm_vm_fops = {
1824 1967
1825static int kvm_dev_ioctl_create_vm(void) 1968static int kvm_dev_ioctl_create_vm(void)
1826{ 1969{
1827 int fd, r; 1970 int r;
1828 struct kvm *kvm; 1971 struct kvm *kvm;
1829 1972
1830 kvm = kvm_create_vm(); 1973 kvm = kvm_create_vm();
@@ -1837,11 +1980,11 @@ static int kvm_dev_ioctl_create_vm(void)
1837 return r; 1980 return r;
1838 } 1981 }
1839#endif 1982#endif
1840 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 1983 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
1841 if (fd < 0) 1984 if (r < 0)
1842 kvm_put_kvm(kvm); 1985 kvm_put_kvm(kvm);
1843 1986
1844 return fd; 1987 return r;
1845} 1988}
1846 1989
1847static long kvm_dev_ioctl_check_extension_generic(long arg) 1990static long kvm_dev_ioctl_check_extension_generic(long arg)
@@ -1922,7 +2065,7 @@ static struct miscdevice kvm_dev = {
1922 &kvm_chardev_ops, 2065 &kvm_chardev_ops,
1923}; 2066};
1924 2067
1925static void hardware_enable(void *junk) 2068static void hardware_enable_nolock(void *junk)
1926{ 2069{
1927 int cpu = raw_smp_processor_id(); 2070 int cpu = raw_smp_processor_id();
1928 int r; 2071 int r;
@@ -1942,7 +2085,14 @@ static void hardware_enable(void *junk)
1942 } 2085 }
1943} 2086}
1944 2087
1945static void hardware_disable(void *junk) 2088static void hardware_enable(void *junk)
2089{
2090 spin_lock(&kvm_lock);
2091 hardware_enable_nolock(junk);
2092 spin_unlock(&kvm_lock);
2093}
2094
2095static void hardware_disable_nolock(void *junk)
1946{ 2096{
1947 int cpu = raw_smp_processor_id(); 2097 int cpu = raw_smp_processor_id();
1948 2098
@@ -1952,13 +2102,20 @@ static void hardware_disable(void *junk)
1952 kvm_arch_hardware_disable(NULL); 2102 kvm_arch_hardware_disable(NULL);
1953} 2103}
1954 2104
2105static void hardware_disable(void *junk)
2106{
2107 spin_lock(&kvm_lock);
2108 hardware_disable_nolock(junk);
2109 spin_unlock(&kvm_lock);
2110}
2111
1955static void hardware_disable_all_nolock(void) 2112static void hardware_disable_all_nolock(void)
1956{ 2113{
1957 BUG_ON(!kvm_usage_count); 2114 BUG_ON(!kvm_usage_count);
1958 2115
1959 kvm_usage_count--; 2116 kvm_usage_count--;
1960 if (!kvm_usage_count) 2117 if (!kvm_usage_count)
1961 on_each_cpu(hardware_disable, NULL, 1); 2118 on_each_cpu(hardware_disable_nolock, NULL, 1);
1962} 2119}
1963 2120
1964static void hardware_disable_all(void) 2121static void hardware_disable_all(void)
@@ -1977,7 +2134,7 @@ static int hardware_enable_all(void)
1977 kvm_usage_count++; 2134 kvm_usage_count++;
1978 if (kvm_usage_count == 1) { 2135 if (kvm_usage_count == 1) {
1979 atomic_set(&hardware_enable_failed, 0); 2136 atomic_set(&hardware_enable_failed, 0);
1980 on_each_cpu(hardware_enable, NULL, 1); 2137 on_each_cpu(hardware_enable_nolock, NULL, 1);
1981 2138
1982 if (atomic_read(&hardware_enable_failed)) { 2139 if (atomic_read(&hardware_enable_failed)) {
1983 hardware_disable_all_nolock(); 2140 hardware_disable_all_nolock();
@@ -2008,27 +2165,19 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2008 case CPU_STARTING: 2165 case CPU_STARTING:
2009 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2166 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2010 cpu); 2167 cpu);
2011 spin_lock(&kvm_lock);
2012 hardware_enable(NULL); 2168 hardware_enable(NULL);
2013 spin_unlock(&kvm_lock);
2014 break; 2169 break;
2015 } 2170 }
2016 return NOTIFY_OK; 2171 return NOTIFY_OK;
2017} 2172}
2018 2173
2019 2174
2020asmlinkage void kvm_handle_fault_on_reboot(void) 2175asmlinkage void kvm_spurious_fault(void)
2021{ 2176{
2022 if (kvm_rebooting) {
2023 /* spin while reset goes on */
2024 local_irq_enable();
2025 while (true)
2026 cpu_relax();
2027 }
2028 /* Fault while not rebooting. We want the trace. */ 2177 /* Fault while not rebooting. We want the trace. */
2029 BUG(); 2178 BUG();
2030} 2179}
2031EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2180EXPORT_SYMBOL_GPL(kvm_spurious_fault);
2032 2181
2033static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2182static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2034 void *v) 2183 void *v)
@@ -2041,7 +2190,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2041 */ 2190 */
2042 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2191 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2043 kvm_rebooting = true; 2192 kvm_rebooting = true;
2044 on_each_cpu(hardware_disable, NULL, 1); 2193 on_each_cpu(hardware_disable_nolock, NULL, 1);
2045 return NOTIFY_OK; 2194 return NOTIFY_OK;
2046} 2195}
2047 2196
@@ -2211,7 +2360,7 @@ static void kvm_exit_debug(void)
2211static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2360static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2212{ 2361{
2213 if (kvm_usage_count) 2362 if (kvm_usage_count)
2214 hardware_disable(NULL); 2363 hardware_disable_nolock(NULL);
2215 return 0; 2364 return 0;
2216} 2365}
2217 2366
@@ -2219,7 +2368,7 @@ static int kvm_resume(struct sys_device *dev)
2219{ 2368{
2220 if (kvm_usage_count) { 2369 if (kvm_usage_count) {
2221 WARN_ON(spin_is_locked(&kvm_lock)); 2370 WARN_ON(spin_is_locked(&kvm_lock));
2222 hardware_enable(NULL); 2371 hardware_enable_nolock(NULL);
2223 } 2372 }
2224 return 0; 2373 return 0;
2225} 2374}
@@ -2336,6 +2485,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2336 goto out_free_5; 2485 goto out_free_5;
2337 } 2486 }
2338 2487
2488 r = kvm_async_pf_init();
2489 if (r)
2490 goto out_free;
2491
2339 kvm_chardev_ops.owner = module; 2492 kvm_chardev_ops.owner = module;
2340 kvm_vm_fops.owner = module; 2493 kvm_vm_fops.owner = module;
2341 kvm_vcpu_fops.owner = module; 2494 kvm_vcpu_fops.owner = module;
@@ -2343,7 +2496,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2343 r = misc_register(&kvm_dev); 2496 r = misc_register(&kvm_dev);
2344 if (r) { 2497 if (r) {
2345 printk(KERN_ERR "kvm: misc device register failed\n"); 2498 printk(KERN_ERR "kvm: misc device register failed\n");
2346 goto out_free; 2499 goto out_unreg;
2347 } 2500 }
2348 2501
2349 kvm_preempt_ops.sched_in = kvm_sched_in; 2502 kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -2353,6 +2506,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2353 2506
2354 return 0; 2507 return 0;
2355 2508
2509out_unreg:
2510 kvm_async_pf_deinit();
2356out_free: 2511out_free:
2357 kmem_cache_destroy(kvm_vcpu_cache); 2512 kmem_cache_destroy(kvm_vcpu_cache);
2358out_free_5: 2513out_free_5:
@@ -2385,11 +2540,12 @@ void kvm_exit(void)
2385 kvm_exit_debug(); 2540 kvm_exit_debug();
2386 misc_deregister(&kvm_dev); 2541 misc_deregister(&kvm_dev);
2387 kmem_cache_destroy(kvm_vcpu_cache); 2542 kmem_cache_destroy(kvm_vcpu_cache);
2543 kvm_async_pf_deinit();
2388 sysdev_unregister(&kvm_sysdev); 2544 sysdev_unregister(&kvm_sysdev);
2389 sysdev_class_unregister(&kvm_sysdev_class); 2545 sysdev_class_unregister(&kvm_sysdev_class);
2390 unregister_reboot_notifier(&kvm_reboot_notifier); 2546 unregister_reboot_notifier(&kvm_reboot_notifier);
2391 unregister_cpu_notifier(&kvm_cpu_notifier); 2547 unregister_cpu_notifier(&kvm_cpu_notifier);
2392 on_each_cpu(hardware_disable, NULL, 1); 2548 on_each_cpu(hardware_disable_nolock, NULL, 1);
2393 kvm_arch_hardware_unsetup(); 2549 kvm_arch_hardware_unsetup();
2394 kvm_arch_exit(); 2550 kvm_arch_exit();
2395 free_cpumask_var(cpus_hardware_enabled); 2551 free_cpumask_var(cpus_hardware_enabled);