diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 13:14:24 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 13:14:24 -0500 |
commit | 55065bc52795faae549abfb912aacc622dd63876 (patch) | |
tree | 63683547e41ed459a2a8747eeafb5e969633d54f | |
parent | 008d23e4852d78bb2618f2035f8b2110b6a6b968 (diff) | |
parent | e5c301428294cb8925667c9ee39f817c4ab1c2c9 (diff) |
Merge branch 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (142 commits)
KVM: Initialize fpu state in preemptible context
KVM: VMX: when entering real mode align segment base to 16 bytes
KVM: MMU: handle 'map_writable' in set_spte() function
KVM: MMU: audit: allow audit more guests at the same time
KVM: Fetch guest cr3 from hardware on demand
KVM: Replace reads of vcpu->arch.cr3 by an accessor
KVM: MMU: only write protect mappings at pagetable level
KVM: VMX: Correct asm constraint in vmcs_load()/vmcs_clear()
KVM: MMU: Initialize base_role for tdp mmus
KVM: VMX: Optimize atomic EFER load
KVM: VMX: Add definitions for more vm entry/exit control bits
KVM: SVM: copy instruction bytes from VMCB
KVM: SVM: implement enhanced INVLPG intercept
KVM: SVM: enhance mov DR intercept handler
KVM: SVM: enhance MOV CR intercept handler
KVM: SVM: add new SVM feature bit names
KVM: cleanup emulate_instruction
KVM: move complete_insn_gp() into x86.c
KVM: x86: fix CR8 handling
KVM guest: Fix kvm clock initialization when it's configured out
...
43 files changed, 3078 insertions, 1185 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 338c96ea0855..55fe7599bc8e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1705,6 +1705,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1705 | 1705 | ||
1706 | no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver | 1706 | no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver |
1707 | 1707 | ||
1708 | no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page | ||
1709 | fault handling. | ||
1710 | |||
1708 | nolapic [X86-32,APIC] Do not enable or use the local APIC. | 1711 | nolapic [X86-32,APIC] Do not enable or use the local APIC. |
1709 | 1712 | ||
1710 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. | 1713 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. |
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 50713e37c695..ad85797c1cf0 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt | |||
@@ -1085,6 +1085,184 @@ of 4 instructions that make up a hypercall. | |||
1085 | If any additional field gets added to this structure later on, a bit for that | 1085 | If any additional field gets added to this structure later on, a bit for that |
1086 | additional piece of information will be set in the flags bitmap. | 1086 | additional piece of information will be set in the flags bitmap. |
1087 | 1087 | ||
1088 | 4.47 KVM_ASSIGN_PCI_DEVICE | ||
1089 | |||
1090 | Capability: KVM_CAP_DEVICE_ASSIGNMENT | ||
1091 | Architectures: x86 ia64 | ||
1092 | Type: vm ioctl | ||
1093 | Parameters: struct kvm_assigned_pci_dev (in) | ||
1094 | Returns: 0 on success, -1 on error | ||
1095 | |||
1096 | Assigns a host PCI device to the VM. | ||
1097 | |||
1098 | struct kvm_assigned_pci_dev { | ||
1099 | __u32 assigned_dev_id; | ||
1100 | __u32 busnr; | ||
1101 | __u32 devfn; | ||
1102 | __u32 flags; | ||
1103 | __u32 segnr; | ||
1104 | union { | ||
1105 | __u32 reserved[11]; | ||
1106 | }; | ||
1107 | }; | ||
1108 | |||
1109 | The PCI device is specified by the triple segnr, busnr, and devfn. | ||
1110 | Identification in succeeding service requests is done via assigned_dev_id. The | ||
1111 | following flags are specified: | ||
1112 | |||
1113 | /* Depends on KVM_CAP_IOMMU */ | ||
1114 | #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) | ||
1115 | |||
1116 | 4.48 KVM_DEASSIGN_PCI_DEVICE | ||
1117 | |||
1118 | Capability: KVM_CAP_DEVICE_DEASSIGNMENT | ||
1119 | Architectures: x86 ia64 | ||
1120 | Type: vm ioctl | ||
1121 | Parameters: struct kvm_assigned_pci_dev (in) | ||
1122 | Returns: 0 on success, -1 on error | ||
1123 | |||
1124 | Ends PCI device assignment, releasing all associated resources. | ||
1125 | |||
1126 | See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is | ||
1127 | used in kvm_assigned_pci_dev to identify the device. | ||
1128 | |||
1129 | 4.49 KVM_ASSIGN_DEV_IRQ | ||
1130 | |||
1131 | Capability: KVM_CAP_ASSIGN_DEV_IRQ | ||
1132 | Architectures: x86 ia64 | ||
1133 | Type: vm ioctl | ||
1134 | Parameters: struct kvm_assigned_irq (in) | ||
1135 | Returns: 0 on success, -1 on error | ||
1136 | |||
1137 | Assigns an IRQ to a passed-through device. | ||
1138 | |||
1139 | struct kvm_assigned_irq { | ||
1140 | __u32 assigned_dev_id; | ||
1141 | __u32 host_irq; | ||
1142 | __u32 guest_irq; | ||
1143 | __u32 flags; | ||
1144 | union { | ||
1145 | struct { | ||
1146 | __u32 addr_lo; | ||
1147 | __u32 addr_hi; | ||
1148 | __u32 data; | ||
1149 | } guest_msi; | ||
1150 | __u32 reserved[12]; | ||
1151 | }; | ||
1152 | }; | ||
1153 | |||
1154 | The following flags are defined: | ||
1155 | |||
1156 | #define KVM_DEV_IRQ_HOST_INTX (1 << 0) | ||
1157 | #define KVM_DEV_IRQ_HOST_MSI (1 << 1) | ||
1158 | #define KVM_DEV_IRQ_HOST_MSIX (1 << 2) | ||
1159 | |||
1160 | #define KVM_DEV_IRQ_GUEST_INTX (1 << 8) | ||
1161 | #define KVM_DEV_IRQ_GUEST_MSI (1 << 9) | ||
1162 | #define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) | ||
1163 | |||
1164 | It is not valid to specify multiple types per host or guest IRQ. However, the | ||
1165 | IRQ type of host and guest can differ or can even be null. | ||
1166 | |||
1167 | 4.50 KVM_DEASSIGN_DEV_IRQ | ||
1168 | |||
1169 | Capability: KVM_CAP_ASSIGN_DEV_IRQ | ||
1170 | Architectures: x86 ia64 | ||
1171 | Type: vm ioctl | ||
1172 | Parameters: struct kvm_assigned_irq (in) | ||
1173 | Returns: 0 on success, -1 on error | ||
1174 | |||
1175 | Ends an IRQ assignment to a passed-through device. | ||
1176 | |||
1177 | See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified | ||
1178 | by assigned_dev_id, flags must correspond to the IRQ type specified on | ||
1179 | KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. | ||
1180 | |||
1181 | 4.51 KVM_SET_GSI_ROUTING | ||
1182 | |||
1183 | Capability: KVM_CAP_IRQ_ROUTING | ||
1184 | Architectures: x86 ia64 | ||
1185 | Type: vm ioctl | ||
1186 | Parameters: struct kvm_irq_routing (in) | ||
1187 | Returns: 0 on success, -1 on error | ||
1188 | |||
1189 | Sets the GSI routing table entries, overwriting any previously set entries. | ||
1190 | |||
1191 | struct kvm_irq_routing { | ||
1192 | __u32 nr; | ||
1193 | __u32 flags; | ||
1194 | struct kvm_irq_routing_entry entries[0]; | ||
1195 | }; | ||
1196 | |||
1197 | No flags are specified so far, the corresponding field must be set to zero. | ||
1198 | |||
1199 | struct kvm_irq_routing_entry { | ||
1200 | __u32 gsi; | ||
1201 | __u32 type; | ||
1202 | __u32 flags; | ||
1203 | __u32 pad; | ||
1204 | union { | ||
1205 | struct kvm_irq_routing_irqchip irqchip; | ||
1206 | struct kvm_irq_routing_msi msi; | ||
1207 | __u32 pad[8]; | ||
1208 | } u; | ||
1209 | }; | ||
1210 | |||
1211 | /* gsi routing entry types */ | ||
1212 | #define KVM_IRQ_ROUTING_IRQCHIP 1 | ||
1213 | #define KVM_IRQ_ROUTING_MSI 2 | ||
1214 | |||
1215 | No flags are specified so far, the corresponding field must be set to zero. | ||
1216 | |||
1217 | struct kvm_irq_routing_irqchip { | ||
1218 | __u32 irqchip; | ||
1219 | __u32 pin; | ||
1220 | }; | ||
1221 | |||
1222 | struct kvm_irq_routing_msi { | ||
1223 | __u32 address_lo; | ||
1224 | __u32 address_hi; | ||
1225 | __u32 data; | ||
1226 | __u32 pad; | ||
1227 | }; | ||
1228 | |||
1229 | 4.52 KVM_ASSIGN_SET_MSIX_NR | ||
1230 | |||
1231 | Capability: KVM_CAP_DEVICE_MSIX | ||
1232 | Architectures: x86 ia64 | ||
1233 | Type: vm ioctl | ||
1234 | Parameters: struct kvm_assigned_msix_nr (in) | ||
1235 | Returns: 0 on success, -1 on error | ||
1236 | |||
1237 | Set the number of MSI-X interrupts for an assigned device. This service can | ||
1238 | only be called once in the lifetime of an assigned device. | ||
1239 | |||
1240 | struct kvm_assigned_msix_nr { | ||
1241 | __u32 assigned_dev_id; | ||
1242 | __u16 entry_nr; | ||
1243 | __u16 padding; | ||
1244 | }; | ||
1245 | |||
1246 | #define KVM_MAX_MSIX_PER_DEV 256 | ||
1247 | |||
1248 | 4.53 KVM_ASSIGN_SET_MSIX_ENTRY | ||
1249 | |||
1250 | Capability: KVM_CAP_DEVICE_MSIX | ||
1251 | Architectures: x86 ia64 | ||
1252 | Type: vm ioctl | ||
1253 | Parameters: struct kvm_assigned_msix_entry (in) | ||
1254 | Returns: 0 on success, -1 on error | ||
1255 | |||
1256 | Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting | ||
1257 | the GSI vector to zero means disabling the interrupt. | ||
1258 | |||
1259 | struct kvm_assigned_msix_entry { | ||
1260 | __u32 assigned_dev_id; | ||
1261 | __u32 gsi; | ||
1262 | __u16 entry; /* The index of entry in the MSI-X table */ | ||
1263 | __u16 padding[3]; | ||
1264 | }; | ||
1265 | |||
1088 | 5. The kvm_run structure | 1266 | 5. The kvm_run structure |
1089 | 1267 | ||
1090 | Application code obtains a pointer to the kvm_run structure by | 1268 | Application code obtains a pointer to the kvm_run structure by |
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt index 14a12ea92b7f..882068538c9c 100644 --- a/Documentation/kvm/cpuid.txt +++ b/Documentation/kvm/cpuid.txt | |||
@@ -36,6 +36,9 @@ KVM_FEATURE_MMU_OP || 2 || deprecated. | |||
36 | KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs | 36 | KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs |
37 | || || 0x4b564d00 and 0x4b564d01 | 37 | || || 0x4b564d00 and 0x4b564d01 |
38 | ------------------------------------------------------------------------------ | 38 | ------------------------------------------------------------------------------ |
39 | KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by | ||
40 | || || writing to msr 0x4b564d02 | ||
41 | ------------------------------------------------------------------------------ | ||
39 | KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side | 42 | KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side |
40 | || || per-cpu warps are expected in | 43 | || || per-cpu warps are expected in |
41 | || || kvmclock. | 44 | || || kvmclock. |
diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt index 8ddcfe84c09a..d079aed27e03 100644 --- a/Documentation/kvm/msr.txt +++ b/Documentation/kvm/msr.txt | |||
@@ -3,7 +3,6 @@ Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010 | |||
3 | ===================================================== | 3 | ===================================================== |
4 | 4 | ||
5 | KVM makes use of some custom MSRs to service some requests. | 5 | KVM makes use of some custom MSRs to service some requests. |
6 | At present, this facility is only used by kvmclock. | ||
7 | 6 | ||
8 | Custom MSRs have a range reserved for them, that goes from | 7 | Custom MSRs have a range reserved for them, that goes from |
9 | 0x4b564d00 to 0x4b564dff. There are MSRs outside this area, | 8 | 0x4b564d00 to 0x4b564dff. There are MSRs outside this area, |
@@ -151,3 +150,38 @@ MSR_KVM_SYSTEM_TIME: 0x12 | |||
151 | return PRESENT; | 150 | return PRESENT; |
152 | } else | 151 | } else |
153 | return NON_PRESENT; | 152 | return NON_PRESENT; |
153 | |||
154 | MSR_KVM_ASYNC_PF_EN: 0x4b564d02 | ||
155 | data: Bits 63-6 hold 64-byte aligned physical address of a | ||
156 | 64 byte memory area which must be in guest RAM and must be | ||
157 | zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1 | ||
158 | when asynchronous page faults are enabled on the vcpu 0 when | ||
159 | disabled. Bit 2 is 1 if asynchronous page faults can be injected | ||
160 | when vcpu is in cpl == 0. | ||
161 | |||
162 | First 4 byte of 64 byte memory location will be written to by | ||
163 | the hypervisor at the time of asynchronous page fault (APF) | ||
164 | injection to indicate type of asynchronous page fault. Value | ||
165 | of 1 means that the page referred to by the page fault is not | ||
166 | present. Value 2 means that the page is now available. Disabling | ||
167 | interrupt inhibits APFs. Guest must not enable interrupt | ||
168 | before the reason is read, or it may be overwritten by another | ||
169 | APF. Since APF uses the same exception vector as regular page | ||
170 | fault guest must reset the reason to 0 before it does | ||
171 | something that can generate normal page fault. If during page | ||
172 | fault APF reason is 0 it means that this is regular page | ||
173 | fault. | ||
174 | |||
175 | During delivery of type 1 APF cr2 contains a token that will | ||
176 | be used to notify a guest when missing page becomes | ||
177 | available. When page becomes available type 2 APF is sent with | ||
178 | cr2 set to the token associated with the page. There is special | ||
179 | kind of token 0xffffffff which tells vcpu that it should wake | ||
180 | up all processes waiting for APFs and no individual type 2 APFs | ||
181 | will be sent. | ||
182 | |||
183 | If APF is disabled while there are outstanding APFs, they will | ||
184 | not be delivered. | ||
185 | |||
186 | Currently type 2 APF will be always delivered on the same vcpu as | ||
187 | type 1 was, but guest should not rely on that. | ||
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 2f229e5de498..2689ee54a1c9 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h | |||
@@ -590,6 +590,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu); | |||
590 | int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); | 590 | int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); |
591 | void kvm_sal_emul(struct kvm_vcpu *vcpu); | 591 | void kvm_sal_emul(struct kvm_vcpu *vcpu); |
592 | 592 | ||
593 | #define __KVM_HAVE_ARCH_VM_ALLOC 1 | ||
594 | struct kvm *kvm_arch_alloc_vm(void); | ||
595 | void kvm_arch_free_vm(struct kvm *kvm); | ||
596 | |||
593 | #endif /* __ASSEMBLY__*/ | 597 | #endif /* __ASSEMBLY__*/ |
594 | 598 | ||
595 | #endif | 599 | #endif |
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index f56a6316e134..70d224d4264c 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c | |||
@@ -749,7 +749,7 @@ out: | |||
749 | return r; | 749 | return r; |
750 | } | 750 | } |
751 | 751 | ||
752 | static struct kvm *kvm_alloc_kvm(void) | 752 | struct kvm *kvm_arch_alloc_vm(void) |
753 | { | 753 | { |
754 | 754 | ||
755 | struct kvm *kvm; | 755 | struct kvm *kvm; |
@@ -760,7 +760,7 @@ static struct kvm *kvm_alloc_kvm(void) | |||
760 | vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE)); | 760 | vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE)); |
761 | 761 | ||
762 | if (!vm_base) | 762 | if (!vm_base) |
763 | return ERR_PTR(-ENOMEM); | 763 | return NULL; |
764 | 764 | ||
765 | memset((void *)vm_base, 0, KVM_VM_DATA_SIZE); | 765 | memset((void *)vm_base, 0, KVM_VM_DATA_SIZE); |
766 | kvm = (struct kvm *)(vm_base + | 766 | kvm = (struct kvm *)(vm_base + |
@@ -806,10 +806,12 @@ static void kvm_build_io_pmt(struct kvm *kvm) | |||
806 | #define GUEST_PHYSICAL_RR4 0x2739 | 806 | #define GUEST_PHYSICAL_RR4 0x2739 |
807 | #define VMM_INIT_RR 0x1660 | 807 | #define VMM_INIT_RR 0x1660 |
808 | 808 | ||
809 | static void kvm_init_vm(struct kvm *kvm) | 809 | int kvm_arch_init_vm(struct kvm *kvm) |
810 | { | 810 | { |
811 | BUG_ON(!kvm); | 811 | BUG_ON(!kvm); |
812 | 812 | ||
813 | kvm->arch.is_sn2 = ia64_platform_is("sn2"); | ||
814 | |||
813 | kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0; | 815 | kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0; |
814 | kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4; | 816 | kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4; |
815 | kvm->arch.vmm_init_rr = VMM_INIT_RR; | 817 | kvm->arch.vmm_init_rr = VMM_INIT_RR; |
@@ -823,21 +825,8 @@ static void kvm_init_vm(struct kvm *kvm) | |||
823 | 825 | ||
824 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ | 826 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ |
825 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); | 827 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); |
826 | } | ||
827 | |||
828 | struct kvm *kvm_arch_create_vm(void) | ||
829 | { | ||
830 | struct kvm *kvm = kvm_alloc_kvm(); | ||
831 | |||
832 | if (IS_ERR(kvm)) | ||
833 | return ERR_PTR(-ENOMEM); | ||
834 | |||
835 | kvm->arch.is_sn2 = ia64_platform_is("sn2"); | ||
836 | |||
837 | kvm_init_vm(kvm); | ||
838 | |||
839 | return kvm; | ||
840 | 828 | ||
829 | return 0; | ||
841 | } | 830 | } |
842 | 831 | ||
843 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, | 832 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, |
@@ -962,7 +951,9 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
962 | goto out; | 951 | goto out; |
963 | r = kvm_setup_default_irq_routing(kvm); | 952 | r = kvm_setup_default_irq_routing(kvm); |
964 | if (r) { | 953 | if (r) { |
954 | mutex_lock(&kvm->slots_lock); | ||
965 | kvm_ioapic_destroy(kvm); | 955 | kvm_ioapic_destroy(kvm); |
956 | mutex_unlock(&kvm->slots_lock); | ||
966 | goto out; | 957 | goto out; |
967 | } | 958 | } |
968 | break; | 959 | break; |
@@ -1357,7 +1348,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, | |||
1357 | return -EINVAL; | 1348 | return -EINVAL; |
1358 | } | 1349 | } |
1359 | 1350 | ||
1360 | static void free_kvm(struct kvm *kvm) | 1351 | void kvm_arch_free_vm(struct kvm *kvm) |
1361 | { | 1352 | { |
1362 | unsigned long vm_base = kvm->arch.vm_base; | 1353 | unsigned long vm_base = kvm->arch.vm_base; |
1363 | 1354 | ||
@@ -1399,9 +1390,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
1399 | #endif | 1390 | #endif |
1400 | kfree(kvm->arch.vioapic); | 1391 | kfree(kvm->arch.vioapic); |
1401 | kvm_release_vm_pages(kvm); | 1392 | kvm_release_vm_pages(kvm); |
1402 | kvm_free_physmem(kvm); | ||
1403 | cleanup_srcu_struct(&kvm->srcu); | ||
1404 | free_kvm(kvm); | ||
1405 | } | 1393 | } |
1406 | 1394 | ||
1407 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | 1395 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index e316847c08c0..badc983031b3 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c | |||
@@ -1307,12 +1307,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) | |||
1307 | int err = -ENOMEM; | 1307 | int err = -ENOMEM; |
1308 | unsigned long p; | 1308 | unsigned long p; |
1309 | 1309 | ||
1310 | vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s)); | 1310 | vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); |
1311 | if (!vcpu_book3s) | 1311 | if (!vcpu_book3s) |
1312 | goto out; | 1312 | goto out; |
1313 | 1313 | ||
1314 | memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s)); | ||
1315 | |||
1316 | vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) | 1314 | vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) |
1317 | kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); | 1315 | kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); |
1318 | if (!vcpu_book3s->shadow_vcpu) | 1316 | if (!vcpu_book3s->shadow_vcpu) |
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 38f756f25053..99758460efde 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c | |||
@@ -145,18 +145,12 @@ void kvm_arch_check_processor_compat(void *rtn) | |||
145 | *(int *)rtn = kvmppc_core_check_processor_compat(); | 145 | *(int *)rtn = kvmppc_core_check_processor_compat(); |
146 | } | 146 | } |
147 | 147 | ||
148 | struct kvm *kvm_arch_create_vm(void) | 148 | int kvm_arch_init_vm(struct kvm *kvm) |
149 | { | 149 | { |
150 | struct kvm *kvm; | 150 | return 0; |
151 | |||
152 | kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
153 | if (!kvm) | ||
154 | return ERR_PTR(-ENOMEM); | ||
155 | |||
156 | return kvm; | ||
157 | } | 151 | } |
158 | 152 | ||
159 | static void kvmppc_free_vcpus(struct kvm *kvm) | 153 | void kvm_arch_destroy_vm(struct kvm *kvm) |
160 | { | 154 | { |
161 | unsigned int i; | 155 | unsigned int i; |
162 | struct kvm_vcpu *vcpu; | 156 | struct kvm_vcpu *vcpu; |
@@ -176,14 +170,6 @@ void kvm_arch_sync_events(struct kvm *kvm) | |||
176 | { | 170 | { |
177 | } | 171 | } |
178 | 172 | ||
179 | void kvm_arch_destroy_vm(struct kvm *kvm) | ||
180 | { | ||
181 | kvmppc_free_vcpus(kvm); | ||
182 | kvm_free_physmem(kvm); | ||
183 | cleanup_srcu_struct(&kvm->srcu); | ||
184 | kfree(kvm); | ||
185 | } | ||
186 | |||
187 | int kvm_dev_ioctl_check_extension(long ext) | 173 | int kvm_dev_ioctl_check_extension(long ext) |
188 | { | 174 | { |
189 | int r; | 175 | int r; |
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 985d825494f1..bade533ba288 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c | |||
@@ -164,24 +164,18 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
164 | return r; | 164 | return r; |
165 | } | 165 | } |
166 | 166 | ||
167 | struct kvm *kvm_arch_create_vm(void) | 167 | int kvm_arch_init_vm(struct kvm *kvm) |
168 | { | 168 | { |
169 | struct kvm *kvm; | ||
170 | int rc; | 169 | int rc; |
171 | char debug_name[16]; | 170 | char debug_name[16]; |
172 | 171 | ||
173 | rc = s390_enable_sie(); | 172 | rc = s390_enable_sie(); |
174 | if (rc) | 173 | if (rc) |
175 | goto out_nokvm; | 174 | goto out_err; |
176 | |||
177 | rc = -ENOMEM; | ||
178 | kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
179 | if (!kvm) | ||
180 | goto out_nokvm; | ||
181 | 175 | ||
182 | kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL); | 176 | kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL); |
183 | if (!kvm->arch.sca) | 177 | if (!kvm->arch.sca) |
184 | goto out_nosca; | 178 | goto out_err; |
185 | 179 | ||
186 | sprintf(debug_name, "kvm-%u", current->pid); | 180 | sprintf(debug_name, "kvm-%u", current->pid); |
187 | 181 | ||
@@ -195,13 +189,11 @@ struct kvm *kvm_arch_create_vm(void) | |||
195 | debug_register_view(kvm->arch.dbf, &debug_sprintf_view); | 189 | debug_register_view(kvm->arch.dbf, &debug_sprintf_view); |
196 | VM_EVENT(kvm, 3, "%s", "vm created"); | 190 | VM_EVENT(kvm, 3, "%s", "vm created"); |
197 | 191 | ||
198 | return kvm; | 192 | return 0; |
199 | out_nodbf: | 193 | out_nodbf: |
200 | free_page((unsigned long)(kvm->arch.sca)); | 194 | free_page((unsigned long)(kvm->arch.sca)); |
201 | out_nosca: | 195 | out_err: |
202 | kfree(kvm); | 196 | return rc; |
203 | out_nokvm: | ||
204 | return ERR_PTR(rc); | ||
205 | } | 197 | } |
206 | 198 | ||
207 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | 199 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
@@ -240,11 +232,8 @@ void kvm_arch_sync_events(struct kvm *kvm) | |||
240 | void kvm_arch_destroy_vm(struct kvm *kvm) | 232 | void kvm_arch_destroy_vm(struct kvm *kvm) |
241 | { | 233 | { |
242 | kvm_free_vcpus(kvm); | 234 | kvm_free_vcpus(kvm); |
243 | kvm_free_physmem(kvm); | ||
244 | free_page((unsigned long)(kvm->arch.sca)); | 235 | free_page((unsigned long)(kvm->arch.sca)); |
245 | debug_unregister(kvm->arch.dbf); | 236 | debug_unregister(kvm->arch.dbf); |
246 | cleanup_srcu_struct(&kvm->srcu); | ||
247 | kfree(kvm); | ||
248 | } | 237 | } |
249 | 238 | ||
250 | /* Section: vcpu related */ | 239 | /* Section: vcpu related */ |
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b36c6b3fe144..8e37deb1eb38 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -15,6 +15,14 @@ | |||
15 | 15 | ||
16 | struct x86_emulate_ctxt; | 16 | struct x86_emulate_ctxt; |
17 | 17 | ||
18 | struct x86_exception { | ||
19 | u8 vector; | ||
20 | bool error_code_valid; | ||
21 | u16 error_code; | ||
22 | bool nested_page_fault; | ||
23 | u64 address; /* cr2 or nested page fault gpa */ | ||
24 | }; | ||
25 | |||
18 | /* | 26 | /* |
19 | * x86_emulate_ops: | 27 | * x86_emulate_ops: |
20 | * | 28 | * |
@@ -64,7 +72,8 @@ struct x86_emulate_ops { | |||
64 | * @bytes: [IN ] Number of bytes to read from memory. | 72 | * @bytes: [IN ] Number of bytes to read from memory. |
65 | */ | 73 | */ |
66 | int (*read_std)(unsigned long addr, void *val, | 74 | int (*read_std)(unsigned long addr, void *val, |
67 | unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); | 75 | unsigned int bytes, struct kvm_vcpu *vcpu, |
76 | struct x86_exception *fault); | ||
68 | 77 | ||
69 | /* | 78 | /* |
70 | * write_std: Write bytes of standard (non-emulated/special) memory. | 79 | * write_std: Write bytes of standard (non-emulated/special) memory. |
@@ -74,7 +83,8 @@ struct x86_emulate_ops { | |||
74 | * @bytes: [IN ] Number of bytes to write to memory. | 83 | * @bytes: [IN ] Number of bytes to write to memory. |
75 | */ | 84 | */ |
76 | int (*write_std)(unsigned long addr, void *val, | 85 | int (*write_std)(unsigned long addr, void *val, |
77 | unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); | 86 | unsigned int bytes, struct kvm_vcpu *vcpu, |
87 | struct x86_exception *fault); | ||
78 | /* | 88 | /* |
79 | * fetch: Read bytes of standard (non-emulated/special) memory. | 89 | * fetch: Read bytes of standard (non-emulated/special) memory. |
80 | * Used for instruction fetch. | 90 | * Used for instruction fetch. |
@@ -83,7 +93,8 @@ struct x86_emulate_ops { | |||
83 | * @bytes: [IN ] Number of bytes to read from memory. | 93 | * @bytes: [IN ] Number of bytes to read from memory. |
84 | */ | 94 | */ |
85 | int (*fetch)(unsigned long addr, void *val, | 95 | int (*fetch)(unsigned long addr, void *val, |
86 | unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); | 96 | unsigned int bytes, struct kvm_vcpu *vcpu, |
97 | struct x86_exception *fault); | ||
87 | 98 | ||
88 | /* | 99 | /* |
89 | * read_emulated: Read bytes from emulated/special memory area. | 100 | * read_emulated: Read bytes from emulated/special memory area. |
@@ -94,7 +105,7 @@ struct x86_emulate_ops { | |||
94 | int (*read_emulated)(unsigned long addr, | 105 | int (*read_emulated)(unsigned long addr, |
95 | void *val, | 106 | void *val, |
96 | unsigned int bytes, | 107 | unsigned int bytes, |
97 | unsigned int *error, | 108 | struct x86_exception *fault, |
98 | struct kvm_vcpu *vcpu); | 109 | struct kvm_vcpu *vcpu); |
99 | 110 | ||
100 | /* | 111 | /* |
@@ -107,7 +118,7 @@ struct x86_emulate_ops { | |||
107 | int (*write_emulated)(unsigned long addr, | 118 | int (*write_emulated)(unsigned long addr, |
108 | const void *val, | 119 | const void *val, |
109 | unsigned int bytes, | 120 | unsigned int bytes, |
110 | unsigned int *error, | 121 | struct x86_exception *fault, |
111 | struct kvm_vcpu *vcpu); | 122 | struct kvm_vcpu *vcpu); |
112 | 123 | ||
113 | /* | 124 | /* |
@@ -122,7 +133,7 @@ struct x86_emulate_ops { | |||
122 | const void *old, | 133 | const void *old, |
123 | const void *new, | 134 | const void *new, |
124 | unsigned int bytes, | 135 | unsigned int bytes, |
125 | unsigned int *error, | 136 | struct x86_exception *fault, |
126 | struct kvm_vcpu *vcpu); | 137 | struct kvm_vcpu *vcpu); |
127 | 138 | ||
128 | int (*pio_in_emulated)(int size, unsigned short port, void *val, | 139 | int (*pio_in_emulated)(int size, unsigned short port, void *val, |
@@ -159,7 +170,10 @@ struct operand { | |||
159 | }; | 170 | }; |
160 | union { | 171 | union { |
161 | unsigned long *reg; | 172 | unsigned long *reg; |
162 | unsigned long mem; | 173 | struct segmented_address { |
174 | ulong ea; | ||
175 | unsigned seg; | ||
176 | } mem; | ||
163 | } addr; | 177 | } addr; |
164 | union { | 178 | union { |
165 | unsigned long val; | 179 | unsigned long val; |
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt { | |||
226 | 240 | ||
227 | bool perm_ok; /* do not check permissions if true */ | 241 | bool perm_ok; /* do not check permissions if true */ |
228 | 242 | ||
229 | int exception; /* exception that happens during emulation or -1 */ | 243 | bool have_exception; |
230 | u32 error_code; /* error code for exception */ | 244 | struct x86_exception exception; |
231 | bool error_code_valid; | ||
232 | 245 | ||
233 | /* decode cache */ | 246 | /* decode cache */ |
234 | struct decode_cache decode; | 247 | struct decode_cache decode; |
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt { | |||
252 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | 265 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 |
253 | #endif | 266 | #endif |
254 | 267 | ||
255 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt); | 268 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); |
256 | #define EMULATION_FAILED -1 | 269 | #define EMULATION_FAILED -1 |
257 | #define EMULATION_OK 0 | 270 | #define EMULATION_OK 0 |
258 | #define EMULATION_RESTART 1 | 271 | #define EMULATION_RESTART 1 |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f702f82aa1eb..aa75f21a9fba 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -83,11 +83,14 @@ | |||
83 | #define KVM_NR_FIXED_MTRR_REGION 88 | 83 | #define KVM_NR_FIXED_MTRR_REGION 88 |
84 | #define KVM_NR_VAR_MTRR 8 | 84 | #define KVM_NR_VAR_MTRR 8 |
85 | 85 | ||
86 | #define ASYNC_PF_PER_VCPU 64 | ||
87 | |||
86 | extern spinlock_t kvm_lock; | 88 | extern spinlock_t kvm_lock; |
87 | extern struct list_head vm_list; | 89 | extern struct list_head vm_list; |
88 | 90 | ||
89 | struct kvm_vcpu; | 91 | struct kvm_vcpu; |
90 | struct kvm; | 92 | struct kvm; |
93 | struct kvm_async_pf; | ||
91 | 94 | ||
92 | enum kvm_reg { | 95 | enum kvm_reg { |
93 | VCPU_REGS_RAX = 0, | 96 | VCPU_REGS_RAX = 0, |
@@ -114,6 +117,7 @@ enum kvm_reg { | |||
114 | 117 | ||
115 | enum kvm_reg_ex { | 118 | enum kvm_reg_ex { |
116 | VCPU_EXREG_PDPTR = NR_VCPU_REGS, | 119 | VCPU_EXREG_PDPTR = NR_VCPU_REGS, |
120 | VCPU_EXREG_CR3, | ||
117 | }; | 121 | }; |
118 | 122 | ||
119 | enum { | 123 | enum { |
@@ -238,16 +242,18 @@ struct kvm_mmu { | |||
238 | void (*new_cr3)(struct kvm_vcpu *vcpu); | 242 | void (*new_cr3)(struct kvm_vcpu *vcpu); |
239 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); | 243 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); |
240 | unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); | 244 | unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); |
241 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | 245 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, |
242 | void (*inject_page_fault)(struct kvm_vcpu *vcpu); | 246 | bool prefault); |
247 | void (*inject_page_fault)(struct kvm_vcpu *vcpu, | ||
248 | struct x86_exception *fault); | ||
243 | void (*free)(struct kvm_vcpu *vcpu); | 249 | void (*free)(struct kvm_vcpu *vcpu); |
244 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, | 250 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, |
245 | u32 *error); | 251 | struct x86_exception *exception); |
246 | gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); | 252 | gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); |
247 | void (*prefetch_page)(struct kvm_vcpu *vcpu, | 253 | void (*prefetch_page)(struct kvm_vcpu *vcpu, |
248 | struct kvm_mmu_page *page); | 254 | struct kvm_mmu_page *page); |
249 | int (*sync_page)(struct kvm_vcpu *vcpu, | 255 | int (*sync_page)(struct kvm_vcpu *vcpu, |
250 | struct kvm_mmu_page *sp, bool clear_unsync); | 256 | struct kvm_mmu_page *sp); |
251 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); | 257 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); |
252 | hpa_t root_hpa; | 258 | hpa_t root_hpa; |
253 | int root_level; | 259 | int root_level; |
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch { | |||
315 | */ | 321 | */ |
316 | struct kvm_mmu *walk_mmu; | 322 | struct kvm_mmu *walk_mmu; |
317 | 323 | ||
318 | /* | ||
319 | * This struct is filled with the necessary information to propagate a | ||
320 | * page fault into the guest | ||
321 | */ | ||
322 | struct { | ||
323 | u64 address; | ||
324 | unsigned error_code; | ||
325 | bool nested; | ||
326 | } fault; | ||
327 | |||
328 | /* only needed in kvm_pv_mmu_op() path, but it's hot so | 324 | /* only needed in kvm_pv_mmu_op() path, but it's hot so |
329 | * put it here to avoid allocation */ | 325 | * put it here to avoid allocation */ |
330 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; | 326 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; |
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch { | |||
412 | u64 hv_vapic; | 408 | u64 hv_vapic; |
413 | 409 | ||
414 | cpumask_var_t wbinvd_dirty_mask; | 410 | cpumask_var_t wbinvd_dirty_mask; |
411 | |||
412 | struct { | ||
413 | bool halted; | ||
414 | gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; | ||
415 | struct gfn_to_hva_cache data; | ||
416 | u64 msr_val; | ||
417 | u32 id; | ||
418 | bool send_user_only; | ||
419 | } apf; | ||
415 | }; | 420 | }; |
416 | 421 | ||
417 | struct kvm_arch { | 422 | struct kvm_arch { |
@@ -456,6 +461,10 @@ struct kvm_arch { | |||
456 | /* fields used by HYPER-V emulation */ | 461 | /* fields used by HYPER-V emulation */ |
457 | u64 hv_guest_os_id; | 462 | u64 hv_guest_os_id; |
458 | u64 hv_hypercall; | 463 | u64 hv_hypercall; |
464 | |||
465 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
466 | int audit_point; | ||
467 | #endif | ||
459 | }; | 468 | }; |
460 | 469 | ||
461 | struct kvm_vm_stat { | 470 | struct kvm_vm_stat { |
@@ -529,6 +538,7 @@ struct kvm_x86_ops { | |||
529 | struct kvm_segment *var, int seg); | 538 | struct kvm_segment *var, int seg); |
530 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); | 539 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); |
531 | void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); | 540 | void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); |
541 | void (*decache_cr3)(struct kvm_vcpu *vcpu); | ||
532 | void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); | 542 | void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); |
533 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); | 543 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); |
534 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | 544 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); |
@@ -582,9 +592,17 @@ struct kvm_x86_ops { | |||
582 | 592 | ||
583 | void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); | 593 | void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); |
584 | 594 | ||
595 | void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); | ||
585 | const struct trace_print_flags *exit_reasons_str; | 596 | const struct trace_print_flags *exit_reasons_str; |
586 | }; | 597 | }; |
587 | 598 | ||
599 | struct kvm_arch_async_pf { | ||
600 | u32 token; | ||
601 | gfn_t gfn; | ||
602 | unsigned long cr3; | ||
603 | bool direct_map; | ||
604 | }; | ||
605 | |||
588 | extern struct kvm_x86_ops *kvm_x86_ops; | 606 | extern struct kvm_x86_ops *kvm_x86_ops; |
589 | 607 | ||
590 | int kvm_mmu_module_init(void); | 608 | int kvm_mmu_module_init(void); |
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu); | |||
594 | int kvm_mmu_create(struct kvm_vcpu *vcpu); | 612 | int kvm_mmu_create(struct kvm_vcpu *vcpu); |
595 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); | 613 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); |
596 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); | 614 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); |
597 | void kvm_mmu_set_base_ptes(u64 base_pte); | ||
598 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 615 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
599 | u64 dirty_mask, u64 nx_mask, u64 x_mask); | 616 | u64 dirty_mask, u64 nx_mask, u64 x_mask); |
600 | 617 | ||
@@ -623,8 +640,15 @@ enum emulation_result { | |||
623 | #define EMULTYPE_NO_DECODE (1 << 0) | 640 | #define EMULTYPE_NO_DECODE (1 << 0) |
624 | #define EMULTYPE_TRAP_UD (1 << 1) | 641 | #define EMULTYPE_TRAP_UD (1 << 1) |
625 | #define EMULTYPE_SKIP (1 << 2) | 642 | #define EMULTYPE_SKIP (1 << 2) |
626 | int emulate_instruction(struct kvm_vcpu *vcpu, | 643 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, |
627 | unsigned long cr2, u16 error_code, int emulation_type); | 644 | int emulation_type, void *insn, int insn_len); |
645 | |||
646 | static inline int emulate_instruction(struct kvm_vcpu *vcpu, | ||
647 | int emulation_type) | ||
648 | { | ||
649 | return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); | ||
650 | } | ||
651 | |||
628 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 652 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
629 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 653 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
630 | 654 | ||
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | |||
650 | int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); | 674 | int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); |
651 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | 675 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); |
652 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | 676 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); |
653 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); | 677 | int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); |
654 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); | 678 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); |
655 | int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); | 679 | int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); |
656 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); | 680 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); |
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); | |||
668 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 692 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
669 | void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); | 693 | void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); |
670 | void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 694 | void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
671 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu); | 695 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); |
672 | int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 696 | int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
673 | gfn_t gfn, void *data, int offset, int len, | 697 | gfn_t gfn, void *data, int offset, int len, |
674 | u32 access); | 698 | u32 access); |
675 | void kvm_propagate_fault(struct kvm_vcpu *vcpu); | 699 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); |
676 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); | 700 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); |
677 | 701 | ||
678 | int kvm_pic_set_irq(void *opaque, int irq, int level); | 702 | int kvm_pic_set_irq(void *opaque, int irq, int level); |
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | |||
690 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | 714 | int kvm_mmu_load(struct kvm_vcpu *vcpu); |
691 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | 715 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); |
692 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); | 716 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); |
693 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); | 717 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, |
694 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); | 718 | struct x86_exception *exception); |
695 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); | 719 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, |
696 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); | 720 | struct x86_exception *exception); |
721 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, | ||
722 | struct x86_exception *exception); | ||
723 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, | ||
724 | struct x86_exception *exception); | ||
697 | 725 | ||
698 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); | 726 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); |
699 | 727 | ||
700 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu); | 728 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu); |
701 | 729 | ||
702 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); | 730 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, |
731 | void *insn, int insn_len); | ||
703 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); | 732 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); |
704 | 733 | ||
705 | void kvm_enable_tdp(void); | 734 | void kvm_enable_tdp(void); |
@@ -766,20 +795,25 @@ enum { | |||
766 | #define HF_VINTR_MASK (1 << 2) | 795 | #define HF_VINTR_MASK (1 << 2) |
767 | #define HF_NMI_MASK (1 << 3) | 796 | #define HF_NMI_MASK (1 << 3) |
768 | #define HF_IRET_MASK (1 << 4) | 797 | #define HF_IRET_MASK (1 << 4) |
798 | #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ | ||
769 | 799 | ||
770 | /* | 800 | /* |
771 | * Hardware virtualization extension instructions may fault if a | 801 | * Hardware virtualization extension instructions may fault if a |
772 | * reboot turns off virtualization while processes are running. | 802 | * reboot turns off virtualization while processes are running. |
773 | * Trap the fault and ignore the instruction if that happens. | 803 | * Trap the fault and ignore the instruction if that happens. |
774 | */ | 804 | */ |
775 | asmlinkage void kvm_handle_fault_on_reboot(void); | 805 | asmlinkage void kvm_spurious_fault(void); |
806 | extern bool kvm_rebooting; | ||
776 | 807 | ||
777 | #define __kvm_handle_fault_on_reboot(insn) \ | 808 | #define __kvm_handle_fault_on_reboot(insn) \ |
778 | "666: " insn "\n\t" \ | 809 | "666: " insn "\n\t" \ |
810 | "668: \n\t" \ | ||
779 | ".pushsection .fixup, \"ax\" \n" \ | 811 | ".pushsection .fixup, \"ax\" \n" \ |
780 | "667: \n\t" \ | 812 | "667: \n\t" \ |
813 | "cmpb $0, kvm_rebooting \n\t" \ | ||
814 | "jne 668b \n\t" \ | ||
781 | __ASM_SIZE(push) " $666b \n\t" \ | 815 | __ASM_SIZE(push) " $666b \n\t" \ |
782 | "jmp kvm_handle_fault_on_reboot \n\t" \ | 816 | "call kvm_spurious_fault \n\t" \ |
783 | ".popsection \n\t" \ | 817 | ".popsection \n\t" \ |
784 | ".pushsection __ex_table, \"a\" \n\t" \ | 818 | ".pushsection __ex_table, \"a\" \n\t" \ |
785 | _ASM_PTR " 666b, 667b \n\t" \ | 819 | _ASM_PTR " 666b, 667b \n\t" \ |
@@ -799,4 +833,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); | |||
799 | 833 | ||
800 | bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); | 834 | bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); |
801 | 835 | ||
836 | void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, | ||
837 | struct kvm_async_pf *work); | ||
838 | void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, | ||
839 | struct kvm_async_pf *work); | ||
840 | void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, | ||
841 | struct kvm_async_pf *work); | ||
842 | bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); | ||
843 | extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); | ||
844 | |||
845 | void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); | ||
846 | |||
802 | #endif /* _ASM_X86_KVM_HOST_H */ | 847 | #endif /* _ASM_X86_KVM_HOST_H */ |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 7b562b6184bc..a427bf77a93d 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -20,6 +20,7 @@ | |||
20 | * are available. The use of 0x11 and 0x12 is deprecated | 20 | * are available. The use of 0x11 and 0x12 is deprecated |
21 | */ | 21 | */ |
22 | #define KVM_FEATURE_CLOCKSOURCE2 3 | 22 | #define KVM_FEATURE_CLOCKSOURCE2 3 |
23 | #define KVM_FEATURE_ASYNC_PF 4 | ||
23 | 24 | ||
24 | /* The last 8 bits are used to indicate how to interpret the flags field | 25 | /* The last 8 bits are used to indicate how to interpret the flags field |
25 | * in pvclock structure. If no bits are set, all flags are ignored. | 26 | * in pvclock structure. If no bits are set, all flags are ignored. |
@@ -32,9 +33,13 @@ | |||
32 | /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ | 33 | /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ |
33 | #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 | 34 | #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 |
34 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 | 35 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 |
36 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 | ||
35 | 37 | ||
36 | #define KVM_MAX_MMU_OP_BATCH 32 | 38 | #define KVM_MAX_MMU_OP_BATCH 32 |
37 | 39 | ||
40 | #define KVM_ASYNC_PF_ENABLED (1 << 0) | ||
41 | #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) | ||
42 | |||
38 | /* Operations for KVM_HC_MMU_OP */ | 43 | /* Operations for KVM_HC_MMU_OP */ |
39 | #define KVM_MMU_OP_WRITE_PTE 1 | 44 | #define KVM_MMU_OP_WRITE_PTE 1 |
40 | #define KVM_MMU_OP_FLUSH_TLB 2 | 45 | #define KVM_MMU_OP_FLUSH_TLB 2 |
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt { | |||
61 | __u64 pt_phys; | 66 | __u64 pt_phys; |
62 | }; | 67 | }; |
63 | 68 | ||
69 | #define KVM_PV_REASON_PAGE_NOT_PRESENT 1 | ||
70 | #define KVM_PV_REASON_PAGE_READY 2 | ||
71 | |||
72 | struct kvm_vcpu_pv_apf_data { | ||
73 | __u32 reason; | ||
74 | __u8 pad[60]; | ||
75 | __u32 enabled; | ||
76 | }; | ||
77 | |||
64 | #ifdef __KERNEL__ | 78 | #ifdef __KERNEL__ |
65 | #include <asm/processor.h> | 79 | #include <asm/processor.h> |
66 | 80 | ||
67 | extern void kvmclock_init(void); | 81 | extern void kvmclock_init(void); |
82 | extern int kvm_register_clock(char *txt); | ||
68 | 83 | ||
69 | 84 | ||
70 | /* This instruction is vmcall. On non-VT architectures, it will generate a | 85 | /* This instruction is vmcall. On non-VT architectures, it will generate a |
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void) | |||
160 | 175 | ||
161 | #ifdef CONFIG_KVM_GUEST | 176 | #ifdef CONFIG_KVM_GUEST |
162 | void __init kvm_guest_init(void); | 177 | void __init kvm_guest_init(void); |
178 | void kvm_async_pf_task_wait(u32 token); | ||
179 | void kvm_async_pf_task_wake(u32 token); | ||
180 | u32 kvm_read_and_reset_pf_reason(void); | ||
163 | #else | 181 | #else |
164 | #define kvm_guest_init() do { } while (0) | 182 | #define kvm_guest_init() do { } while (0) |
183 | #define kvm_async_pf_task_wait(T) do {} while(0) | ||
184 | #define kvm_async_pf_task_wake(T) do {} while(0) | ||
185 | static inline u32 kvm_read_and_reset_pf_reason(void) | ||
186 | { | ||
187 | return 0; | ||
188 | } | ||
165 | #endif | 189 | #endif |
166 | 190 | ||
167 | #endif /* __KERNEL__ */ | 191 | #endif /* __KERNEL__ */ |
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0e831059ac5a..f2b83bc7d784 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h | |||
@@ -47,14 +47,13 @@ enum { | |||
47 | INTERCEPT_MONITOR, | 47 | INTERCEPT_MONITOR, |
48 | INTERCEPT_MWAIT, | 48 | INTERCEPT_MWAIT, |
49 | INTERCEPT_MWAIT_COND, | 49 | INTERCEPT_MWAIT_COND, |
50 | INTERCEPT_XSETBV, | ||
50 | }; | 51 | }; |
51 | 52 | ||
52 | 53 | ||
53 | struct __attribute__ ((__packed__)) vmcb_control_area { | 54 | struct __attribute__ ((__packed__)) vmcb_control_area { |
54 | u16 intercept_cr_read; | 55 | u32 intercept_cr; |
55 | u16 intercept_cr_write; | 56 | u32 intercept_dr; |
56 | u16 intercept_dr_read; | ||
57 | u16 intercept_dr_write; | ||
58 | u32 intercept_exceptions; | 57 | u32 intercept_exceptions; |
59 | u64 intercept; | 58 | u64 intercept; |
60 | u8 reserved_1[42]; | 59 | u8 reserved_1[42]; |
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area { | |||
81 | u32 event_inj_err; | 80 | u32 event_inj_err; |
82 | u64 nested_cr3; | 81 | u64 nested_cr3; |
83 | u64 lbr_ctl; | 82 | u64 lbr_ctl; |
84 | u64 reserved_5; | 83 | u32 clean; |
84 | u32 reserved_5; | ||
85 | u64 next_rip; | 85 | u64 next_rip; |
86 | u8 reserved_6[816]; | 86 | u8 insn_len; |
87 | u8 insn_bytes[15]; | ||
88 | u8 reserved_6[800]; | ||
87 | }; | 89 | }; |
88 | 90 | ||
89 | 91 | ||
90 | #define TLB_CONTROL_DO_NOTHING 0 | 92 | #define TLB_CONTROL_DO_NOTHING 0 |
91 | #define TLB_CONTROL_FLUSH_ALL_ASID 1 | 93 | #define TLB_CONTROL_FLUSH_ALL_ASID 1 |
94 | #define TLB_CONTROL_FLUSH_ASID 3 | ||
95 | #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 | ||
92 | 96 | ||
93 | #define V_TPR_MASK 0x0f | 97 | #define V_TPR_MASK 0x0f |
94 | 98 | ||
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb { | |||
204 | #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK | 208 | #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK |
205 | #define SVM_SELECTOR_CODE_MASK (1 << 3) | 209 | #define SVM_SELECTOR_CODE_MASK (1 << 3) |
206 | 210 | ||
207 | #define INTERCEPT_CR0_MASK 1 | 211 | #define INTERCEPT_CR0_READ 0 |
208 | #define INTERCEPT_CR3_MASK (1 << 3) | 212 | #define INTERCEPT_CR3_READ 3 |
209 | #define INTERCEPT_CR4_MASK (1 << 4) | 213 | #define INTERCEPT_CR4_READ 4 |
210 | #define INTERCEPT_CR8_MASK (1 << 8) | 214 | #define INTERCEPT_CR8_READ 8 |
211 | 215 | #define INTERCEPT_CR0_WRITE (16 + 0) | |
212 | #define INTERCEPT_DR0_MASK 1 | 216 | #define INTERCEPT_CR3_WRITE (16 + 3) |
213 | #define INTERCEPT_DR1_MASK (1 << 1) | 217 | #define INTERCEPT_CR4_WRITE (16 + 4) |
214 | #define INTERCEPT_DR2_MASK (1 << 2) | 218 | #define INTERCEPT_CR8_WRITE (16 + 8) |
215 | #define INTERCEPT_DR3_MASK (1 << 3) | 219 | |
216 | #define INTERCEPT_DR4_MASK (1 << 4) | 220 | #define INTERCEPT_DR0_READ 0 |
217 | #define INTERCEPT_DR5_MASK (1 << 5) | 221 | #define INTERCEPT_DR1_READ 1 |
218 | #define INTERCEPT_DR6_MASK (1 << 6) | 222 | #define INTERCEPT_DR2_READ 2 |
219 | #define INTERCEPT_DR7_MASK (1 << 7) | 223 | #define INTERCEPT_DR3_READ 3 |
224 | #define INTERCEPT_DR4_READ 4 | ||
225 | #define INTERCEPT_DR5_READ 5 | ||
226 | #define INTERCEPT_DR6_READ 6 | ||
227 | #define INTERCEPT_DR7_READ 7 | ||
228 | #define INTERCEPT_DR0_WRITE (16 + 0) | ||
229 | #define INTERCEPT_DR1_WRITE (16 + 1) | ||
230 | #define INTERCEPT_DR2_WRITE (16 + 2) | ||
231 | #define INTERCEPT_DR3_WRITE (16 + 3) | ||
232 | #define INTERCEPT_DR4_WRITE (16 + 4) | ||
233 | #define INTERCEPT_DR5_WRITE (16 + 5) | ||
234 | #define INTERCEPT_DR6_WRITE (16 + 6) | ||
235 | #define INTERCEPT_DR7_WRITE (16 + 7) | ||
220 | 236 | ||
221 | #define SVM_EVTINJ_VEC_MASK 0xff | 237 | #define SVM_EVTINJ_VEC_MASK 0xff |
222 | 238 | ||
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb { | |||
246 | #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 | 262 | #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 |
247 | #define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 | 263 | #define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 |
248 | 264 | ||
265 | #define SVM_EXITINFO_REG_MASK 0x0F | ||
266 | |||
249 | #define SVM_EXIT_READ_CR0 0x000 | 267 | #define SVM_EXIT_READ_CR0 0x000 |
250 | #define SVM_EXIT_READ_CR3 0x003 | 268 | #define SVM_EXIT_READ_CR3 0x003 |
251 | #define SVM_EXIT_READ_CR4 0x004 | 269 | #define SVM_EXIT_READ_CR4 0x004 |
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb { | |||
316 | #define SVM_EXIT_MONITOR 0x08a | 334 | #define SVM_EXIT_MONITOR 0x08a |
317 | #define SVM_EXIT_MWAIT 0x08b | 335 | #define SVM_EXIT_MWAIT 0x08b |
318 | #define SVM_EXIT_MWAIT_COND 0x08c | 336 | #define SVM_EXIT_MWAIT_COND 0x08c |
337 | #define SVM_EXIT_XSETBV 0x08d | ||
319 | #define SVM_EXIT_NPF 0x400 | 338 | #define SVM_EXIT_NPF 0x400 |
320 | 339 | ||
321 | #define SVM_EXIT_ERR -1 | 340 | #define SVM_EXIT_ERR -1 |
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index f66cda56781d..0310da67307f 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void); | |||
30 | asmlinkage void stack_segment(void); | 30 | asmlinkage void stack_segment(void); |
31 | asmlinkage void general_protection(void); | 31 | asmlinkage void general_protection(void); |
32 | asmlinkage void page_fault(void); | 32 | asmlinkage void page_fault(void); |
33 | asmlinkage void async_page_fault(void); | ||
33 | asmlinkage void spurious_interrupt_bug(void); | 34 | asmlinkage void spurious_interrupt_bug(void); |
34 | asmlinkage void coprocessor_error(void); | 35 | asmlinkage void coprocessor_error(void); |
35 | asmlinkage void alignment_check(void); | 36 | asmlinkage void alignment_check(void); |
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 9f0cbd987d50..84471b810460 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -66,15 +66,23 @@ | |||
66 | #define PIN_BASED_NMI_EXITING 0x00000008 | 66 | #define PIN_BASED_NMI_EXITING 0x00000008 |
67 | #define PIN_BASED_VIRTUAL_NMIS 0x00000020 | 67 | #define PIN_BASED_VIRTUAL_NMIS 0x00000020 |
68 | 68 | ||
69 | #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 | ||
69 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 | 70 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 |
71 | #define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 | ||
70 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 | 72 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 |
71 | #define VM_EXIT_SAVE_IA32_PAT 0x00040000 | 73 | #define VM_EXIT_SAVE_IA32_PAT 0x00040000 |
72 | #define VM_EXIT_LOAD_IA32_PAT 0x00080000 | 74 | #define VM_EXIT_LOAD_IA32_PAT 0x00080000 |
75 | #define VM_EXIT_SAVE_IA32_EFER 0x00100000 | ||
76 | #define VM_EXIT_LOAD_IA32_EFER 0x00200000 | ||
77 | #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 | ||
73 | 78 | ||
79 | #define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 | ||
74 | #define VM_ENTRY_IA32E_MODE 0x00000200 | 80 | #define VM_ENTRY_IA32E_MODE 0x00000200 |
75 | #define VM_ENTRY_SMM 0x00000400 | 81 | #define VM_ENTRY_SMM 0x00000400 |
76 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 | 82 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 |
83 | #define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 | ||
77 | #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 | 84 | #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 |
85 | #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 | ||
78 | 86 | ||
79 | /* VMCS Encodings */ | 87 | /* VMCS Encodings */ |
80 | enum vmcs_field { | 88 | enum vmcs_field { |
@@ -239,6 +247,7 @@ enum vmcs_field { | |||
239 | #define EXIT_REASON_TASK_SWITCH 9 | 247 | #define EXIT_REASON_TASK_SWITCH 9 |
240 | #define EXIT_REASON_CPUID 10 | 248 | #define EXIT_REASON_CPUID 10 |
241 | #define EXIT_REASON_HLT 12 | 249 | #define EXIT_REASON_HLT 12 |
250 | #define EXIT_REASON_INVD 13 | ||
242 | #define EXIT_REASON_INVLPG 14 | 251 | #define EXIT_REASON_INVLPG 14 |
243 | #define EXIT_REASON_RDPMC 15 | 252 | #define EXIT_REASON_RDPMC 15 |
244 | #define EXIT_REASON_RDTSC 16 | 253 | #define EXIT_REASON_RDTSC 16 |
@@ -296,6 +305,12 @@ enum vmcs_field { | |||
296 | #define GUEST_INTR_STATE_SMI 0x00000004 | 305 | #define GUEST_INTR_STATE_SMI 0x00000004 |
297 | #define GUEST_INTR_STATE_NMI 0x00000008 | 306 | #define GUEST_INTR_STATE_NMI 0x00000008 |
298 | 307 | ||
308 | /* GUEST_ACTIVITY_STATE flags */ | ||
309 | #define GUEST_ACTIVITY_ACTIVE 0 | ||
310 | #define GUEST_ACTIVITY_HLT 1 | ||
311 | #define GUEST_ACTIVITY_SHUTDOWN 2 | ||
312 | #define GUEST_ACTIVITY_WAIT_SIPI 3 | ||
313 | |||
299 | /* | 314 | /* |
300 | * Exit Qualifications for MOV for Control Register Access | 315 | * Exit Qualifications for MOV for Control Register Access |
301 | */ | 316 | */ |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 591e60104278..c8b4efad7ebb 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -1406,6 +1406,16 @@ ENTRY(general_protection) | |||
1406 | CFI_ENDPROC | 1406 | CFI_ENDPROC |
1407 | END(general_protection) | 1407 | END(general_protection) |
1408 | 1408 | ||
1409 | #ifdef CONFIG_KVM_GUEST | ||
1410 | ENTRY(async_page_fault) | ||
1411 | RING0_EC_FRAME | ||
1412 | pushl $do_async_page_fault | ||
1413 | CFI_ADJUST_CFA_OFFSET 4 | ||
1414 | jmp error_code | ||
1415 | CFI_ENDPROC | ||
1416 | END(apf_page_fault) | ||
1417 | #endif | ||
1418 | |||
1409 | /* | 1419 | /* |
1410 | * End of kprobes section | 1420 | * End of kprobes section |
1411 | */ | 1421 | */ |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d3b895f375d3..aed1ffbeb0c9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -1329,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment | |||
1329 | #endif | 1329 | #endif |
1330 | errorentry general_protection do_general_protection | 1330 | errorentry general_protection do_general_protection |
1331 | errorentry page_fault do_page_fault | 1331 | errorentry page_fault do_page_fault |
1332 | #ifdef CONFIG_KVM_GUEST | ||
1333 | errorentry async_page_fault do_async_page_fault | ||
1334 | #endif | ||
1332 | #ifdef CONFIG_X86_MCE | 1335 | #ifdef CONFIG_X86_MCE |
1333 | paranoidzeroentry machine_check *machine_check_vector(%rip) | 1336 | paranoidzeroentry machine_check *machine_check_vector(%rip) |
1334 | #endif | 1337 | #endif |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 58bb239a2fd7..e60c38cc0eed 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk) | |||
169 | set_stopped_child_used_math(tsk); | 169 | set_stopped_child_used_math(tsk); |
170 | return 0; | 170 | return 0; |
171 | } | 171 | } |
172 | EXPORT_SYMBOL_GPL(init_fpu); | ||
172 | 173 | ||
173 | /* | 174 | /* |
174 | * The xstateregs_active() routine is the same as the fpregs_active() routine, | 175 | * The xstateregs_active() routine is the same as the fpregs_active() routine, |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 63b0ec8d3d4a..8dc44662394b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -27,16 +27,37 @@ | |||
27 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
30 | #include <linux/notifier.h> | ||
31 | #include <linux/reboot.h> | ||
32 | #include <linux/hash.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <linux/slab.h> | ||
35 | #include <linux/kprobes.h> | ||
30 | #include <asm/timer.h> | 36 | #include <asm/timer.h> |
37 | #include <asm/cpu.h> | ||
38 | #include <asm/traps.h> | ||
39 | #include <asm/desc.h> | ||
40 | #include <asm/tlbflush.h> | ||
31 | 41 | ||
32 | #define MMU_QUEUE_SIZE 1024 | 42 | #define MMU_QUEUE_SIZE 1024 |
33 | 43 | ||
44 | static int kvmapf = 1; | ||
45 | |||
46 | static int parse_no_kvmapf(char *arg) | ||
47 | { | ||
48 | kvmapf = 0; | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | early_param("no-kvmapf", parse_no_kvmapf); | ||
53 | |||
34 | struct kvm_para_state { | 54 | struct kvm_para_state { |
35 | u8 mmu_queue[MMU_QUEUE_SIZE]; | 55 | u8 mmu_queue[MMU_QUEUE_SIZE]; |
36 | int mmu_queue_len; | 56 | int mmu_queue_len; |
37 | }; | 57 | }; |
38 | 58 | ||
39 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); | 59 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); |
60 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); | ||
40 | 61 | ||
41 | static struct kvm_para_state *kvm_para_state(void) | 62 | static struct kvm_para_state *kvm_para_state(void) |
42 | { | 63 | { |
@@ -50,6 +71,195 @@ static void kvm_io_delay(void) | |||
50 | { | 71 | { |
51 | } | 72 | } |
52 | 73 | ||
74 | #define KVM_TASK_SLEEP_HASHBITS 8 | ||
75 | #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) | ||
76 | |||
77 | struct kvm_task_sleep_node { | ||
78 | struct hlist_node link; | ||
79 | wait_queue_head_t wq; | ||
80 | u32 token; | ||
81 | int cpu; | ||
82 | bool halted; | ||
83 | struct mm_struct *mm; | ||
84 | }; | ||
85 | |||
86 | static struct kvm_task_sleep_head { | ||
87 | spinlock_t lock; | ||
88 | struct hlist_head list; | ||
89 | } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; | ||
90 | |||
91 | static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, | ||
92 | u32 token) | ||
93 | { | ||
94 | struct hlist_node *p; | ||
95 | |||
96 | hlist_for_each(p, &b->list) { | ||
97 | struct kvm_task_sleep_node *n = | ||
98 | hlist_entry(p, typeof(*n), link); | ||
99 | if (n->token == token) | ||
100 | return n; | ||
101 | } | ||
102 | |||
103 | return NULL; | ||
104 | } | ||
105 | |||
106 | void kvm_async_pf_task_wait(u32 token) | ||
107 | { | ||
108 | u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); | ||
109 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; | ||
110 | struct kvm_task_sleep_node n, *e; | ||
111 | DEFINE_WAIT(wait); | ||
112 | int cpu, idle; | ||
113 | |||
114 | cpu = get_cpu(); | ||
115 | idle = idle_cpu(cpu); | ||
116 | put_cpu(); | ||
117 | |||
118 | spin_lock(&b->lock); | ||
119 | e = _find_apf_task(b, token); | ||
120 | if (e) { | ||
121 | /* dummy entry exist -> wake up was delivered ahead of PF */ | ||
122 | hlist_del(&e->link); | ||
123 | kfree(e); | ||
124 | spin_unlock(&b->lock); | ||
125 | return; | ||
126 | } | ||
127 | |||
128 | n.token = token; | ||
129 | n.cpu = smp_processor_id(); | ||
130 | n.mm = current->active_mm; | ||
131 | n.halted = idle || preempt_count() > 1; | ||
132 | atomic_inc(&n.mm->mm_count); | ||
133 | init_waitqueue_head(&n.wq); | ||
134 | hlist_add_head(&n.link, &b->list); | ||
135 | spin_unlock(&b->lock); | ||
136 | |||
137 | for (;;) { | ||
138 | if (!n.halted) | ||
139 | prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); | ||
140 | if (hlist_unhashed(&n.link)) | ||
141 | break; | ||
142 | |||
143 | if (!n.halted) { | ||
144 | local_irq_enable(); | ||
145 | schedule(); | ||
146 | local_irq_disable(); | ||
147 | } else { | ||
148 | /* | ||
149 | * We cannot reschedule. So halt. | ||
150 | */ | ||
151 | native_safe_halt(); | ||
152 | local_irq_disable(); | ||
153 | } | ||
154 | } | ||
155 | if (!n.halted) | ||
156 | finish_wait(&n.wq, &wait); | ||
157 | |||
158 | return; | ||
159 | } | ||
160 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); | ||
161 | |||
162 | static void apf_task_wake_one(struct kvm_task_sleep_node *n) | ||
163 | { | ||
164 | hlist_del_init(&n->link); | ||
165 | if (!n->mm) | ||
166 | return; | ||
167 | mmdrop(n->mm); | ||
168 | if (n->halted) | ||
169 | smp_send_reschedule(n->cpu); | ||
170 | else if (waitqueue_active(&n->wq)) | ||
171 | wake_up(&n->wq); | ||
172 | } | ||
173 | |||
174 | static void apf_task_wake_all(void) | ||
175 | { | ||
176 | int i; | ||
177 | |||
178 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { | ||
179 | struct hlist_node *p, *next; | ||
180 | struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; | ||
181 | spin_lock(&b->lock); | ||
182 | hlist_for_each_safe(p, next, &b->list) { | ||
183 | struct kvm_task_sleep_node *n = | ||
184 | hlist_entry(p, typeof(*n), link); | ||
185 | if (n->cpu == smp_processor_id()) | ||
186 | apf_task_wake_one(n); | ||
187 | } | ||
188 | spin_unlock(&b->lock); | ||
189 | } | ||
190 | } | ||
191 | |||
192 | void kvm_async_pf_task_wake(u32 token) | ||
193 | { | ||
194 | u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); | ||
195 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; | ||
196 | struct kvm_task_sleep_node *n; | ||
197 | |||
198 | if (token == ~0) { | ||
199 | apf_task_wake_all(); | ||
200 | return; | ||
201 | } | ||
202 | |||
203 | again: | ||
204 | spin_lock(&b->lock); | ||
205 | n = _find_apf_task(b, token); | ||
206 | if (!n) { | ||
207 | /* | ||
208 | * async PF was not yet handled. | ||
209 | * Add dummy entry for the token. | ||
210 | */ | ||
211 | n = kmalloc(sizeof(*n), GFP_ATOMIC); | ||
212 | if (!n) { | ||
213 | /* | ||
214 | * Allocation failed! Busy wait while other cpu | ||
215 | * handles async PF. | ||
216 | */ | ||
217 | spin_unlock(&b->lock); | ||
218 | cpu_relax(); | ||
219 | goto again; | ||
220 | } | ||
221 | n->token = token; | ||
222 | n->cpu = smp_processor_id(); | ||
223 | n->mm = NULL; | ||
224 | init_waitqueue_head(&n->wq); | ||
225 | hlist_add_head(&n->link, &b->list); | ||
226 | } else | ||
227 | apf_task_wake_one(n); | ||
228 | spin_unlock(&b->lock); | ||
229 | return; | ||
230 | } | ||
231 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); | ||
232 | |||
233 | u32 kvm_read_and_reset_pf_reason(void) | ||
234 | { | ||
235 | u32 reason = 0; | ||
236 | |||
237 | if (__get_cpu_var(apf_reason).enabled) { | ||
238 | reason = __get_cpu_var(apf_reason).reason; | ||
239 | __get_cpu_var(apf_reason).reason = 0; | ||
240 | } | ||
241 | |||
242 | return reason; | ||
243 | } | ||
244 | EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); | ||
245 | |||
246 | dotraplinkage void __kprobes | ||
247 | do_async_page_fault(struct pt_regs *regs, unsigned long error_code) | ||
248 | { | ||
249 | switch (kvm_read_and_reset_pf_reason()) { | ||
250 | default: | ||
251 | do_page_fault(regs, error_code); | ||
252 | break; | ||
253 | case KVM_PV_REASON_PAGE_NOT_PRESENT: | ||
254 | /* page is swapped out by the host. */ | ||
255 | kvm_async_pf_task_wait((u32)read_cr2()); | ||
256 | break; | ||
257 | case KVM_PV_REASON_PAGE_READY: | ||
258 | kvm_async_pf_task_wake((u32)read_cr2()); | ||
259 | break; | ||
260 | } | ||
261 | } | ||
262 | |||
53 | static void kvm_mmu_op(void *buffer, unsigned len) | 263 | static void kvm_mmu_op(void *buffer, unsigned len) |
54 | { | 264 | { |
55 | int r; | 265 | int r; |
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void) | |||
231 | #endif | 441 | #endif |
232 | } | 442 | } |
233 | 443 | ||
444 | void __cpuinit kvm_guest_cpu_init(void) | ||
445 | { | ||
446 | if (!kvm_para_available()) | ||
447 | return; | ||
448 | |||
449 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { | ||
450 | u64 pa = __pa(&__get_cpu_var(apf_reason)); | ||
451 | |||
452 | #ifdef CONFIG_PREEMPT | ||
453 | pa |= KVM_ASYNC_PF_SEND_ALWAYS; | ||
454 | #endif | ||
455 | wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); | ||
456 | __get_cpu_var(apf_reason).enabled = 1; | ||
457 | printk(KERN_INFO"KVM setup async PF for cpu %d\n", | ||
458 | smp_processor_id()); | ||
459 | } | ||
460 | } | ||
461 | |||
462 | static void kvm_pv_disable_apf(void *unused) | ||
463 | { | ||
464 | if (!__get_cpu_var(apf_reason).enabled) | ||
465 | return; | ||
466 | |||
467 | wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); | ||
468 | __get_cpu_var(apf_reason).enabled = 0; | ||
469 | |||
470 | printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", | ||
471 | smp_processor_id()); | ||
472 | } | ||
473 | |||
474 | static int kvm_pv_reboot_notify(struct notifier_block *nb, | ||
475 | unsigned long code, void *unused) | ||
476 | { | ||
477 | if (code == SYS_RESTART) | ||
478 | on_each_cpu(kvm_pv_disable_apf, NULL, 1); | ||
479 | return NOTIFY_DONE; | ||
480 | } | ||
481 | |||
482 | static struct notifier_block kvm_pv_reboot_nb = { | ||
483 | .notifier_call = kvm_pv_reboot_notify, | ||
484 | }; | ||
485 | |||
486 | #ifdef CONFIG_SMP | ||
487 | static void __init kvm_smp_prepare_boot_cpu(void) | ||
488 | { | ||
489 | #ifdef CONFIG_KVM_CLOCK | ||
490 | WARN_ON(kvm_register_clock("primary cpu clock")); | ||
491 | #endif | ||
492 | kvm_guest_cpu_init(); | ||
493 | native_smp_prepare_boot_cpu(); | ||
494 | } | ||
495 | |||
496 | static void kvm_guest_cpu_online(void *dummy) | ||
497 | { | ||
498 | kvm_guest_cpu_init(); | ||
499 | } | ||
500 | |||
501 | static void kvm_guest_cpu_offline(void *dummy) | ||
502 | { | ||
503 | kvm_pv_disable_apf(NULL); | ||
504 | apf_task_wake_all(); | ||
505 | } | ||
506 | |||
507 | static int __cpuinit kvm_cpu_notify(struct notifier_block *self, | ||
508 | unsigned long action, void *hcpu) | ||
509 | { | ||
510 | int cpu = (unsigned long)hcpu; | ||
511 | switch (action) { | ||
512 | case CPU_ONLINE: | ||
513 | case CPU_DOWN_FAILED: | ||
514 | case CPU_ONLINE_FROZEN: | ||
515 | smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); | ||
516 | break; | ||
517 | case CPU_DOWN_PREPARE: | ||
518 | case CPU_DOWN_PREPARE_FROZEN: | ||
519 | smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); | ||
520 | break; | ||
521 | default: | ||
522 | break; | ||
523 | } | ||
524 | return NOTIFY_OK; | ||
525 | } | ||
526 | |||
527 | static struct notifier_block __cpuinitdata kvm_cpu_notifier = { | ||
528 | .notifier_call = kvm_cpu_notify, | ||
529 | }; | ||
530 | #endif | ||
531 | |||
532 | static void __init kvm_apf_trap_init(void) | ||
533 | { | ||
534 | set_intr_gate(14, &async_page_fault); | ||
535 | } | ||
536 | |||
234 | void __init kvm_guest_init(void) | 537 | void __init kvm_guest_init(void) |
235 | { | 538 | { |
539 | int i; | ||
540 | |||
236 | if (!kvm_para_available()) | 541 | if (!kvm_para_available()) |
237 | return; | 542 | return; |
238 | 543 | ||
239 | paravirt_ops_setup(); | 544 | paravirt_ops_setup(); |
545 | register_reboot_notifier(&kvm_pv_reboot_nb); | ||
546 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) | ||
547 | spin_lock_init(&async_pf_sleepers[i].lock); | ||
548 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) | ||
549 | x86_init.irqs.trap_init = kvm_apf_trap_init; | ||
550 | |||
551 | #ifdef CONFIG_SMP | ||
552 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | ||
553 | register_cpu_notifier(&kvm_cpu_notifier); | ||
554 | #else | ||
555 | kvm_guest_cpu_init(); | ||
556 | #endif | ||
240 | } | 557 | } |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ca43ce31a19c..f98d3eafe07a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = { | |||
125 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 125 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
126 | }; | 126 | }; |
127 | 127 | ||
128 | static int kvm_register_clock(char *txt) | 128 | int kvm_register_clock(char *txt) |
129 | { | 129 | { |
130 | int cpu = smp_processor_id(); | 130 | int cpu = smp_processor_id(); |
131 | int low, high, ret; | 131 | int low, high, ret; |
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) | |||
152 | } | 152 | } |
153 | #endif | 153 | #endif |
154 | 154 | ||
155 | #ifdef CONFIG_SMP | ||
156 | static void __init kvm_smp_prepare_boot_cpu(void) | ||
157 | { | ||
158 | WARN_ON(kvm_register_clock("primary cpu clock")); | ||
159 | native_smp_prepare_boot_cpu(); | ||
160 | } | ||
161 | #endif | ||
162 | |||
163 | /* | 155 | /* |
164 | * After the clock is registered, the host will keep writing to the | 156 | * After the clock is registered, the host will keep writing to the |
165 | * registered memory location. If the guest happens to shutdown, this memory | 157 | * registered memory location. If the guest happens to shutdown, this memory |
@@ -206,9 +198,6 @@ void __init kvmclock_init(void) | |||
206 | x86_cpuinit.setup_percpu_clockev = | 198 | x86_cpuinit.setup_percpu_clockev = |
207 | kvm_setup_secondary_clock; | 199 | kvm_setup_secondary_clock; |
208 | #endif | 200 | #endif |
209 | #ifdef CONFIG_SMP | ||
210 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | ||
211 | #endif | ||
212 | machine_ops.shutdown = kvm_shutdown; | 201 | machine_ops.shutdown = kvm_shutdown; |
213 | #ifdef CONFIG_KEXEC | 202 | #ifdef CONFIG_KEXEC |
214 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 203 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ddc131ff438f..50f63648ce1b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -28,6 +28,7 @@ config KVM | |||
28 | select HAVE_KVM_IRQCHIP | 28 | select HAVE_KVM_IRQCHIP |
29 | select HAVE_KVM_EVENTFD | 29 | select HAVE_KVM_EVENTFD |
30 | select KVM_APIC_ARCHITECTURE | 30 | select KVM_APIC_ARCHITECTURE |
31 | select KVM_ASYNC_PF | ||
31 | select USER_RETURN_NOTIFIER | 32 | select USER_RETURN_NOTIFIER |
32 | select KVM_MMIO | 33 | select KVM_MMIO |
33 | ---help--- | 34 | ---help--- |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31a7035c4bd9..f15501f431c8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 2 | ccflags-y += -Ivirt/kvm -Iarch/x86/kvm |
3 | 3 | ||
4 | CFLAGS_x86.o := -I. | 4 | CFLAGS_x86.o := -I. |
5 | CFLAGS_svm.o := -I. | 5 | CFLAGS_svm.o := -I. |
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | |||
9 | coalesced_mmio.o irq_comm.o eventfd.o \ | 9 | coalesced_mmio.o irq_comm.o eventfd.o \ |
10 | assigned-dev.o) | 10 | assigned-dev.o) |
11 | kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) | 11 | kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) |
12 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) | ||
12 | 13 | ||
13 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | 14 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ |
14 | i8254.o timer.o | 15 | i8254.o timer.o |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 38b6e8dafaff..caf966781d25 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -20,16 +20,8 @@ | |||
20 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | 20 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #ifndef __KERNEL__ | ||
24 | #include <stdio.h> | ||
25 | #include <stdint.h> | ||
26 | #include <public/xen.h> | ||
27 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | ||
28 | #else | ||
29 | #include <linux/kvm_host.h> | 23 | #include <linux/kvm_host.h> |
30 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
31 | #define DPRINTF(x...) do {} while (0) | ||
32 | #endif | ||
33 | #include <linux/module.h> | 25 | #include <linux/module.h> |
34 | #include <asm/kvm_emulate.h> | 26 | #include <asm/kvm_emulate.h> |
35 | 27 | ||
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg) | |||
418 | } | 410 | } |
419 | 411 | ||
420 | static inline unsigned long | 412 | static inline unsigned long |
421 | register_address(struct decode_cache *c, unsigned long base, unsigned long reg) | 413 | register_address(struct decode_cache *c, unsigned long reg) |
422 | { | 414 | { |
423 | return base + address_mask(c, reg); | 415 | return address_mask(c, reg); |
424 | } | 416 | } |
425 | 417 | ||
426 | static inline void | 418 | static inline void |
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, | |||
452 | return ops->get_cached_segment_base(seg, ctxt->vcpu); | 444 | return ops->get_cached_segment_base(seg, ctxt->vcpu); |
453 | } | 445 | } |
454 | 446 | ||
455 | static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, | 447 | static unsigned seg_override(struct x86_emulate_ctxt *ctxt, |
456 | struct x86_emulate_ops *ops, | 448 | struct x86_emulate_ops *ops, |
457 | struct decode_cache *c) | 449 | struct decode_cache *c) |
458 | { | 450 | { |
459 | if (!c->has_seg_override) | 451 | if (!c->has_seg_override) |
460 | return 0; | 452 | return 0; |
461 | 453 | ||
462 | return seg_base(ctxt, ops, c->seg_override); | 454 | return c->seg_override; |
463 | } | 455 | } |
464 | 456 | ||
465 | static unsigned long es_base(struct x86_emulate_ctxt *ctxt, | 457 | static ulong linear(struct x86_emulate_ctxt *ctxt, |
466 | struct x86_emulate_ops *ops) | 458 | struct segmented_address addr) |
467 | { | 459 | { |
468 | return seg_base(ctxt, ops, VCPU_SREG_ES); | 460 | struct decode_cache *c = &ctxt->decode; |
469 | } | 461 | ulong la; |
470 | |||
471 | static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, | ||
472 | struct x86_emulate_ops *ops) | ||
473 | { | ||
474 | return seg_base(ctxt, ops, VCPU_SREG_SS); | ||
475 | } | ||
476 | 462 | ||
477 | static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, | 463 | la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; |
478 | u32 error, bool valid) | 464 | if (c->ad_bytes != 8) |
479 | { | 465 | la &= (u32)-1; |
480 | ctxt->exception = vec; | 466 | return la; |
481 | ctxt->error_code = error; | ||
482 | ctxt->error_code_valid = valid; | ||
483 | } | 467 | } |
484 | 468 | ||
485 | static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) | 469 | static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, |
470 | u32 error, bool valid) | ||
486 | { | 471 | { |
487 | emulate_exception(ctxt, GP_VECTOR, err, true); | 472 | ctxt->exception.vector = vec; |
473 | ctxt->exception.error_code = error; | ||
474 | ctxt->exception.error_code_valid = valid; | ||
475 | return X86EMUL_PROPAGATE_FAULT; | ||
488 | } | 476 | } |
489 | 477 | ||
490 | static void emulate_pf(struct x86_emulate_ctxt *ctxt) | 478 | static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) |
491 | { | 479 | { |
492 | emulate_exception(ctxt, PF_VECTOR, 0, true); | 480 | return emulate_exception(ctxt, GP_VECTOR, err, true); |
493 | } | 481 | } |
494 | 482 | ||
495 | static void emulate_ud(struct x86_emulate_ctxt *ctxt) | 483 | static int emulate_ud(struct x86_emulate_ctxt *ctxt) |
496 | { | 484 | { |
497 | emulate_exception(ctxt, UD_VECTOR, 0, false); | 485 | return emulate_exception(ctxt, UD_VECTOR, 0, false); |
498 | } | 486 | } |
499 | 487 | ||
500 | static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) | 488 | static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err) |
501 | { | 489 | { |
502 | emulate_exception(ctxt, TS_VECTOR, err, true); | 490 | return emulate_exception(ctxt, TS_VECTOR, err, true); |
503 | } | 491 | } |
504 | 492 | ||
505 | static int emulate_de(struct x86_emulate_ctxt *ctxt) | 493 | static int emulate_de(struct x86_emulate_ctxt *ctxt) |
506 | { | 494 | { |
507 | emulate_exception(ctxt, DE_VECTOR, 0, false); | 495 | return emulate_exception(ctxt, DE_VECTOR, 0, false); |
508 | return X86EMUL_PROPAGATE_FAULT; | ||
509 | } | 496 | } |
510 | 497 | ||
511 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | 498 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, |
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | |||
520 | cur_size = fc->end - fc->start; | 507 | cur_size = fc->end - fc->start; |
521 | size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); | 508 | size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); |
522 | rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, | 509 | rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, |
523 | size, ctxt->vcpu, NULL); | 510 | size, ctxt->vcpu, &ctxt->exception); |
524 | if (rc != X86EMUL_CONTINUE) | 511 | if (rc != X86EMUL_CONTINUE) |
525 | return rc; | 512 | return rc; |
526 | fc->end += size; | 513 | fc->end += size; |
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, | |||
564 | 551 | ||
565 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | 552 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, |
566 | struct x86_emulate_ops *ops, | 553 | struct x86_emulate_ops *ops, |
567 | ulong addr, | 554 | struct segmented_address addr, |
568 | u16 *size, unsigned long *address, int op_bytes) | 555 | u16 *size, unsigned long *address, int op_bytes) |
569 | { | 556 | { |
570 | int rc; | 557 | int rc; |
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, | |||
572 | if (op_bytes == 2) | 559 | if (op_bytes == 2) |
573 | op_bytes = 3; | 560 | op_bytes = 3; |
574 | *address = 0; | 561 | *address = 0; |
575 | rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); | 562 | rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, |
563 | ctxt->vcpu, &ctxt->exception); | ||
576 | if (rc != X86EMUL_CONTINUE) | 564 | if (rc != X86EMUL_CONTINUE) |
577 | return rc; | 565 | return rc; |
578 | rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); | 566 | addr.ea += 2; |
567 | rc = ops->read_std(linear(ctxt, addr), address, op_bytes, | ||
568 | ctxt->vcpu, &ctxt->exception); | ||
579 | return rc; | 569 | return rc; |
580 | } | 570 | } |
581 | 571 | ||
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
768 | break; | 758 | break; |
769 | } | 759 | } |
770 | } | 760 | } |
771 | op->addr.mem = modrm_ea; | 761 | op->addr.mem.ea = modrm_ea; |
772 | done: | 762 | done: |
773 | return rc; | 763 | return rc; |
774 | } | 764 | } |
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, | |||
783 | op->type = OP_MEM; | 773 | op->type = OP_MEM; |
784 | switch (c->ad_bytes) { | 774 | switch (c->ad_bytes) { |
785 | case 2: | 775 | case 2: |
786 | op->addr.mem = insn_fetch(u16, 2, c->eip); | 776 | op->addr.mem.ea = insn_fetch(u16, 2, c->eip); |
787 | break; | 777 | break; |
788 | case 4: | 778 | case 4: |
789 | op->addr.mem = insn_fetch(u32, 4, c->eip); | 779 | op->addr.mem.ea = insn_fetch(u32, 4, c->eip); |
790 | break; | 780 | break; |
791 | case 8: | 781 | case 8: |
792 | op->addr.mem = insn_fetch(u64, 8, c->eip); | 782 | op->addr.mem.ea = insn_fetch(u64, 8, c->eip); |
793 | break; | 783 | break; |
794 | } | 784 | } |
795 | done: | 785 | done: |
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c) | |||
808 | else if (c->src.bytes == 4) | 798 | else if (c->src.bytes == 4) |
809 | sv = (s32)c->src.val & (s32)mask; | 799 | sv = (s32)c->src.val & (s32)mask; |
810 | 800 | ||
811 | c->dst.addr.mem += (sv >> 3); | 801 | c->dst.addr.mem.ea += (sv >> 3); |
812 | } | 802 | } |
813 | 803 | ||
814 | /* only subword offset */ | 804 | /* only subword offset */ |
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
821 | { | 811 | { |
822 | int rc; | 812 | int rc; |
823 | struct read_cache *mc = &ctxt->decode.mem_read; | 813 | struct read_cache *mc = &ctxt->decode.mem_read; |
824 | u32 err; | ||
825 | 814 | ||
826 | while (size) { | 815 | while (size) { |
827 | int n = min(size, 8u); | 816 | int n = min(size, 8u); |
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
829 | if (mc->pos < mc->end) | 818 | if (mc->pos < mc->end) |
830 | goto read_cached; | 819 | goto read_cached; |
831 | 820 | ||
832 | rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, | 821 | rc = ops->read_emulated(addr, mc->data + mc->end, n, |
833 | ctxt->vcpu); | 822 | &ctxt->exception, ctxt->vcpu); |
834 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
835 | emulate_pf(ctxt); | ||
836 | if (rc != X86EMUL_CONTINUE) | 823 | if (rc != X86EMUL_CONTINUE) |
837 | return rc; | 824 | return rc; |
838 | mc->end += n; | 825 | mc->end += n; |
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
907 | struct desc_ptr dt; | 894 | struct desc_ptr dt; |
908 | u16 index = selector >> 3; | 895 | u16 index = selector >> 3; |
909 | int ret; | 896 | int ret; |
910 | u32 err; | ||
911 | ulong addr; | 897 | ulong addr; |
912 | 898 | ||
913 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 899 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); |
914 | 900 | ||
915 | if (dt.size < index * 8 + 7) { | 901 | if (dt.size < index * 8 + 7) |
916 | emulate_gp(ctxt, selector & 0xfffc); | 902 | return emulate_gp(ctxt, selector & 0xfffc); |
917 | return X86EMUL_PROPAGATE_FAULT; | ||
918 | } | ||
919 | addr = dt.address + index * 8; | 903 | addr = dt.address + index * 8; |
920 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 904 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, |
921 | if (ret == X86EMUL_PROPAGATE_FAULT) | 905 | &ctxt->exception); |
922 | emulate_pf(ctxt); | ||
923 | 906 | ||
924 | return ret; | 907 | return ret; |
925 | } | 908 | } |
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
931 | { | 914 | { |
932 | struct desc_ptr dt; | 915 | struct desc_ptr dt; |
933 | u16 index = selector >> 3; | 916 | u16 index = selector >> 3; |
934 | u32 err; | ||
935 | ulong addr; | 917 | ulong addr; |
936 | int ret; | 918 | int ret; |
937 | 919 | ||
938 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 920 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); |
939 | 921 | ||
940 | if (dt.size < index * 8 + 7) { | 922 | if (dt.size < index * 8 + 7) |
941 | emulate_gp(ctxt, selector & 0xfffc); | 923 | return emulate_gp(ctxt, selector & 0xfffc); |
942 | return X86EMUL_PROPAGATE_FAULT; | ||
943 | } | ||
944 | 924 | ||
945 | addr = dt.address + index * 8; | 925 | addr = dt.address + index * 8; |
946 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 926 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, |
947 | if (ret == X86EMUL_PROPAGATE_FAULT) | 927 | &ctxt->exception); |
948 | emulate_pf(ctxt); | ||
949 | 928 | ||
950 | return ret; | 929 | return ret; |
951 | } | 930 | } |
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1092 | { | 1071 | { |
1093 | int rc; | 1072 | int rc; |
1094 | struct decode_cache *c = &ctxt->decode; | 1073 | struct decode_cache *c = &ctxt->decode; |
1095 | u32 err; | ||
1096 | 1074 | ||
1097 | switch (c->dst.type) { | 1075 | switch (c->dst.type) { |
1098 | case OP_REG: | 1076 | case OP_REG: |
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1101 | case OP_MEM: | 1079 | case OP_MEM: |
1102 | if (c->lock_prefix) | 1080 | if (c->lock_prefix) |
1103 | rc = ops->cmpxchg_emulated( | 1081 | rc = ops->cmpxchg_emulated( |
1104 | c->dst.addr.mem, | 1082 | linear(ctxt, c->dst.addr.mem), |
1105 | &c->dst.orig_val, | 1083 | &c->dst.orig_val, |
1106 | &c->dst.val, | 1084 | &c->dst.val, |
1107 | c->dst.bytes, | 1085 | c->dst.bytes, |
1108 | &err, | 1086 | &ctxt->exception, |
1109 | ctxt->vcpu); | 1087 | ctxt->vcpu); |
1110 | else | 1088 | else |
1111 | rc = ops->write_emulated( | 1089 | rc = ops->write_emulated( |
1112 | c->dst.addr.mem, | 1090 | linear(ctxt, c->dst.addr.mem), |
1113 | &c->dst.val, | 1091 | &c->dst.val, |
1114 | c->dst.bytes, | 1092 | c->dst.bytes, |
1115 | &err, | 1093 | &ctxt->exception, |
1116 | ctxt->vcpu); | 1094 | ctxt->vcpu); |
1117 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1118 | emulate_pf(ctxt); | ||
1119 | if (rc != X86EMUL_CONTINUE) | 1095 | if (rc != X86EMUL_CONTINUE) |
1120 | return rc; | 1096 | return rc; |
1121 | break; | 1097 | break; |
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt, | |||
1137 | c->dst.bytes = c->op_bytes; | 1113 | c->dst.bytes = c->op_bytes; |
1138 | c->dst.val = c->src.val; | 1114 | c->dst.val = c->src.val; |
1139 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); | 1115 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); |
1140 | c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), | 1116 | c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); |
1141 | c->regs[VCPU_REGS_RSP]); | 1117 | c->dst.addr.mem.seg = VCPU_SREG_SS; |
1142 | } | 1118 | } |
1143 | 1119 | ||
1144 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, | 1120 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, | |||
1147 | { | 1123 | { |
1148 | struct decode_cache *c = &ctxt->decode; | 1124 | struct decode_cache *c = &ctxt->decode; |
1149 | int rc; | 1125 | int rc; |
1126 | struct segmented_address addr; | ||
1150 | 1127 | ||
1151 | rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), | 1128 | addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); |
1152 | c->regs[VCPU_REGS_RSP]), | 1129 | addr.seg = VCPU_SREG_SS; |
1153 | dest, len); | 1130 | rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); |
1154 | if (rc != X86EMUL_CONTINUE) | 1131 | if (rc != X86EMUL_CONTINUE) |
1155 | return rc; | 1132 | return rc; |
1156 | 1133 | ||
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1184 | change_mask |= EFLG_IF; | 1161 | change_mask |= EFLG_IF; |
1185 | break; | 1162 | break; |
1186 | case X86EMUL_MODE_VM86: | 1163 | case X86EMUL_MODE_VM86: |
1187 | if (iopl < 3) { | 1164 | if (iopl < 3) |
1188 | emulate_gp(ctxt, 0); | 1165 | return emulate_gp(ctxt, 0); |
1189 | return X86EMUL_PROPAGATE_FAULT; | ||
1190 | } | ||
1191 | change_mask |= EFLG_IF; | 1166 | change_mask |= EFLG_IF; |
1192 | break; | 1167 | break; |
1193 | default: /* real mode */ | 1168 | default: /* real mode */ |
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1198 | *(unsigned long *)dest = | 1173 | *(unsigned long *)dest = |
1199 | (ctxt->eflags & ~change_mask) | (val & change_mask); | 1174 | (ctxt->eflags & ~change_mask) | (val & change_mask); |
1200 | 1175 | ||
1201 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1202 | emulate_pf(ctxt); | ||
1203 | |||
1204 | return rc; | 1176 | return rc; |
1205 | } | 1177 | } |
1206 | 1178 | ||
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |||
1287 | gva_t cs_addr; | 1259 | gva_t cs_addr; |
1288 | gva_t eip_addr; | 1260 | gva_t eip_addr; |
1289 | u16 cs, eip; | 1261 | u16 cs, eip; |
1290 | u32 err; | ||
1291 | 1262 | ||
1292 | /* TODO: Add limit checks */ | 1263 | /* TODO: Add limit checks */ |
1293 | c->src.val = ctxt->eflags; | 1264 | c->src.val = ctxt->eflags; |
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |||
1317 | eip_addr = dt.address + (irq << 2); | 1288 | eip_addr = dt.address + (irq << 2); |
1318 | cs_addr = dt.address + (irq << 2) + 2; | 1289 | cs_addr = dt.address + (irq << 2) + 2; |
1319 | 1290 | ||
1320 | rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); | 1291 | rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); |
1321 | if (rc != X86EMUL_CONTINUE) | 1292 | if (rc != X86EMUL_CONTINUE) |
1322 | return rc; | 1293 | return rc; |
1323 | 1294 | ||
1324 | rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); | 1295 | rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); |
1325 | if (rc != X86EMUL_CONTINUE) | 1296 | if (rc != X86EMUL_CONTINUE) |
1326 | return rc; | 1297 | return rc; |
1327 | 1298 | ||
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | |||
1370 | if (rc != X86EMUL_CONTINUE) | 1341 | if (rc != X86EMUL_CONTINUE) |
1371 | return rc; | 1342 | return rc; |
1372 | 1343 | ||
1373 | if (temp_eip & ~0xffff) { | 1344 | if (temp_eip & ~0xffff) |
1374 | emulate_gp(ctxt, 0); | 1345 | return emulate_gp(ctxt, 0); |
1375 | return X86EMUL_PROPAGATE_FAULT; | ||
1376 | } | ||
1377 | 1346 | ||
1378 | rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); | 1347 | rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); |
1379 | 1348 | ||
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1624 | 1593 | ||
1625 | /* syscall is not available in real mode */ | 1594 | /* syscall is not available in real mode */ |
1626 | if (ctxt->mode == X86EMUL_MODE_REAL || | 1595 | if (ctxt->mode == X86EMUL_MODE_REAL || |
1627 | ctxt->mode == X86EMUL_MODE_VM86) { | 1596 | ctxt->mode == X86EMUL_MODE_VM86) |
1628 | emulate_ud(ctxt); | 1597 | return emulate_ud(ctxt); |
1629 | return X86EMUL_PROPAGATE_FAULT; | ||
1630 | } | ||
1631 | 1598 | ||
1632 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1599 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1633 | 1600 | ||
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1678 | u16 cs_sel, ss_sel; | 1645 | u16 cs_sel, ss_sel; |
1679 | 1646 | ||
1680 | /* inject #GP if in real mode */ | 1647 | /* inject #GP if in real mode */ |
1681 | if (ctxt->mode == X86EMUL_MODE_REAL) { | 1648 | if (ctxt->mode == X86EMUL_MODE_REAL) |
1682 | emulate_gp(ctxt, 0); | 1649 | return emulate_gp(ctxt, 0); |
1683 | return X86EMUL_PROPAGATE_FAULT; | ||
1684 | } | ||
1685 | 1650 | ||
1686 | /* XXX sysenter/sysexit have not been tested in 64bit mode. | 1651 | /* XXX sysenter/sysexit have not been tested in 64bit mode. |
1687 | * Therefore, we inject an #UD. | 1652 | * Therefore, we inject an #UD. |
1688 | */ | 1653 | */ |
1689 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | 1654 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
1690 | emulate_ud(ctxt); | 1655 | return emulate_ud(ctxt); |
1691 | return X86EMUL_PROPAGATE_FAULT; | ||
1692 | } | ||
1693 | 1656 | ||
1694 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1657 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1695 | 1658 | ||
1696 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | 1659 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); |
1697 | switch (ctxt->mode) { | 1660 | switch (ctxt->mode) { |
1698 | case X86EMUL_MODE_PROT32: | 1661 | case X86EMUL_MODE_PROT32: |
1699 | if ((msr_data & 0xfffc) == 0x0) { | 1662 | if ((msr_data & 0xfffc) == 0x0) |
1700 | emulate_gp(ctxt, 0); | 1663 | return emulate_gp(ctxt, 0); |
1701 | return X86EMUL_PROPAGATE_FAULT; | ||
1702 | } | ||
1703 | break; | 1664 | break; |
1704 | case X86EMUL_MODE_PROT64: | 1665 | case X86EMUL_MODE_PROT64: |
1705 | if (msr_data == 0x0) { | 1666 | if (msr_data == 0x0) |
1706 | emulate_gp(ctxt, 0); | 1667 | return emulate_gp(ctxt, 0); |
1707 | return X86EMUL_PROPAGATE_FAULT; | ||
1708 | } | ||
1709 | break; | 1668 | break; |
1710 | } | 1669 | } |
1711 | 1670 | ||
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1745 | 1704 | ||
1746 | /* inject #GP if in real mode or Virtual 8086 mode */ | 1705 | /* inject #GP if in real mode or Virtual 8086 mode */ |
1747 | if (ctxt->mode == X86EMUL_MODE_REAL || | 1706 | if (ctxt->mode == X86EMUL_MODE_REAL || |
1748 | ctxt->mode == X86EMUL_MODE_VM86) { | 1707 | ctxt->mode == X86EMUL_MODE_VM86) |
1749 | emulate_gp(ctxt, 0); | 1708 | return emulate_gp(ctxt, 0); |
1750 | return X86EMUL_PROPAGATE_FAULT; | ||
1751 | } | ||
1752 | 1709 | ||
1753 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1710 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1754 | 1711 | ||
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1763 | switch (usermode) { | 1720 | switch (usermode) { |
1764 | case X86EMUL_MODE_PROT32: | 1721 | case X86EMUL_MODE_PROT32: |
1765 | cs_sel = (u16)(msr_data + 16); | 1722 | cs_sel = (u16)(msr_data + 16); |
1766 | if ((msr_data & 0xfffc) == 0x0) { | 1723 | if ((msr_data & 0xfffc) == 0x0) |
1767 | emulate_gp(ctxt, 0); | 1724 | return emulate_gp(ctxt, 0); |
1768 | return X86EMUL_PROPAGATE_FAULT; | ||
1769 | } | ||
1770 | ss_sel = (u16)(msr_data + 24); | 1725 | ss_sel = (u16)(msr_data + 24); |
1771 | break; | 1726 | break; |
1772 | case X86EMUL_MODE_PROT64: | 1727 | case X86EMUL_MODE_PROT64: |
1773 | cs_sel = (u16)(msr_data + 32); | 1728 | cs_sel = (u16)(msr_data + 32); |
1774 | if (msr_data == 0x0) { | 1729 | if (msr_data == 0x0) |
1775 | emulate_gp(ctxt, 0); | 1730 | return emulate_gp(ctxt, 0); |
1776 | return X86EMUL_PROPAGATE_FAULT; | ||
1777 | } | ||
1778 | ss_sel = cs_sel + 8; | 1731 | ss_sel = cs_sel + 8; |
1779 | cs.d = 0; | 1732 | cs.d = 0; |
1780 | cs.l = 1; | 1733 | cs.l = 1; |
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
1934 | { | 1887 | { |
1935 | struct tss_segment_16 tss_seg; | 1888 | struct tss_segment_16 tss_seg; |
1936 | int ret; | 1889 | int ret; |
1937 | u32 err, new_tss_base = get_desc_base(new_desc); | 1890 | u32 new_tss_base = get_desc_base(new_desc); |
1938 | 1891 | ||
1939 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 1892 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
1940 | &err); | 1893 | &ctxt->exception); |
1941 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1894 | if (ret != X86EMUL_CONTINUE) |
1942 | /* FIXME: need to provide precise fault address */ | 1895 | /* FIXME: need to provide precise fault address */ |
1943 | emulate_pf(ctxt); | ||
1944 | return ret; | 1896 | return ret; |
1945 | } | ||
1946 | 1897 | ||
1947 | save_state_to_tss16(ctxt, ops, &tss_seg); | 1898 | save_state_to_tss16(ctxt, ops, &tss_seg); |
1948 | 1899 | ||
1949 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 1900 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
1950 | &err); | 1901 | &ctxt->exception); |
1951 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1902 | if (ret != X86EMUL_CONTINUE) |
1952 | /* FIXME: need to provide precise fault address */ | 1903 | /* FIXME: need to provide precise fault address */ |
1953 | emulate_pf(ctxt); | ||
1954 | return ret; | 1904 | return ret; |
1955 | } | ||
1956 | 1905 | ||
1957 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 1906 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
1958 | &err); | 1907 | &ctxt->exception); |
1959 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1908 | if (ret != X86EMUL_CONTINUE) |
1960 | /* FIXME: need to provide precise fault address */ | 1909 | /* FIXME: need to provide precise fault address */ |
1961 | emulate_pf(ctxt); | ||
1962 | return ret; | 1910 | return ret; |
1963 | } | ||
1964 | 1911 | ||
1965 | if (old_tss_sel != 0xffff) { | 1912 | if (old_tss_sel != 0xffff) { |
1966 | tss_seg.prev_task_link = old_tss_sel; | 1913 | tss_seg.prev_task_link = old_tss_sel; |
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
1968 | ret = ops->write_std(new_tss_base, | 1915 | ret = ops->write_std(new_tss_base, |
1969 | &tss_seg.prev_task_link, | 1916 | &tss_seg.prev_task_link, |
1970 | sizeof tss_seg.prev_task_link, | 1917 | sizeof tss_seg.prev_task_link, |
1971 | ctxt->vcpu, &err); | 1918 | ctxt->vcpu, &ctxt->exception); |
1972 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1919 | if (ret != X86EMUL_CONTINUE) |
1973 | /* FIXME: need to provide precise fault address */ | 1920 | /* FIXME: need to provide precise fault address */ |
1974 | emulate_pf(ctxt); | ||
1975 | return ret; | 1921 | return ret; |
1976 | } | ||
1977 | } | 1922 | } |
1978 | 1923 | ||
1979 | return load_state_from_tss16(ctxt, ops, &tss_seg); | 1924 | return load_state_from_tss16(ctxt, ops, &tss_seg); |
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2013 | struct decode_cache *c = &ctxt->decode; | 1958 | struct decode_cache *c = &ctxt->decode; |
2014 | int ret; | 1959 | int ret; |
2015 | 1960 | ||
2016 | if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { | 1961 | if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) |
2017 | emulate_gp(ctxt, 0); | 1962 | return emulate_gp(ctxt, 0); |
2018 | return X86EMUL_PROPAGATE_FAULT; | ||
2019 | } | ||
2020 | c->eip = tss->eip; | 1963 | c->eip = tss->eip; |
2021 | ctxt->eflags = tss->eflags | 2; | 1964 | ctxt->eflags = tss->eflags | 2; |
2022 | c->regs[VCPU_REGS_RAX] = tss->eax; | 1965 | c->regs[VCPU_REGS_RAX] = tss->eax; |
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2076 | { | 2019 | { |
2077 | struct tss_segment_32 tss_seg; | 2020 | struct tss_segment_32 tss_seg; |
2078 | int ret; | 2021 | int ret; |
2079 | u32 err, new_tss_base = get_desc_base(new_desc); | 2022 | u32 new_tss_base = get_desc_base(new_desc); |
2080 | 2023 | ||
2081 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2024 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
2082 | &err); | 2025 | &ctxt->exception); |
2083 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2026 | if (ret != X86EMUL_CONTINUE) |
2084 | /* FIXME: need to provide precise fault address */ | 2027 | /* FIXME: need to provide precise fault address */ |
2085 | emulate_pf(ctxt); | ||
2086 | return ret; | 2028 | return ret; |
2087 | } | ||
2088 | 2029 | ||
2089 | save_state_to_tss32(ctxt, ops, &tss_seg); | 2030 | save_state_to_tss32(ctxt, ops, &tss_seg); |
2090 | 2031 | ||
2091 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2032 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
2092 | &err); | 2033 | &ctxt->exception); |
2093 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2034 | if (ret != X86EMUL_CONTINUE) |
2094 | /* FIXME: need to provide precise fault address */ | 2035 | /* FIXME: need to provide precise fault address */ |
2095 | emulate_pf(ctxt); | ||
2096 | return ret; | 2036 | return ret; |
2097 | } | ||
2098 | 2037 | ||
2099 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2038 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
2100 | &err); | 2039 | &ctxt->exception); |
2101 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2040 | if (ret != X86EMUL_CONTINUE) |
2102 | /* FIXME: need to provide precise fault address */ | 2041 | /* FIXME: need to provide precise fault address */ |
2103 | emulate_pf(ctxt); | ||
2104 | return ret; | 2042 | return ret; |
2105 | } | ||
2106 | 2043 | ||
2107 | if (old_tss_sel != 0xffff) { | 2044 | if (old_tss_sel != 0xffff) { |
2108 | tss_seg.prev_task_link = old_tss_sel; | 2045 | tss_seg.prev_task_link = old_tss_sel; |
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2110 | ret = ops->write_std(new_tss_base, | 2047 | ret = ops->write_std(new_tss_base, |
2111 | &tss_seg.prev_task_link, | 2048 | &tss_seg.prev_task_link, |
2112 | sizeof tss_seg.prev_task_link, | 2049 | sizeof tss_seg.prev_task_link, |
2113 | ctxt->vcpu, &err); | 2050 | ctxt->vcpu, &ctxt->exception); |
2114 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2051 | if (ret != X86EMUL_CONTINUE) |
2115 | /* FIXME: need to provide precise fault address */ | 2052 | /* FIXME: need to provide precise fault address */ |
2116 | emulate_pf(ctxt); | ||
2117 | return ret; | 2053 | return ret; |
2118 | } | ||
2119 | } | 2054 | } |
2120 | 2055 | ||
2121 | return load_state_from_tss32(ctxt, ops, &tss_seg); | 2056 | return load_state_from_tss32(ctxt, ops, &tss_seg); |
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2146 | 2081 | ||
2147 | if (reason != TASK_SWITCH_IRET) { | 2082 | if (reason != TASK_SWITCH_IRET) { |
2148 | if ((tss_selector & 3) > next_tss_desc.dpl || | 2083 | if ((tss_selector & 3) > next_tss_desc.dpl || |
2149 | ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { | 2084 | ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) |
2150 | emulate_gp(ctxt, 0); | 2085 | return emulate_gp(ctxt, 0); |
2151 | return X86EMUL_PROPAGATE_FAULT; | ||
2152 | } | ||
2153 | } | 2086 | } |
2154 | 2087 | ||
2155 | desc_limit = desc_limit_scaled(&next_tss_desc); | 2088 | desc_limit = desc_limit_scaled(&next_tss_desc); |
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2231 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 2164 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; |
2232 | } | 2165 | } |
2233 | 2166 | ||
2234 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, | 2167 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, |
2235 | int reg, struct operand *op) | 2168 | int reg, struct operand *op) |
2236 | { | 2169 | { |
2237 | struct decode_cache *c = &ctxt->decode; | 2170 | struct decode_cache *c = &ctxt->decode; |
2238 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; | 2171 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; |
2239 | 2172 | ||
2240 | register_address_increment(c, &c->regs[reg], df * op->bytes); | 2173 | register_address_increment(c, &c->regs[reg], df * op->bytes); |
2241 | op->addr.mem = register_address(c, base, c->regs[reg]); | 2174 | op->addr.mem.ea = register_address(c, c->regs[reg]); |
2175 | op->addr.mem.seg = seg; | ||
2242 | } | 2176 | } |
2243 | 2177 | ||
2244 | static int em_push(struct x86_emulate_ctxt *ctxt) | 2178 | static int em_push(struct x86_emulate_ctxt *ctxt) |
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) | |||
2369 | struct decode_cache *c = &ctxt->decode; | 2303 | struct decode_cache *c = &ctxt->decode; |
2370 | u64 tsc = 0; | 2304 | u64 tsc = 0; |
2371 | 2305 | ||
2372 | if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { | 2306 | if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) |
2373 | emulate_gp(ctxt, 0); | 2307 | return emulate_gp(ctxt, 0); |
2374 | return X86EMUL_PROPAGATE_FAULT; | ||
2375 | } | ||
2376 | ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); | 2308 | ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); |
2377 | c->regs[VCPU_REGS_RAX] = (u32)tsc; | 2309 | c->regs[VCPU_REGS_RAX] = (u32)tsc; |
2378 | c->regs[VCPU_REGS_RDX] = tsc >> 32; | 2310 | c->regs[VCPU_REGS_RDX] = tsc >> 32; |
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
2647 | 2579 | ||
2648 | op->type = OP_IMM; | 2580 | op->type = OP_IMM; |
2649 | op->bytes = size; | 2581 | op->bytes = size; |
2650 | op->addr.mem = c->eip; | 2582 | op->addr.mem.ea = c->eip; |
2651 | /* NB. Immediates are sign-extended as necessary. */ | 2583 | /* NB. Immediates are sign-extended as necessary. */ |
2652 | switch (op->bytes) { | 2584 | switch (op->bytes) { |
2653 | case 1: | 2585 | case 1: |
@@ -2678,7 +2610,7 @@ done: | |||
2678 | } | 2610 | } |
2679 | 2611 | ||
2680 | int | 2612 | int |
2681 | x86_decode_insn(struct x86_emulate_ctxt *ctxt) | 2613 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) |
2682 | { | 2614 | { |
2683 | struct x86_emulate_ops *ops = ctxt->ops; | 2615 | struct x86_emulate_ops *ops = ctxt->ops; |
2684 | struct decode_cache *c = &ctxt->decode; | 2616 | struct decode_cache *c = &ctxt->decode; |
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt) | |||
2689 | struct operand memop = { .type = OP_NONE }; | 2621 | struct operand memop = { .type = OP_NONE }; |
2690 | 2622 | ||
2691 | c->eip = ctxt->eip; | 2623 | c->eip = ctxt->eip; |
2692 | c->fetch.start = c->fetch.end = c->eip; | 2624 | c->fetch.start = c->eip; |
2625 | c->fetch.end = c->fetch.start + insn_len; | ||
2626 | if (insn_len > 0) | ||
2627 | memcpy(c->fetch.data, insn, insn_len); | ||
2693 | ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); | 2628 | ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); |
2694 | 2629 | ||
2695 | switch (mode) { | 2630 | switch (mode) { |
@@ -2803,10 +2738,8 @@ done_prefixes: | |||
2803 | c->execute = opcode.u.execute; | 2738 | c->execute = opcode.u.execute; |
2804 | 2739 | ||
2805 | /* Unrecognised? */ | 2740 | /* Unrecognised? */ |
2806 | if (c->d == 0 || (c->d & Undefined)) { | 2741 | if (c->d == 0 || (c->d & Undefined)) |
2807 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
2808 | return -1; | 2742 | return -1; |
2809 | } | ||
2810 | 2743 | ||
2811 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | 2744 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) |
2812 | c->op_bytes = 8; | 2745 | c->op_bytes = 8; |
@@ -2831,14 +2764,13 @@ done_prefixes: | |||
2831 | if (!c->has_seg_override) | 2764 | if (!c->has_seg_override) |
2832 | set_seg_override(c, VCPU_SREG_DS); | 2765 | set_seg_override(c, VCPU_SREG_DS); |
2833 | 2766 | ||
2834 | if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) | 2767 | memop.addr.mem.seg = seg_override(ctxt, ops, c); |
2835 | memop.addr.mem += seg_override_base(ctxt, ops, c); | ||
2836 | 2768 | ||
2837 | if (memop.type == OP_MEM && c->ad_bytes != 8) | 2769 | if (memop.type == OP_MEM && c->ad_bytes != 8) |
2838 | memop.addr.mem = (u32)memop.addr.mem; | 2770 | memop.addr.mem.ea = (u32)memop.addr.mem.ea; |
2839 | 2771 | ||
2840 | if (memop.type == OP_MEM && c->rip_relative) | 2772 | if (memop.type == OP_MEM && c->rip_relative) |
2841 | memop.addr.mem += c->eip; | 2773 | memop.addr.mem.ea += c->eip; |
2842 | 2774 | ||
2843 | /* | 2775 | /* |
2844 | * Decode and fetch the source operand: register, memory | 2776 | * Decode and fetch the source operand: register, memory |
@@ -2890,14 +2822,14 @@ done_prefixes: | |||
2890 | case SrcSI: | 2822 | case SrcSI: |
2891 | c->src.type = OP_MEM; | 2823 | c->src.type = OP_MEM; |
2892 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 2824 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
2893 | c->src.addr.mem = | 2825 | c->src.addr.mem.ea = |
2894 | register_address(c, seg_override_base(ctxt, ops, c), | 2826 | register_address(c, c->regs[VCPU_REGS_RSI]); |
2895 | c->regs[VCPU_REGS_RSI]); | 2827 | c->src.addr.mem.seg = seg_override(ctxt, ops, c), |
2896 | c->src.val = 0; | 2828 | c->src.val = 0; |
2897 | break; | 2829 | break; |
2898 | case SrcImmFAddr: | 2830 | case SrcImmFAddr: |
2899 | c->src.type = OP_IMM; | 2831 | c->src.type = OP_IMM; |
2900 | c->src.addr.mem = c->eip; | 2832 | c->src.addr.mem.ea = c->eip; |
2901 | c->src.bytes = c->op_bytes + 2; | 2833 | c->src.bytes = c->op_bytes + 2; |
2902 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | 2834 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); |
2903 | break; | 2835 | break; |
@@ -2944,7 +2876,7 @@ done_prefixes: | |||
2944 | break; | 2876 | break; |
2945 | case DstImmUByte: | 2877 | case DstImmUByte: |
2946 | c->dst.type = OP_IMM; | 2878 | c->dst.type = OP_IMM; |
2947 | c->dst.addr.mem = c->eip; | 2879 | c->dst.addr.mem.ea = c->eip; |
2948 | c->dst.bytes = 1; | 2880 | c->dst.bytes = 1; |
2949 | c->dst.val = insn_fetch(u8, 1, c->eip); | 2881 | c->dst.val = insn_fetch(u8, 1, c->eip); |
2950 | break; | 2882 | break; |
@@ -2969,9 +2901,9 @@ done_prefixes: | |||
2969 | case DstDI: | 2901 | case DstDI: |
2970 | c->dst.type = OP_MEM; | 2902 | c->dst.type = OP_MEM; |
2971 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 2903 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
2972 | c->dst.addr.mem = | 2904 | c->dst.addr.mem.ea = |
2973 | register_address(c, es_base(ctxt, ops), | 2905 | register_address(c, c->regs[VCPU_REGS_RDI]); |
2974 | c->regs[VCPU_REGS_RDI]); | 2906 | c->dst.addr.mem.seg = VCPU_SREG_ES; |
2975 | c->dst.val = 0; | 2907 | c->dst.val = 0; |
2976 | break; | 2908 | break; |
2977 | case ImplicitOps: | 2909 | case ImplicitOps: |
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
3020 | ctxt->decode.mem_read.pos = 0; | 2952 | ctxt->decode.mem_read.pos = 0; |
3021 | 2953 | ||
3022 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { | 2954 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { |
3023 | emulate_ud(ctxt); | 2955 | rc = emulate_ud(ctxt); |
3024 | goto done; | 2956 | goto done; |
3025 | } | 2957 | } |
3026 | 2958 | ||
3027 | /* LOCK prefix is allowed only with some instructions */ | 2959 | /* LOCK prefix is allowed only with some instructions */ |
3028 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { | 2960 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { |
3029 | emulate_ud(ctxt); | 2961 | rc = emulate_ud(ctxt); |
3030 | goto done; | 2962 | goto done; |
3031 | } | 2963 | } |
3032 | 2964 | ||
3033 | if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { | 2965 | if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { |
3034 | emulate_ud(ctxt); | 2966 | rc = emulate_ud(ctxt); |
3035 | goto done; | 2967 | goto done; |
3036 | } | 2968 | } |
3037 | 2969 | ||
3038 | /* Privileged instruction can be executed only in CPL=0 */ | 2970 | /* Privileged instruction can be executed only in CPL=0 */ |
3039 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { | 2971 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { |
3040 | emulate_gp(ctxt, 0); | 2972 | rc = emulate_gp(ctxt, 0); |
3041 | goto done; | 2973 | goto done; |
3042 | } | 2974 | } |
3043 | 2975 | ||
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
3050 | } | 2982 | } |
3051 | 2983 | ||
3052 | if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { | 2984 | if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { |
3053 | rc = read_emulated(ctxt, ops, c->src.addr.mem, | 2985 | rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), |
3054 | c->src.valptr, c->src.bytes); | 2986 | c->src.valptr, c->src.bytes); |
3055 | if (rc != X86EMUL_CONTINUE) | 2987 | if (rc != X86EMUL_CONTINUE) |
3056 | goto done; | 2988 | goto done; |
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
3058 | } | 2990 | } |
3059 | 2991 | ||
3060 | if (c->src2.type == OP_MEM) { | 2992 | if (c->src2.type == OP_MEM) { |
3061 | rc = read_emulated(ctxt, ops, c->src2.addr.mem, | 2993 | rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), |
3062 | &c->src2.val, c->src2.bytes); | 2994 | &c->src2.val, c->src2.bytes); |
3063 | if (rc != X86EMUL_CONTINUE) | 2995 | if (rc != X86EMUL_CONTINUE) |
3064 | goto done; | 2996 | goto done; |
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
3070 | 3002 | ||
3071 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { | 3003 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { |
3072 | /* optimisation - avoid slow emulated read if Mov */ | 3004 | /* optimisation - avoid slow emulated read if Mov */ |
3073 | rc = read_emulated(ctxt, ops, c->dst.addr.mem, | 3005 | rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), |
3074 | &c->dst.val, c->dst.bytes); | 3006 | &c->dst.val, c->dst.bytes); |
3075 | if (rc != X86EMUL_CONTINUE) | 3007 | if (rc != X86EMUL_CONTINUE) |
3076 | goto done; | 3008 | goto done; |
@@ -3215,13 +3147,13 @@ special_insn: | |||
3215 | break; | 3147 | break; |
3216 | case 0x8c: /* mov r/m, sreg */ | 3148 | case 0x8c: /* mov r/m, sreg */ |
3217 | if (c->modrm_reg > VCPU_SREG_GS) { | 3149 | if (c->modrm_reg > VCPU_SREG_GS) { |
3218 | emulate_ud(ctxt); | 3150 | rc = emulate_ud(ctxt); |
3219 | goto done; | 3151 | goto done; |
3220 | } | 3152 | } |
3221 | c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); | 3153 | c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); |
3222 | break; | 3154 | break; |
3223 | case 0x8d: /* lea r16/r32, m */ | 3155 | case 0x8d: /* lea r16/r32, m */ |
3224 | c->dst.val = c->src.addr.mem; | 3156 | c->dst.val = c->src.addr.mem.ea; |
3225 | break; | 3157 | break; |
3226 | case 0x8e: { /* mov seg, r/m16 */ | 3158 | case 0x8e: { /* mov seg, r/m16 */ |
3227 | uint16_t sel; | 3159 | uint16_t sel; |
@@ -3230,7 +3162,7 @@ special_insn: | |||
3230 | 3162 | ||
3231 | if (c->modrm_reg == VCPU_SREG_CS || | 3163 | if (c->modrm_reg == VCPU_SREG_CS || |
3232 | c->modrm_reg > VCPU_SREG_GS) { | 3164 | c->modrm_reg > VCPU_SREG_GS) { |
3233 | emulate_ud(ctxt); | 3165 | rc = emulate_ud(ctxt); |
3234 | goto done; | 3166 | goto done; |
3235 | } | 3167 | } |
3236 | 3168 | ||
@@ -3268,7 +3200,6 @@ special_insn: | |||
3268 | break; | 3200 | break; |
3269 | case 0xa6 ... 0xa7: /* cmps */ | 3201 | case 0xa6 ... 0xa7: /* cmps */ |
3270 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3202 | c->dst.type = OP_NONE; /* Disable writeback. */ |
3271 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem); | ||
3272 | goto cmp; | 3203 | goto cmp; |
3273 | case 0xa8 ... 0xa9: /* test ax, imm */ | 3204 | case 0xa8 ... 0xa9: /* test ax, imm */ |
3274 | goto test; | 3205 | goto test; |
@@ -3363,7 +3294,7 @@ special_insn: | |||
3363 | do_io_in: | 3294 | do_io_in: |
3364 | c->dst.bytes = min(c->dst.bytes, 4u); | 3295 | c->dst.bytes = min(c->dst.bytes, 4u); |
3365 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { | 3296 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { |
3366 | emulate_gp(ctxt, 0); | 3297 | rc = emulate_gp(ctxt, 0); |
3367 | goto done; | 3298 | goto done; |
3368 | } | 3299 | } |
3369 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, | 3300 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, |
@@ -3377,7 +3308,7 @@ special_insn: | |||
3377 | c->src.bytes = min(c->src.bytes, 4u); | 3308 | c->src.bytes = min(c->src.bytes, 4u); |
3378 | if (!emulator_io_permited(ctxt, ops, c->dst.val, | 3309 | if (!emulator_io_permited(ctxt, ops, c->dst.val, |
3379 | c->src.bytes)) { | 3310 | c->src.bytes)) { |
3380 | emulate_gp(ctxt, 0); | 3311 | rc = emulate_gp(ctxt, 0); |
3381 | goto done; | 3312 | goto done; |
3382 | } | 3313 | } |
3383 | ops->pio_out_emulated(c->src.bytes, c->dst.val, | 3314 | ops->pio_out_emulated(c->src.bytes, c->dst.val, |
@@ -3402,14 +3333,14 @@ special_insn: | |||
3402 | break; | 3333 | break; |
3403 | case 0xfa: /* cli */ | 3334 | case 0xfa: /* cli */ |
3404 | if (emulator_bad_iopl(ctxt, ops)) { | 3335 | if (emulator_bad_iopl(ctxt, ops)) { |
3405 | emulate_gp(ctxt, 0); | 3336 | rc = emulate_gp(ctxt, 0); |
3406 | goto done; | 3337 | goto done; |
3407 | } else | 3338 | } else |
3408 | ctxt->eflags &= ~X86_EFLAGS_IF; | 3339 | ctxt->eflags &= ~X86_EFLAGS_IF; |
3409 | break; | 3340 | break; |
3410 | case 0xfb: /* sti */ | 3341 | case 0xfb: /* sti */ |
3411 | if (emulator_bad_iopl(ctxt, ops)) { | 3342 | if (emulator_bad_iopl(ctxt, ops)) { |
3412 | emulate_gp(ctxt, 0); | 3343 | rc = emulate_gp(ctxt, 0); |
3413 | goto done; | 3344 | goto done; |
3414 | } else { | 3345 | } else { |
3415 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | 3346 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; |
@@ -3449,11 +3380,11 @@ writeback: | |||
3449 | c->dst.type = saved_dst_type; | 3380 | c->dst.type = saved_dst_type; |
3450 | 3381 | ||
3451 | if ((c->d & SrcMask) == SrcSI) | 3382 | if ((c->d & SrcMask) == SrcSI) |
3452 | string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), | 3383 | string_addr_inc(ctxt, seg_override(ctxt, ops, c), |
3453 | VCPU_REGS_RSI, &c->src); | 3384 | VCPU_REGS_RSI, &c->src); |
3454 | 3385 | ||
3455 | if ((c->d & DstMask) == DstDI) | 3386 | if ((c->d & DstMask) == DstDI) |
3456 | string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, | 3387 | string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, |
3457 | &c->dst); | 3388 | &c->dst); |
3458 | 3389 | ||
3459 | if (c->rep_prefix && (c->d & String)) { | 3390 | if (c->rep_prefix && (c->d & String)) { |
@@ -3482,6 +3413,8 @@ writeback: | |||
3482 | ctxt->eip = c->eip; | 3413 | ctxt->eip = c->eip; |
3483 | 3414 | ||
3484 | done: | 3415 | done: |
3416 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
3417 | ctxt->have_exception = true; | ||
3485 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 3418 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
3486 | 3419 | ||
3487 | twobyte_insn: | 3420 | twobyte_insn: |
@@ -3544,9 +3477,11 @@ twobyte_insn: | |||
3544 | break; | 3477 | break; |
3545 | case 5: /* not defined */ | 3478 | case 5: /* not defined */ |
3546 | emulate_ud(ctxt); | 3479 | emulate_ud(ctxt); |
3480 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3547 | goto done; | 3481 | goto done; |
3548 | case 7: /* invlpg*/ | 3482 | case 7: /* invlpg*/ |
3549 | emulate_invlpg(ctxt->vcpu, c->src.addr.mem); | 3483 | emulate_invlpg(ctxt->vcpu, |
3484 | linear(ctxt, c->src.addr.mem)); | ||
3550 | /* Disable writeback. */ | 3485 | /* Disable writeback. */ |
3551 | c->dst.type = OP_NONE; | 3486 | c->dst.type = OP_NONE; |
3552 | break; | 3487 | break; |
@@ -3573,6 +3508,7 @@ twobyte_insn: | |||
3573 | case 5 ... 7: | 3508 | case 5 ... 7: |
3574 | case 9 ... 15: | 3509 | case 9 ... 15: |
3575 | emulate_ud(ctxt); | 3510 | emulate_ud(ctxt); |
3511 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3576 | goto done; | 3512 | goto done; |
3577 | } | 3513 | } |
3578 | c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); | 3514 | c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); |
@@ -3581,6 +3517,7 @@ twobyte_insn: | |||
3581 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 3517 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && |
3582 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { | 3518 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { |
3583 | emulate_ud(ctxt); | 3519 | emulate_ud(ctxt); |
3520 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3584 | goto done; | 3521 | goto done; |
3585 | } | 3522 | } |
3586 | ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); | 3523 | ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); |
@@ -3588,6 +3525,7 @@ twobyte_insn: | |||
3588 | case 0x22: /* mov reg, cr */ | 3525 | case 0x22: /* mov reg, cr */ |
3589 | if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { | 3526 | if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { |
3590 | emulate_gp(ctxt, 0); | 3527 | emulate_gp(ctxt, 0); |
3528 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3591 | goto done; | 3529 | goto done; |
3592 | } | 3530 | } |
3593 | c->dst.type = OP_NONE; | 3531 | c->dst.type = OP_NONE; |
@@ -3596,6 +3534,7 @@ twobyte_insn: | |||
3596 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 3534 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && |
3597 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { | 3535 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { |
3598 | emulate_ud(ctxt); | 3536 | emulate_ud(ctxt); |
3537 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3599 | goto done; | 3538 | goto done; |
3600 | } | 3539 | } |
3601 | 3540 | ||
@@ -3604,6 +3543,7 @@ twobyte_insn: | |||
3604 | ~0ULL : ~0U), ctxt->vcpu) < 0) { | 3543 | ~0ULL : ~0U), ctxt->vcpu) < 0) { |
3605 | /* #UD condition is already handled by the code above */ | 3544 | /* #UD condition is already handled by the code above */ |
3606 | emulate_gp(ctxt, 0); | 3545 | emulate_gp(ctxt, 0); |
3546 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3607 | goto done; | 3547 | goto done; |
3608 | } | 3548 | } |
3609 | 3549 | ||
@@ -3615,6 +3555,7 @@ twobyte_insn: | |||
3615 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | 3555 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); |
3616 | if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { | 3556 | if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { |
3617 | emulate_gp(ctxt, 0); | 3557 | emulate_gp(ctxt, 0); |
3558 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3618 | goto done; | 3559 | goto done; |
3619 | } | 3560 | } |
3620 | rc = X86EMUL_CONTINUE; | 3561 | rc = X86EMUL_CONTINUE; |
@@ -3623,6 +3564,7 @@ twobyte_insn: | |||
3623 | /* rdmsr */ | 3564 | /* rdmsr */ |
3624 | if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { | 3565 | if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { |
3625 | emulate_gp(ctxt, 0); | 3566 | emulate_gp(ctxt, 0); |
3567 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3626 | goto done; | 3568 | goto done; |
3627 | } else { | 3569 | } else { |
3628 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 3570 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; |
@@ -3785,6 +3727,5 @@ twobyte_insn: | |||
3785 | goto writeback; | 3727 | goto writeback; |
3786 | 3728 | ||
3787 | cannot_emulate: | 3729 | cannot_emulate: |
3788 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
3789 | return -1; | 3730 | return -1; |
3790 | } | 3731 | } |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 975bb45329a1..3377d53fcd36 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) | |||
73 | return vcpu->arch.cr4 & mask; | 73 | return vcpu->arch.cr4 & mask; |
74 | } | 74 | } |
75 | 75 | ||
76 | static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) | ||
77 | { | ||
78 | if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) | ||
79 | kvm_x86_ops->decache_cr3(vcpu); | ||
80 | return vcpu->arch.cr3; | ||
81 | } | ||
82 | |||
76 | static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) | 83 | static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) |
77 | { | 84 | { |
78 | return kvm_read_cr4_bits(vcpu, ~0UL); | 85 | return kvm_read_cr4_bits(vcpu, ~0UL); |
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | |||
84 | | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); | 91 | | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); |
85 | } | 92 | } |
86 | 93 | ||
94 | static inline void enter_guest_mode(struct kvm_vcpu *vcpu) | ||
95 | { | ||
96 | vcpu->arch.hflags |= HF_GUEST_MASK; | ||
97 | } | ||
98 | |||
99 | static inline void leave_guest_mode(struct kvm_vcpu *vcpu) | ||
100 | { | ||
101 | vcpu->arch.hflags &= ~HF_GUEST_MASK; | ||
102 | } | ||
103 | |||
104 | static inline bool is_guest_mode(struct kvm_vcpu *vcpu) | ||
105 | { | ||
106 | return vcpu->arch.hflags & HF_GUEST_MASK; | ||
107 | } | ||
108 | |||
87 | #endif | 109 | #endif |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 413f8973a855..93cf9d0d3653 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) | |||
277 | 277 | ||
278 | if (old_ppr != ppr) { | 278 | if (old_ppr != ppr) { |
279 | apic_set_reg(apic, APIC_PROCPRI, ppr); | 279 | apic_set_reg(apic, APIC_PROCPRI, ppr); |
280 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | 280 | if (ppr < old_ppr) |
281 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | ||
281 | } | 282 | } |
282 | } | 283 | } |
283 | 284 | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fbb04aee8301..9cafbb499813 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -18,9 +18,11 @@ | |||
18 | * | 18 | * |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include "irq.h" | ||
21 | #include "mmu.h" | 22 | #include "mmu.h" |
22 | #include "x86.h" | 23 | #include "x86.h" |
23 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
25 | #include "x86.h" | ||
24 | 26 | ||
25 | #include <linux/kvm_host.h> | 27 | #include <linux/kvm_host.h> |
26 | #include <linux/types.h> | 28 | #include <linux/types.h> |
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages; | |||
194 | 196 | ||
195 | static u64 __read_mostly shadow_trap_nonpresent_pte; | 197 | static u64 __read_mostly shadow_trap_nonpresent_pte; |
196 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | 198 | static u64 __read_mostly shadow_notrap_nonpresent_pte; |
197 | static u64 __read_mostly shadow_base_present_pte; | ||
198 | static u64 __read_mostly shadow_nx_mask; | 199 | static u64 __read_mostly shadow_nx_mask; |
199 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | 200 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
200 | static u64 __read_mostly shadow_user_mask; | 201 | static u64 __read_mostly shadow_user_mask; |
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | |||
213 | } | 214 | } |
214 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | 215 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); |
215 | 216 | ||
216 | void kvm_mmu_set_base_ptes(u64 base_pte) | ||
217 | { | ||
218 | shadow_base_present_pte = base_pte; | ||
219 | } | ||
220 | EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); | ||
221 | |||
222 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 217 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
223 | u64 dirty_mask, u64 nx_mask, u64 x_mask) | 218 | u64 dirty_mask, u64 nx_mask, u64 x_mask) |
224 | { | 219 | { |
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) | |||
482 | } | 477 | } |
483 | 478 | ||
484 | /* | 479 | /* |
485 | * Return the pointer to the largepage write count for a given | 480 | * Return the pointer to the large page information for a given gfn, |
486 | * gfn, handling slots that are not large page aligned. | 481 | * handling slots that are not large page aligned. |
487 | */ | 482 | */ |
488 | static int *slot_largepage_idx(gfn_t gfn, | 483 | static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, |
489 | struct kvm_memory_slot *slot, | 484 | struct kvm_memory_slot *slot, |
490 | int level) | 485 | int level) |
491 | { | 486 | { |
492 | unsigned long idx; | 487 | unsigned long idx; |
493 | 488 | ||
494 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 489 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - |
495 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | 490 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); |
496 | return &slot->lpage_info[level - 2][idx].write_count; | 491 | return &slot->lpage_info[level - 2][idx]; |
497 | } | 492 | } |
498 | 493 | ||
499 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | 494 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) |
500 | { | 495 | { |
501 | struct kvm_memory_slot *slot; | 496 | struct kvm_memory_slot *slot; |
502 | int *write_count; | 497 | struct kvm_lpage_info *linfo; |
503 | int i; | 498 | int i; |
504 | 499 | ||
505 | slot = gfn_to_memslot(kvm, gfn); | 500 | slot = gfn_to_memslot(kvm, gfn); |
506 | for (i = PT_DIRECTORY_LEVEL; | 501 | for (i = PT_DIRECTORY_LEVEL; |
507 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 502 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
508 | write_count = slot_largepage_idx(gfn, slot, i); | 503 | linfo = lpage_info_slot(gfn, slot, i); |
509 | *write_count += 1; | 504 | linfo->write_count += 1; |
510 | } | 505 | } |
511 | } | 506 | } |
512 | 507 | ||
513 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | 508 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) |
514 | { | 509 | { |
515 | struct kvm_memory_slot *slot; | 510 | struct kvm_memory_slot *slot; |
516 | int *write_count; | 511 | struct kvm_lpage_info *linfo; |
517 | int i; | 512 | int i; |
518 | 513 | ||
519 | slot = gfn_to_memslot(kvm, gfn); | 514 | slot = gfn_to_memslot(kvm, gfn); |
520 | for (i = PT_DIRECTORY_LEVEL; | 515 | for (i = PT_DIRECTORY_LEVEL; |
521 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 516 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
522 | write_count = slot_largepage_idx(gfn, slot, i); | 517 | linfo = lpage_info_slot(gfn, slot, i); |
523 | *write_count -= 1; | 518 | linfo->write_count -= 1; |
524 | WARN_ON(*write_count < 0); | 519 | WARN_ON(linfo->write_count < 0); |
525 | } | 520 | } |
526 | } | 521 | } |
527 | 522 | ||
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm, | |||
530 | int level) | 525 | int level) |
531 | { | 526 | { |
532 | struct kvm_memory_slot *slot; | 527 | struct kvm_memory_slot *slot; |
533 | int *largepage_idx; | 528 | struct kvm_lpage_info *linfo; |
534 | 529 | ||
535 | slot = gfn_to_memslot(kvm, gfn); | 530 | slot = gfn_to_memslot(kvm, gfn); |
536 | if (slot) { | 531 | if (slot) { |
537 | largepage_idx = slot_largepage_idx(gfn, slot, level); | 532 | linfo = lpage_info_slot(gfn, slot, level); |
538 | return *largepage_idx; | 533 | return linfo->write_count; |
539 | } | 534 | } |
540 | 535 | ||
541 | return 1; | 536 | return 1; |
@@ -590,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
590 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | 585 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
591 | { | 586 | { |
592 | struct kvm_memory_slot *slot; | 587 | struct kvm_memory_slot *slot; |
593 | unsigned long idx; | 588 | struct kvm_lpage_info *linfo; |
594 | 589 | ||
595 | slot = gfn_to_memslot(kvm, gfn); | 590 | slot = gfn_to_memslot(kvm, gfn); |
596 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | 591 | if (likely(level == PT_PAGE_TABLE_LEVEL)) |
597 | return &slot->rmap[gfn - slot->base_gfn]; | 592 | return &slot->rmap[gfn - slot->base_gfn]; |
598 | 593 | ||
599 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 594 | linfo = lpage_info_slot(gfn, slot, level); |
600 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | ||
601 | 595 | ||
602 | return &slot->lpage_info[level - 2][idx].rmap_pde; | 596 | return &linfo->rmap_pde; |
603 | } | 597 | } |
604 | 598 | ||
605 | /* | 599 | /* |
@@ -887,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
887 | end = start + (memslot->npages << PAGE_SHIFT); | 881 | end = start + (memslot->npages << PAGE_SHIFT); |
888 | if (hva >= start && hva < end) { | 882 | if (hva >= start && hva < end) { |
889 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | 883 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; |
884 | gfn_t gfn = memslot->base_gfn + gfn_offset; | ||
890 | 885 | ||
891 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); | 886 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); |
892 | 887 | ||
893 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { | 888 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { |
894 | unsigned long idx; | 889 | struct kvm_lpage_info *linfo; |
895 | int sh; | 890 | |
896 | 891 | linfo = lpage_info_slot(gfn, memslot, | |
897 | sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); | 892 | PT_DIRECTORY_LEVEL + j); |
898 | idx = ((memslot->base_gfn+gfn_offset) >> sh) - | 893 | ret |= handler(kvm, &linfo->rmap_pde, data); |
899 | (memslot->base_gfn >> sh); | ||
900 | ret |= handler(kvm, | ||
901 | &memslot->lpage_info[j][idx].rmap_pde, | ||
902 | data); | ||
903 | } | 894 | } |
904 | trace_kvm_age_page(hva, memslot, ret); | 895 | trace_kvm_age_page(hva, memslot, ret); |
905 | retval |= ret; | 896 | retval |= ret; |
@@ -1161,7 +1152,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | |||
1161 | } | 1152 | } |
1162 | 1153 | ||
1163 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | 1154 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, |
1164 | struct kvm_mmu_page *sp, bool clear_unsync) | 1155 | struct kvm_mmu_page *sp) |
1165 | { | 1156 | { |
1166 | return 1; | 1157 | return 1; |
1167 | } | 1158 | } |
@@ -1291,7 +1282,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
1291 | if (clear_unsync) | 1282 | if (clear_unsync) |
1292 | kvm_unlink_unsync_page(vcpu->kvm, sp); | 1283 | kvm_unlink_unsync_page(vcpu->kvm, sp); |
1293 | 1284 | ||
1294 | if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { | 1285 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { |
1295 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); | 1286 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); |
1296 | return 1; | 1287 | return 1; |
1297 | } | 1288 | } |
@@ -1332,12 +1323,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | |||
1332 | continue; | 1323 | continue; |
1333 | 1324 | ||
1334 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); | 1325 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); |
1326 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1335 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || | 1327 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || |
1336 | (vcpu->arch.mmu.sync_page(vcpu, s, true))) { | 1328 | (vcpu->arch.mmu.sync_page(vcpu, s))) { |
1337 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); | 1329 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); |
1338 | continue; | 1330 | continue; |
1339 | } | 1331 | } |
1340 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1341 | flush = true; | 1332 | flush = true; |
1342 | } | 1333 | } |
1343 | 1334 | ||
@@ -1963,9 +1954,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1963 | unsigned pte_access, int user_fault, | 1954 | unsigned pte_access, int user_fault, |
1964 | int write_fault, int dirty, int level, | 1955 | int write_fault, int dirty, int level, |
1965 | gfn_t gfn, pfn_t pfn, bool speculative, | 1956 | gfn_t gfn, pfn_t pfn, bool speculative, |
1966 | bool can_unsync, bool reset_host_protection) | 1957 | bool can_unsync, bool host_writable) |
1967 | { | 1958 | { |
1968 | u64 spte; | 1959 | u64 spte, entry = *sptep; |
1969 | int ret = 0; | 1960 | int ret = 0; |
1970 | 1961 | ||
1971 | /* | 1962 | /* |
@@ -1973,7 +1964,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1973 | * whether the guest actually used the pte (in order to detect | 1964 | * whether the guest actually used the pte (in order to detect |
1974 | * demand paging). | 1965 | * demand paging). |
1975 | */ | 1966 | */ |
1976 | spte = shadow_base_present_pte; | 1967 | spte = PT_PRESENT_MASK; |
1977 | if (!speculative) | 1968 | if (!speculative) |
1978 | spte |= shadow_accessed_mask; | 1969 | spte |= shadow_accessed_mask; |
1979 | if (!dirty) | 1970 | if (!dirty) |
@@ -1990,8 +1981,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1990 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, | 1981 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, |
1991 | kvm_is_mmio_pfn(pfn)); | 1982 | kvm_is_mmio_pfn(pfn)); |
1992 | 1983 | ||
1993 | if (reset_host_protection) | 1984 | if (host_writable) |
1994 | spte |= SPTE_HOST_WRITEABLE; | 1985 | spte |= SPTE_HOST_WRITEABLE; |
1986 | else | ||
1987 | pte_access &= ~ACC_WRITE_MASK; | ||
1995 | 1988 | ||
1996 | spte |= (u64)pfn << PAGE_SHIFT; | 1989 | spte |= (u64)pfn << PAGE_SHIFT; |
1997 | 1990 | ||
@@ -2036,6 +2029,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2036 | 2029 | ||
2037 | set_pte: | 2030 | set_pte: |
2038 | update_spte(sptep, spte); | 2031 | update_spte(sptep, spte); |
2032 | /* | ||
2033 | * If we overwrite a writable spte with a read-only one we | ||
2034 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
2035 | * will find a read-only spte, even though the writable spte | ||
2036 | * might be cached on a CPU's TLB. | ||
2037 | */ | ||
2038 | if (is_writable_pte(entry) && !is_writable_pte(*sptep)) | ||
2039 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
2039 | done: | 2040 | done: |
2040 | return ret; | 2041 | return ret; |
2041 | } | 2042 | } |
@@ -2045,7 +2046,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2045 | int user_fault, int write_fault, int dirty, | 2046 | int user_fault, int write_fault, int dirty, |
2046 | int *ptwrite, int level, gfn_t gfn, | 2047 | int *ptwrite, int level, gfn_t gfn, |
2047 | pfn_t pfn, bool speculative, | 2048 | pfn_t pfn, bool speculative, |
2048 | bool reset_host_protection) | 2049 | bool host_writable) |
2049 | { | 2050 | { |
2050 | int was_rmapped = 0; | 2051 | int was_rmapped = 0; |
2051 | int rmap_count; | 2052 | int rmap_count; |
@@ -2080,7 +2081,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2080 | 2081 | ||
2081 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, | 2082 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, |
2082 | dirty, level, gfn, pfn, speculative, true, | 2083 | dirty, level, gfn, pfn, speculative, true, |
2083 | reset_host_protection)) { | 2084 | host_writable)) { |
2084 | if (write_fault) | 2085 | if (write_fault) |
2085 | *ptwrite = 1; | 2086 | *ptwrite = 1; |
2086 | kvm_mmu_flush_tlb(vcpu); | 2087 | kvm_mmu_flush_tlb(vcpu); |
@@ -2211,7 +2212,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) | |||
2211 | } | 2212 | } |
2212 | 2213 | ||
2213 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 2214 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
2214 | int level, gfn_t gfn, pfn_t pfn) | 2215 | int map_writable, int level, gfn_t gfn, pfn_t pfn, |
2216 | bool prefault) | ||
2215 | { | 2217 | { |
2216 | struct kvm_shadow_walk_iterator iterator; | 2218 | struct kvm_shadow_walk_iterator iterator; |
2217 | struct kvm_mmu_page *sp; | 2219 | struct kvm_mmu_page *sp; |
@@ -2220,9 +2222,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2220 | 2222 | ||
2221 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 2223 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
2222 | if (iterator.level == level) { | 2224 | if (iterator.level == level) { |
2223 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, | 2225 | unsigned pte_access = ACC_ALL; |
2226 | |||
2227 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, | ||
2224 | 0, write, 1, &pt_write, | 2228 | 0, write, 1, &pt_write, |
2225 | level, gfn, pfn, false, true); | 2229 | level, gfn, pfn, prefault, map_writable); |
2226 | direct_pte_prefetch(vcpu, iterator.sptep); | 2230 | direct_pte_prefetch(vcpu, iterator.sptep); |
2227 | ++vcpu->stat.pf_fixed; | 2231 | ++vcpu->stat.pf_fixed; |
2228 | break; | 2232 | break; |
@@ -2277,12 +2281,17 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | |||
2277 | return 1; | 2281 | return 1; |
2278 | } | 2282 | } |
2279 | 2283 | ||
2280 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 2284 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2285 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | ||
2286 | |||
2287 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | ||
2288 | bool prefault) | ||
2281 | { | 2289 | { |
2282 | int r; | 2290 | int r; |
2283 | int level; | 2291 | int level; |
2284 | pfn_t pfn; | 2292 | pfn_t pfn; |
2285 | unsigned long mmu_seq; | 2293 | unsigned long mmu_seq; |
2294 | bool map_writable; | ||
2286 | 2295 | ||
2287 | level = mapping_level(vcpu, gfn); | 2296 | level = mapping_level(vcpu, gfn); |
2288 | 2297 | ||
@@ -2297,7 +2306,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
2297 | 2306 | ||
2298 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2307 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2299 | smp_rmb(); | 2308 | smp_rmb(); |
2300 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2309 | |
2310 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) | ||
2311 | return 0; | ||
2301 | 2312 | ||
2302 | /* mmio */ | 2313 | /* mmio */ |
2303 | if (is_error_pfn(pfn)) | 2314 | if (is_error_pfn(pfn)) |
@@ -2307,7 +2318,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
2307 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2318 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2308 | goto out_unlock; | 2319 | goto out_unlock; |
2309 | kvm_mmu_free_some_pages(vcpu); | 2320 | kvm_mmu_free_some_pages(vcpu); |
2310 | r = __direct_map(vcpu, v, write, level, gfn, pfn); | 2321 | r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, |
2322 | prefault); | ||
2311 | spin_unlock(&vcpu->kvm->mmu_lock); | 2323 | spin_unlock(&vcpu->kvm->mmu_lock); |
2312 | 2324 | ||
2313 | 2325 | ||
@@ -2530,6 +2542,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2530 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2542 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2531 | sp = page_header(root); | 2543 | sp = page_header(root); |
2532 | mmu_sync_children(vcpu, sp); | 2544 | mmu_sync_children(vcpu, sp); |
2545 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | ||
2533 | return; | 2546 | return; |
2534 | } | 2547 | } |
2535 | for (i = 0; i < 4; ++i) { | 2548 | for (i = 0; i < 4; ++i) { |
@@ -2552,23 +2565,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2552 | } | 2565 | } |
2553 | 2566 | ||
2554 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, | 2567 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, |
2555 | u32 access, u32 *error) | 2568 | u32 access, struct x86_exception *exception) |
2556 | { | 2569 | { |
2557 | if (error) | 2570 | if (exception) |
2558 | *error = 0; | 2571 | exception->error_code = 0; |
2559 | return vaddr; | 2572 | return vaddr; |
2560 | } | 2573 | } |
2561 | 2574 | ||
2562 | static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, | 2575 | static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, |
2563 | u32 access, u32 *error) | 2576 | u32 access, |
2577 | struct x86_exception *exception) | ||
2564 | { | 2578 | { |
2565 | if (error) | 2579 | if (exception) |
2566 | *error = 0; | 2580 | exception->error_code = 0; |
2567 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); | 2581 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); |
2568 | } | 2582 | } |
2569 | 2583 | ||
2570 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 2584 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
2571 | u32 error_code) | 2585 | u32 error_code, bool prefault) |
2572 | { | 2586 | { |
2573 | gfn_t gfn; | 2587 | gfn_t gfn; |
2574 | int r; | 2588 | int r; |
@@ -2584,17 +2598,67 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
2584 | gfn = gva >> PAGE_SHIFT; | 2598 | gfn = gva >> PAGE_SHIFT; |
2585 | 2599 | ||
2586 | return nonpaging_map(vcpu, gva & PAGE_MASK, | 2600 | return nonpaging_map(vcpu, gva & PAGE_MASK, |
2587 | error_code & PFERR_WRITE_MASK, gfn); | 2601 | error_code & PFERR_WRITE_MASK, gfn, prefault); |
2602 | } | ||
2603 | |||
2604 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | ||
2605 | { | ||
2606 | struct kvm_arch_async_pf arch; | ||
2607 | |||
2608 | arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; | ||
2609 | arch.gfn = gfn; | ||
2610 | arch.direct_map = vcpu->arch.mmu.direct_map; | ||
2611 | arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); | ||
2612 | |||
2613 | return kvm_setup_async_pf(vcpu, gva, gfn, &arch); | ||
2588 | } | 2614 | } |
2589 | 2615 | ||
2590 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | 2616 | static bool can_do_async_pf(struct kvm_vcpu *vcpu) |
2591 | u32 error_code) | 2617 | { |
2618 | if (unlikely(!irqchip_in_kernel(vcpu->kvm) || | ||
2619 | kvm_event_needs_reinjection(vcpu))) | ||
2620 | return false; | ||
2621 | |||
2622 | return kvm_x86_ops->interrupt_allowed(vcpu); | ||
2623 | } | ||
2624 | |||
2625 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | ||
2626 | gva_t gva, pfn_t *pfn, bool write, bool *writable) | ||
2627 | { | ||
2628 | bool async; | ||
2629 | |||
2630 | *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); | ||
2631 | |||
2632 | if (!async) | ||
2633 | return false; /* *pfn has correct page already */ | ||
2634 | |||
2635 | put_page(pfn_to_page(*pfn)); | ||
2636 | |||
2637 | if (!prefault && can_do_async_pf(vcpu)) { | ||
2638 | trace_kvm_try_async_get_page(gva, gfn); | ||
2639 | if (kvm_find_async_pf_gfn(vcpu, gfn)) { | ||
2640 | trace_kvm_async_pf_doublefault(gva, gfn); | ||
2641 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); | ||
2642 | return true; | ||
2643 | } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) | ||
2644 | return true; | ||
2645 | } | ||
2646 | |||
2647 | *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); | ||
2648 | |||
2649 | return false; | ||
2650 | } | ||
2651 | |||
2652 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | ||
2653 | bool prefault) | ||
2592 | { | 2654 | { |
2593 | pfn_t pfn; | 2655 | pfn_t pfn; |
2594 | int r; | 2656 | int r; |
2595 | int level; | 2657 | int level; |
2596 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2658 | gfn_t gfn = gpa >> PAGE_SHIFT; |
2597 | unsigned long mmu_seq; | 2659 | unsigned long mmu_seq; |
2660 | int write = error_code & PFERR_WRITE_MASK; | ||
2661 | bool map_writable; | ||
2598 | 2662 | ||
2599 | ASSERT(vcpu); | 2663 | ASSERT(vcpu); |
2600 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 2664 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
@@ -2609,15 +2673,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
2609 | 2673 | ||
2610 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2674 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2611 | smp_rmb(); | 2675 | smp_rmb(); |
2612 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2676 | |
2677 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) | ||
2678 | return 0; | ||
2679 | |||
2680 | /* mmio */ | ||
2613 | if (is_error_pfn(pfn)) | 2681 | if (is_error_pfn(pfn)) |
2614 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); | 2682 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); |
2615 | spin_lock(&vcpu->kvm->mmu_lock); | 2683 | spin_lock(&vcpu->kvm->mmu_lock); |
2616 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2684 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2617 | goto out_unlock; | 2685 | goto out_unlock; |
2618 | kvm_mmu_free_some_pages(vcpu); | 2686 | kvm_mmu_free_some_pages(vcpu); |
2619 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 2687 | r = __direct_map(vcpu, gpa, write, map_writable, |
2620 | level, gfn, pfn); | 2688 | level, gfn, pfn, prefault); |
2621 | spin_unlock(&vcpu->kvm->mmu_lock); | 2689 | spin_unlock(&vcpu->kvm->mmu_lock); |
2622 | 2690 | ||
2623 | return r; | 2691 | return r; |
@@ -2659,18 +2727,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | |||
2659 | 2727 | ||
2660 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | 2728 | static void paging_new_cr3(struct kvm_vcpu *vcpu) |
2661 | { | 2729 | { |
2662 | pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); | 2730 | pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); |
2663 | mmu_free_roots(vcpu); | 2731 | mmu_free_roots(vcpu); |
2664 | } | 2732 | } |
2665 | 2733 | ||
2666 | static unsigned long get_cr3(struct kvm_vcpu *vcpu) | 2734 | static unsigned long get_cr3(struct kvm_vcpu *vcpu) |
2667 | { | 2735 | { |
2668 | return vcpu->arch.cr3; | 2736 | return kvm_read_cr3(vcpu); |
2669 | } | 2737 | } |
2670 | 2738 | ||
2671 | static void inject_page_fault(struct kvm_vcpu *vcpu) | 2739 | static void inject_page_fault(struct kvm_vcpu *vcpu, |
2740 | struct x86_exception *fault) | ||
2672 | { | 2741 | { |
2673 | vcpu->arch.mmu.inject_page_fault(vcpu); | 2742 | vcpu->arch.mmu.inject_page_fault(vcpu, fault); |
2674 | } | 2743 | } |
2675 | 2744 | ||
2676 | static void paging_free(struct kvm_vcpu *vcpu) | 2745 | static void paging_free(struct kvm_vcpu *vcpu) |
@@ -2816,6 +2885,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2816 | { | 2885 | { |
2817 | struct kvm_mmu *context = vcpu->arch.walk_mmu; | 2886 | struct kvm_mmu *context = vcpu->arch.walk_mmu; |
2818 | 2887 | ||
2888 | context->base_role.word = 0; | ||
2819 | context->new_cr3 = nonpaging_new_cr3; | 2889 | context->new_cr3 = nonpaging_new_cr3; |
2820 | context->page_fault = tdp_page_fault; | 2890 | context->page_fault = tdp_page_fault; |
2821 | context->free = nonpaging_free; | 2891 | context->free = nonpaging_free; |
@@ -3008,9 +3078,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
3008 | return; | 3078 | return; |
3009 | } | 3079 | } |
3010 | 3080 | ||
3011 | if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) | ||
3012 | return; | ||
3013 | |||
3014 | ++vcpu->kvm->stat.mmu_pte_updated; | 3081 | ++vcpu->kvm->stat.mmu_pte_updated; |
3015 | if (!sp->role.cr4_pae) | 3082 | if (!sp->role.cr4_pae) |
3016 | paging32_update_pte(vcpu, sp, spte, new); | 3083 | paging32_update_pte(vcpu, sp, spte, new); |
@@ -3264,12 +3331,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | |||
3264 | } | 3331 | } |
3265 | } | 3332 | } |
3266 | 3333 | ||
3267 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | 3334 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
3335 | void *insn, int insn_len) | ||
3268 | { | 3336 | { |
3269 | int r; | 3337 | int r; |
3270 | enum emulation_result er; | 3338 | enum emulation_result er; |
3271 | 3339 | ||
3272 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); | 3340 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); |
3273 | if (r < 0) | 3341 | if (r < 0) |
3274 | goto out; | 3342 | goto out; |
3275 | 3343 | ||
@@ -3282,7 +3350,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | |||
3282 | if (r) | 3350 | if (r) |
3283 | goto out; | 3351 | goto out; |
3284 | 3352 | ||
3285 | er = emulate_instruction(vcpu, cr2, error_code, 0); | 3353 | er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); |
3286 | 3354 | ||
3287 | switch (er) { | 3355 | switch (er) { |
3288 | case EMULATE_DONE: | 3356 | case EMULATE_DONE: |
@@ -3377,11 +3445,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
3377 | if (!test_bit(slot, sp->slot_bitmap)) | 3445 | if (!test_bit(slot, sp->slot_bitmap)) |
3378 | continue; | 3446 | continue; |
3379 | 3447 | ||
3448 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
3449 | continue; | ||
3450 | |||
3380 | pt = sp->spt; | 3451 | pt = sp->spt; |
3381 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | 3452 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) |
3382 | /* avoid RMW */ | 3453 | /* avoid RMW */ |
3383 | if (is_writable_pte(pt[i])) | 3454 | if (is_writable_pte(pt[i])) |
3384 | pt[i] &= ~PT_WRITABLE_MASK; | 3455 | update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); |
3385 | } | 3456 | } |
3386 | kvm_flush_remote_tlbs(kvm); | 3457 | kvm_flush_remote_tlbs(kvm); |
3387 | } | 3458 | } |
@@ -3463,13 +3534,6 @@ static void mmu_destroy_caches(void) | |||
3463 | kmem_cache_destroy(mmu_page_header_cache); | 3534 | kmem_cache_destroy(mmu_page_header_cache); |
3464 | } | 3535 | } |
3465 | 3536 | ||
3466 | void kvm_mmu_module_exit(void) | ||
3467 | { | ||
3468 | mmu_destroy_caches(); | ||
3469 | percpu_counter_destroy(&kvm_total_used_mmu_pages); | ||
3470 | unregister_shrinker(&mmu_shrinker); | ||
3471 | } | ||
3472 | |||
3473 | int kvm_mmu_module_init(void) | 3537 | int kvm_mmu_module_init(void) |
3474 | { | 3538 | { |
3475 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | 3539 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", |
@@ -3566,7 +3630,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, | |||
3566 | 3630 | ||
3567 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) | 3631 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) |
3568 | { | 3632 | { |
3569 | (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); | 3633 | (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); |
3570 | return 1; | 3634 | return 1; |
3571 | } | 3635 | } |
3572 | 3636 | ||
@@ -3662,12 +3726,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | |||
3662 | } | 3726 | } |
3663 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | 3727 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); |
3664 | 3728 | ||
3665 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
3666 | #include "mmu_audit.c" | ||
3667 | #else | ||
3668 | static void mmu_audit_disable(void) { } | ||
3669 | #endif | ||
3670 | |||
3671 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | 3729 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) |
3672 | { | 3730 | { |
3673 | ASSERT(vcpu); | 3731 | ASSERT(vcpu); |
@@ -3675,5 +3733,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | |||
3675 | destroy_kvm_mmu(vcpu); | 3733 | destroy_kvm_mmu(vcpu); |
3676 | free_mmu_pages(vcpu); | 3734 | free_mmu_pages(vcpu); |
3677 | mmu_free_memory_caches(vcpu); | 3735 | mmu_free_memory_caches(vcpu); |
3736 | } | ||
3737 | |||
3738 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
3739 | #include "mmu_audit.c" | ||
3740 | #else | ||
3741 | static void mmu_audit_disable(void) { } | ||
3742 | #endif | ||
3743 | |||
3744 | void kvm_mmu_module_exit(void) | ||
3745 | { | ||
3746 | mmu_destroy_caches(); | ||
3747 | percpu_counter_destroy(&kvm_total_used_mmu_pages); | ||
3748 | unregister_shrinker(&mmu_shrinker); | ||
3678 | mmu_audit_disable(); | 3749 | mmu_audit_disable(); |
3679 | } | 3750 | } |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index ba2bcdde6221..5f6223b8bcf7 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -19,11 +19,9 @@ | |||
19 | 19 | ||
20 | #include <linux/ratelimit.h> | 20 | #include <linux/ratelimit.h> |
21 | 21 | ||
22 | static int audit_point; | 22 | #define audit_printk(kvm, fmt, args...) \ |
23 | |||
24 | #define audit_printk(fmt, args...) \ | ||
25 | printk(KERN_ERR "audit: (%s) error: " \ | 23 | printk(KERN_ERR "audit: (%s) error: " \ |
26 | fmt, audit_point_name[audit_point], ##args) | 24 | fmt, audit_point_name[kvm->arch.audit_point], ##args) |
27 | 25 | ||
28 | typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); | 26 | typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); |
29 | 27 | ||
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
97 | 95 | ||
98 | if (sp->unsync) { | 96 | if (sp->unsync) { |
99 | if (level != PT_PAGE_TABLE_LEVEL) { | 97 | if (level != PT_PAGE_TABLE_LEVEL) { |
100 | audit_printk("unsync sp: %p level = %d\n", sp, level); | 98 | audit_printk(vcpu->kvm, "unsync sp: %p " |
99 | "level = %d\n", sp, level); | ||
101 | return; | 100 | return; |
102 | } | 101 | } |
103 | 102 | ||
104 | if (*sptep == shadow_notrap_nonpresent_pte) { | 103 | if (*sptep == shadow_notrap_nonpresent_pte) { |
105 | audit_printk("notrap spte in unsync sp: %p\n", sp); | 104 | audit_printk(vcpu->kvm, "notrap spte in unsync " |
105 | "sp: %p\n", sp); | ||
106 | return; | 106 | return; |
107 | } | 107 | } |
108 | } | 108 | } |
109 | 109 | ||
110 | if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { | 110 | if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { |
111 | audit_printk("notrap spte in direct sp: %p\n", sp); | 111 | audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n", |
112 | sp); | ||
112 | return; | 113 | return; |
113 | } | 114 | } |
114 | 115 | ||
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
125 | 126 | ||
126 | hpa = pfn << PAGE_SHIFT; | 127 | hpa = pfn << PAGE_SHIFT; |
127 | if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) | 128 | if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) |
128 | audit_printk("levels %d pfn %llx hpa %llx ent %llxn", | 129 | audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " |
129 | vcpu->arch.mmu.root_level, pfn, hpa, *sptep); | 130 | "ent %llxn", vcpu->arch.mmu.root_level, pfn, |
131 | hpa, *sptep); | ||
130 | } | 132 | } |
131 | 133 | ||
132 | static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | 134 | static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) |
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | |||
142 | if (!gfn_to_memslot(kvm, gfn)) { | 144 | if (!gfn_to_memslot(kvm, gfn)) { |
143 | if (!printk_ratelimit()) | 145 | if (!printk_ratelimit()) |
144 | return; | 146 | return; |
145 | audit_printk("no memslot for gfn %llx\n", gfn); | 147 | audit_printk(kvm, "no memslot for gfn %llx\n", gfn); |
146 | audit_printk("index %ld of sp (gfn=%llx)\n", | 148 | audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", |
147 | (long int)(sptep - rev_sp->spt), rev_sp->gfn); | 149 | (long int)(sptep - rev_sp->spt), rev_sp->gfn); |
148 | dump_stack(); | 150 | dump_stack(); |
149 | return; | 151 | return; |
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | |||
153 | if (!*rmapp) { | 155 | if (!*rmapp) { |
154 | if (!printk_ratelimit()) | 156 | if (!printk_ratelimit()) |
155 | return; | 157 | return; |
156 | audit_printk("no rmap for writable spte %llx\n", *sptep); | 158 | audit_printk(kvm, "no rmap for writable spte %llx\n", |
159 | *sptep); | ||
157 | dump_stack(); | 160 | dump_stack(); |
158 | } | 161 | } |
159 | } | 162 | } |
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
168 | { | 171 | { |
169 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | 172 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); |
170 | 173 | ||
171 | if (audit_point == AUDIT_POST_SYNC && sp->unsync) | 174 | if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) |
172 | audit_printk("meet unsync sp(%p) after sync root.\n", sp); | 175 | audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " |
176 | "root.\n", sp); | ||
173 | } | 177 | } |
174 | 178 | ||
175 | static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) | 179 | static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) |
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
202 | spte = rmap_next(kvm, rmapp, NULL); | 206 | spte = rmap_next(kvm, rmapp, NULL); |
203 | while (spte) { | 207 | while (spte) { |
204 | if (is_writable_pte(*spte)) | 208 | if (is_writable_pte(*spte)) |
205 | audit_printk("shadow page has writable mappings: gfn " | 209 | audit_printk(kvm, "shadow page has writable " |
206 | "%llx role %x\n", sp->gfn, sp->role.word); | 210 | "mappings: gfn %llx role %x\n", |
211 | sp->gfn, sp->role.word); | ||
207 | spte = rmap_next(kvm, rmapp, spte); | 212 | spte = rmap_next(kvm, rmapp, spte); |
208 | } | 213 | } |
209 | } | 214 | } |
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) | |||
238 | if (!__ratelimit(&ratelimit_state)) | 243 | if (!__ratelimit(&ratelimit_state)) |
239 | return; | 244 | return; |
240 | 245 | ||
241 | audit_point = point; | 246 | vcpu->kvm->arch.audit_point = point; |
242 | audit_all_active_sps(vcpu->kvm); | 247 | audit_all_active_sps(vcpu->kvm); |
243 | audit_vcpu_spte(vcpu); | 248 | audit_vcpu_spte(vcpu); |
244 | } | 249 | } |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index cd7a833a3b52..53210f1e94c2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -72,7 +72,7 @@ struct guest_walker { | |||
72 | unsigned pt_access; | 72 | unsigned pt_access; |
73 | unsigned pte_access; | 73 | unsigned pte_access; |
74 | gfn_t gfn; | 74 | gfn_t gfn; |
75 | u32 error_code; | 75 | struct x86_exception fault; |
76 | }; | 76 | }; |
77 | 77 | ||
78 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) | 78 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) |
@@ -266,21 +266,23 @@ walk: | |||
266 | return 1; | 266 | return 1; |
267 | 267 | ||
268 | error: | 268 | error: |
269 | walker->error_code = 0; | 269 | walker->fault.vector = PF_VECTOR; |
270 | walker->fault.error_code_valid = true; | ||
271 | walker->fault.error_code = 0; | ||
270 | if (present) | 272 | if (present) |
271 | walker->error_code |= PFERR_PRESENT_MASK; | 273 | walker->fault.error_code |= PFERR_PRESENT_MASK; |
272 | 274 | ||
273 | walker->error_code |= write_fault | user_fault; | 275 | walker->fault.error_code |= write_fault | user_fault; |
274 | 276 | ||
275 | if (fetch_fault && mmu->nx) | 277 | if (fetch_fault && mmu->nx) |
276 | walker->error_code |= PFERR_FETCH_MASK; | 278 | walker->fault.error_code |= PFERR_FETCH_MASK; |
277 | if (rsvd_fault) | 279 | if (rsvd_fault) |
278 | walker->error_code |= PFERR_RSVD_MASK; | 280 | walker->fault.error_code |= PFERR_RSVD_MASK; |
279 | 281 | ||
280 | vcpu->arch.fault.address = addr; | 282 | walker->fault.address = addr; |
281 | vcpu->arch.fault.error_code = walker->error_code; | 283 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; |
282 | 284 | ||
283 | trace_kvm_mmu_walker_error(walker->error_code); | 285 | trace_kvm_mmu_walker_error(walker->fault.error_code); |
284 | return 0; | 286 | return 0; |
285 | } | 287 | } |
286 | 288 | ||
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, | |||
299 | addr, access); | 301 | addr, access); |
300 | } | 302 | } |
301 | 303 | ||
304 | static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, | ||
305 | struct kvm_mmu_page *sp, u64 *spte, | ||
306 | pt_element_t gpte) | ||
307 | { | ||
308 | u64 nonpresent = shadow_trap_nonpresent_pte; | ||
309 | |||
310 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) | ||
311 | goto no_present; | ||
312 | |||
313 | if (!is_present_gpte(gpte)) { | ||
314 | if (!sp->unsync) | ||
315 | nonpresent = shadow_notrap_nonpresent_pte; | ||
316 | goto no_present; | ||
317 | } | ||
318 | |||
319 | if (!(gpte & PT_ACCESSED_MASK)) | ||
320 | goto no_present; | ||
321 | |||
322 | return false; | ||
323 | |||
324 | no_present: | ||
325 | drop_spte(vcpu->kvm, spte, nonpresent); | ||
326 | return true; | ||
327 | } | ||
328 | |||
302 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 329 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
303 | u64 *spte, const void *pte) | 330 | u64 *spte, const void *pte) |
304 | { | 331 | { |
305 | pt_element_t gpte; | 332 | pt_element_t gpte; |
306 | unsigned pte_access; | 333 | unsigned pte_access; |
307 | pfn_t pfn; | 334 | pfn_t pfn; |
308 | u64 new_spte; | ||
309 | 335 | ||
310 | gpte = *(const pt_element_t *)pte; | 336 | gpte = *(const pt_element_t *)pte; |
311 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | 337 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
312 | if (!is_present_gpte(gpte)) { | ||
313 | if (sp->unsync) | ||
314 | new_spte = shadow_trap_nonpresent_pte; | ||
315 | else | ||
316 | new_spte = shadow_notrap_nonpresent_pte; | ||
317 | __set_spte(spte, new_spte); | ||
318 | } | ||
319 | return; | 338 | return; |
320 | } | 339 | |
321 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 340 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
322 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 341 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
323 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) | 342 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) |
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
329 | return; | 348 | return; |
330 | kvm_get_pfn(pfn); | 349 | kvm_get_pfn(pfn); |
331 | /* | 350 | /* |
332 | * we call mmu_set_spte() with reset_host_protection = true beacuse that | 351 | * we call mmu_set_spte() with host_writable = true beacuse that |
333 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). | 352 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). |
334 | */ | 353 | */ |
335 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 354 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
364 | u64 *sptep) | 383 | u64 *sptep) |
365 | { | 384 | { |
366 | struct kvm_mmu_page *sp; | 385 | struct kvm_mmu_page *sp; |
367 | struct kvm_mmu *mmu = &vcpu->arch.mmu; | ||
368 | pt_element_t *gptep = gw->prefetch_ptes; | 386 | pt_element_t *gptep = gw->prefetch_ptes; |
369 | u64 *spte; | 387 | u64 *spte; |
370 | int i; | 388 | int i; |
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
395 | 413 | ||
396 | gpte = gptep[i]; | 414 | gpte = gptep[i]; |
397 | 415 | ||
398 | if (!is_present_gpte(gpte) || | 416 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
399 | is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) { | ||
400 | if (!sp->unsync) | ||
401 | __set_spte(spte, shadow_notrap_nonpresent_pte); | ||
402 | continue; | ||
403 | } | ||
404 | |||
405 | if (!(gpte & PT_ACCESSED_MASK)) | ||
406 | continue; | 417 | continue; |
407 | 418 | ||
408 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 419 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
427 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 438 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
428 | struct guest_walker *gw, | 439 | struct guest_walker *gw, |
429 | int user_fault, int write_fault, int hlevel, | 440 | int user_fault, int write_fault, int hlevel, |
430 | int *ptwrite, pfn_t pfn) | 441 | int *ptwrite, pfn_t pfn, bool map_writable, |
442 | bool prefault) | ||
431 | { | 443 | { |
432 | unsigned access = gw->pt_access; | 444 | unsigned access = gw->pt_access; |
433 | struct kvm_mmu_page *sp = NULL; | 445 | struct kvm_mmu_page *sp = NULL; |
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
501 | 513 | ||
502 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, | 514 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, |
503 | user_fault, write_fault, dirty, ptwrite, it.level, | 515 | user_fault, write_fault, dirty, ptwrite, it.level, |
504 | gw->gfn, pfn, false, true); | 516 | gw->gfn, pfn, prefault, map_writable); |
505 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | 517 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); |
506 | 518 | ||
507 | return it.sptep; | 519 | return it.sptep; |
@@ -527,8 +539,8 @@ out_gpte_changed: | |||
527 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | 539 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or |
528 | * a negative value on error. | 540 | * a negative value on error. |
529 | */ | 541 | */ |
530 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | 542 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, |
531 | u32 error_code) | 543 | bool prefault) |
532 | { | 544 | { |
533 | int write_fault = error_code & PFERR_WRITE_MASK; | 545 | int write_fault = error_code & PFERR_WRITE_MASK; |
534 | int user_fault = error_code & PFERR_USER_MASK; | 546 | int user_fault = error_code & PFERR_USER_MASK; |
@@ -539,6 +551,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
539 | pfn_t pfn; | 551 | pfn_t pfn; |
540 | int level = PT_PAGE_TABLE_LEVEL; | 552 | int level = PT_PAGE_TABLE_LEVEL; |
541 | unsigned long mmu_seq; | 553 | unsigned long mmu_seq; |
554 | bool map_writable; | ||
542 | 555 | ||
543 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 556 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
544 | 557 | ||
@@ -556,8 +569,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
556 | */ | 569 | */ |
557 | if (!r) { | 570 | if (!r) { |
558 | pgprintk("%s: guest page fault\n", __func__); | 571 | pgprintk("%s: guest page fault\n", __func__); |
559 | inject_page_fault(vcpu); | 572 | if (!prefault) { |
560 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 573 | inject_page_fault(vcpu, &walker.fault); |
574 | /* reset fork detector */ | ||
575 | vcpu->arch.last_pt_write_count = 0; | ||
576 | } | ||
561 | return 0; | 577 | return 0; |
562 | } | 578 | } |
563 | 579 | ||
@@ -568,7 +584,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
568 | 584 | ||
569 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 585 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
570 | smp_rmb(); | 586 | smp_rmb(); |
571 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 587 | |
588 | if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, | ||
589 | &map_writable)) | ||
590 | return 0; | ||
572 | 591 | ||
573 | /* mmio */ | 592 | /* mmio */ |
574 | if (is_error_pfn(pfn)) | 593 | if (is_error_pfn(pfn)) |
@@ -581,7 +600,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
581 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | 600 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); |
582 | kvm_mmu_free_some_pages(vcpu); | 601 | kvm_mmu_free_some_pages(vcpu); |
583 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 602 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
584 | level, &write_pt, pfn); | 603 | level, &write_pt, pfn, map_writable, prefault); |
585 | (void)sptep; | 604 | (void)sptep; |
586 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | 605 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, |
587 | sptep, *sptep, write_pt); | 606 | sptep, *sptep, write_pt); |
@@ -661,7 +680,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
661 | } | 680 | } |
662 | 681 | ||
663 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, | 682 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, |
664 | u32 *error) | 683 | struct x86_exception *exception) |
665 | { | 684 | { |
666 | struct guest_walker walker; | 685 | struct guest_walker walker; |
667 | gpa_t gpa = UNMAPPED_GVA; | 686 | gpa_t gpa = UNMAPPED_GVA; |
@@ -672,14 +691,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, | |||
672 | if (r) { | 691 | if (r) { |
673 | gpa = gfn_to_gpa(walker.gfn); | 692 | gpa = gfn_to_gpa(walker.gfn); |
674 | gpa |= vaddr & ~PAGE_MASK; | 693 | gpa |= vaddr & ~PAGE_MASK; |
675 | } else if (error) | 694 | } else if (exception) |
676 | *error = walker.error_code; | 695 | *exception = walker.fault; |
677 | 696 | ||
678 | return gpa; | 697 | return gpa; |
679 | } | 698 | } |
680 | 699 | ||
681 | static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | 700 | static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, |
682 | u32 access, u32 *error) | 701 | u32 access, |
702 | struct x86_exception *exception) | ||
683 | { | 703 | { |
684 | struct guest_walker walker; | 704 | struct guest_walker walker; |
685 | gpa_t gpa = UNMAPPED_GVA; | 705 | gpa_t gpa = UNMAPPED_GVA; |
@@ -690,8 +710,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
690 | if (r) { | 710 | if (r) { |
691 | gpa = gfn_to_gpa(walker.gfn); | 711 | gpa = gfn_to_gpa(walker.gfn); |
692 | gpa |= vaddr & ~PAGE_MASK; | 712 | gpa |= vaddr & ~PAGE_MASK; |
693 | } else if (error) | 713 | } else if (exception) |
694 | *error = walker.error_code; | 714 | *exception = walker.fault; |
695 | 715 | ||
696 | return gpa; | 716 | return gpa; |
697 | } | 717 | } |
@@ -730,12 +750,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
730 | * Using the cached information from sp->gfns is safe because: | 750 | * Using the cached information from sp->gfns is safe because: |
731 | * - The spte has a reference to the struct page, so the pfn for a given gfn | 751 | * - The spte has a reference to the struct page, so the pfn for a given gfn |
732 | * can't change unless all sptes pointing to it are nuked first. | 752 | * can't change unless all sptes pointing to it are nuked first. |
753 | * | ||
754 | * Note: | ||
755 | * We should flush all tlbs if spte is dropped even though guest is | ||
756 | * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page | ||
757 | * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't | ||
758 | * used by guest then tlbs are not flushed, so guest is allowed to access the | ||
759 | * freed pages. | ||
760 | * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. | ||
733 | */ | 761 | */ |
734 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 762 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
735 | bool clear_unsync) | ||
736 | { | 763 | { |
737 | int i, offset, nr_present; | 764 | int i, offset, nr_present; |
738 | bool reset_host_protection; | 765 | bool host_writable; |
739 | gpa_t first_pte_gpa; | 766 | gpa_t first_pte_gpa; |
740 | 767 | ||
741 | offset = nr_present = 0; | 768 | offset = nr_present = 0; |
@@ -764,31 +791,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
764 | return -EINVAL; | 791 | return -EINVAL; |
765 | 792 | ||
766 | gfn = gpte_to_gfn(gpte); | 793 | gfn = gpte_to_gfn(gpte); |
767 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL) | ||
768 | || gfn != sp->gfns[i] || !is_present_gpte(gpte) | ||
769 | || !(gpte & PT_ACCESSED_MASK)) { | ||
770 | u64 nonpresent; | ||
771 | 794 | ||
772 | if (is_present_gpte(gpte) || !clear_unsync) | 795 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { |
773 | nonpresent = shadow_trap_nonpresent_pte; | 796 | vcpu->kvm->tlbs_dirty++; |
774 | else | 797 | continue; |
775 | nonpresent = shadow_notrap_nonpresent_pte; | 798 | } |
776 | drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); | 799 | |
800 | if (gfn != sp->gfns[i]) { | ||
801 | drop_spte(vcpu->kvm, &sp->spt[i], | ||
802 | shadow_trap_nonpresent_pte); | ||
803 | vcpu->kvm->tlbs_dirty++; | ||
777 | continue; | 804 | continue; |
778 | } | 805 | } |
779 | 806 | ||
780 | nr_present++; | 807 | nr_present++; |
781 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 808 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
782 | if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { | 809 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; |
783 | pte_access &= ~ACC_WRITE_MASK; | 810 | |
784 | reset_host_protection = 0; | ||
785 | } else { | ||
786 | reset_host_protection = 1; | ||
787 | } | ||
788 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 811 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
789 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, | 812 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, |
790 | spte_to_pfn(sp->spt[i]), true, false, | 813 | spte_to_pfn(sp->spt[i]), true, false, |
791 | reset_host_protection); | 814 | host_writable); |
792 | } | 815 | } |
793 | 816 | ||
794 | return !nr_present; | 817 | return !nr_present; |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b81a9b7c2ca4..25bd1bc5aad2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -31,6 +31,7 @@ | |||
31 | 31 | ||
32 | #include <asm/tlbflush.h> | 32 | #include <asm/tlbflush.h> |
33 | #include <asm/desc.h> | 33 | #include <asm/desc.h> |
34 | #include <asm/kvm_para.h> | ||
34 | 35 | ||
35 | #include <asm/virtext.h> | 36 | #include <asm/virtext.h> |
36 | #include "trace.h" | 37 | #include "trace.h" |
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL"); | |||
50 | #define SVM_FEATURE_LBRV (1 << 1) | 51 | #define SVM_FEATURE_LBRV (1 << 1) |
51 | #define SVM_FEATURE_SVML (1 << 2) | 52 | #define SVM_FEATURE_SVML (1 << 2) |
52 | #define SVM_FEATURE_NRIP (1 << 3) | 53 | #define SVM_FEATURE_NRIP (1 << 3) |
54 | #define SVM_FEATURE_TSC_RATE (1 << 4) | ||
55 | #define SVM_FEATURE_VMCB_CLEAN (1 << 5) | ||
56 | #define SVM_FEATURE_FLUSH_ASID (1 << 6) | ||
57 | #define SVM_FEATURE_DECODE_ASSIST (1 << 7) | ||
53 | #define SVM_FEATURE_PAUSE_FILTER (1 << 10) | 58 | #define SVM_FEATURE_PAUSE_FILTER (1 << 10) |
54 | 59 | ||
55 | #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ | 60 | #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ |
@@ -97,10 +102,8 @@ struct nested_state { | |||
97 | unsigned long vmexit_rax; | 102 | unsigned long vmexit_rax; |
98 | 103 | ||
99 | /* cache for intercepts of the guest */ | 104 | /* cache for intercepts of the guest */ |
100 | u16 intercept_cr_read; | 105 | u32 intercept_cr; |
101 | u16 intercept_cr_write; | 106 | u32 intercept_dr; |
102 | u16 intercept_dr_read; | ||
103 | u16 intercept_dr_write; | ||
104 | u32 intercept_exceptions; | 107 | u32 intercept_exceptions; |
105 | u64 intercept; | 108 | u64 intercept; |
106 | 109 | ||
@@ -123,7 +126,12 @@ struct vcpu_svm { | |||
123 | u64 next_rip; | 126 | u64 next_rip; |
124 | 127 | ||
125 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | 128 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; |
126 | u64 host_gs_base; | 129 | struct { |
130 | u16 fs; | ||
131 | u16 gs; | ||
132 | u16 ldt; | ||
133 | u64 gs_base; | ||
134 | } host; | ||
127 | 135 | ||
128 | u32 *msrpm; | 136 | u32 *msrpm; |
129 | 137 | ||
@@ -133,6 +141,7 @@ struct vcpu_svm { | |||
133 | 141 | ||
134 | unsigned int3_injected; | 142 | unsigned int3_injected; |
135 | unsigned long int3_rip; | 143 | unsigned long int3_rip; |
144 | u32 apf_reason; | ||
136 | }; | 145 | }; |
137 | 146 | ||
138 | #define MSR_INVALID 0xffffffffU | 147 | #define MSR_INVALID 0xffffffffU |
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm); | |||
180 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | 189 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, |
181 | bool has_error_code, u32 error_code); | 190 | bool has_error_code, u32 error_code); |
182 | 191 | ||
192 | enum { | ||
193 | VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, | ||
194 | pause filter count */ | ||
195 | VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ | ||
196 | VMCB_ASID, /* ASID */ | ||
197 | VMCB_INTR, /* int_ctl, int_vector */ | ||
198 | VMCB_NPT, /* npt_en, nCR3, gPAT */ | ||
199 | VMCB_CR, /* CR0, CR3, CR4, EFER */ | ||
200 | VMCB_DR, /* DR6, DR7 */ | ||
201 | VMCB_DT, /* GDT, IDT */ | ||
202 | VMCB_SEG, /* CS, DS, SS, ES, CPL */ | ||
203 | VMCB_CR2, /* CR2 only */ | ||
204 | VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ | ||
205 | VMCB_DIRTY_MAX, | ||
206 | }; | ||
207 | |||
208 | /* TPR and CR2 are always written before VMRUN */ | ||
209 | #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) | ||
210 | |||
211 | static inline void mark_all_dirty(struct vmcb *vmcb) | ||
212 | { | ||
213 | vmcb->control.clean = 0; | ||
214 | } | ||
215 | |||
216 | static inline void mark_all_clean(struct vmcb *vmcb) | ||
217 | { | ||
218 | vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) | ||
219 | & ~VMCB_ALWAYS_DIRTY_MASK; | ||
220 | } | ||
221 | |||
222 | static inline void mark_dirty(struct vmcb *vmcb, int bit) | ||
223 | { | ||
224 | vmcb->control.clean &= ~(1 << bit); | ||
225 | } | ||
226 | |||
183 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | 227 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) |
184 | { | 228 | { |
185 | return container_of(vcpu, struct vcpu_svm, vcpu); | 229 | return container_of(vcpu, struct vcpu_svm, vcpu); |
186 | } | 230 | } |
187 | 231 | ||
188 | static inline bool is_nested(struct vcpu_svm *svm) | 232 | static void recalc_intercepts(struct vcpu_svm *svm) |
233 | { | ||
234 | struct vmcb_control_area *c, *h; | ||
235 | struct nested_state *g; | ||
236 | |||
237 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
238 | |||
239 | if (!is_guest_mode(&svm->vcpu)) | ||
240 | return; | ||
241 | |||
242 | c = &svm->vmcb->control; | ||
243 | h = &svm->nested.hsave->control; | ||
244 | g = &svm->nested; | ||
245 | |||
246 | c->intercept_cr = h->intercept_cr | g->intercept_cr; | ||
247 | c->intercept_dr = h->intercept_dr | g->intercept_dr; | ||
248 | c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; | ||
249 | c->intercept = h->intercept | g->intercept; | ||
250 | } | ||
251 | |||
252 | static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) | ||
253 | { | ||
254 | if (is_guest_mode(&svm->vcpu)) | ||
255 | return svm->nested.hsave; | ||
256 | else | ||
257 | return svm->vmcb; | ||
258 | } | ||
259 | |||
260 | static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) | ||
261 | { | ||
262 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
263 | |||
264 | vmcb->control.intercept_cr |= (1U << bit); | ||
265 | |||
266 | recalc_intercepts(svm); | ||
267 | } | ||
268 | |||
269 | static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) | ||
270 | { | ||
271 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
272 | |||
273 | vmcb->control.intercept_cr &= ~(1U << bit); | ||
274 | |||
275 | recalc_intercepts(svm); | ||
276 | } | ||
277 | |||
278 | static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) | ||
279 | { | ||
280 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
281 | |||
282 | return vmcb->control.intercept_cr & (1U << bit); | ||
283 | } | ||
284 | |||
285 | static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) | ||
286 | { | ||
287 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
288 | |||
289 | vmcb->control.intercept_dr |= (1U << bit); | ||
290 | |||
291 | recalc_intercepts(svm); | ||
292 | } | ||
293 | |||
294 | static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) | ||
295 | { | ||
296 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
297 | |||
298 | vmcb->control.intercept_dr &= ~(1U << bit); | ||
299 | |||
300 | recalc_intercepts(svm); | ||
301 | } | ||
302 | |||
303 | static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) | ||
304 | { | ||
305 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
306 | |||
307 | vmcb->control.intercept_exceptions |= (1U << bit); | ||
308 | |||
309 | recalc_intercepts(svm); | ||
310 | } | ||
311 | |||
312 | static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) | ||
189 | { | 313 | { |
190 | return svm->nested.vmcb; | 314 | struct vmcb *vmcb = get_host_vmcb(svm); |
315 | |||
316 | vmcb->control.intercept_exceptions &= ~(1U << bit); | ||
317 | |||
318 | recalc_intercepts(svm); | ||
319 | } | ||
320 | |||
321 | static inline void set_intercept(struct vcpu_svm *svm, int bit) | ||
322 | { | ||
323 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
324 | |||
325 | vmcb->control.intercept |= (1ULL << bit); | ||
326 | |||
327 | recalc_intercepts(svm); | ||
328 | } | ||
329 | |||
330 | static inline void clr_intercept(struct vcpu_svm *svm, int bit) | ||
331 | { | ||
332 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
333 | |||
334 | vmcb->control.intercept &= ~(1ULL << bit); | ||
335 | |||
336 | recalc_intercepts(svm); | ||
191 | } | 337 | } |
192 | 338 | ||
193 | static inline void enable_gif(struct vcpu_svm *svm) | 339 | static inline void enable_gif(struct vcpu_svm *svm) |
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr) | |||
264 | 410 | ||
265 | #define MAX_INST_SIZE 15 | 411 | #define MAX_INST_SIZE 15 |
266 | 412 | ||
267 | static inline u32 svm_has(u32 feat) | ||
268 | { | ||
269 | return svm_features & feat; | ||
270 | } | ||
271 | |||
272 | static inline void clgi(void) | 413 | static inline void clgi(void) |
273 | { | 414 | { |
274 | asm volatile (__ex(SVM_CLGI)); | 415 | asm volatile (__ex(SVM_CLGI)); |
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid) | |||
284 | asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); | 425 | asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); |
285 | } | 426 | } |
286 | 427 | ||
287 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | ||
288 | { | ||
289 | to_svm(vcpu)->asid_generation--; | ||
290 | } | ||
291 | |||
292 | static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | ||
293 | { | ||
294 | force_new_asid(vcpu); | ||
295 | } | ||
296 | |||
297 | static int get_npt_level(void) | 428 | static int get_npt_level(void) |
298 | { | 429 | { |
299 | #ifdef CONFIG_X86_64 | 430 | #ifdef CONFIG_X86_64 |
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
310 | efer &= ~EFER_LME; | 441 | efer &= ~EFER_LME; |
311 | 442 | ||
312 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; | 443 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; |
444 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | ||
313 | } | 445 | } |
314 | 446 | ||
315 | static int is_external_interrupt(u32 info) | 447 | static int is_external_interrupt(u32 info) |
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
347 | svm->next_rip = svm->vmcb->control.next_rip; | 479 | svm->next_rip = svm->vmcb->control.next_rip; |
348 | 480 | ||
349 | if (!svm->next_rip) { | 481 | if (!svm->next_rip) { |
350 | if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != | 482 | if (emulate_instruction(vcpu, EMULTYPE_SKIP) != |
351 | EMULATE_DONE) | 483 | EMULATE_DONE) |
352 | printk(KERN_DEBUG "%s: NOP\n", __func__); | 484 | printk(KERN_DEBUG "%s: NOP\n", __func__); |
353 | return; | 485 | return; |
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
374 | nested_svm_check_exception(svm, nr, has_error_code, error_code)) | 506 | nested_svm_check_exception(svm, nr, has_error_code, error_code)) |
375 | return; | 507 | return; |
376 | 508 | ||
377 | if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { | 509 | if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { |
378 | unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); | 510 | unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); |
379 | 511 | ||
380 | /* | 512 | /* |
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void) | |||
670 | 802 | ||
671 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | 803 | svm_features = cpuid_edx(SVM_CPUID_FUNC); |
672 | 804 | ||
673 | if (!svm_has(SVM_FEATURE_NPT)) | 805 | if (!boot_cpu_has(X86_FEATURE_NPT)) |
674 | npt_enabled = false; | 806 | npt_enabled = false; |
675 | 807 | ||
676 | if (npt_enabled && !npt) { | 808 | if (npt_enabled && !npt) { |
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | |||
725 | struct vcpu_svm *svm = to_svm(vcpu); | 857 | struct vcpu_svm *svm = to_svm(vcpu); |
726 | u64 g_tsc_offset = 0; | 858 | u64 g_tsc_offset = 0; |
727 | 859 | ||
728 | if (is_nested(svm)) { | 860 | if (is_guest_mode(vcpu)) { |
729 | g_tsc_offset = svm->vmcb->control.tsc_offset - | 861 | g_tsc_offset = svm->vmcb->control.tsc_offset - |
730 | svm->nested.hsave->control.tsc_offset; | 862 | svm->nested.hsave->control.tsc_offset; |
731 | svm->nested.hsave->control.tsc_offset = offset; | 863 | svm->nested.hsave->control.tsc_offset = offset; |
732 | } | 864 | } |
733 | 865 | ||
734 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; | 866 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; |
867 | |||
868 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
735 | } | 869 | } |
736 | 870 | ||
737 | static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | 871 | static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) |
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | |||
739 | struct vcpu_svm *svm = to_svm(vcpu); | 873 | struct vcpu_svm *svm = to_svm(vcpu); |
740 | 874 | ||
741 | svm->vmcb->control.tsc_offset += adjustment; | 875 | svm->vmcb->control.tsc_offset += adjustment; |
742 | if (is_nested(svm)) | 876 | if (is_guest_mode(vcpu)) |
743 | svm->nested.hsave->control.tsc_offset += adjustment; | 877 | svm->nested.hsave->control.tsc_offset += adjustment; |
878 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
744 | } | 879 | } |
745 | 880 | ||
746 | static void init_vmcb(struct vcpu_svm *svm) | 881 | static void init_vmcb(struct vcpu_svm *svm) |
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
749 | struct vmcb_save_area *save = &svm->vmcb->save; | 884 | struct vmcb_save_area *save = &svm->vmcb->save; |
750 | 885 | ||
751 | svm->vcpu.fpu_active = 1; | 886 | svm->vcpu.fpu_active = 1; |
887 | svm->vcpu.arch.hflags = 0; | ||
752 | 888 | ||
753 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | 889 | set_cr_intercept(svm, INTERCEPT_CR0_READ); |
754 | INTERCEPT_CR3_MASK | | 890 | set_cr_intercept(svm, INTERCEPT_CR3_READ); |
755 | INTERCEPT_CR4_MASK; | 891 | set_cr_intercept(svm, INTERCEPT_CR4_READ); |
756 | 892 | set_cr_intercept(svm, INTERCEPT_CR0_WRITE); | |
757 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | 893 | set_cr_intercept(svm, INTERCEPT_CR3_WRITE); |
758 | INTERCEPT_CR3_MASK | | 894 | set_cr_intercept(svm, INTERCEPT_CR4_WRITE); |
759 | INTERCEPT_CR4_MASK | | 895 | set_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
760 | INTERCEPT_CR8_MASK; | 896 | |
761 | 897 | set_dr_intercept(svm, INTERCEPT_DR0_READ); | |
762 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | 898 | set_dr_intercept(svm, INTERCEPT_DR1_READ); |
763 | INTERCEPT_DR1_MASK | | 899 | set_dr_intercept(svm, INTERCEPT_DR2_READ); |
764 | INTERCEPT_DR2_MASK | | 900 | set_dr_intercept(svm, INTERCEPT_DR3_READ); |
765 | INTERCEPT_DR3_MASK | | 901 | set_dr_intercept(svm, INTERCEPT_DR4_READ); |
766 | INTERCEPT_DR4_MASK | | 902 | set_dr_intercept(svm, INTERCEPT_DR5_READ); |
767 | INTERCEPT_DR5_MASK | | 903 | set_dr_intercept(svm, INTERCEPT_DR6_READ); |
768 | INTERCEPT_DR6_MASK | | 904 | set_dr_intercept(svm, INTERCEPT_DR7_READ); |
769 | INTERCEPT_DR7_MASK; | 905 | |
770 | 906 | set_dr_intercept(svm, INTERCEPT_DR0_WRITE); | |
771 | control->intercept_dr_write = INTERCEPT_DR0_MASK | | 907 | set_dr_intercept(svm, INTERCEPT_DR1_WRITE); |
772 | INTERCEPT_DR1_MASK | | 908 | set_dr_intercept(svm, INTERCEPT_DR2_WRITE); |
773 | INTERCEPT_DR2_MASK | | 909 | set_dr_intercept(svm, INTERCEPT_DR3_WRITE); |
774 | INTERCEPT_DR3_MASK | | 910 | set_dr_intercept(svm, INTERCEPT_DR4_WRITE); |
775 | INTERCEPT_DR4_MASK | | 911 | set_dr_intercept(svm, INTERCEPT_DR5_WRITE); |
776 | INTERCEPT_DR5_MASK | | 912 | set_dr_intercept(svm, INTERCEPT_DR6_WRITE); |
777 | INTERCEPT_DR6_MASK | | 913 | set_dr_intercept(svm, INTERCEPT_DR7_WRITE); |
778 | INTERCEPT_DR7_MASK; | 914 | |
779 | 915 | set_exception_intercept(svm, PF_VECTOR); | |
780 | control->intercept_exceptions = (1 << PF_VECTOR) | | 916 | set_exception_intercept(svm, UD_VECTOR); |
781 | (1 << UD_VECTOR) | | 917 | set_exception_intercept(svm, MC_VECTOR); |
782 | (1 << MC_VECTOR); | 918 | |
783 | 919 | set_intercept(svm, INTERCEPT_INTR); | |
784 | 920 | set_intercept(svm, INTERCEPT_NMI); | |
785 | control->intercept = (1ULL << INTERCEPT_INTR) | | 921 | set_intercept(svm, INTERCEPT_SMI); |
786 | (1ULL << INTERCEPT_NMI) | | 922 | set_intercept(svm, INTERCEPT_SELECTIVE_CR0); |
787 | (1ULL << INTERCEPT_SMI) | | 923 | set_intercept(svm, INTERCEPT_CPUID); |
788 | (1ULL << INTERCEPT_SELECTIVE_CR0) | | 924 | set_intercept(svm, INTERCEPT_INVD); |
789 | (1ULL << INTERCEPT_CPUID) | | 925 | set_intercept(svm, INTERCEPT_HLT); |
790 | (1ULL << INTERCEPT_INVD) | | 926 | set_intercept(svm, INTERCEPT_INVLPG); |
791 | (1ULL << INTERCEPT_HLT) | | 927 | set_intercept(svm, INTERCEPT_INVLPGA); |
792 | (1ULL << INTERCEPT_INVLPG) | | 928 | set_intercept(svm, INTERCEPT_IOIO_PROT); |
793 | (1ULL << INTERCEPT_INVLPGA) | | 929 | set_intercept(svm, INTERCEPT_MSR_PROT); |
794 | (1ULL << INTERCEPT_IOIO_PROT) | | 930 | set_intercept(svm, INTERCEPT_TASK_SWITCH); |
795 | (1ULL << INTERCEPT_MSR_PROT) | | 931 | set_intercept(svm, INTERCEPT_SHUTDOWN); |
796 | (1ULL << INTERCEPT_TASK_SWITCH) | | 932 | set_intercept(svm, INTERCEPT_VMRUN); |
797 | (1ULL << INTERCEPT_SHUTDOWN) | | 933 | set_intercept(svm, INTERCEPT_VMMCALL); |
798 | (1ULL << INTERCEPT_VMRUN) | | 934 | set_intercept(svm, INTERCEPT_VMLOAD); |
799 | (1ULL << INTERCEPT_VMMCALL) | | 935 | set_intercept(svm, INTERCEPT_VMSAVE); |
800 | (1ULL << INTERCEPT_VMLOAD) | | 936 | set_intercept(svm, INTERCEPT_STGI); |
801 | (1ULL << INTERCEPT_VMSAVE) | | 937 | set_intercept(svm, INTERCEPT_CLGI); |
802 | (1ULL << INTERCEPT_STGI) | | 938 | set_intercept(svm, INTERCEPT_SKINIT); |
803 | (1ULL << INTERCEPT_CLGI) | | 939 | set_intercept(svm, INTERCEPT_WBINVD); |
804 | (1ULL << INTERCEPT_SKINIT) | | 940 | set_intercept(svm, INTERCEPT_MONITOR); |
805 | (1ULL << INTERCEPT_WBINVD) | | 941 | set_intercept(svm, INTERCEPT_MWAIT); |
806 | (1ULL << INTERCEPT_MONITOR) | | 942 | set_intercept(svm, INTERCEPT_XSETBV); |
807 | (1ULL << INTERCEPT_MWAIT); | ||
808 | 943 | ||
809 | control->iopm_base_pa = iopm_base; | 944 | control->iopm_base_pa = iopm_base; |
810 | control->msrpm_base_pa = __pa(svm->msrpm); | 945 | control->msrpm_base_pa = __pa(svm->msrpm); |
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
855 | if (npt_enabled) { | 990 | if (npt_enabled) { |
856 | /* Setup VMCB for Nested Paging */ | 991 | /* Setup VMCB for Nested Paging */ |
857 | control->nested_ctl = 1; | 992 | control->nested_ctl = 1; |
858 | control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | | 993 | clr_intercept(svm, INTERCEPT_TASK_SWITCH); |
859 | (1ULL << INTERCEPT_INVLPG)); | 994 | clr_intercept(svm, INTERCEPT_INVLPG); |
860 | control->intercept_exceptions &= ~(1 << PF_VECTOR); | 995 | clr_exception_intercept(svm, PF_VECTOR); |
861 | control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; | 996 | clr_cr_intercept(svm, INTERCEPT_CR3_READ); |
862 | control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; | 997 | clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); |
863 | save->g_pat = 0x0007040600070406ULL; | 998 | save->g_pat = 0x0007040600070406ULL; |
864 | save->cr3 = 0; | 999 | save->cr3 = 0; |
865 | save->cr4 = 0; | 1000 | save->cr4 = 0; |
866 | } | 1001 | } |
867 | force_new_asid(&svm->vcpu); | 1002 | svm->asid_generation = 0; |
868 | 1003 | ||
869 | svm->nested.vmcb = 0; | 1004 | svm->nested.vmcb = 0; |
870 | svm->vcpu.arch.hflags = 0; | 1005 | svm->vcpu.arch.hflags = 0; |
871 | 1006 | ||
872 | if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { | 1007 | if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { |
873 | control->pause_filter_count = 3000; | 1008 | control->pause_filter_count = 3000; |
874 | control->intercept |= (1ULL << INTERCEPT_PAUSE); | 1009 | set_intercept(svm, INTERCEPT_PAUSE); |
875 | } | 1010 | } |
876 | 1011 | ||
1012 | mark_all_dirty(svm->vmcb); | ||
1013 | |||
877 | enable_gif(svm); | 1014 | enable_gif(svm); |
878 | } | 1015 | } |
879 | 1016 | ||
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
990 | 1127 | ||
991 | if (unlikely(cpu != vcpu->cpu)) { | 1128 | if (unlikely(cpu != vcpu->cpu)) { |
992 | svm->asid_generation = 0; | 1129 | svm->asid_generation = 0; |
1130 | mark_all_dirty(svm->vmcb); | ||
993 | } | 1131 | } |
994 | 1132 | ||
1133 | #ifdef CONFIG_X86_64 | ||
1134 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); | ||
1135 | #endif | ||
1136 | savesegment(fs, svm->host.fs); | ||
1137 | savesegment(gs, svm->host.gs); | ||
1138 | svm->host.ldt = kvm_read_ldt(); | ||
1139 | |||
995 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 1140 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
996 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1141 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
997 | } | 1142 | } |
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
1002 | int i; | 1147 | int i; |
1003 | 1148 | ||
1004 | ++vcpu->stat.host_state_reload; | 1149 | ++vcpu->stat.host_state_reload; |
1150 | kvm_load_ldt(svm->host.ldt); | ||
1151 | #ifdef CONFIG_X86_64 | ||
1152 | loadsegment(fs, svm->host.fs); | ||
1153 | load_gs_index(svm->host.gs); | ||
1154 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
1155 | #else | ||
1156 | loadsegment(gs, svm->host.gs); | ||
1157 | #endif | ||
1005 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 1158 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
1006 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1159 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
1007 | } | 1160 | } |
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
1021 | switch (reg) { | 1174 | switch (reg) { |
1022 | case VCPU_EXREG_PDPTR: | 1175 | case VCPU_EXREG_PDPTR: |
1023 | BUG_ON(!npt_enabled); | 1176 | BUG_ON(!npt_enabled); |
1024 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); | 1177 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); |
1025 | break; | 1178 | break; |
1026 | default: | 1179 | default: |
1027 | BUG(); | 1180 | BUG(); |
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
1030 | 1183 | ||
1031 | static void svm_set_vintr(struct vcpu_svm *svm) | 1184 | static void svm_set_vintr(struct vcpu_svm *svm) |
1032 | { | 1185 | { |
1033 | svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; | 1186 | set_intercept(svm, INTERCEPT_VINTR); |
1034 | } | 1187 | } |
1035 | 1188 | ||
1036 | static void svm_clear_vintr(struct vcpu_svm *svm) | 1189 | static void svm_clear_vintr(struct vcpu_svm *svm) |
1037 | { | 1190 | { |
1038 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); | 1191 | clr_intercept(svm, INTERCEPT_VINTR); |
1039 | } | 1192 | } |
1040 | 1193 | ||
1041 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) | 1194 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) |
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | |||
1150 | 1303 | ||
1151 | svm->vmcb->save.idtr.limit = dt->size; | 1304 | svm->vmcb->save.idtr.limit = dt->size; |
1152 | svm->vmcb->save.idtr.base = dt->address ; | 1305 | svm->vmcb->save.idtr.base = dt->address ; |
1306 | mark_dirty(svm->vmcb, VMCB_DT); | ||
1153 | } | 1307 | } |
1154 | 1308 | ||
1155 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | 1309 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) |
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | |||
1166 | 1320 | ||
1167 | svm->vmcb->save.gdtr.limit = dt->size; | 1321 | svm->vmcb->save.gdtr.limit = dt->size; |
1168 | svm->vmcb->save.gdtr.base = dt->address ; | 1322 | svm->vmcb->save.gdtr.base = dt->address ; |
1323 | mark_dirty(svm->vmcb, VMCB_DT); | ||
1169 | } | 1324 | } |
1170 | 1325 | ||
1171 | static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | 1326 | static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) |
1172 | { | 1327 | { |
1173 | } | 1328 | } |
1174 | 1329 | ||
1330 | static void svm_decache_cr3(struct kvm_vcpu *vcpu) | ||
1331 | { | ||
1332 | } | ||
1333 | |||
1175 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 1334 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
1176 | { | 1335 | { |
1177 | } | 1336 | } |
1178 | 1337 | ||
1179 | static void update_cr0_intercept(struct vcpu_svm *svm) | 1338 | static void update_cr0_intercept(struct vcpu_svm *svm) |
1180 | { | 1339 | { |
1181 | struct vmcb *vmcb = svm->vmcb; | ||
1182 | ulong gcr0 = svm->vcpu.arch.cr0; | 1340 | ulong gcr0 = svm->vcpu.arch.cr0; |
1183 | u64 *hcr0 = &svm->vmcb->save.cr0; | 1341 | u64 *hcr0 = &svm->vmcb->save.cr0; |
1184 | 1342 | ||
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm) | |||
1188 | *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | 1346 | *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) |
1189 | | (gcr0 & SVM_CR0_SELECTIVE_MASK); | 1347 | | (gcr0 & SVM_CR0_SELECTIVE_MASK); |
1190 | 1348 | ||
1349 | mark_dirty(svm->vmcb, VMCB_CR); | ||
1191 | 1350 | ||
1192 | if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { | 1351 | if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { |
1193 | vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; | 1352 | clr_cr_intercept(svm, INTERCEPT_CR0_READ); |
1194 | vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; | 1353 | clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); |
1195 | if (is_nested(svm)) { | ||
1196 | struct vmcb *hsave = svm->nested.hsave; | ||
1197 | |||
1198 | hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; | ||
1199 | hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; | ||
1200 | vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read; | ||
1201 | vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write; | ||
1202 | } | ||
1203 | } else { | 1354 | } else { |
1204 | svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; | 1355 | set_cr_intercept(svm, INTERCEPT_CR0_READ); |
1205 | svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; | 1356 | set_cr_intercept(svm, INTERCEPT_CR0_WRITE); |
1206 | if (is_nested(svm)) { | ||
1207 | struct vmcb *hsave = svm->nested.hsave; | ||
1208 | |||
1209 | hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK; | ||
1210 | hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK; | ||
1211 | } | ||
1212 | } | 1357 | } |
1213 | } | 1358 | } |
1214 | 1359 | ||
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1216 | { | 1361 | { |
1217 | struct vcpu_svm *svm = to_svm(vcpu); | 1362 | struct vcpu_svm *svm = to_svm(vcpu); |
1218 | 1363 | ||
1219 | if (is_nested(svm)) { | 1364 | if (is_guest_mode(vcpu)) { |
1220 | /* | 1365 | /* |
1221 | * We are here because we run in nested mode, the host kvm | 1366 | * We are here because we run in nested mode, the host kvm |
1222 | * intercepts cr0 writes but the l1 hypervisor does not. | 1367 | * intercepts cr0 writes but the l1 hypervisor does not. |
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1268 | */ | 1413 | */ |
1269 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | 1414 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); |
1270 | svm->vmcb->save.cr0 = cr0; | 1415 | svm->vmcb->save.cr0 = cr0; |
1416 | mark_dirty(svm->vmcb, VMCB_CR); | ||
1271 | update_cr0_intercept(svm); | 1417 | update_cr0_intercept(svm); |
1272 | } | 1418 | } |
1273 | 1419 | ||
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
1277 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; | 1423 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; |
1278 | 1424 | ||
1279 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) | 1425 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) |
1280 | force_new_asid(vcpu); | 1426 | svm_flush_tlb(vcpu); |
1281 | 1427 | ||
1282 | vcpu->arch.cr4 = cr4; | 1428 | vcpu->arch.cr4 = cr4; |
1283 | if (!npt_enabled) | 1429 | if (!npt_enabled) |
1284 | cr4 |= X86_CR4_PAE; | 1430 | cr4 |= X86_CR4_PAE; |
1285 | cr4 |= host_cr4_mce; | 1431 | cr4 |= host_cr4_mce; |
1286 | to_svm(vcpu)->vmcb->save.cr4 = cr4; | 1432 | to_svm(vcpu)->vmcb->save.cr4 = cr4; |
1433 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | ||
1287 | } | 1434 | } |
1288 | 1435 | ||
1289 | static void svm_set_segment(struct kvm_vcpu *vcpu, | 1436 | static void svm_set_segment(struct kvm_vcpu *vcpu, |
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, | |||
1312 | = (svm->vmcb->save.cs.attrib | 1459 | = (svm->vmcb->save.cs.attrib |
1313 | >> SVM_SELECTOR_DPL_SHIFT) & 3; | 1460 | >> SVM_SELECTOR_DPL_SHIFT) & 3; |
1314 | 1461 | ||
1462 | mark_dirty(svm->vmcb, VMCB_SEG); | ||
1315 | } | 1463 | } |
1316 | 1464 | ||
1317 | static void update_db_intercept(struct kvm_vcpu *vcpu) | 1465 | static void update_db_intercept(struct kvm_vcpu *vcpu) |
1318 | { | 1466 | { |
1319 | struct vcpu_svm *svm = to_svm(vcpu); | 1467 | struct vcpu_svm *svm = to_svm(vcpu); |
1320 | 1468 | ||
1321 | svm->vmcb->control.intercept_exceptions &= | 1469 | clr_exception_intercept(svm, DB_VECTOR); |
1322 | ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); | 1470 | clr_exception_intercept(svm, BP_VECTOR); |
1323 | 1471 | ||
1324 | if (svm->nmi_singlestep) | 1472 | if (svm->nmi_singlestep) |
1325 | svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); | 1473 | set_exception_intercept(svm, DB_VECTOR); |
1326 | 1474 | ||
1327 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { | 1475 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { |
1328 | if (vcpu->guest_debug & | 1476 | if (vcpu->guest_debug & |
1329 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | 1477 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) |
1330 | svm->vmcb->control.intercept_exceptions |= | 1478 | set_exception_intercept(svm, DB_VECTOR); |
1331 | 1 << DB_VECTOR; | ||
1332 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | 1479 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
1333 | svm->vmcb->control.intercept_exceptions |= | 1480 | set_exception_intercept(svm, BP_VECTOR); |
1334 | 1 << BP_VECTOR; | ||
1335 | } else | 1481 | } else |
1336 | vcpu->guest_debug = 0; | 1482 | vcpu->guest_debug = 0; |
1337 | } | 1483 | } |
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | |||
1345 | else | 1491 | else |
1346 | svm->vmcb->save.dr7 = vcpu->arch.dr7; | 1492 | svm->vmcb->save.dr7 = vcpu->arch.dr7; |
1347 | 1493 | ||
1348 | update_db_intercept(vcpu); | 1494 | mark_dirty(svm->vmcb, VMCB_DR); |
1349 | } | ||
1350 | |||
1351 | static void load_host_msrs(struct kvm_vcpu *vcpu) | ||
1352 | { | ||
1353 | #ifdef CONFIG_X86_64 | ||
1354 | wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
1355 | #endif | ||
1356 | } | ||
1357 | 1495 | ||
1358 | static void save_host_msrs(struct kvm_vcpu *vcpu) | 1496 | update_db_intercept(vcpu); |
1359 | { | ||
1360 | #ifdef CONFIG_X86_64 | ||
1361 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
1362 | #endif | ||
1363 | } | 1497 | } |
1364 | 1498 | ||
1365 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) | 1499 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) |
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) | |||
1372 | 1506 | ||
1373 | svm->asid_generation = sd->asid_generation; | 1507 | svm->asid_generation = sd->asid_generation; |
1374 | svm->vmcb->control.asid = sd->next_asid++; | 1508 | svm->vmcb->control.asid = sd->next_asid++; |
1509 | |||
1510 | mark_dirty(svm->vmcb, VMCB_ASID); | ||
1375 | } | 1511 | } |
1376 | 1512 | ||
1377 | static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) | 1513 | static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) |
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) | |||
1379 | struct vcpu_svm *svm = to_svm(vcpu); | 1515 | struct vcpu_svm *svm = to_svm(vcpu); |
1380 | 1516 | ||
1381 | svm->vmcb->save.dr7 = value; | 1517 | svm->vmcb->save.dr7 = value; |
1518 | mark_dirty(svm->vmcb, VMCB_DR); | ||
1382 | } | 1519 | } |
1383 | 1520 | ||
1384 | static int pf_interception(struct vcpu_svm *svm) | 1521 | static int pf_interception(struct vcpu_svm *svm) |
1385 | { | 1522 | { |
1386 | u64 fault_address; | 1523 | u64 fault_address = svm->vmcb->control.exit_info_2; |
1387 | u32 error_code; | 1524 | u32 error_code; |
1525 | int r = 1; | ||
1388 | 1526 | ||
1389 | fault_address = svm->vmcb->control.exit_info_2; | 1527 | switch (svm->apf_reason) { |
1390 | error_code = svm->vmcb->control.exit_info_1; | 1528 | default: |
1391 | 1529 | error_code = svm->vmcb->control.exit_info_1; | |
1392 | trace_kvm_page_fault(fault_address, error_code); | 1530 | |
1393 | if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) | 1531 | trace_kvm_page_fault(fault_address, error_code); |
1394 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | 1532 | if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) |
1395 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1533 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
1534 | r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, | ||
1535 | svm->vmcb->control.insn_bytes, | ||
1536 | svm->vmcb->control.insn_len); | ||
1537 | break; | ||
1538 | case KVM_PV_REASON_PAGE_NOT_PRESENT: | ||
1539 | svm->apf_reason = 0; | ||
1540 | local_irq_disable(); | ||
1541 | kvm_async_pf_task_wait(fault_address); | ||
1542 | local_irq_enable(); | ||
1543 | break; | ||
1544 | case KVM_PV_REASON_PAGE_READY: | ||
1545 | svm->apf_reason = 0; | ||
1546 | local_irq_disable(); | ||
1547 | kvm_async_pf_task_wake(fault_address); | ||
1548 | local_irq_enable(); | ||
1549 | break; | ||
1550 | } | ||
1551 | return r; | ||
1396 | } | 1552 | } |
1397 | 1553 | ||
1398 | static int db_interception(struct vcpu_svm *svm) | 1554 | static int db_interception(struct vcpu_svm *svm) |
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm) | |||
1440 | { | 1596 | { |
1441 | int er; | 1597 | int er; |
1442 | 1598 | ||
1443 | er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); | 1599 | er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); |
1444 | if (er != EMULATE_DONE) | 1600 | if (er != EMULATE_DONE) |
1445 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | 1601 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
1446 | return 1; | 1602 | return 1; |
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm) | |||
1449 | static void svm_fpu_activate(struct kvm_vcpu *vcpu) | 1605 | static void svm_fpu_activate(struct kvm_vcpu *vcpu) |
1450 | { | 1606 | { |
1451 | struct vcpu_svm *svm = to_svm(vcpu); | 1607 | struct vcpu_svm *svm = to_svm(vcpu); |
1452 | u32 excp; | ||
1453 | |||
1454 | if (is_nested(svm)) { | ||
1455 | u32 h_excp, n_excp; | ||
1456 | |||
1457 | h_excp = svm->nested.hsave->control.intercept_exceptions; | ||
1458 | n_excp = svm->nested.intercept_exceptions; | ||
1459 | h_excp &= ~(1 << NM_VECTOR); | ||
1460 | excp = h_excp | n_excp; | ||
1461 | } else { | ||
1462 | excp = svm->vmcb->control.intercept_exceptions; | ||
1463 | excp &= ~(1 << NM_VECTOR); | ||
1464 | } | ||
1465 | 1608 | ||
1466 | svm->vmcb->control.intercept_exceptions = excp; | 1609 | clr_exception_intercept(svm, NM_VECTOR); |
1467 | 1610 | ||
1468 | svm->vcpu.fpu_active = 1; | 1611 | svm->vcpu.fpu_active = 1; |
1469 | update_cr0_intercept(svm); | 1612 | update_cr0_intercept(svm); |
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm) | |||
1570 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | 1713 | string = (io_info & SVM_IOIO_STR_MASK) != 0; |
1571 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; | 1714 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; |
1572 | if (string || in) | 1715 | if (string || in) |
1573 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 1716 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
1574 | 1717 | ||
1575 | port = io_info >> 16; | 1718 | port = io_info >> 16; |
1576 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; | 1719 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; |
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, | |||
1624 | struct vcpu_svm *svm = to_svm(vcpu); | 1767 | struct vcpu_svm *svm = to_svm(vcpu); |
1625 | 1768 | ||
1626 | svm->vmcb->control.nested_cr3 = root; | 1769 | svm->vmcb->control.nested_cr3 = root; |
1627 | force_new_asid(vcpu); | 1770 | mark_dirty(svm->vmcb, VMCB_NPT); |
1771 | svm_flush_tlb(vcpu); | ||
1628 | } | 1772 | } |
1629 | 1773 | ||
1630 | static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) | 1774 | static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, |
1775 | struct x86_exception *fault) | ||
1631 | { | 1776 | { |
1632 | struct vcpu_svm *svm = to_svm(vcpu); | 1777 | struct vcpu_svm *svm = to_svm(vcpu); |
1633 | 1778 | ||
1634 | svm->vmcb->control.exit_code = SVM_EXIT_NPF; | 1779 | svm->vmcb->control.exit_code = SVM_EXIT_NPF; |
1635 | svm->vmcb->control.exit_code_hi = 0; | 1780 | svm->vmcb->control.exit_code_hi = 0; |
1636 | svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; | 1781 | svm->vmcb->control.exit_info_1 = fault->error_code; |
1637 | svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; | 1782 | svm->vmcb->control.exit_info_2 = fault->address; |
1638 | 1783 | ||
1639 | nested_svm_vmexit(svm); | 1784 | nested_svm_vmexit(svm); |
1640 | } | 1785 | } |
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | |||
1680 | { | 1825 | { |
1681 | int vmexit; | 1826 | int vmexit; |
1682 | 1827 | ||
1683 | if (!is_nested(svm)) | 1828 | if (!is_guest_mode(&svm->vcpu)) |
1684 | return 0; | 1829 | return 0; |
1685 | 1830 | ||
1686 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; | 1831 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; |
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | |||
1698 | /* This function returns true if it is save to enable the irq window */ | 1843 | /* This function returns true if it is save to enable the irq window */ |
1699 | static inline bool nested_svm_intr(struct vcpu_svm *svm) | 1844 | static inline bool nested_svm_intr(struct vcpu_svm *svm) |
1700 | { | 1845 | { |
1701 | if (!is_nested(svm)) | 1846 | if (!is_guest_mode(&svm->vcpu)) |
1702 | return true; | 1847 | return true; |
1703 | 1848 | ||
1704 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) | 1849 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) |
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) | |||
1737 | /* This function returns true if it is save to enable the nmi window */ | 1882 | /* This function returns true if it is save to enable the nmi window */ |
1738 | static inline bool nested_svm_nmi(struct vcpu_svm *svm) | 1883 | static inline bool nested_svm_nmi(struct vcpu_svm *svm) |
1739 | { | 1884 | { |
1740 | if (!is_nested(svm)) | 1885 | if (!is_guest_mode(&svm->vcpu)) |
1741 | return true; | 1886 | return true; |
1742 | 1887 | ||
1743 | if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) | 1888 | if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) |
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) | |||
1836 | return NESTED_EXIT_HOST; | 1981 | return NESTED_EXIT_HOST; |
1837 | break; | 1982 | break; |
1838 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: | 1983 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: |
1839 | /* When we're shadowing, trap PFs */ | 1984 | /* When we're shadowing, trap PFs, but not async PF */ |
1840 | if (!npt_enabled) | 1985 | if (!npt_enabled && svm->apf_reason == 0) |
1841 | return NESTED_EXIT_HOST; | 1986 | return NESTED_EXIT_HOST; |
1842 | break; | 1987 | break; |
1843 | case SVM_EXIT_EXCP_BASE + NM_VECTOR: | 1988 | case SVM_EXIT_EXCP_BASE + NM_VECTOR: |
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm) | |||
1865 | case SVM_EXIT_IOIO: | 2010 | case SVM_EXIT_IOIO: |
1866 | vmexit = nested_svm_intercept_ioio(svm); | 2011 | vmexit = nested_svm_intercept_ioio(svm); |
1867 | break; | 2012 | break; |
1868 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { | 2013 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { |
1869 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); | 2014 | u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); |
1870 | if (svm->nested.intercept_cr_read & cr_bits) | 2015 | if (svm->nested.intercept_cr & bit) |
1871 | vmexit = NESTED_EXIT_DONE; | 2016 | vmexit = NESTED_EXIT_DONE; |
1872 | break; | 2017 | break; |
1873 | } | 2018 | } |
1874 | case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { | 2019 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { |
1875 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); | 2020 | u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); |
1876 | if (svm->nested.intercept_cr_write & cr_bits) | 2021 | if (svm->nested.intercept_dr & bit) |
1877 | vmexit = NESTED_EXIT_DONE; | ||
1878 | break; | ||
1879 | } | ||
1880 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { | ||
1881 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); | ||
1882 | if (svm->nested.intercept_dr_read & dr_bits) | ||
1883 | vmexit = NESTED_EXIT_DONE; | ||
1884 | break; | ||
1885 | } | ||
1886 | case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { | ||
1887 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); | ||
1888 | if (svm->nested.intercept_dr_write & dr_bits) | ||
1889 | vmexit = NESTED_EXIT_DONE; | 2022 | vmexit = NESTED_EXIT_DONE; |
1890 | break; | 2023 | break; |
1891 | } | 2024 | } |
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm) | |||
1893 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); | 2026 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); |
1894 | if (svm->nested.intercept_exceptions & excp_bits) | 2027 | if (svm->nested.intercept_exceptions & excp_bits) |
1895 | vmexit = NESTED_EXIT_DONE; | 2028 | vmexit = NESTED_EXIT_DONE; |
2029 | /* async page fault always cause vmexit */ | ||
2030 | else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && | ||
2031 | svm->apf_reason != 0) | ||
2032 | vmexit = NESTED_EXIT_DONE; | ||
1896 | break; | 2033 | break; |
1897 | } | 2034 | } |
1898 | case SVM_EXIT_ERR: { | 2035 | case SVM_EXIT_ERR: { |
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr | |||
1926 | struct vmcb_control_area *dst = &dst_vmcb->control; | 2063 | struct vmcb_control_area *dst = &dst_vmcb->control; |
1927 | struct vmcb_control_area *from = &from_vmcb->control; | 2064 | struct vmcb_control_area *from = &from_vmcb->control; |
1928 | 2065 | ||
1929 | dst->intercept_cr_read = from->intercept_cr_read; | 2066 | dst->intercept_cr = from->intercept_cr; |
1930 | dst->intercept_cr_write = from->intercept_cr_write; | 2067 | dst->intercept_dr = from->intercept_dr; |
1931 | dst->intercept_dr_read = from->intercept_dr_read; | ||
1932 | dst->intercept_dr_write = from->intercept_dr_write; | ||
1933 | dst->intercept_exceptions = from->intercept_exceptions; | 2068 | dst->intercept_exceptions = from->intercept_exceptions; |
1934 | dst->intercept = from->intercept; | 2069 | dst->intercept = from->intercept; |
1935 | dst->iopm_base_pa = from->iopm_base_pa; | 2070 | dst->iopm_base_pa = from->iopm_base_pa; |
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1970 | if (!nested_vmcb) | 2105 | if (!nested_vmcb) |
1971 | return 1; | 2106 | return 1; |
1972 | 2107 | ||
1973 | /* Exit nested SVM mode */ | 2108 | /* Exit Guest-Mode */ |
2109 | leave_guest_mode(&svm->vcpu); | ||
1974 | svm->nested.vmcb = 0; | 2110 | svm->nested.vmcb = 0; |
1975 | 2111 | ||
1976 | /* Give the current vmcb to the guest */ | 2112 | /* Give the current vmcb to the guest */ |
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1984 | nested_vmcb->save.idtr = vmcb->save.idtr; | 2120 | nested_vmcb->save.idtr = vmcb->save.idtr; |
1985 | nested_vmcb->save.efer = svm->vcpu.arch.efer; | 2121 | nested_vmcb->save.efer = svm->vcpu.arch.efer; |
1986 | nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); | 2122 | nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); |
1987 | nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; | 2123 | nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); |
1988 | nested_vmcb->save.cr2 = vmcb->save.cr2; | 2124 | nested_vmcb->save.cr2 = vmcb->save.cr2; |
1989 | nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; | 2125 | nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; |
1990 | nested_vmcb->save.rflags = vmcb->save.rflags; | 2126 | nested_vmcb->save.rflags = vmcb->save.rflags; |
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
2061 | svm->vmcb->save.cpl = 0; | 2197 | svm->vmcb->save.cpl = 0; |
2062 | svm->vmcb->control.exit_int_info = 0; | 2198 | svm->vmcb->control.exit_int_info = 0; |
2063 | 2199 | ||
2200 | mark_all_dirty(svm->vmcb); | ||
2201 | |||
2064 | nested_svm_unmap(page); | 2202 | nested_svm_unmap(page); |
2065 | 2203 | ||
2066 | nested_svm_uninit_mmu_context(&svm->vcpu); | 2204 | nested_svm_uninit_mmu_context(&svm->vcpu); |
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2148 | nested_vmcb->control.event_inj, | 2286 | nested_vmcb->control.event_inj, |
2149 | nested_vmcb->control.nested_ctl); | 2287 | nested_vmcb->control.nested_ctl); |
2150 | 2288 | ||
2151 | trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, | 2289 | trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, |
2152 | nested_vmcb->control.intercept_cr_write, | 2290 | nested_vmcb->control.intercept_cr >> 16, |
2153 | nested_vmcb->control.intercept_exceptions, | 2291 | nested_vmcb->control.intercept_exceptions, |
2154 | nested_vmcb->control.intercept); | 2292 | nested_vmcb->control.intercept); |
2155 | 2293 | ||
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2177 | if (npt_enabled) | 2315 | if (npt_enabled) |
2178 | hsave->save.cr3 = vmcb->save.cr3; | 2316 | hsave->save.cr3 = vmcb->save.cr3; |
2179 | else | 2317 | else |
2180 | hsave->save.cr3 = svm->vcpu.arch.cr3; | 2318 | hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); |
2181 | 2319 | ||
2182 | copy_vmcb_control_area(hsave, vmcb); | 2320 | copy_vmcb_control_area(hsave, vmcb); |
2183 | 2321 | ||
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2229 | svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; | 2367 | svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; |
2230 | 2368 | ||
2231 | /* cache intercepts */ | 2369 | /* cache intercepts */ |
2232 | svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; | 2370 | svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; |
2233 | svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; | 2371 | svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; |
2234 | svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; | ||
2235 | svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; | ||
2236 | svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; | 2372 | svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; |
2237 | svm->nested.intercept = nested_vmcb->control.intercept; | 2373 | svm->nested.intercept = nested_vmcb->control.intercept; |
2238 | 2374 | ||
2239 | force_new_asid(&svm->vcpu); | 2375 | svm_flush_tlb(&svm->vcpu); |
2240 | svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; | 2376 | svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; |
2241 | if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) | 2377 | if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) |
2242 | svm->vcpu.arch.hflags |= HF_VINTR_MASK; | 2378 | svm->vcpu.arch.hflags |= HF_VINTR_MASK; |
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2245 | 2381 | ||
2246 | if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { | 2382 | if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { |
2247 | /* We only want the cr8 intercept bits of the guest */ | 2383 | /* We only want the cr8 intercept bits of the guest */ |
2248 | svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; | 2384 | clr_cr_intercept(svm, INTERCEPT_CR8_READ); |
2249 | svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | 2385 | clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
2250 | } | 2386 | } |
2251 | 2387 | ||
2252 | /* We don't want to see VMMCALLs from a nested guest */ | 2388 | /* We don't want to see VMMCALLs from a nested guest */ |
2253 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); | 2389 | clr_intercept(svm, INTERCEPT_VMMCALL); |
2254 | |||
2255 | /* | ||
2256 | * We don't want a nested guest to be more powerful than the guest, so | ||
2257 | * all intercepts are ORed | ||
2258 | */ | ||
2259 | svm->vmcb->control.intercept_cr_read |= | ||
2260 | nested_vmcb->control.intercept_cr_read; | ||
2261 | svm->vmcb->control.intercept_cr_write |= | ||
2262 | nested_vmcb->control.intercept_cr_write; | ||
2263 | svm->vmcb->control.intercept_dr_read |= | ||
2264 | nested_vmcb->control.intercept_dr_read; | ||
2265 | svm->vmcb->control.intercept_dr_write |= | ||
2266 | nested_vmcb->control.intercept_dr_write; | ||
2267 | svm->vmcb->control.intercept_exceptions |= | ||
2268 | nested_vmcb->control.intercept_exceptions; | ||
2269 | |||
2270 | svm->vmcb->control.intercept |= nested_vmcb->control.intercept; | ||
2271 | 2390 | ||
2272 | svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; | 2391 | svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; |
2273 | svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; | 2392 | svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; |
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2278 | 2397 | ||
2279 | nested_svm_unmap(page); | 2398 | nested_svm_unmap(page); |
2280 | 2399 | ||
2281 | /* nested_vmcb is our indicator if nested SVM is activated */ | 2400 | /* Enter Guest-Mode */ |
2401 | enter_guest_mode(&svm->vcpu); | ||
2402 | |||
2403 | /* | ||
2404 | * Merge guest and host intercepts - must be called with vcpu in | ||
2405 | * guest-mode to take affect here | ||
2406 | */ | ||
2407 | recalc_intercepts(svm); | ||
2408 | |||
2282 | svm->nested.vmcb = vmcb_gpa; | 2409 | svm->nested.vmcb = vmcb_gpa; |
2283 | 2410 | ||
2284 | enable_gif(svm); | 2411 | enable_gif(svm); |
2285 | 2412 | ||
2413 | mark_all_dirty(svm->vmcb); | ||
2414 | |||
2286 | return true; | 2415 | return true; |
2287 | } | 2416 | } |
2288 | 2417 | ||
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm) | |||
2400 | svm_clear_vintr(svm); | 2529 | svm_clear_vintr(svm); |
2401 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 2530 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
2402 | 2531 | ||
2532 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
2533 | |||
2403 | return 1; | 2534 | return 1; |
2404 | } | 2535 | } |
2405 | 2536 | ||
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm) | |||
2426 | return 1; | 2557 | return 1; |
2427 | } | 2558 | } |
2428 | 2559 | ||
2560 | static int xsetbv_interception(struct vcpu_svm *svm) | ||
2561 | { | ||
2562 | u64 new_bv = kvm_read_edx_eax(&svm->vcpu); | ||
2563 | u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); | ||
2564 | |||
2565 | if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { | ||
2566 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
2567 | skip_emulated_instruction(&svm->vcpu); | ||
2568 | } | ||
2569 | |||
2570 | return 1; | ||
2571 | } | ||
2572 | |||
2429 | static int invalid_op_interception(struct vcpu_svm *svm) | 2573 | static int invalid_op_interception(struct vcpu_svm *svm) |
2430 | { | 2574 | { |
2431 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | 2575 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm) | |||
2507 | static int iret_interception(struct vcpu_svm *svm) | 2651 | static int iret_interception(struct vcpu_svm *svm) |
2508 | { | 2652 | { |
2509 | ++svm->vcpu.stat.nmi_window_exits; | 2653 | ++svm->vcpu.stat.nmi_window_exits; |
2510 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); | 2654 | clr_intercept(svm, INTERCEPT_IRET); |
2511 | svm->vcpu.arch.hflags |= HF_IRET_MASK; | 2655 | svm->vcpu.arch.hflags |= HF_IRET_MASK; |
2512 | return 1; | 2656 | return 1; |
2513 | } | 2657 | } |
2514 | 2658 | ||
2515 | static int invlpg_interception(struct vcpu_svm *svm) | 2659 | static int invlpg_interception(struct vcpu_svm *svm) |
2516 | { | 2660 | { |
2517 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; | 2661 | if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) |
2662 | return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; | ||
2663 | |||
2664 | kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); | ||
2665 | skip_emulated_instruction(&svm->vcpu); | ||
2666 | return 1; | ||
2518 | } | 2667 | } |
2519 | 2668 | ||
2520 | static int emulate_on_interception(struct vcpu_svm *svm) | 2669 | static int emulate_on_interception(struct vcpu_svm *svm) |
2521 | { | 2670 | { |
2522 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; | 2671 | return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; |
2672 | } | ||
2673 | |||
2674 | #define CR_VALID (1ULL << 63) | ||
2675 | |||
2676 | static int cr_interception(struct vcpu_svm *svm) | ||
2677 | { | ||
2678 | int reg, cr; | ||
2679 | unsigned long val; | ||
2680 | int err; | ||
2681 | |||
2682 | if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) | ||
2683 | return emulate_on_interception(svm); | ||
2684 | |||
2685 | if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) | ||
2686 | return emulate_on_interception(svm); | ||
2687 | |||
2688 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; | ||
2689 | cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; | ||
2690 | |||
2691 | err = 0; | ||
2692 | if (cr >= 16) { /* mov to cr */ | ||
2693 | cr -= 16; | ||
2694 | val = kvm_register_read(&svm->vcpu, reg); | ||
2695 | switch (cr) { | ||
2696 | case 0: | ||
2697 | err = kvm_set_cr0(&svm->vcpu, val); | ||
2698 | break; | ||
2699 | case 3: | ||
2700 | err = kvm_set_cr3(&svm->vcpu, val); | ||
2701 | break; | ||
2702 | case 4: | ||
2703 | err = kvm_set_cr4(&svm->vcpu, val); | ||
2704 | break; | ||
2705 | case 8: | ||
2706 | err = kvm_set_cr8(&svm->vcpu, val); | ||
2707 | break; | ||
2708 | default: | ||
2709 | WARN(1, "unhandled write to CR%d", cr); | ||
2710 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
2711 | return 1; | ||
2712 | } | ||
2713 | } else { /* mov from cr */ | ||
2714 | switch (cr) { | ||
2715 | case 0: | ||
2716 | val = kvm_read_cr0(&svm->vcpu); | ||
2717 | break; | ||
2718 | case 2: | ||
2719 | val = svm->vcpu.arch.cr2; | ||
2720 | break; | ||
2721 | case 3: | ||
2722 | val = kvm_read_cr3(&svm->vcpu); | ||
2723 | break; | ||
2724 | case 4: | ||
2725 | val = kvm_read_cr4(&svm->vcpu); | ||
2726 | break; | ||
2727 | case 8: | ||
2728 | val = kvm_get_cr8(&svm->vcpu); | ||
2729 | break; | ||
2730 | default: | ||
2731 | WARN(1, "unhandled read from CR%d", cr); | ||
2732 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
2733 | return 1; | ||
2734 | } | ||
2735 | kvm_register_write(&svm->vcpu, reg, val); | ||
2736 | } | ||
2737 | kvm_complete_insn_gp(&svm->vcpu, err); | ||
2738 | |||
2739 | return 1; | ||
2523 | } | 2740 | } |
2524 | 2741 | ||
2525 | static int cr0_write_interception(struct vcpu_svm *svm) | 2742 | static int cr0_write_interception(struct vcpu_svm *svm) |
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm) | |||
2527 | struct kvm_vcpu *vcpu = &svm->vcpu; | 2744 | struct kvm_vcpu *vcpu = &svm->vcpu; |
2528 | int r; | 2745 | int r; |
2529 | 2746 | ||
2530 | r = emulate_instruction(&svm->vcpu, 0, 0, 0); | 2747 | r = cr_interception(svm); |
2531 | 2748 | ||
2532 | if (svm->nested.vmexit_rip) { | 2749 | if (svm->nested.vmexit_rip) { |
2533 | kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); | 2750 | kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); |
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm) | |||
2536 | svm->nested.vmexit_rip = 0; | 2753 | svm->nested.vmexit_rip = 0; |
2537 | } | 2754 | } |
2538 | 2755 | ||
2539 | return r == EMULATE_DONE; | 2756 | return r; |
2757 | } | ||
2758 | |||
2759 | static int dr_interception(struct vcpu_svm *svm) | ||
2760 | { | ||
2761 | int reg, dr; | ||
2762 | unsigned long val; | ||
2763 | int err; | ||
2764 | |||
2765 | if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) | ||
2766 | return emulate_on_interception(svm); | ||
2767 | |||
2768 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; | ||
2769 | dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; | ||
2770 | |||
2771 | if (dr >= 16) { /* mov to DRn */ | ||
2772 | val = kvm_register_read(&svm->vcpu, reg); | ||
2773 | kvm_set_dr(&svm->vcpu, dr - 16, val); | ||
2774 | } else { | ||
2775 | err = kvm_get_dr(&svm->vcpu, dr, &val); | ||
2776 | if (!err) | ||
2777 | kvm_register_write(&svm->vcpu, reg, val); | ||
2778 | } | ||
2779 | |||
2780 | return 1; | ||
2540 | } | 2781 | } |
2541 | 2782 | ||
2542 | static int cr8_write_interception(struct vcpu_svm *svm) | 2783 | static int cr8_write_interception(struct vcpu_svm *svm) |
2543 | { | 2784 | { |
2544 | struct kvm_run *kvm_run = svm->vcpu.run; | 2785 | struct kvm_run *kvm_run = svm->vcpu.run; |
2786 | int r; | ||
2545 | 2787 | ||
2546 | u8 cr8_prev = kvm_get_cr8(&svm->vcpu); | 2788 | u8 cr8_prev = kvm_get_cr8(&svm->vcpu); |
2547 | /* instruction emulation calls kvm_set_cr8() */ | 2789 | /* instruction emulation calls kvm_set_cr8() */ |
2548 | emulate_instruction(&svm->vcpu, 0, 0, 0); | 2790 | r = cr_interception(svm); |
2549 | if (irqchip_in_kernel(svm->vcpu.kvm)) { | 2791 | if (irqchip_in_kernel(svm->vcpu.kvm)) { |
2550 | svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | 2792 | clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
2551 | return 1; | 2793 | return r; |
2552 | } | 2794 | } |
2553 | if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) | 2795 | if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) |
2554 | return 1; | 2796 | return r; |
2555 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | 2797 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; |
2556 | return 0; | 2798 | return 0; |
2557 | } | 2799 | } |
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
2562 | 2804 | ||
2563 | switch (ecx) { | 2805 | switch (ecx) { |
2564 | case MSR_IA32_TSC: { | 2806 | case MSR_IA32_TSC: { |
2565 | u64 tsc_offset; | 2807 | struct vmcb *vmcb = get_host_vmcb(svm); |
2566 | 2808 | ||
2567 | if (is_nested(svm)) | 2809 | *data = vmcb->control.tsc_offset + native_read_tsc(); |
2568 | tsc_offset = svm->nested.hsave->control.tsc_offset; | ||
2569 | else | ||
2570 | tsc_offset = svm->vmcb->control.tsc_offset; | ||
2571 | |||
2572 | *data = tsc_offset + native_read_tsc(); | ||
2573 | break; | 2810 | break; |
2574 | } | 2811 | } |
2575 | case MSR_STAR: | 2812 | case MSR_STAR: |
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2714 | svm->vmcb->save.sysenter_esp = data; | 2951 | svm->vmcb->save.sysenter_esp = data; |
2715 | break; | 2952 | break; |
2716 | case MSR_IA32_DEBUGCTLMSR: | 2953 | case MSR_IA32_DEBUGCTLMSR: |
2717 | if (!svm_has(SVM_FEATURE_LBRV)) { | 2954 | if (!boot_cpu_has(X86_FEATURE_LBRV)) { |
2718 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", | 2955 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", |
2719 | __func__, data); | 2956 | __func__, data); |
2720 | break; | 2957 | break; |
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2723 | return 1; | 2960 | return 1; |
2724 | 2961 | ||
2725 | svm->vmcb->save.dbgctl = data; | 2962 | svm->vmcb->save.dbgctl = data; |
2963 | mark_dirty(svm->vmcb, VMCB_LBR); | ||
2726 | if (data & (1ULL<<0)) | 2964 | if (data & (1ULL<<0)) |
2727 | svm_enable_lbrv(svm); | 2965 | svm_enable_lbrv(svm); |
2728 | else | 2966 | else |
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) | |||
2775 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | 3013 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); |
2776 | svm_clear_vintr(svm); | 3014 | svm_clear_vintr(svm); |
2777 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 3015 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
3016 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
2778 | /* | 3017 | /* |
2779 | * If the user space waits to inject interrupts, exit as soon as | 3018 | * If the user space waits to inject interrupts, exit as soon as |
2780 | * possible | 3019 | * possible |
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm) | |||
2797 | } | 3036 | } |
2798 | 3037 | ||
2799 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | 3038 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { |
2800 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | 3039 | [SVM_EXIT_READ_CR0] = cr_interception, |
2801 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | 3040 | [SVM_EXIT_READ_CR3] = cr_interception, |
2802 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | 3041 | [SVM_EXIT_READ_CR4] = cr_interception, |
2803 | [SVM_EXIT_READ_CR8] = emulate_on_interception, | 3042 | [SVM_EXIT_READ_CR8] = cr_interception, |
2804 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, | 3043 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, |
2805 | [SVM_EXIT_WRITE_CR0] = cr0_write_interception, | 3044 | [SVM_EXIT_WRITE_CR0] = cr0_write_interception, |
2806 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | 3045 | [SVM_EXIT_WRITE_CR3] = cr_interception, |
2807 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | 3046 | [SVM_EXIT_WRITE_CR4] = cr_interception, |
2808 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, | 3047 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, |
2809 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | 3048 | [SVM_EXIT_READ_DR0] = dr_interception, |
2810 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | 3049 | [SVM_EXIT_READ_DR1] = dr_interception, |
2811 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | 3050 | [SVM_EXIT_READ_DR2] = dr_interception, |
2812 | [SVM_EXIT_READ_DR3] = emulate_on_interception, | 3051 | [SVM_EXIT_READ_DR3] = dr_interception, |
2813 | [SVM_EXIT_READ_DR4] = emulate_on_interception, | 3052 | [SVM_EXIT_READ_DR4] = dr_interception, |
2814 | [SVM_EXIT_READ_DR5] = emulate_on_interception, | 3053 | [SVM_EXIT_READ_DR5] = dr_interception, |
2815 | [SVM_EXIT_READ_DR6] = emulate_on_interception, | 3054 | [SVM_EXIT_READ_DR6] = dr_interception, |
2816 | [SVM_EXIT_READ_DR7] = emulate_on_interception, | 3055 | [SVM_EXIT_READ_DR7] = dr_interception, |
2817 | [SVM_EXIT_WRITE_DR0] = emulate_on_interception, | 3056 | [SVM_EXIT_WRITE_DR0] = dr_interception, |
2818 | [SVM_EXIT_WRITE_DR1] = emulate_on_interception, | 3057 | [SVM_EXIT_WRITE_DR1] = dr_interception, |
2819 | [SVM_EXIT_WRITE_DR2] = emulate_on_interception, | 3058 | [SVM_EXIT_WRITE_DR2] = dr_interception, |
2820 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | 3059 | [SVM_EXIT_WRITE_DR3] = dr_interception, |
2821 | [SVM_EXIT_WRITE_DR4] = emulate_on_interception, | 3060 | [SVM_EXIT_WRITE_DR4] = dr_interception, |
2822 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | 3061 | [SVM_EXIT_WRITE_DR5] = dr_interception, |
2823 | [SVM_EXIT_WRITE_DR6] = emulate_on_interception, | 3062 | [SVM_EXIT_WRITE_DR6] = dr_interception, |
2824 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | 3063 | [SVM_EXIT_WRITE_DR7] = dr_interception, |
2825 | [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, | 3064 | [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, |
2826 | [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, | 3065 | [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, |
2827 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, | 3066 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, |
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
2854 | [SVM_EXIT_WBINVD] = emulate_on_interception, | 3093 | [SVM_EXIT_WBINVD] = emulate_on_interception, |
2855 | [SVM_EXIT_MONITOR] = invalid_op_interception, | 3094 | [SVM_EXIT_MONITOR] = invalid_op_interception, |
2856 | [SVM_EXIT_MWAIT] = invalid_op_interception, | 3095 | [SVM_EXIT_MWAIT] = invalid_op_interception, |
3096 | [SVM_EXIT_XSETBV] = xsetbv_interception, | ||
2857 | [SVM_EXIT_NPF] = pf_interception, | 3097 | [SVM_EXIT_NPF] = pf_interception, |
2858 | }; | 3098 | }; |
2859 | 3099 | ||
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu) | |||
2864 | struct vmcb_save_area *save = &svm->vmcb->save; | 3104 | struct vmcb_save_area *save = &svm->vmcb->save; |
2865 | 3105 | ||
2866 | pr_err("VMCB Control Area:\n"); | 3106 | pr_err("VMCB Control Area:\n"); |
2867 | pr_err("cr_read: %04x\n", control->intercept_cr_read); | 3107 | pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); |
2868 | pr_err("cr_write: %04x\n", control->intercept_cr_write); | 3108 | pr_err("cr_write: %04x\n", control->intercept_cr >> 16); |
2869 | pr_err("dr_read: %04x\n", control->intercept_dr_read); | 3109 | pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); |
2870 | pr_err("dr_write: %04x\n", control->intercept_dr_write); | 3110 | pr_err("dr_write: %04x\n", control->intercept_dr >> 16); |
2871 | pr_err("exceptions: %08x\n", control->intercept_exceptions); | 3111 | pr_err("exceptions: %08x\n", control->intercept_exceptions); |
2872 | pr_err("intercepts: %016llx\n", control->intercept); | 3112 | pr_err("intercepts: %016llx\n", control->intercept); |
2873 | pr_err("pause filter count: %d\n", control->pause_filter_count); | 3113 | pr_err("pause filter count: %d\n", control->pause_filter_count); |
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu) | |||
2950 | 3190 | ||
2951 | } | 3191 | } |
2952 | 3192 | ||
3193 | static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | ||
3194 | { | ||
3195 | struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; | ||
3196 | |||
3197 | *info1 = control->exit_info_1; | ||
3198 | *info2 = control->exit_info_2; | ||
3199 | } | ||
3200 | |||
2953 | static int handle_exit(struct kvm_vcpu *vcpu) | 3201 | static int handle_exit(struct kvm_vcpu *vcpu) |
2954 | { | 3202 | { |
2955 | struct vcpu_svm *svm = to_svm(vcpu); | 3203 | struct vcpu_svm *svm = to_svm(vcpu); |
2956 | struct kvm_run *kvm_run = vcpu->run; | 3204 | struct kvm_run *kvm_run = vcpu->run; |
2957 | u32 exit_code = svm->vmcb->control.exit_code; | 3205 | u32 exit_code = svm->vmcb->control.exit_code; |
2958 | 3206 | ||
2959 | trace_kvm_exit(exit_code, vcpu); | 3207 | trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); |
2960 | 3208 | ||
2961 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) | 3209 | if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) |
2962 | vcpu->arch.cr0 = svm->vmcb->save.cr0; | 3210 | vcpu->arch.cr0 = svm->vmcb->save.cr0; |
2963 | if (npt_enabled) | 3211 | if (npt_enabled) |
2964 | vcpu->arch.cr3 = svm->vmcb->save.cr3; | 3212 | vcpu->arch.cr3 = svm->vmcb->save.cr3; |
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) | |||
2970 | return 1; | 3218 | return 1; |
2971 | } | 3219 | } |
2972 | 3220 | ||
2973 | if (is_nested(svm)) { | 3221 | if (is_guest_mode(vcpu)) { |
2974 | int vmexit; | 3222 | int vmexit; |
2975 | 3223 | ||
2976 | trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, | 3224 | trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, |
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm) | |||
3033 | 3281 | ||
3034 | struct svm_cpu_data *sd = per_cpu(svm_data, cpu); | 3282 | struct svm_cpu_data *sd = per_cpu(svm_data, cpu); |
3035 | 3283 | ||
3036 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
3037 | /* FIXME: handle wraparound of asid_generation */ | 3284 | /* FIXME: handle wraparound of asid_generation */ |
3038 | if (svm->asid_generation != sd->asid_generation) | 3285 | if (svm->asid_generation != sd->asid_generation) |
3039 | new_asid(svm, sd); | 3286 | new_asid(svm, sd); |
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) | |||
3045 | 3292 | ||
3046 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; | 3293 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; |
3047 | vcpu->arch.hflags |= HF_NMI_MASK; | 3294 | vcpu->arch.hflags |= HF_NMI_MASK; |
3048 | svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); | 3295 | set_intercept(svm, INTERCEPT_IRET); |
3049 | ++vcpu->stat.nmi_injections; | 3296 | ++vcpu->stat.nmi_injections; |
3050 | } | 3297 | } |
3051 | 3298 | ||
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
3058 | control->int_ctl &= ~V_INTR_PRIO_MASK; | 3305 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
3059 | control->int_ctl |= V_IRQ_MASK | | 3306 | control->int_ctl |= V_IRQ_MASK | |
3060 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | 3307 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); |
3308 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
3061 | } | 3309 | } |
3062 | 3310 | ||
3063 | static void svm_set_irq(struct kvm_vcpu *vcpu) | 3311 | static void svm_set_irq(struct kvm_vcpu *vcpu) |
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | |||
3077 | { | 3325 | { |
3078 | struct vcpu_svm *svm = to_svm(vcpu); | 3326 | struct vcpu_svm *svm = to_svm(vcpu); |
3079 | 3327 | ||
3080 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3328 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
3081 | return; | 3329 | return; |
3082 | 3330 | ||
3083 | if (irr == -1) | 3331 | if (irr == -1) |
3084 | return; | 3332 | return; |
3085 | 3333 | ||
3086 | if (tpr >= irr) | 3334 | if (tpr >= irr) |
3087 | svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; | 3335 | set_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
3088 | } | 3336 | } |
3089 | 3337 | ||
3090 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) | 3338 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) |
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | |||
3112 | 3360 | ||
3113 | if (masked) { | 3361 | if (masked) { |
3114 | svm->vcpu.arch.hflags |= HF_NMI_MASK; | 3362 | svm->vcpu.arch.hflags |= HF_NMI_MASK; |
3115 | svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); | 3363 | set_intercept(svm, INTERCEPT_IRET); |
3116 | } else { | 3364 | } else { |
3117 | svm->vcpu.arch.hflags &= ~HF_NMI_MASK; | 3365 | svm->vcpu.arch.hflags &= ~HF_NMI_MASK; |
3118 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); | 3366 | clr_intercept(svm, INTERCEPT_IRET); |
3119 | } | 3367 | } |
3120 | } | 3368 | } |
3121 | 3369 | ||
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) | |||
3131 | 3379 | ||
3132 | ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); | 3380 | ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); |
3133 | 3381 | ||
3134 | if (is_nested(svm)) | 3382 | if (is_guest_mode(vcpu)) |
3135 | return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); | 3383 | return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); |
3136 | 3384 | ||
3137 | return ret; | 3385 | return ret; |
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | |||
3177 | 3425 | ||
3178 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) | 3426 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) |
3179 | { | 3427 | { |
3180 | force_new_asid(vcpu); | 3428 | struct vcpu_svm *svm = to_svm(vcpu); |
3429 | |||
3430 | if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) | ||
3431 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; | ||
3432 | else | ||
3433 | svm->asid_generation--; | ||
3181 | } | 3434 | } |
3182 | 3435 | ||
3183 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) | 3436 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) |
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) | |||
3188 | { | 3441 | { |
3189 | struct vcpu_svm *svm = to_svm(vcpu); | 3442 | struct vcpu_svm *svm = to_svm(vcpu); |
3190 | 3443 | ||
3191 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3444 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
3192 | return; | 3445 | return; |
3193 | 3446 | ||
3194 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { | 3447 | if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { |
3195 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; | 3448 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; |
3196 | kvm_set_cr8(vcpu, cr8); | 3449 | kvm_set_cr8(vcpu, cr8); |
3197 | } | 3450 | } |
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) | |||
3202 | struct vcpu_svm *svm = to_svm(vcpu); | 3455 | struct vcpu_svm *svm = to_svm(vcpu); |
3203 | u64 cr8; | 3456 | u64 cr8; |
3204 | 3457 | ||
3205 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3458 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
3206 | return; | 3459 | return; |
3207 | 3460 | ||
3208 | cr8 = kvm_get_cr8(vcpu); | 3461 | cr8 = kvm_get_cr8(vcpu); |
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) | |||
3289 | static void svm_vcpu_run(struct kvm_vcpu *vcpu) | 3542 | static void svm_vcpu_run(struct kvm_vcpu *vcpu) |
3290 | { | 3543 | { |
3291 | struct vcpu_svm *svm = to_svm(vcpu); | 3544 | struct vcpu_svm *svm = to_svm(vcpu); |
3292 | u16 fs_selector; | ||
3293 | u16 gs_selector; | ||
3294 | u16 ldt_selector; | ||
3295 | 3545 | ||
3296 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3546 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
3297 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3547 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3308 | 3558 | ||
3309 | sync_lapic_to_cr8(vcpu); | 3559 | sync_lapic_to_cr8(vcpu); |
3310 | 3560 | ||
3311 | save_host_msrs(vcpu); | ||
3312 | savesegment(fs, fs_selector); | ||
3313 | savesegment(gs, gs_selector); | ||
3314 | ldt_selector = kvm_read_ldt(); | ||
3315 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | 3561 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
3316 | 3562 | ||
3317 | clgi(); | 3563 | clgi(); |
@@ -3389,19 +3635,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3389 | #endif | 3635 | #endif |
3390 | ); | 3636 | ); |
3391 | 3637 | ||
3392 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | ||
3393 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
3394 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
3395 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
3396 | |||
3397 | load_host_msrs(vcpu); | ||
3398 | kvm_load_ldt(ldt_selector); | ||
3399 | loadsegment(fs, fs_selector); | ||
3400 | #ifdef CONFIG_X86_64 | 3638 | #ifdef CONFIG_X86_64 |
3401 | load_gs_index(gs_selector); | 3639 | wrmsrl(MSR_GS_BASE, svm->host.gs_base); |
3402 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
3403 | #else | 3640 | #else |
3404 | loadsegment(gs, gs_selector); | 3641 | loadsegment(fs, svm->host.fs); |
3405 | #endif | 3642 | #endif |
3406 | 3643 | ||
3407 | reload_tss(vcpu); | 3644 | reload_tss(vcpu); |
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3410 | 3647 | ||
3411 | stgi(); | 3648 | stgi(); |
3412 | 3649 | ||
3650 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | ||
3651 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
3652 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
3653 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
3654 | |||
3413 | sync_cr8_to_lapic(vcpu); | 3655 | sync_cr8_to_lapic(vcpu); |
3414 | 3656 | ||
3415 | svm->next_rip = 0; | 3657 | svm->next_rip = 0; |
3416 | 3658 | ||
3659 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
3660 | |||
3661 | /* if exit due to PF check for async PF */ | ||
3662 | if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) | ||
3663 | svm->apf_reason = kvm_read_and_reset_pf_reason(); | ||
3664 | |||
3417 | if (npt_enabled) { | 3665 | if (npt_enabled) { |
3418 | vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); | 3666 | vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); |
3419 | vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); | 3667 | vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); |
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3426 | if (unlikely(svm->vmcb->control.exit_code == | 3674 | if (unlikely(svm->vmcb->control.exit_code == |
3427 | SVM_EXIT_EXCP_BASE + MC_VECTOR)) | 3675 | SVM_EXIT_EXCP_BASE + MC_VECTOR)) |
3428 | svm_handle_mce(svm); | 3676 | svm_handle_mce(svm); |
3677 | |||
3678 | mark_all_clean(svm->vmcb); | ||
3429 | } | 3679 | } |
3430 | 3680 | ||
3431 | #undef R | 3681 | #undef R |
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
3435 | struct vcpu_svm *svm = to_svm(vcpu); | 3685 | struct vcpu_svm *svm = to_svm(vcpu); |
3436 | 3686 | ||
3437 | svm->vmcb->save.cr3 = root; | 3687 | svm->vmcb->save.cr3 = root; |
3438 | force_new_asid(vcpu); | 3688 | mark_dirty(svm->vmcb, VMCB_CR); |
3689 | svm_flush_tlb(vcpu); | ||
3439 | } | 3690 | } |
3440 | 3691 | ||
3441 | static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) | 3692 | static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) |
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
3443 | struct vcpu_svm *svm = to_svm(vcpu); | 3694 | struct vcpu_svm *svm = to_svm(vcpu); |
3444 | 3695 | ||
3445 | svm->vmcb->control.nested_cr3 = root; | 3696 | svm->vmcb->control.nested_cr3 = root; |
3697 | mark_dirty(svm->vmcb, VMCB_NPT); | ||
3446 | 3698 | ||
3447 | /* Also sync guest cr3 here in case we live migrate */ | 3699 | /* Also sync guest cr3 here in case we live migrate */ |
3448 | svm->vmcb->save.cr3 = vcpu->arch.cr3; | 3700 | svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); |
3701 | mark_dirty(svm->vmcb, VMCB_CR); | ||
3449 | 3702 | ||
3450 | force_new_asid(vcpu); | 3703 | svm_flush_tlb(vcpu); |
3451 | } | 3704 | } |
3452 | 3705 | ||
3453 | static int is_disabled(void) | 3706 | static int is_disabled(void) |
@@ -3494,10 +3747,6 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) | |||
3494 | static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | 3747 | static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) |
3495 | { | 3748 | { |
3496 | switch (func) { | 3749 | switch (func) { |
3497 | case 0x00000001: | ||
3498 | /* Mask out xsave bit as long as it is not supported by SVM */ | ||
3499 | entry->ecx &= ~(bit(X86_FEATURE_XSAVE)); | ||
3500 | break; | ||
3501 | case 0x80000001: | 3750 | case 0x80000001: |
3502 | if (nested) | 3751 | if (nested) |
3503 | entry->ecx |= (1 << 2); /* Set SVM bit */ | 3752 | entry->ecx |= (1 << 2); /* Set SVM bit */ |
@@ -3511,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | |||
3511 | additional features */ | 3760 | additional features */ |
3512 | 3761 | ||
3513 | /* Support next_rip if host supports it */ | 3762 | /* Support next_rip if host supports it */ |
3514 | if (svm_has(SVM_FEATURE_NRIP)) | 3763 | if (boot_cpu_has(X86_FEATURE_NRIPS)) |
3515 | entry->edx |= SVM_FEATURE_NRIP; | 3764 | entry->edx |= SVM_FEATURE_NRIP; |
3516 | 3765 | ||
3517 | /* Support NPT for the guest if enabled */ | 3766 | /* Support NPT for the guest if enabled */ |
@@ -3571,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { | |||
3571 | { SVM_EXIT_WBINVD, "wbinvd" }, | 3820 | { SVM_EXIT_WBINVD, "wbinvd" }, |
3572 | { SVM_EXIT_MONITOR, "monitor" }, | 3821 | { SVM_EXIT_MONITOR, "monitor" }, |
3573 | { SVM_EXIT_MWAIT, "mwait" }, | 3822 | { SVM_EXIT_MWAIT, "mwait" }, |
3823 | { SVM_EXIT_XSETBV, "xsetbv" }, | ||
3574 | { SVM_EXIT_NPF, "npf" }, | 3824 | { SVM_EXIT_NPF, "npf" }, |
3575 | { -1, NULL } | 3825 | { -1, NULL } |
3576 | }; | 3826 | }; |
@@ -3594,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) | |||
3594 | { | 3844 | { |
3595 | struct vcpu_svm *svm = to_svm(vcpu); | 3845 | struct vcpu_svm *svm = to_svm(vcpu); |
3596 | 3846 | ||
3597 | svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; | 3847 | set_exception_intercept(svm, NM_VECTOR); |
3598 | if (is_nested(svm)) | ||
3599 | svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR; | ||
3600 | update_cr0_intercept(svm); | 3848 | update_cr0_intercept(svm); |
3601 | } | 3849 | } |
3602 | 3850 | ||
@@ -3627,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3627 | .get_cpl = svm_get_cpl, | 3875 | .get_cpl = svm_get_cpl, |
3628 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, | 3876 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, |
3629 | .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, | 3877 | .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, |
3878 | .decache_cr3 = svm_decache_cr3, | ||
3630 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, | 3879 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, |
3631 | .set_cr0 = svm_set_cr0, | 3880 | .set_cr0 = svm_set_cr0, |
3632 | .set_cr3 = svm_set_cr3, | 3881 | .set_cr3 = svm_set_cr3, |
@@ -3667,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3667 | .get_tdp_level = get_npt_level, | 3916 | .get_tdp_level = get_npt_level, |
3668 | .get_mt_mask = svm_get_mt_mask, | 3917 | .get_mt_mask = svm_get_mt_mask, |
3669 | 3918 | ||
3919 | .get_exit_info = svm_get_exit_info, | ||
3670 | .exit_reasons_str = svm_exit_reasons_str, | 3920 | .exit_reasons_str = svm_exit_reasons_str, |
3921 | |||
3671 | .get_lpage_level = svm_get_lpage_level, | 3922 | .get_lpage_level = svm_get_lpage_level, |
3672 | 3923 | ||
3673 | .cpuid_update = svm_cpuid_update, | 3924 | .cpuid_update = svm_cpuid_update, |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a6544b8e7c0f..1357d7cf4ec8 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic, | |||
178 | #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) | 178 | #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) |
179 | #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) | 179 | #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) |
180 | 180 | ||
181 | #define KVM_ISA_VMX 1 | ||
182 | #define KVM_ISA_SVM 2 | ||
183 | |||
181 | /* | 184 | /* |
182 | * Tracepoint for kvm guest exit: | 185 | * Tracepoint for kvm guest exit: |
183 | */ | 186 | */ |
184 | TRACE_EVENT(kvm_exit, | 187 | TRACE_EVENT(kvm_exit, |
185 | TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), | 188 | TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), |
186 | TP_ARGS(exit_reason, vcpu), | 189 | TP_ARGS(exit_reason, vcpu, isa), |
187 | 190 | ||
188 | TP_STRUCT__entry( | 191 | TP_STRUCT__entry( |
189 | __field( unsigned int, exit_reason ) | 192 | __field( unsigned int, exit_reason ) |
190 | __field( unsigned long, guest_rip ) | 193 | __field( unsigned long, guest_rip ) |
194 | __field( u32, isa ) | ||
195 | __field( u64, info1 ) | ||
196 | __field( u64, info2 ) | ||
191 | ), | 197 | ), |
192 | 198 | ||
193 | TP_fast_assign( | 199 | TP_fast_assign( |
194 | __entry->exit_reason = exit_reason; | 200 | __entry->exit_reason = exit_reason; |
195 | __entry->guest_rip = kvm_rip_read(vcpu); | 201 | __entry->guest_rip = kvm_rip_read(vcpu); |
202 | __entry->isa = isa; | ||
203 | kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, | ||
204 | &__entry->info2); | ||
196 | ), | 205 | ), |
197 | 206 | ||
198 | TP_printk("reason %s rip 0x%lx", | 207 | TP_printk("reason %s rip 0x%lx info %llx %llx", |
199 | ftrace_print_symbols_seq(p, __entry->exit_reason, | 208 | ftrace_print_symbols_seq(p, __entry->exit_reason, |
200 | kvm_x86_ops->exit_reasons_str), | 209 | kvm_x86_ops->exit_reasons_str), |
201 | __entry->guest_rip) | 210 | __entry->guest_rip, __entry->info1, __entry->info2) |
202 | ); | 211 | ); |
203 | 212 | ||
204 | /* | 213 | /* |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 81fcbe9515c5..bf89ec2cfb82 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); | |||
69 | static int __read_mostly vmm_exclusive = 1; | 69 | static int __read_mostly vmm_exclusive = 1; |
70 | module_param(vmm_exclusive, bool, S_IRUGO); | 70 | module_param(vmm_exclusive, bool, S_IRUGO); |
71 | 71 | ||
72 | static int __read_mostly yield_on_hlt = 1; | ||
73 | module_param(yield_on_hlt, bool, S_IRUGO); | ||
74 | |||
72 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ | 75 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
73 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) | 76 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
74 | #define KVM_GUEST_CR0_MASK \ | 77 | #define KVM_GUEST_CR0_MASK \ |
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm); | |||
177 | static u64 construct_eptp(unsigned long root_hpa); | 180 | static u64 construct_eptp(unsigned long root_hpa); |
178 | static void kvm_cpu_vmxon(u64 addr); | 181 | static void kvm_cpu_vmxon(u64 addr); |
179 | static void kvm_cpu_vmxoff(void); | 182 | static void kvm_cpu_vmxoff(void); |
183 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
180 | 184 | ||
181 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 185 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
182 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 186 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b; | |||
188 | static unsigned long *vmx_msr_bitmap_legacy; | 192 | static unsigned long *vmx_msr_bitmap_legacy; |
189 | static unsigned long *vmx_msr_bitmap_longmode; | 193 | static unsigned long *vmx_msr_bitmap_longmode; |
190 | 194 | ||
195 | static bool cpu_has_load_ia32_efer; | ||
196 | |||
191 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); | 197 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); |
192 | static DEFINE_SPINLOCK(vmx_vpid_lock); | 198 | static DEFINE_SPINLOCK(vmx_vpid_lock); |
193 | 199 | ||
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs) | |||
472 | u8 error; | 478 | u8 error; |
473 | 479 | ||
474 | asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" | 480 | asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" |
475 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | 481 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
476 | : "cc", "memory"); | 482 | : "cc", "memory"); |
477 | if (error) | 483 | if (error) |
478 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | 484 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", |
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs) | |||
485 | u8 error; | 491 | u8 error; |
486 | 492 | ||
487 | asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" | 493 | asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" |
488 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | 494 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
489 | : "cc", "memory"); | 495 | : "cc", "memory"); |
490 | if (error) | 496 | if (error) |
491 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | 497 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", |
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) | |||
565 | 571 | ||
566 | static unsigned long vmcs_readl(unsigned long field) | 572 | static unsigned long vmcs_readl(unsigned long field) |
567 | { | 573 | { |
568 | unsigned long value; | 574 | unsigned long value = 0; |
569 | 575 | ||
570 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) | 576 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) |
571 | : "=a"(value) : "d"(field) : "cc"); | 577 | : "+a"(value) : "d"(field) : "cc"); |
572 | return value; | 578 | return value; |
573 | } | 579 | } |
574 | 580 | ||
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) | |||
661 | unsigned i; | 667 | unsigned i; |
662 | struct msr_autoload *m = &vmx->msr_autoload; | 668 | struct msr_autoload *m = &vmx->msr_autoload; |
663 | 669 | ||
670 | if (msr == MSR_EFER && cpu_has_load_ia32_efer) { | ||
671 | vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); | ||
672 | vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); | ||
673 | return; | ||
674 | } | ||
675 | |||
664 | for (i = 0; i < m->nr; ++i) | 676 | for (i = 0; i < m->nr; ++i) |
665 | if (m->guest[i].index == msr) | 677 | if (m->guest[i].index == msr) |
666 | break; | 678 | break; |
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, | |||
680 | unsigned i; | 692 | unsigned i; |
681 | struct msr_autoload *m = &vmx->msr_autoload; | 693 | struct msr_autoload *m = &vmx->msr_autoload; |
682 | 694 | ||
695 | if (msr == MSR_EFER && cpu_has_load_ia32_efer) { | ||
696 | vmcs_write64(GUEST_IA32_EFER, guest_val); | ||
697 | vmcs_write64(HOST_IA32_EFER, host_val); | ||
698 | vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); | ||
699 | vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); | ||
700 | return; | ||
701 | } | ||
702 | |||
683 | for (i = 0; i < m->nr; ++i) | 703 | for (i = 0; i < m->nr; ++i) |
684 | if (m->guest[i].index == msr) | 704 | if (m->guest[i].index == msr) |
685 | break; | 705 | break; |
@@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
1009 | vmx_set_interrupt_shadow(vcpu, 0); | 1029 | vmx_set_interrupt_shadow(vcpu, 0); |
1010 | } | 1030 | } |
1011 | 1031 | ||
1032 | static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | ||
1033 | { | ||
1034 | /* Ensure that we clear the HLT state in the VMCS. We don't need to | ||
1035 | * explicitly skip the instruction because if the HLT state is set, then | ||
1036 | * the instruction is already executing and RIP has already been | ||
1037 | * advanced. */ | ||
1038 | if (!yield_on_hlt && | ||
1039 | vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) | ||
1040 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | ||
1041 | } | ||
1042 | |||
1012 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 1043 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
1013 | bool has_error_code, u32 error_code, | 1044 | bool has_error_code, u32 error_code, |
1014 | bool reinject) | 1045 | bool reinject) |
@@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1035 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | 1066 | intr_info |= INTR_TYPE_HARD_EXCEPTION; |
1036 | 1067 | ||
1037 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | 1068 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); |
1069 | vmx_clear_hlt(vcpu); | ||
1038 | } | 1070 | } |
1039 | 1071 | ||
1040 | static bool vmx_rdtscp_supported(void) | 1072 | static bool vmx_rdtscp_supported(void) |
@@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void) | |||
1305 | && tboot_enabled()) | 1337 | && tboot_enabled()) |
1306 | return 1; | 1338 | return 1; |
1307 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) | 1339 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) |
1308 | && !tboot_enabled()) | 1340 | && !tboot_enabled()) { |
1341 | printk(KERN_WARNING "kvm: disable TXT in the BIOS or " | ||
1342 | " activate TXT before enabling KVM\n"); | ||
1309 | return 1; | 1343 | return 1; |
1344 | } | ||
1310 | } | 1345 | } |
1311 | 1346 | ||
1312 | return 0; | 1347 | return 0; |
@@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | |||
1400 | return 0; | 1435 | return 0; |
1401 | } | 1436 | } |
1402 | 1437 | ||
1438 | static __init bool allow_1_setting(u32 msr, u32 ctl) | ||
1439 | { | ||
1440 | u32 vmx_msr_low, vmx_msr_high; | ||
1441 | |||
1442 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
1443 | return vmx_msr_high & ctl; | ||
1444 | } | ||
1445 | |||
1403 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | 1446 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) |
1404 | { | 1447 | { |
1405 | u32 vmx_msr_low, vmx_msr_high; | 1448 | u32 vmx_msr_low, vmx_msr_high; |
@@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1416 | &_pin_based_exec_control) < 0) | 1459 | &_pin_based_exec_control) < 0) |
1417 | return -EIO; | 1460 | return -EIO; |
1418 | 1461 | ||
1419 | min = CPU_BASED_HLT_EXITING | | 1462 | min = |
1420 | #ifdef CONFIG_X86_64 | 1463 | #ifdef CONFIG_X86_64 |
1421 | CPU_BASED_CR8_LOAD_EXITING | | 1464 | CPU_BASED_CR8_LOAD_EXITING | |
1422 | CPU_BASED_CR8_STORE_EXITING | | 1465 | CPU_BASED_CR8_STORE_EXITING | |
@@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1429 | CPU_BASED_MWAIT_EXITING | | 1472 | CPU_BASED_MWAIT_EXITING | |
1430 | CPU_BASED_MONITOR_EXITING | | 1473 | CPU_BASED_MONITOR_EXITING | |
1431 | CPU_BASED_INVLPG_EXITING; | 1474 | CPU_BASED_INVLPG_EXITING; |
1475 | |||
1476 | if (yield_on_hlt) | ||
1477 | min |= CPU_BASED_HLT_EXITING; | ||
1478 | |||
1432 | opt = CPU_BASED_TPR_SHADOW | | 1479 | opt = CPU_BASED_TPR_SHADOW | |
1433 | CPU_BASED_USE_MSR_BITMAPS | | 1480 | CPU_BASED_USE_MSR_BITMAPS | |
1434 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 1481 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
@@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1510 | vmcs_conf->vmexit_ctrl = _vmexit_control; | 1557 | vmcs_conf->vmexit_ctrl = _vmexit_control; |
1511 | vmcs_conf->vmentry_ctrl = _vmentry_control; | 1558 | vmcs_conf->vmentry_ctrl = _vmentry_control; |
1512 | 1559 | ||
1560 | cpu_has_load_ia32_efer = | ||
1561 | allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, | ||
1562 | VM_ENTRY_LOAD_IA32_EFER) | ||
1563 | && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, | ||
1564 | VM_EXIT_LOAD_IA32_EFER); | ||
1565 | |||
1513 | return 0; | 1566 | return 0; |
1514 | } | 1567 | } |
1515 | 1568 | ||
@@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | |||
1683 | save->limit = vmcs_read32(sf->limit); | 1736 | save->limit = vmcs_read32(sf->limit); |
1684 | save->ar = vmcs_read32(sf->ar_bytes); | 1737 | save->ar = vmcs_read32(sf->ar_bytes); |
1685 | vmcs_write16(sf->selector, save->base >> 4); | 1738 | vmcs_write16(sf->selector, save->base >> 4); |
1686 | vmcs_write32(sf->base, save->base & 0xfffff); | 1739 | vmcs_write32(sf->base, save->base & 0xffff0); |
1687 | vmcs_write32(sf->limit, 0xffff); | 1740 | vmcs_write32(sf->limit, 0xffff); |
1688 | vmcs_write32(sf->ar_bytes, 0xf3); | 1741 | vmcs_write32(sf->ar_bytes, 0xf3); |
1742 | if (save->base & 0xf) | ||
1743 | printk_once(KERN_WARNING "kvm: segment base is not paragraph" | ||
1744 | " aligned when entering protected mode (seg=%d)", | ||
1745 | seg); | ||
1689 | } | 1746 | } |
1690 | 1747 | ||
1691 | static void enter_rmode(struct kvm_vcpu *vcpu) | 1748 | static void enter_rmode(struct kvm_vcpu *vcpu) |
@@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | |||
1814 | vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; | 1871 | vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; |
1815 | } | 1872 | } |
1816 | 1873 | ||
1874 | static void vmx_decache_cr3(struct kvm_vcpu *vcpu) | ||
1875 | { | ||
1876 | if (enable_ept && is_paging(vcpu)) | ||
1877 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
1878 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
1879 | } | ||
1880 | |||
1817 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 1881 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
1818 | { | 1882 | { |
1819 | ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; | 1883 | ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; |
@@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | |||
1857 | unsigned long cr0, | 1921 | unsigned long cr0, |
1858 | struct kvm_vcpu *vcpu) | 1922 | struct kvm_vcpu *vcpu) |
1859 | { | 1923 | { |
1924 | vmx_decache_cr3(vcpu); | ||
1860 | if (!(cr0 & X86_CR0_PG)) { | 1925 | if (!(cr0 & X86_CR0_PG)) { |
1861 | /* From paging/starting to nonpaging */ | 1926 | /* From paging/starting to nonpaging */ |
1862 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | 1927 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, |
@@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
1937 | if (enable_ept) { | 2002 | if (enable_ept) { |
1938 | eptp = construct_eptp(cr3); | 2003 | eptp = construct_eptp(cr3); |
1939 | vmcs_write64(EPT_POINTER, eptp); | 2004 | vmcs_write64(EPT_POINTER, eptp); |
1940 | guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : | 2005 | guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : |
1941 | vcpu->kvm->arch.ept_identity_map_addr; | 2006 | vcpu->kvm->arch.ept_identity_map_addr; |
1942 | ept_load_pdptrs(vcpu); | 2007 | ept_load_pdptrs(vcpu); |
1943 | } | 2008 | } |
@@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2725 | vmcs_writel(GUEST_IDTR_BASE, 0); | 2790 | vmcs_writel(GUEST_IDTR_BASE, 0); |
2726 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | 2791 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); |
2727 | 2792 | ||
2728 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | 2793 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); |
2729 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | 2794 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); |
2730 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | 2795 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); |
2731 | 2796 | ||
@@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) | |||
2787 | return; | 2852 | return; |
2788 | } | 2853 | } |
2789 | 2854 | ||
2855 | if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { | ||
2856 | enable_irq_window(vcpu); | ||
2857 | return; | ||
2858 | } | ||
2790 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 2859 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2791 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; | 2860 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; |
2792 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 2861 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
@@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) | |||
2814 | } else | 2883 | } else |
2815 | intr |= INTR_TYPE_EXT_INTR; | 2884 | intr |= INTR_TYPE_EXT_INTR; |
2816 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); | 2885 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); |
2886 | vmx_clear_hlt(vcpu); | ||
2817 | } | 2887 | } |
2818 | 2888 | ||
2819 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | 2889 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) |
@@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2841 | } | 2911 | } |
2842 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2912 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2843 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 2913 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
2914 | vmx_clear_hlt(vcpu); | ||
2844 | } | 2915 | } |
2845 | 2916 | ||
2846 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | 2917 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) |
@@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | |||
2849 | return 0; | 2920 | return 0; |
2850 | 2921 | ||
2851 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 2922 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & |
2852 | (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); | 2923 | (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
2924 | | GUEST_INTR_STATE_NMI)); | ||
2853 | } | 2925 | } |
2854 | 2926 | ||
2855 | static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) | 2927 | static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) |
@@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
2910 | * Cause the #SS fault with 0 error code in VM86 mode. | 2982 | * Cause the #SS fault with 0 error code in VM86 mode. |
2911 | */ | 2983 | */ |
2912 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | 2984 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) |
2913 | if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) | 2985 | if (emulate_instruction(vcpu, 0) == EMULATE_DONE) |
2914 | return 1; | 2986 | return 1; |
2915 | /* | 2987 | /* |
2916 | * Forward all other exceptions that are valid in real mode. | 2988 | * Forward all other exceptions that are valid in real mode. |
@@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
3007 | } | 3079 | } |
3008 | 3080 | ||
3009 | if (is_invalid_opcode(intr_info)) { | 3081 | if (is_invalid_opcode(intr_info)) { |
3010 | er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); | 3082 | er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); |
3011 | if (er != EMULATE_DONE) | 3083 | if (er != EMULATE_DONE) |
3012 | kvm_queue_exception(vcpu, UD_VECTOR); | 3084 | kvm_queue_exception(vcpu, UD_VECTOR); |
3013 | return 1; | 3085 | return 1; |
@@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
3026 | 3098 | ||
3027 | if (kvm_event_needs_reinjection(vcpu)) | 3099 | if (kvm_event_needs_reinjection(vcpu)) |
3028 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 3100 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
3029 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 3101 | return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); |
3030 | } | 3102 | } |
3031 | 3103 | ||
3032 | if (vmx->rmode.vm86_active && | 3104 | if (vmx->rmode.vm86_active && |
@@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu) | |||
3098 | ++vcpu->stat.io_exits; | 3170 | ++vcpu->stat.io_exits; |
3099 | 3171 | ||
3100 | if (string || in) | 3172 | if (string || in) |
3101 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 3173 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
3102 | 3174 | ||
3103 | port = exit_qualification >> 16; | 3175 | port = exit_qualification >> 16; |
3104 | size = (exit_qualification & 7) + 1; | 3176 | size = (exit_qualification & 7) + 1; |
@@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
3118 | hypercall[2] = 0xc1; | 3190 | hypercall[2] = 0xc1; |
3119 | } | 3191 | } |
3120 | 3192 | ||
3121 | static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) | ||
3122 | { | ||
3123 | if (err) | ||
3124 | kvm_inject_gp(vcpu, 0); | ||
3125 | else | ||
3126 | skip_emulated_instruction(vcpu); | ||
3127 | } | ||
3128 | |||
3129 | static int handle_cr(struct kvm_vcpu *vcpu) | 3193 | static int handle_cr(struct kvm_vcpu *vcpu) |
3130 | { | 3194 | { |
3131 | unsigned long exit_qualification, val; | 3195 | unsigned long exit_qualification, val; |
@@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3143 | switch (cr) { | 3207 | switch (cr) { |
3144 | case 0: | 3208 | case 0: |
3145 | err = kvm_set_cr0(vcpu, val); | 3209 | err = kvm_set_cr0(vcpu, val); |
3146 | complete_insn_gp(vcpu, err); | 3210 | kvm_complete_insn_gp(vcpu, err); |
3147 | return 1; | 3211 | return 1; |
3148 | case 3: | 3212 | case 3: |
3149 | err = kvm_set_cr3(vcpu, val); | 3213 | err = kvm_set_cr3(vcpu, val); |
3150 | complete_insn_gp(vcpu, err); | 3214 | kvm_complete_insn_gp(vcpu, err); |
3151 | return 1; | 3215 | return 1; |
3152 | case 4: | 3216 | case 4: |
3153 | err = kvm_set_cr4(vcpu, val); | 3217 | err = kvm_set_cr4(vcpu, val); |
3154 | complete_insn_gp(vcpu, err); | 3218 | kvm_complete_insn_gp(vcpu, err); |
3155 | return 1; | 3219 | return 1; |
3156 | case 8: { | 3220 | case 8: { |
3157 | u8 cr8_prev = kvm_get_cr8(vcpu); | 3221 | u8 cr8_prev = kvm_get_cr8(vcpu); |
3158 | u8 cr8 = kvm_register_read(vcpu, reg); | 3222 | u8 cr8 = kvm_register_read(vcpu, reg); |
3159 | kvm_set_cr8(vcpu, cr8); | 3223 | err = kvm_set_cr8(vcpu, cr8); |
3160 | skip_emulated_instruction(vcpu); | 3224 | kvm_complete_insn_gp(vcpu, err); |
3161 | if (irqchip_in_kernel(vcpu->kvm)) | 3225 | if (irqchip_in_kernel(vcpu->kvm)) |
3162 | return 1; | 3226 | return 1; |
3163 | if (cr8_prev <= cr8) | 3227 | if (cr8_prev <= cr8) |
@@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3176 | case 1: /*mov from cr*/ | 3240 | case 1: /*mov from cr*/ |
3177 | switch (cr) { | 3241 | switch (cr) { |
3178 | case 3: | 3242 | case 3: |
3179 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); | 3243 | val = kvm_read_cr3(vcpu); |
3180 | trace_kvm_cr_read(cr, vcpu->arch.cr3); | 3244 | kvm_register_write(vcpu, reg, val); |
3245 | trace_kvm_cr_read(cr, val); | ||
3181 | skip_emulated_instruction(vcpu); | 3246 | skip_emulated_instruction(vcpu); |
3182 | return 1; | 3247 | return 1; |
3183 | case 8: | 3248 | case 8: |
@@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu) | |||
3349 | return 1; | 3414 | return 1; |
3350 | } | 3415 | } |
3351 | 3416 | ||
3417 | static int handle_invd(struct kvm_vcpu *vcpu) | ||
3418 | { | ||
3419 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
3420 | } | ||
3421 | |||
3352 | static int handle_invlpg(struct kvm_vcpu *vcpu) | 3422 | static int handle_invlpg(struct kvm_vcpu *vcpu) |
3353 | { | 3423 | { |
3354 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 3424 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
@@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) | |||
3377 | 3447 | ||
3378 | static int handle_apic_access(struct kvm_vcpu *vcpu) | 3448 | static int handle_apic_access(struct kvm_vcpu *vcpu) |
3379 | { | 3449 | { |
3380 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 3450 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
3381 | } | 3451 | } |
3382 | 3452 | ||
3383 | static int handle_task_switch(struct kvm_vcpu *vcpu) | 3453 | static int handle_task_switch(struct kvm_vcpu *vcpu) |
@@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) | |||
3476 | 3546 | ||
3477 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 3547 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
3478 | trace_kvm_page_fault(gpa, exit_qualification); | 3548 | trace_kvm_page_fault(gpa, exit_qualification); |
3479 | return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); | 3549 | return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); |
3480 | } | 3550 | } |
3481 | 3551 | ||
3482 | static u64 ept_rsvd_mask(u64 spte, int level) | 3552 | static u64 ept_rsvd_mask(u64 spte, int level) |
@@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
3592 | && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) | 3662 | && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) |
3593 | return handle_interrupt_window(&vmx->vcpu); | 3663 | return handle_interrupt_window(&vmx->vcpu); |
3594 | 3664 | ||
3595 | err = emulate_instruction(vcpu, 0, 0, 0); | 3665 | err = emulate_instruction(vcpu, 0); |
3596 | 3666 | ||
3597 | if (err == EMULATE_DO_MMIO) { | 3667 | if (err == EMULATE_DO_MMIO) { |
3598 | ret = 0; | 3668 | ret = 0; |
@@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3649 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | 3719 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, |
3650 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | 3720 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
3651 | [EXIT_REASON_HLT] = handle_halt, | 3721 | [EXIT_REASON_HLT] = handle_halt, |
3722 | [EXIT_REASON_INVD] = handle_invd, | ||
3652 | [EXIT_REASON_INVLPG] = handle_invlpg, | 3723 | [EXIT_REASON_INVLPG] = handle_invlpg, |
3653 | [EXIT_REASON_VMCALL] = handle_vmcall, | 3724 | [EXIT_REASON_VMCALL] = handle_vmcall, |
3654 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, | 3725 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, |
@@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3676 | static const int kvm_vmx_max_exit_handlers = | 3747 | static const int kvm_vmx_max_exit_handlers = |
3677 | ARRAY_SIZE(kvm_vmx_exit_handlers); | 3748 | ARRAY_SIZE(kvm_vmx_exit_handlers); |
3678 | 3749 | ||
3750 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | ||
3751 | { | ||
3752 | *info1 = vmcs_readl(EXIT_QUALIFICATION); | ||
3753 | *info2 = vmcs_read32(VM_EXIT_INTR_INFO); | ||
3754 | } | ||
3755 | |||
3679 | /* | 3756 | /* |
3680 | * The guest has exited. See if we can fix it or if we need userspace | 3757 | * The guest has exited. See if we can fix it or if we need userspace |
3681 | * assistance. | 3758 | * assistance. |
@@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3686 | u32 exit_reason = vmx->exit_reason; | 3763 | u32 exit_reason = vmx->exit_reason; |
3687 | u32 vectoring_info = vmx->idt_vectoring_info; | 3764 | u32 vectoring_info = vmx->idt_vectoring_info; |
3688 | 3765 | ||
3689 | trace_kvm_exit(exit_reason, vcpu); | 3766 | trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); |
3690 | 3767 | ||
3691 | /* If guest state is invalid, start emulating */ | 3768 | /* If guest state is invalid, start emulating */ |
3692 | if (vmx->emulation_required && emulate_invalid_guest_state) | 3769 | if (vmx->emulation_required && emulate_invalid_guest_state) |
3693 | return handle_invalid_guest_state(vcpu); | 3770 | return handle_invalid_guest_state(vcpu); |
3694 | 3771 | ||
3695 | /* Access CR3 don't cause VMExit in paging mode, so we need | ||
3696 | * to sync with guest real CR3. */ | ||
3697 | if (enable_ept && is_paging(vcpu)) | ||
3698 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
3699 | |||
3700 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | 3772 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { |
3701 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 3773 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
3702 | vcpu->run->fail_entry.hardware_entry_failure_reason | 3774 | vcpu->run->fail_entry.hardware_entry_failure_reason |
@@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4013 | ); | 4085 | ); |
4014 | 4086 | ||
4015 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | 4087 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) |
4016 | | (1 << VCPU_EXREG_PDPTR)); | 4088 | | (1 << VCPU_EXREG_PDPTR) |
4089 | | (1 << VCPU_EXREG_CR3)); | ||
4017 | vcpu->arch.regs_dirty = 0; | 4090 | vcpu->arch.regs_dirty = 0; |
4018 | 4091 | ||
4019 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 4092 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
@@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4280 | .get_cpl = vmx_get_cpl, | 4353 | .get_cpl = vmx_get_cpl, |
4281 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | 4354 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, |
4282 | .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, | 4355 | .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, |
4356 | .decache_cr3 = vmx_decache_cr3, | ||
4283 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | 4357 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, |
4284 | .set_cr0 = vmx_set_cr0, | 4358 | .set_cr0 = vmx_set_cr0, |
4285 | .set_cr3 = vmx_set_cr3, | 4359 | .set_cr3 = vmx_set_cr3, |
@@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4320 | .get_tdp_level = get_ept_level, | 4394 | .get_tdp_level = get_ept_level, |
4321 | .get_mt_mask = vmx_get_mt_mask, | 4395 | .get_mt_mask = vmx_get_mt_mask, |
4322 | 4396 | ||
4397 | .get_exit_info = vmx_get_exit_info, | ||
4323 | .exit_reasons_str = vmx_exit_reasons_str, | 4398 | .exit_reasons_str = vmx_exit_reasons_str, |
4399 | |||
4324 | .get_lpage_level = vmx_get_lpage_level, | 4400 | .get_lpage_level = vmx_get_lpage_level, |
4325 | 4401 | ||
4326 | .cpuid_update = vmx_cpuid_update, | 4402 | .cpuid_update = vmx_cpuid_update, |
@@ -4396,8 +4472,6 @@ static int __init vmx_init(void) | |||
4396 | 4472 | ||
4397 | if (enable_ept) { | 4473 | if (enable_ept) { |
4398 | bypass_guest_pf = 0; | 4474 | bypass_guest_pf = 0; |
4399 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | | ||
4400 | VMX_EPT_WRITABLE_MASK); | ||
4401 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 4475 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
4402 | VMX_EPT_EXECUTABLE_MASK); | 4476 | VMX_EPT_EXECUTABLE_MASK); |
4403 | kvm_enable_tdp(); | 4477 | kvm_enable_tdp(); |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 46a368cb651e..bcc0efce85bf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
44 | #include <linux/perf_event.h> | 44 | #include <linux/perf_event.h> |
45 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
46 | #include <linux/hash.h> | ||
46 | #include <trace/events/kvm.h> | 47 | #include <trace/events/kvm.h> |
47 | 48 | ||
48 | #define CREATE_TRACE_POINTS | 49 | #define CREATE_TRACE_POINTS |
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
155 | 156 | ||
156 | u64 __read_mostly host_xcr0; | 157 | u64 __read_mostly host_xcr0; |
157 | 158 | ||
159 | static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) | ||
160 | { | ||
161 | int i; | ||
162 | for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) | ||
163 | vcpu->arch.apf.gfns[i] = ~0; | ||
164 | } | ||
165 | |||
158 | static void kvm_on_user_return(struct user_return_notifier *urn) | 166 | static void kvm_on_user_return(struct user_return_notifier *urn) |
159 | { | 167 | { |
160 | unsigned slot; | 168 | unsigned slot; |
@@ -326,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) | |||
326 | } | 334 | } |
327 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); | 335 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); |
328 | 336 | ||
329 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu) | 337 | void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) |
330 | { | 338 | { |
331 | unsigned error_code = vcpu->arch.fault.error_code; | 339 | if (err) |
340 | kvm_inject_gp(vcpu, 0); | ||
341 | else | ||
342 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
343 | } | ||
344 | EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); | ||
332 | 345 | ||
346 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) | ||
347 | { | ||
333 | ++vcpu->stat.pf_guest; | 348 | ++vcpu->stat.pf_guest; |
334 | vcpu->arch.cr2 = vcpu->arch.fault.address; | 349 | vcpu->arch.cr2 = fault->address; |
335 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | 350 | kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); |
336 | } | 351 | } |
337 | 352 | ||
338 | void kvm_propagate_fault(struct kvm_vcpu *vcpu) | 353 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) |
339 | { | 354 | { |
340 | if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) | 355 | if (mmu_is_nested(vcpu) && !fault->nested_page_fault) |
341 | vcpu->arch.nested_mmu.inject_page_fault(vcpu); | 356 | vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); |
342 | else | 357 | else |
343 | vcpu->arch.mmu.inject_page_fault(vcpu); | 358 | vcpu->arch.mmu.inject_page_fault(vcpu, fault); |
344 | |||
345 | vcpu->arch.fault.nested = false; | ||
346 | } | 359 | } |
347 | 360 | ||
348 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) | 361 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) |
@@ -460,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) | |||
460 | (unsigned long *)&vcpu->arch.regs_avail)) | 473 | (unsigned long *)&vcpu->arch.regs_avail)) |
461 | return true; | 474 | return true; |
462 | 475 | ||
463 | gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; | 476 | gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; |
464 | offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); | 477 | offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); |
465 | r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), | 478 | r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), |
466 | PFERR_USER_MASK | PFERR_WRITE_MASK); | 479 | PFERR_USER_MASK | PFERR_WRITE_MASK); |
467 | if (r < 0) | 480 | if (r < 0) |
@@ -506,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
506 | } else | 519 | } else |
507 | #endif | 520 | #endif |
508 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, | 521 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, |
509 | vcpu->arch.cr3)) | 522 | kvm_read_cr3(vcpu))) |
510 | return 1; | 523 | return 1; |
511 | } | 524 | } |
512 | 525 | ||
513 | kvm_x86_ops->set_cr0(vcpu, cr0); | 526 | kvm_x86_ops->set_cr0(vcpu, cr0); |
514 | 527 | ||
528 | if ((cr0 ^ old_cr0) & X86_CR0_PG) | ||
529 | kvm_clear_async_pf_completion_queue(vcpu); | ||
530 | |||
515 | if ((cr0 ^ old_cr0) & update_bits) | 531 | if ((cr0 ^ old_cr0) & update_bits) |
516 | kvm_mmu_reset_context(vcpu); | 532 | kvm_mmu_reset_context(vcpu); |
517 | return 0; | 533 | return 0; |
@@ -595,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
595 | return 1; | 611 | return 1; |
596 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) | 612 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) |
597 | && ((cr4 ^ old_cr4) & pdptr_bits) | 613 | && ((cr4 ^ old_cr4) & pdptr_bits) |
598 | && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) | 614 | && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, |
615 | kvm_read_cr3(vcpu))) | ||
599 | return 1; | 616 | return 1; |
600 | 617 | ||
601 | if (cr4 & X86_CR4_VMXE) | 618 | if (cr4 & X86_CR4_VMXE) |
@@ -615,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); | |||
615 | 632 | ||
616 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 633 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
617 | { | 634 | { |
618 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | 635 | if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { |
619 | kvm_mmu_sync_roots(vcpu); | 636 | kvm_mmu_sync_roots(vcpu); |
620 | kvm_mmu_flush_tlb(vcpu); | 637 | kvm_mmu_flush_tlb(vcpu); |
621 | return 0; | 638 | return 0; |
@@ -650,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
650 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | 667 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) |
651 | return 1; | 668 | return 1; |
652 | vcpu->arch.cr3 = cr3; | 669 | vcpu->arch.cr3 = cr3; |
670 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
653 | vcpu->arch.mmu.new_cr3(vcpu); | 671 | vcpu->arch.mmu.new_cr3(vcpu); |
654 | return 0; | 672 | return 0; |
655 | } | 673 | } |
656 | EXPORT_SYMBOL_GPL(kvm_set_cr3); | 674 | EXPORT_SYMBOL_GPL(kvm_set_cr3); |
657 | 675 | ||
658 | int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | 676 | int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) |
659 | { | 677 | { |
660 | if (cr8 & CR8_RESERVED_BITS) | 678 | if (cr8 & CR8_RESERVED_BITS) |
661 | return 1; | 679 | return 1; |
@@ -665,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
665 | vcpu->arch.cr8 = cr8; | 683 | vcpu->arch.cr8 = cr8; |
666 | return 0; | 684 | return 0; |
667 | } | 685 | } |
668 | |||
669 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
670 | { | ||
671 | if (__kvm_set_cr8(vcpu, cr8)) | ||
672 | kvm_inject_gp(vcpu, 0); | ||
673 | } | ||
674 | EXPORT_SYMBOL_GPL(kvm_set_cr8); | 686 | EXPORT_SYMBOL_GPL(kvm_set_cr8); |
675 | 687 | ||
676 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) | 688 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) |
@@ -775,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); | |||
775 | * kvm-specific. Those are put in the beginning of the list. | 787 | * kvm-specific. Those are put in the beginning of the list. |
776 | */ | 788 | */ |
777 | 789 | ||
778 | #define KVM_SAVE_MSRS_BEGIN 7 | 790 | #define KVM_SAVE_MSRS_BEGIN 8 |
779 | static u32 msrs_to_save[] = { | 791 | static u32 msrs_to_save[] = { |
780 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, | 792 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
781 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, | 793 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
782 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, | 794 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
783 | HV_X64_MSR_APIC_ASSIST_PAGE, | 795 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, |
784 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | 796 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
785 | MSR_STAR, | 797 | MSR_STAR, |
786 | #ifdef CONFIG_X86_64 | 798 | #ifdef CONFIG_X86_64 |
@@ -830,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
830 | kvm_x86_ops->set_efer(vcpu, efer); | 842 | kvm_x86_ops->set_efer(vcpu, efer); |
831 | 843 | ||
832 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; | 844 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; |
833 | kvm_mmu_reset_context(vcpu); | ||
834 | 845 | ||
835 | /* Update reserved bits */ | 846 | /* Update reserved bits */ |
836 | if ((efer ^ old_efer) & EFER_NX) | 847 | if ((efer ^ old_efer) & EFER_NX) |
@@ -1418,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1418 | return 0; | 1429 | return 0; |
1419 | } | 1430 | } |
1420 | 1431 | ||
1432 | static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) | ||
1433 | { | ||
1434 | gpa_t gpa = data & ~0x3f; | ||
1435 | |||
1436 | /* Bits 2:5 are resrved, Should be zero */ | ||
1437 | if (data & 0x3c) | ||
1438 | return 1; | ||
1439 | |||
1440 | vcpu->arch.apf.msr_val = data; | ||
1441 | |||
1442 | if (!(data & KVM_ASYNC_PF_ENABLED)) { | ||
1443 | kvm_clear_async_pf_completion_queue(vcpu); | ||
1444 | kvm_async_pf_hash_reset(vcpu); | ||
1445 | return 0; | ||
1446 | } | ||
1447 | |||
1448 | if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) | ||
1449 | return 1; | ||
1450 | |||
1451 | vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); | ||
1452 | kvm_async_pf_wakeup_all(vcpu); | ||
1453 | return 0; | ||
1454 | } | ||
1455 | |||
1421 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 1456 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
1422 | { | 1457 | { |
1423 | switch (msr) { | 1458 | switch (msr) { |
@@ -1499,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1499 | } | 1534 | } |
1500 | break; | 1535 | break; |
1501 | } | 1536 | } |
1537 | case MSR_KVM_ASYNC_PF_EN: | ||
1538 | if (kvm_pv_enable_async_pf(vcpu, data)) | ||
1539 | return 1; | ||
1540 | break; | ||
1502 | case MSR_IA32_MCG_CTL: | 1541 | case MSR_IA32_MCG_CTL: |
1503 | case MSR_IA32_MCG_STATUS: | 1542 | case MSR_IA32_MCG_STATUS: |
1504 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | 1543 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: |
@@ -1775,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1775 | case MSR_KVM_SYSTEM_TIME_NEW: | 1814 | case MSR_KVM_SYSTEM_TIME_NEW: |
1776 | data = vcpu->arch.time; | 1815 | data = vcpu->arch.time; |
1777 | break; | 1816 | break; |
1817 | case MSR_KVM_ASYNC_PF_EN: | ||
1818 | data = vcpu->arch.apf.msr_val; | ||
1819 | break; | ||
1778 | case MSR_IA32_P5_MC_ADDR: | 1820 | case MSR_IA32_P5_MC_ADDR: |
1779 | case MSR_IA32_P5_MC_TYPE: | 1821 | case MSR_IA32_P5_MC_TYPE: |
1780 | case MSR_IA32_MCG_CAP: | 1822 | case MSR_IA32_MCG_CAP: |
@@ -1904,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1904 | case KVM_CAP_NOP_IO_DELAY: | 1946 | case KVM_CAP_NOP_IO_DELAY: |
1905 | case KVM_CAP_MP_STATE: | 1947 | case KVM_CAP_MP_STATE: |
1906 | case KVM_CAP_SYNC_MMU: | 1948 | case KVM_CAP_SYNC_MMU: |
1949 | case KVM_CAP_USER_NMI: | ||
1907 | case KVM_CAP_REINJECT_CONTROL: | 1950 | case KVM_CAP_REINJECT_CONTROL: |
1908 | case KVM_CAP_IRQ_INJECT_STATUS: | 1951 | case KVM_CAP_IRQ_INJECT_STATUS: |
1909 | case KVM_CAP_ASSIGN_DEV_IRQ: | 1952 | case KVM_CAP_ASSIGN_DEV_IRQ: |
@@ -1922,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1922 | case KVM_CAP_DEBUGREGS: | 1965 | case KVM_CAP_DEBUGREGS: |
1923 | case KVM_CAP_X86_ROBUST_SINGLESTEP: | 1966 | case KVM_CAP_X86_ROBUST_SINGLESTEP: |
1924 | case KVM_CAP_XSAVE: | 1967 | case KVM_CAP_XSAVE: |
1968 | case KVM_CAP_ASYNC_PF: | ||
1925 | r = 1; | 1969 | r = 1; |
1926 | break; | 1970 | break; |
1927 | case KVM_CAP_COALESCED_MMIO: | 1971 | case KVM_CAP_COALESCED_MMIO: |
@@ -2185,6 +2229,11 @@ out: | |||
2185 | return r; | 2229 | return r; |
2186 | } | 2230 | } |
2187 | 2231 | ||
2232 | static void cpuid_mask(u32 *word, int wordnum) | ||
2233 | { | ||
2234 | *word &= boot_cpu_data.x86_capability[wordnum]; | ||
2235 | } | ||
2236 | |||
2188 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | 2237 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
2189 | u32 index) | 2238 | u32 index) |
2190 | { | 2239 | { |
@@ -2259,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2259 | break; | 2308 | break; |
2260 | case 1: | 2309 | case 1: |
2261 | entry->edx &= kvm_supported_word0_x86_features; | 2310 | entry->edx &= kvm_supported_word0_x86_features; |
2311 | cpuid_mask(&entry->edx, 0); | ||
2262 | entry->ecx &= kvm_supported_word4_x86_features; | 2312 | entry->ecx &= kvm_supported_word4_x86_features; |
2313 | cpuid_mask(&entry->ecx, 4); | ||
2263 | /* we support x2apic emulation even if host does not support | 2314 | /* we support x2apic emulation even if host does not support |
2264 | * it since we emulate x2apic in software */ | 2315 | * it since we emulate x2apic in software */ |
2265 | entry->ecx |= F(X2APIC); | 2316 | entry->ecx |= F(X2APIC); |
@@ -2350,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2350 | break; | 2401 | break; |
2351 | case 0x80000001: | 2402 | case 0x80000001: |
2352 | entry->edx &= kvm_supported_word1_x86_features; | 2403 | entry->edx &= kvm_supported_word1_x86_features; |
2404 | cpuid_mask(&entry->edx, 1); | ||
2353 | entry->ecx &= kvm_supported_word6_x86_features; | 2405 | entry->ecx &= kvm_supported_word6_x86_features; |
2406 | cpuid_mask(&entry->ecx, 6); | ||
2354 | break; | 2407 | break; |
2355 | } | 2408 | } |
2356 | 2409 | ||
@@ -3169,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
3169 | struct kvm_memslots *slots, *old_slots; | 3222 | struct kvm_memslots *slots, *old_slots; |
3170 | unsigned long *dirty_bitmap; | 3223 | unsigned long *dirty_bitmap; |
3171 | 3224 | ||
3172 | r = -ENOMEM; | 3225 | dirty_bitmap = memslot->dirty_bitmap_head; |
3173 | dirty_bitmap = vmalloc(n); | 3226 | if (memslot->dirty_bitmap == dirty_bitmap) |
3174 | if (!dirty_bitmap) | 3227 | dirty_bitmap += n / sizeof(long); |
3175 | goto out; | ||
3176 | memset(dirty_bitmap, 0, n); | 3228 | memset(dirty_bitmap, 0, n); |
3177 | 3229 | ||
3178 | r = -ENOMEM; | 3230 | r = -ENOMEM; |
3179 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 3231 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); |
3180 | if (!slots) { | 3232 | if (!slots) |
3181 | vfree(dirty_bitmap); | ||
3182 | goto out; | 3233 | goto out; |
3183 | } | ||
3184 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 3234 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); |
3185 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; | 3235 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; |
3236 | slots->generation++; | ||
3186 | 3237 | ||
3187 | old_slots = kvm->memslots; | 3238 | old_slots = kvm->memslots; |
3188 | rcu_assign_pointer(kvm->memslots, slots); | 3239 | rcu_assign_pointer(kvm->memslots, slots); |
@@ -3195,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
3195 | spin_unlock(&kvm->mmu_lock); | 3246 | spin_unlock(&kvm->mmu_lock); |
3196 | 3247 | ||
3197 | r = -EFAULT; | 3248 | r = -EFAULT; |
3198 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { | 3249 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) |
3199 | vfree(dirty_bitmap); | ||
3200 | goto out; | 3250 | goto out; |
3201 | } | ||
3202 | vfree(dirty_bitmap); | ||
3203 | } else { | 3251 | } else { |
3204 | r = -EFAULT; | 3252 | r = -EFAULT; |
3205 | if (clear_user(log->dirty_bitmap, n)) | 3253 | if (clear_user(log->dirty_bitmap, n)) |
@@ -3266,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3266 | if (vpic) { | 3314 | if (vpic) { |
3267 | r = kvm_ioapic_init(kvm); | 3315 | r = kvm_ioapic_init(kvm); |
3268 | if (r) { | 3316 | if (r) { |
3317 | mutex_lock(&kvm->slots_lock); | ||
3269 | kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, | 3318 | kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, |
3270 | &vpic->dev); | 3319 | &vpic->dev); |
3320 | mutex_unlock(&kvm->slots_lock); | ||
3271 | kfree(vpic); | 3321 | kfree(vpic); |
3272 | goto create_irqchip_unlock; | 3322 | goto create_irqchip_unlock; |
3273 | } | 3323 | } |
@@ -3278,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3278 | smp_wmb(); | 3328 | smp_wmb(); |
3279 | r = kvm_setup_default_irq_routing(kvm); | 3329 | r = kvm_setup_default_irq_routing(kvm); |
3280 | if (r) { | 3330 | if (r) { |
3331 | mutex_lock(&kvm->slots_lock); | ||
3281 | mutex_lock(&kvm->irq_lock); | 3332 | mutex_lock(&kvm->irq_lock); |
3282 | kvm_ioapic_destroy(kvm); | 3333 | kvm_ioapic_destroy(kvm); |
3283 | kvm_destroy_pic(kvm); | 3334 | kvm_destroy_pic(kvm); |
3284 | mutex_unlock(&kvm->irq_lock); | 3335 | mutex_unlock(&kvm->irq_lock); |
3336 | mutex_unlock(&kvm->slots_lock); | ||
3285 | } | 3337 | } |
3286 | create_irqchip_unlock: | 3338 | create_irqchip_unlock: |
3287 | mutex_unlock(&kvm->lock); | 3339 | mutex_unlock(&kvm->lock); |
@@ -3557,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | |||
3557 | static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | 3609 | static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) |
3558 | { | 3610 | { |
3559 | gpa_t t_gpa; | 3611 | gpa_t t_gpa; |
3560 | u32 error; | 3612 | struct x86_exception exception; |
3561 | 3613 | ||
3562 | BUG_ON(!mmu_is_nested(vcpu)); | 3614 | BUG_ON(!mmu_is_nested(vcpu)); |
3563 | 3615 | ||
3564 | /* NPT walks are always user-walks */ | 3616 | /* NPT walks are always user-walks */ |
3565 | access |= PFERR_USER_MASK; | 3617 | access |= PFERR_USER_MASK; |
3566 | t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); | 3618 | t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); |
3567 | if (t_gpa == UNMAPPED_GVA) | ||
3568 | vcpu->arch.fault.nested = true; | ||
3569 | 3619 | ||
3570 | return t_gpa; | 3620 | return t_gpa; |
3571 | } | 3621 | } |
3572 | 3622 | ||
3573 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3623 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, |
3624 | struct x86_exception *exception) | ||
3574 | { | 3625 | { |
3575 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3626 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3576 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); | 3627 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3577 | } | 3628 | } |
3578 | 3629 | ||
3579 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3630 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, |
3631 | struct x86_exception *exception) | ||
3580 | { | 3632 | { |
3581 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3633 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3582 | access |= PFERR_FETCH_MASK; | 3634 | access |= PFERR_FETCH_MASK; |
3583 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); | 3635 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3584 | } | 3636 | } |
3585 | 3637 | ||
3586 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3638 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, |
3639 | struct x86_exception *exception) | ||
3587 | { | 3640 | { |
3588 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3641 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3589 | access |= PFERR_WRITE_MASK; | 3642 | access |= PFERR_WRITE_MASK; |
3590 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); | 3643 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3591 | } | 3644 | } |
3592 | 3645 | ||
3593 | /* uses this to access any guest's mapped memory without checking CPL */ | 3646 | /* uses this to access any guest's mapped memory without checking CPL */ |
3594 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3647 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, |
3648 | struct x86_exception *exception) | ||
3595 | { | 3649 | { |
3596 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); | 3650 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); |
3597 | } | 3651 | } |
3598 | 3652 | ||
3599 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, | 3653 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, |
3600 | struct kvm_vcpu *vcpu, u32 access, | 3654 | struct kvm_vcpu *vcpu, u32 access, |
3601 | u32 *error) | 3655 | struct x86_exception *exception) |
3602 | { | 3656 | { |
3603 | void *data = val; | 3657 | void *data = val; |
3604 | int r = X86EMUL_CONTINUE; | 3658 | int r = X86EMUL_CONTINUE; |
3605 | 3659 | ||
3606 | while (bytes) { | 3660 | while (bytes) { |
3607 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, | 3661 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, |
3608 | error); | 3662 | exception); |
3609 | unsigned offset = addr & (PAGE_SIZE-1); | 3663 | unsigned offset = addr & (PAGE_SIZE-1); |
3610 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); | 3664 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); |
3611 | int ret; | 3665 | int ret; |
3612 | 3666 | ||
3613 | if (gpa == UNMAPPED_GVA) { | 3667 | if (gpa == UNMAPPED_GVA) |
3614 | r = X86EMUL_PROPAGATE_FAULT; | 3668 | return X86EMUL_PROPAGATE_FAULT; |
3615 | goto out; | ||
3616 | } | ||
3617 | ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); | 3669 | ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); |
3618 | if (ret < 0) { | 3670 | if (ret < 0) { |
3619 | r = X86EMUL_IO_NEEDED; | 3671 | r = X86EMUL_IO_NEEDED; |
@@ -3630,31 +3682,35 @@ out: | |||
3630 | 3682 | ||
3631 | /* used for instruction fetching */ | 3683 | /* used for instruction fetching */ |
3632 | static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, | 3684 | static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, |
3633 | struct kvm_vcpu *vcpu, u32 *error) | 3685 | struct kvm_vcpu *vcpu, |
3686 | struct x86_exception *exception) | ||
3634 | { | 3687 | { |
3635 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3688 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3636 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, | 3689 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, |
3637 | access | PFERR_FETCH_MASK, error); | 3690 | access | PFERR_FETCH_MASK, |
3691 | exception); | ||
3638 | } | 3692 | } |
3639 | 3693 | ||
3640 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, | 3694 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, |
3641 | struct kvm_vcpu *vcpu, u32 *error) | 3695 | struct kvm_vcpu *vcpu, |
3696 | struct x86_exception *exception) | ||
3642 | { | 3697 | { |
3643 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3698 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3644 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, | 3699 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, |
3645 | error); | 3700 | exception); |
3646 | } | 3701 | } |
3647 | 3702 | ||
3648 | static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, | 3703 | static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, |
3649 | struct kvm_vcpu *vcpu, u32 *error) | 3704 | struct kvm_vcpu *vcpu, |
3705 | struct x86_exception *exception) | ||
3650 | { | 3706 | { |
3651 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); | 3707 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); |
3652 | } | 3708 | } |
3653 | 3709 | ||
3654 | static int kvm_write_guest_virt_system(gva_t addr, void *val, | 3710 | static int kvm_write_guest_virt_system(gva_t addr, void *val, |
3655 | unsigned int bytes, | 3711 | unsigned int bytes, |
3656 | struct kvm_vcpu *vcpu, | 3712 | struct kvm_vcpu *vcpu, |
3657 | u32 *error) | 3713 | struct x86_exception *exception) |
3658 | { | 3714 | { |
3659 | void *data = val; | 3715 | void *data = val; |
3660 | int r = X86EMUL_CONTINUE; | 3716 | int r = X86EMUL_CONTINUE; |
@@ -3662,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, | |||
3662 | while (bytes) { | 3718 | while (bytes) { |
3663 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, | 3719 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, |
3664 | PFERR_WRITE_MASK, | 3720 | PFERR_WRITE_MASK, |
3665 | error); | 3721 | exception); |
3666 | unsigned offset = addr & (PAGE_SIZE-1); | 3722 | unsigned offset = addr & (PAGE_SIZE-1); |
3667 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); | 3723 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); |
3668 | int ret; | 3724 | int ret; |
3669 | 3725 | ||
3670 | if (gpa == UNMAPPED_GVA) { | 3726 | if (gpa == UNMAPPED_GVA) |
3671 | r = X86EMUL_PROPAGATE_FAULT; | 3727 | return X86EMUL_PROPAGATE_FAULT; |
3672 | goto out; | ||
3673 | } | ||
3674 | ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); | 3728 | ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); |
3675 | if (ret < 0) { | 3729 | if (ret < 0) { |
3676 | r = X86EMUL_IO_NEEDED; | 3730 | r = X86EMUL_IO_NEEDED; |
@@ -3688,7 +3742,7 @@ out: | |||
3688 | static int emulator_read_emulated(unsigned long addr, | 3742 | static int emulator_read_emulated(unsigned long addr, |
3689 | void *val, | 3743 | void *val, |
3690 | unsigned int bytes, | 3744 | unsigned int bytes, |
3691 | unsigned int *error_code, | 3745 | struct x86_exception *exception, |
3692 | struct kvm_vcpu *vcpu) | 3746 | struct kvm_vcpu *vcpu) |
3693 | { | 3747 | { |
3694 | gpa_t gpa; | 3748 | gpa_t gpa; |
@@ -3701,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr, | |||
3701 | return X86EMUL_CONTINUE; | 3755 | return X86EMUL_CONTINUE; |
3702 | } | 3756 | } |
3703 | 3757 | ||
3704 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); | 3758 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); |
3705 | 3759 | ||
3706 | if (gpa == UNMAPPED_GVA) | 3760 | if (gpa == UNMAPPED_GVA) |
3707 | return X86EMUL_PROPAGATE_FAULT; | 3761 | return X86EMUL_PROPAGATE_FAULT; |
@@ -3710,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr, | |||
3710 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | 3764 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) |
3711 | goto mmio; | 3765 | goto mmio; |
3712 | 3766 | ||
3713 | if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) | 3767 | if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) |
3714 | == X86EMUL_CONTINUE) | 3768 | == X86EMUL_CONTINUE) |
3715 | return X86EMUL_CONTINUE; | 3769 | return X86EMUL_CONTINUE; |
3716 | 3770 | ||
3717 | mmio: | 3771 | mmio: |
@@ -3735,7 +3789,7 @@ mmio: | |||
3735 | } | 3789 | } |
3736 | 3790 | ||
3737 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 3791 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
3738 | const void *val, int bytes) | 3792 | const void *val, int bytes) |
3739 | { | 3793 | { |
3740 | int ret; | 3794 | int ret; |
3741 | 3795 | ||
@@ -3749,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3749 | static int emulator_write_emulated_onepage(unsigned long addr, | 3803 | static int emulator_write_emulated_onepage(unsigned long addr, |
3750 | const void *val, | 3804 | const void *val, |
3751 | unsigned int bytes, | 3805 | unsigned int bytes, |
3752 | unsigned int *error_code, | 3806 | struct x86_exception *exception, |
3753 | struct kvm_vcpu *vcpu) | 3807 | struct kvm_vcpu *vcpu) |
3754 | { | 3808 | { |
3755 | gpa_t gpa; | 3809 | gpa_t gpa; |
3756 | 3810 | ||
3757 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); | 3811 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); |
3758 | 3812 | ||
3759 | if (gpa == UNMAPPED_GVA) | 3813 | if (gpa == UNMAPPED_GVA) |
3760 | return X86EMUL_PROPAGATE_FAULT; | 3814 | return X86EMUL_PROPAGATE_FAULT; |
@@ -3787,7 +3841,7 @@ mmio: | |||
3787 | int emulator_write_emulated(unsigned long addr, | 3841 | int emulator_write_emulated(unsigned long addr, |
3788 | const void *val, | 3842 | const void *val, |
3789 | unsigned int bytes, | 3843 | unsigned int bytes, |
3790 | unsigned int *error_code, | 3844 | struct x86_exception *exception, |
3791 | struct kvm_vcpu *vcpu) | 3845 | struct kvm_vcpu *vcpu) |
3792 | { | 3846 | { |
3793 | /* Crossing a page boundary? */ | 3847 | /* Crossing a page boundary? */ |
@@ -3795,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr, | |||
3795 | int rc, now; | 3849 | int rc, now; |
3796 | 3850 | ||
3797 | now = -addr & ~PAGE_MASK; | 3851 | now = -addr & ~PAGE_MASK; |
3798 | rc = emulator_write_emulated_onepage(addr, val, now, error_code, | 3852 | rc = emulator_write_emulated_onepage(addr, val, now, exception, |
3799 | vcpu); | 3853 | vcpu); |
3800 | if (rc != X86EMUL_CONTINUE) | 3854 | if (rc != X86EMUL_CONTINUE) |
3801 | return rc; | 3855 | return rc; |
@@ -3803,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr, | |||
3803 | val += now; | 3857 | val += now; |
3804 | bytes -= now; | 3858 | bytes -= now; |
3805 | } | 3859 | } |
3806 | return emulator_write_emulated_onepage(addr, val, bytes, error_code, | 3860 | return emulator_write_emulated_onepage(addr, val, bytes, exception, |
3807 | vcpu); | 3861 | vcpu); |
3808 | } | 3862 | } |
3809 | 3863 | ||
@@ -3821,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
3821 | const void *old, | 3875 | const void *old, |
3822 | const void *new, | 3876 | const void *new, |
3823 | unsigned int bytes, | 3877 | unsigned int bytes, |
3824 | unsigned int *error_code, | 3878 | struct x86_exception *exception, |
3825 | struct kvm_vcpu *vcpu) | 3879 | struct kvm_vcpu *vcpu) |
3826 | { | 3880 | { |
3827 | gpa_t gpa; | 3881 | gpa_t gpa; |
@@ -3879,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
3879 | emul_write: | 3933 | emul_write: |
3880 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); | 3934 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); |
3881 | 3935 | ||
3882 | return emulator_write_emulated(addr, new, bytes, error_code, vcpu); | 3936 | return emulator_write_emulated(addr, new, bytes, exception, vcpu); |
3883 | } | 3937 | } |
3884 | 3938 | ||
3885 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) | 3939 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) |
@@ -3904,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, | |||
3904 | if (vcpu->arch.pio.count) | 3958 | if (vcpu->arch.pio.count) |
3905 | goto data_avail; | 3959 | goto data_avail; |
3906 | 3960 | ||
3907 | trace_kvm_pio(0, port, size, 1); | 3961 | trace_kvm_pio(0, port, size, count); |
3908 | 3962 | ||
3909 | vcpu->arch.pio.port = port; | 3963 | vcpu->arch.pio.port = port; |
3910 | vcpu->arch.pio.in = 1; | 3964 | vcpu->arch.pio.in = 1; |
@@ -3932,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port, | |||
3932 | const void *val, unsigned int count, | 3986 | const void *val, unsigned int count, |
3933 | struct kvm_vcpu *vcpu) | 3987 | struct kvm_vcpu *vcpu) |
3934 | { | 3988 | { |
3935 | trace_kvm_pio(1, port, size, 1); | 3989 | trace_kvm_pio(1, port, size, count); |
3936 | 3990 | ||
3937 | vcpu->arch.pio.port = port; | 3991 | vcpu->arch.pio.port = port; |
3938 | vcpu->arch.pio.in = 0; | 3992 | vcpu->arch.pio.in = 0; |
@@ -3973,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) | |||
3973 | return X86EMUL_CONTINUE; | 4027 | return X86EMUL_CONTINUE; |
3974 | 4028 | ||
3975 | if (kvm_x86_ops->has_wbinvd_exit()) { | 4029 | if (kvm_x86_ops->has_wbinvd_exit()) { |
3976 | preempt_disable(); | 4030 | int cpu = get_cpu(); |
4031 | |||
4032 | cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); | ||
3977 | smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, | 4033 | smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, |
3978 | wbinvd_ipi, NULL, 1); | 4034 | wbinvd_ipi, NULL, 1); |
3979 | preempt_enable(); | 4035 | put_cpu(); |
3980 | cpumask_clear(vcpu->arch.wbinvd_dirty_mask); | 4036 | cpumask_clear(vcpu->arch.wbinvd_dirty_mask); |
3981 | } | 4037 | } else |
3982 | wbinvd(); | 4038 | wbinvd(); |
3983 | return X86EMUL_CONTINUE; | 4039 | return X86EMUL_CONTINUE; |
3984 | } | 4040 | } |
3985 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); | 4041 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); |
@@ -4019,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) | |||
4019 | value = vcpu->arch.cr2; | 4075 | value = vcpu->arch.cr2; |
4020 | break; | 4076 | break; |
4021 | case 3: | 4077 | case 3: |
4022 | value = vcpu->arch.cr3; | 4078 | value = kvm_read_cr3(vcpu); |
4023 | break; | 4079 | break; |
4024 | case 4: | 4080 | case 4: |
4025 | value = kvm_read_cr4(vcpu); | 4081 | value = kvm_read_cr4(vcpu); |
@@ -4053,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) | |||
4053 | res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); | 4109 | res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); |
4054 | break; | 4110 | break; |
4055 | case 8: | 4111 | case 8: |
4056 | res = __kvm_set_cr8(vcpu, val & 0xfUL); | 4112 | res = kvm_set_cr8(vcpu, val); |
4057 | break; | 4113 | break; |
4058 | default: | 4114 | default: |
4059 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 4115 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); |
@@ -4206,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) | |||
4206 | static void inject_emulated_exception(struct kvm_vcpu *vcpu) | 4262 | static void inject_emulated_exception(struct kvm_vcpu *vcpu) |
4207 | { | 4263 | { |
4208 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 4264 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4209 | if (ctxt->exception == PF_VECTOR) | 4265 | if (ctxt->exception.vector == PF_VECTOR) |
4210 | kvm_propagate_fault(vcpu); | 4266 | kvm_propagate_fault(vcpu, &ctxt->exception); |
4211 | else if (ctxt->error_code_valid) | 4267 | else if (ctxt->exception.error_code_valid) |
4212 | kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); | 4268 | kvm_queue_exception_e(vcpu, ctxt->exception.vector, |
4269 | ctxt->exception.error_code); | ||
4213 | else | 4270 | else |
4214 | kvm_queue_exception(vcpu, ctxt->exception); | 4271 | kvm_queue_exception(vcpu, ctxt->exception.vector); |
4215 | } | 4272 | } |
4216 | 4273 | ||
4217 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | 4274 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) |
@@ -4267,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); | |||
4267 | 4324 | ||
4268 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) | 4325 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) |
4269 | { | 4326 | { |
4327 | int r = EMULATE_DONE; | ||
4328 | |||
4270 | ++vcpu->stat.insn_emulation_fail; | 4329 | ++vcpu->stat.insn_emulation_fail; |
4271 | trace_kvm_emulate_insn_failed(vcpu); | 4330 | trace_kvm_emulate_insn_failed(vcpu); |
4272 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | 4331 | if (!is_guest_mode(vcpu)) { |
4273 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | 4332 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
4274 | vcpu->run->internal.ndata = 0; | 4333 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |
4334 | vcpu->run->internal.ndata = 0; | ||
4335 | r = EMULATE_FAIL; | ||
4336 | } | ||
4275 | kvm_queue_exception(vcpu, UD_VECTOR); | 4337 | kvm_queue_exception(vcpu, UD_VECTOR); |
4276 | return EMULATE_FAIL; | 4338 | |
4339 | return r; | ||
4277 | } | 4340 | } |
4278 | 4341 | ||
4279 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | 4342 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) |
@@ -4302,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | |||
4302 | return false; | 4365 | return false; |
4303 | } | 4366 | } |
4304 | 4367 | ||
4305 | int emulate_instruction(struct kvm_vcpu *vcpu, | 4368 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, |
4306 | unsigned long cr2, | 4369 | unsigned long cr2, |
4307 | u16 error_code, | 4370 | int emulation_type, |
4308 | int emulation_type) | 4371 | void *insn, |
4372 | int insn_len) | ||
4309 | { | 4373 | { |
4310 | int r; | 4374 | int r; |
4311 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4375 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; |
@@ -4323,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
4323 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { | 4387 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
4324 | init_emulate_ctxt(vcpu); | 4388 | init_emulate_ctxt(vcpu); |
4325 | vcpu->arch.emulate_ctxt.interruptibility = 0; | 4389 | vcpu->arch.emulate_ctxt.interruptibility = 0; |
4326 | vcpu->arch.emulate_ctxt.exception = -1; | 4390 | vcpu->arch.emulate_ctxt.have_exception = false; |
4327 | vcpu->arch.emulate_ctxt.perm_ok = false; | 4391 | vcpu->arch.emulate_ctxt.perm_ok = false; |
4328 | 4392 | ||
4329 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt); | 4393 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); |
4330 | if (r == X86EMUL_PROPAGATE_FAULT) | 4394 | if (r == X86EMUL_PROPAGATE_FAULT) |
4331 | goto done; | 4395 | goto done; |
4332 | 4396 | ||
@@ -4389,7 +4453,7 @@ restart: | |||
4389 | } | 4453 | } |
4390 | 4454 | ||
4391 | done: | 4455 | done: |
4392 | if (vcpu->arch.emulate_ctxt.exception >= 0) { | 4456 | if (vcpu->arch.emulate_ctxt.have_exception) { |
4393 | inject_emulated_exception(vcpu); | 4457 | inject_emulated_exception(vcpu); |
4394 | r = EMULATE_DONE; | 4458 | r = EMULATE_DONE; |
4395 | } else if (vcpu->arch.pio.count) { | 4459 | } else if (vcpu->arch.pio.count) { |
@@ -4413,7 +4477,7 @@ done: | |||
4413 | 4477 | ||
4414 | return r; | 4478 | return r; |
4415 | } | 4479 | } |
4416 | EXPORT_SYMBOL_GPL(emulate_instruction); | 4480 | EXPORT_SYMBOL_GPL(x86_emulate_instruction); |
4417 | 4481 | ||
4418 | int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) | 4482 | int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) |
4419 | { | 4483 | { |
@@ -4653,7 +4717,6 @@ int kvm_arch_init(void *opaque) | |||
4653 | 4717 | ||
4654 | kvm_x86_ops = ops; | 4718 | kvm_x86_ops = ops; |
4655 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | 4719 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); |
4656 | kvm_mmu_set_base_ptes(PT_PRESENT_MASK); | ||
4657 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, | 4720 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |
4658 | PT_DIRTY_MASK, PT64_NX_MASK, 0); | 4721 | PT_DIRTY_MASK, PT64_NX_MASK, 0); |
4659 | 4722 | ||
@@ -5116,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5116 | vcpu->fpu_active = 0; | 5179 | vcpu->fpu_active = 0; |
5117 | kvm_x86_ops->fpu_deactivate(vcpu); | 5180 | kvm_x86_ops->fpu_deactivate(vcpu); |
5118 | } | 5181 | } |
5182 | if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { | ||
5183 | /* Page is swapped out. Do synthetic halt */ | ||
5184 | vcpu->arch.apf.halted = true; | ||
5185 | r = 1; | ||
5186 | goto out; | ||
5187 | } | ||
5119 | } | 5188 | } |
5120 | 5189 | ||
5121 | r = kvm_mmu_reload(vcpu); | 5190 | r = kvm_mmu_reload(vcpu); |
@@ -5244,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5244 | 5313 | ||
5245 | r = 1; | 5314 | r = 1; |
5246 | while (r > 0) { | 5315 | while (r > 0) { |
5247 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) | 5316 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
5317 | !vcpu->arch.apf.halted) | ||
5248 | r = vcpu_enter_guest(vcpu); | 5318 | r = vcpu_enter_guest(vcpu); |
5249 | else { | 5319 | else { |
5250 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); | 5320 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); |
@@ -5257,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5257 | vcpu->arch.mp_state = | 5327 | vcpu->arch.mp_state = |
5258 | KVM_MP_STATE_RUNNABLE; | 5328 | KVM_MP_STATE_RUNNABLE; |
5259 | case KVM_MP_STATE_RUNNABLE: | 5329 | case KVM_MP_STATE_RUNNABLE: |
5330 | vcpu->arch.apf.halted = false; | ||
5260 | break; | 5331 | break; |
5261 | case KVM_MP_STATE_SIPI_RECEIVED: | 5332 | case KVM_MP_STATE_SIPI_RECEIVED: |
5262 | default: | 5333 | default: |
@@ -5278,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5278 | vcpu->run->exit_reason = KVM_EXIT_INTR; | 5349 | vcpu->run->exit_reason = KVM_EXIT_INTR; |
5279 | ++vcpu->stat.request_irq_exits; | 5350 | ++vcpu->stat.request_irq_exits; |
5280 | } | 5351 | } |
5352 | |||
5353 | kvm_check_async_pf_completion(vcpu); | ||
5354 | |||
5281 | if (signal_pending(current)) { | 5355 | if (signal_pending(current)) { |
5282 | r = -EINTR; | 5356 | r = -EINTR; |
5283 | vcpu->run->exit_reason = KVM_EXIT_INTR; | 5357 | vcpu->run->exit_reason = KVM_EXIT_INTR; |
@@ -5302,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
5302 | int r; | 5376 | int r; |
5303 | sigset_t sigsaved; | 5377 | sigset_t sigsaved; |
5304 | 5378 | ||
5379 | if (!tsk_used_math(current) && init_fpu(current)) | ||
5380 | return -ENOMEM; | ||
5381 | |||
5305 | if (vcpu->sigset_active) | 5382 | if (vcpu->sigset_active) |
5306 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); | 5383 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); |
5307 | 5384 | ||
@@ -5313,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
5313 | } | 5390 | } |
5314 | 5391 | ||
5315 | /* re-sync apic's tpr */ | 5392 | /* re-sync apic's tpr */ |
5316 | if (!irqchip_in_kernel(vcpu->kvm)) | 5393 | if (!irqchip_in_kernel(vcpu->kvm)) { |
5317 | kvm_set_cr8(vcpu, kvm_run->cr8); | 5394 | if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { |
5395 | r = -EINVAL; | ||
5396 | goto out; | ||
5397 | } | ||
5398 | } | ||
5318 | 5399 | ||
5319 | if (vcpu->arch.pio.count || vcpu->mmio_needed) { | 5400 | if (vcpu->arch.pio.count || vcpu->mmio_needed) { |
5320 | if (vcpu->mmio_needed) { | 5401 | if (vcpu->mmio_needed) { |
@@ -5323,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
5323 | vcpu->mmio_needed = 0; | 5404 | vcpu->mmio_needed = 0; |
5324 | } | 5405 | } |
5325 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | 5406 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); |
5326 | r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); | 5407 | r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); |
5327 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5408 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
5328 | if (r != EMULATE_DONE) { | 5409 | if (r != EMULATE_DONE) { |
5329 | r = 0; | 5410 | r = 0; |
@@ -5436,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
5436 | 5517 | ||
5437 | sregs->cr0 = kvm_read_cr0(vcpu); | 5518 | sregs->cr0 = kvm_read_cr0(vcpu); |
5438 | sregs->cr2 = vcpu->arch.cr2; | 5519 | sregs->cr2 = vcpu->arch.cr2; |
5439 | sregs->cr3 = vcpu->arch.cr3; | 5520 | sregs->cr3 = kvm_read_cr3(vcpu); |
5440 | sregs->cr4 = kvm_read_cr4(vcpu); | 5521 | sregs->cr4 = kvm_read_cr4(vcpu); |
5441 | sregs->cr8 = kvm_get_cr8(vcpu); | 5522 | sregs->cr8 = kvm_get_cr8(vcpu); |
5442 | sregs->efer = vcpu->arch.efer; | 5523 | sregs->efer = vcpu->arch.efer; |
@@ -5504,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5504 | kvm_x86_ops->set_gdt(vcpu, &dt); | 5585 | kvm_x86_ops->set_gdt(vcpu, &dt); |
5505 | 5586 | ||
5506 | vcpu->arch.cr2 = sregs->cr2; | 5587 | vcpu->arch.cr2 = sregs->cr2; |
5507 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | 5588 | mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; |
5508 | vcpu->arch.cr3 = sregs->cr3; | 5589 | vcpu->arch.cr3 = sregs->cr3; |
5590 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
5509 | 5591 | ||
5510 | kvm_set_cr8(vcpu, sregs->cr8); | 5592 | kvm_set_cr8(vcpu, sregs->cr8); |
5511 | 5593 | ||
@@ -5522,7 +5604,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5522 | if (sregs->cr4 & X86_CR4_OSXSAVE) | 5604 | if (sregs->cr4 & X86_CR4_OSXSAVE) |
5523 | update_cpuid(vcpu); | 5605 | update_cpuid(vcpu); |
5524 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { | 5606 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { |
5525 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); | 5607 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); |
5526 | mmu_reset_needed = 1; | 5608 | mmu_reset_needed = 1; |
5527 | } | 5609 | } |
5528 | 5610 | ||
@@ -5773,6 +5855,8 @@ free_vcpu: | |||
5773 | 5855 | ||
5774 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | 5856 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
5775 | { | 5857 | { |
5858 | vcpu->arch.apf.msr_val = 0; | ||
5859 | |||
5776 | vcpu_load(vcpu); | 5860 | vcpu_load(vcpu); |
5777 | kvm_mmu_unload(vcpu); | 5861 | kvm_mmu_unload(vcpu); |
5778 | vcpu_put(vcpu); | 5862 | vcpu_put(vcpu); |
@@ -5792,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
5792 | vcpu->arch.dr7 = DR7_FIXED_1; | 5876 | vcpu->arch.dr7 = DR7_FIXED_1; |
5793 | 5877 | ||
5794 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 5878 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5879 | vcpu->arch.apf.msr_val = 0; | ||
5880 | |||
5881 | kvm_clear_async_pf_completion_queue(vcpu); | ||
5882 | kvm_async_pf_hash_reset(vcpu); | ||
5883 | vcpu->arch.apf.halted = false; | ||
5795 | 5884 | ||
5796 | return kvm_x86_ops->vcpu_reset(vcpu); | 5885 | return kvm_x86_ops->vcpu_reset(vcpu); |
5797 | } | 5886 | } |
@@ -5881,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5881 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) | 5970 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) |
5882 | goto fail_free_mce_banks; | 5971 | goto fail_free_mce_banks; |
5883 | 5972 | ||
5973 | kvm_async_pf_hash_reset(vcpu); | ||
5974 | |||
5884 | return 0; | 5975 | return 0; |
5885 | fail_free_mce_banks: | 5976 | fail_free_mce_banks: |
5886 | kfree(vcpu->arch.mce_banks); | 5977 | kfree(vcpu->arch.mce_banks); |
@@ -5906,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
5906 | free_page((unsigned long)vcpu->arch.pio_data); | 5997 | free_page((unsigned long)vcpu->arch.pio_data); |
5907 | } | 5998 | } |
5908 | 5999 | ||
5909 | struct kvm *kvm_arch_create_vm(void) | 6000 | int kvm_arch_init_vm(struct kvm *kvm) |
5910 | { | 6001 | { |
5911 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
5912 | |||
5913 | if (!kvm) | ||
5914 | return ERR_PTR(-ENOMEM); | ||
5915 | |||
5916 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 6002 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
5917 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 6003 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
5918 | 6004 | ||
@@ -5921,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void) | |||
5921 | 6007 | ||
5922 | spin_lock_init(&kvm->arch.tsc_write_lock); | 6008 | spin_lock_init(&kvm->arch.tsc_write_lock); |
5923 | 6009 | ||
5924 | return kvm; | 6010 | return 0; |
5925 | } | 6011 | } |
5926 | 6012 | ||
5927 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | 6013 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) |
@@ -5939,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
5939 | /* | 6025 | /* |
5940 | * Unpin any mmu pages first. | 6026 | * Unpin any mmu pages first. |
5941 | */ | 6027 | */ |
5942 | kvm_for_each_vcpu(i, vcpu, kvm) | 6028 | kvm_for_each_vcpu(i, vcpu, kvm) { |
6029 | kvm_clear_async_pf_completion_queue(vcpu); | ||
5943 | kvm_unload_vcpu_mmu(vcpu); | 6030 | kvm_unload_vcpu_mmu(vcpu); |
6031 | } | ||
5944 | kvm_for_each_vcpu(i, vcpu, kvm) | 6032 | kvm_for_each_vcpu(i, vcpu, kvm) |
5945 | kvm_arch_vcpu_free(vcpu); | 6033 | kvm_arch_vcpu_free(vcpu); |
5946 | 6034 | ||
@@ -5964,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
5964 | kfree(kvm->arch.vpic); | 6052 | kfree(kvm->arch.vpic); |
5965 | kfree(kvm->arch.vioapic); | 6053 | kfree(kvm->arch.vioapic); |
5966 | kvm_free_vcpus(kvm); | 6054 | kvm_free_vcpus(kvm); |
5967 | kvm_free_physmem(kvm); | ||
5968 | if (kvm->arch.apic_access_page) | 6055 | if (kvm->arch.apic_access_page) |
5969 | put_page(kvm->arch.apic_access_page); | 6056 | put_page(kvm->arch.apic_access_page); |
5970 | if (kvm->arch.ept_identity_pagetable) | 6057 | if (kvm->arch.ept_identity_pagetable) |
5971 | put_page(kvm->arch.ept_identity_pagetable); | 6058 | put_page(kvm->arch.ept_identity_pagetable); |
5972 | cleanup_srcu_struct(&kvm->srcu); | ||
5973 | kfree(kvm); | ||
5974 | } | 6059 | } |
5975 | 6060 | ||
5976 | int kvm_arch_prepare_memory_region(struct kvm *kvm, | 6061 | int kvm_arch_prepare_memory_region(struct kvm *kvm, |
@@ -6051,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm) | |||
6051 | 6136 | ||
6052 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 6137 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
6053 | { | 6138 | { |
6054 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE | 6139 | return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
6140 | !vcpu->arch.apf.halted) | ||
6141 | || !list_empty_careful(&vcpu->async_pf.done) | ||
6055 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED | 6142 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED |
6056 | || vcpu->arch.nmi_pending || | 6143 | || vcpu->arch.nmi_pending || |
6057 | (kvm_arch_interrupt_allowed(vcpu) && | 6144 | (kvm_arch_interrupt_allowed(vcpu) && |
@@ -6110,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | |||
6110 | } | 6197 | } |
6111 | EXPORT_SYMBOL_GPL(kvm_set_rflags); | 6198 | EXPORT_SYMBOL_GPL(kvm_set_rflags); |
6112 | 6199 | ||
6200 | void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) | ||
6201 | { | ||
6202 | int r; | ||
6203 | |||
6204 | if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || | ||
6205 | is_error_page(work->page)) | ||
6206 | return; | ||
6207 | |||
6208 | r = kvm_mmu_reload(vcpu); | ||
6209 | if (unlikely(r)) | ||
6210 | return; | ||
6211 | |||
6212 | if (!vcpu->arch.mmu.direct_map && | ||
6213 | work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) | ||
6214 | return; | ||
6215 | |||
6216 | vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); | ||
6217 | } | ||
6218 | |||
6219 | static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) | ||
6220 | { | ||
6221 | return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); | ||
6222 | } | ||
6223 | |||
6224 | static inline u32 kvm_async_pf_next_probe(u32 key) | ||
6225 | { | ||
6226 | return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); | ||
6227 | } | ||
6228 | |||
6229 | static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6230 | { | ||
6231 | u32 key = kvm_async_pf_hash_fn(gfn); | ||
6232 | |||
6233 | while (vcpu->arch.apf.gfns[key] != ~0) | ||
6234 | key = kvm_async_pf_next_probe(key); | ||
6235 | |||
6236 | vcpu->arch.apf.gfns[key] = gfn; | ||
6237 | } | ||
6238 | |||
6239 | static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6240 | { | ||
6241 | int i; | ||
6242 | u32 key = kvm_async_pf_hash_fn(gfn); | ||
6243 | |||
6244 | for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && | ||
6245 | (vcpu->arch.apf.gfns[key] != gfn && | ||
6246 | vcpu->arch.apf.gfns[key] != ~0); i++) | ||
6247 | key = kvm_async_pf_next_probe(key); | ||
6248 | |||
6249 | return key; | ||
6250 | } | ||
6251 | |||
6252 | bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6253 | { | ||
6254 | return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; | ||
6255 | } | ||
6256 | |||
6257 | static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6258 | { | ||
6259 | u32 i, j, k; | ||
6260 | |||
6261 | i = j = kvm_async_pf_gfn_slot(vcpu, gfn); | ||
6262 | while (true) { | ||
6263 | vcpu->arch.apf.gfns[i] = ~0; | ||
6264 | do { | ||
6265 | j = kvm_async_pf_next_probe(j); | ||
6266 | if (vcpu->arch.apf.gfns[j] == ~0) | ||
6267 | return; | ||
6268 | k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); | ||
6269 | /* | ||
6270 | * k lies cyclically in ]i,j] | ||
6271 | * | i.k.j | | ||
6272 | * |....j i.k.| or |.k..j i...| | ||
6273 | */ | ||
6274 | } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); | ||
6275 | vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; | ||
6276 | i = j; | ||
6277 | } | ||
6278 | } | ||
6279 | |||
6280 | static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) | ||
6281 | { | ||
6282 | |||
6283 | return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, | ||
6284 | sizeof(val)); | ||
6285 | } | ||
6286 | |||
6287 | void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, | ||
6288 | struct kvm_async_pf *work) | ||
6289 | { | ||
6290 | struct x86_exception fault; | ||
6291 | |||
6292 | trace_kvm_async_pf_not_present(work->arch.token, work->gva); | ||
6293 | kvm_add_async_pf_gfn(vcpu, work->arch.gfn); | ||
6294 | |||
6295 | if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || | ||
6296 | (vcpu->arch.apf.send_user_only && | ||
6297 | kvm_x86_ops->get_cpl(vcpu) == 0)) | ||
6298 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); | ||
6299 | else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { | ||
6300 | fault.vector = PF_VECTOR; | ||
6301 | fault.error_code_valid = true; | ||
6302 | fault.error_code = 0; | ||
6303 | fault.nested_page_fault = false; | ||
6304 | fault.address = work->arch.token; | ||
6305 | kvm_inject_page_fault(vcpu, &fault); | ||
6306 | } | ||
6307 | } | ||
6308 | |||
6309 | void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, | ||
6310 | struct kvm_async_pf *work) | ||
6311 | { | ||
6312 | struct x86_exception fault; | ||
6313 | |||
6314 | trace_kvm_async_pf_ready(work->arch.token, work->gva); | ||
6315 | if (is_error_page(work->page)) | ||
6316 | work->arch.token = ~0; /* broadcast wakeup */ | ||
6317 | else | ||
6318 | kvm_del_async_pf_gfn(vcpu, work->arch.gfn); | ||
6319 | |||
6320 | if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && | ||
6321 | !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { | ||
6322 | fault.vector = PF_VECTOR; | ||
6323 | fault.error_code_valid = true; | ||
6324 | fault.error_code = 0; | ||
6325 | fault.nested_page_fault = false; | ||
6326 | fault.address = work->arch.token; | ||
6327 | kvm_inject_page_fault(vcpu, &fault); | ||
6328 | } | ||
6329 | vcpu->arch.apf.halted = false; | ||
6330 | } | ||
6331 | |||
6332 | bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) | ||
6333 | { | ||
6334 | if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) | ||
6335 | return true; | ||
6336 | else | ||
6337 | return !kvm_event_needs_reinjection(vcpu) && | ||
6338 | kvm_x86_ops->interrupt_allowed(vcpu); | ||
6339 | } | ||
6340 | |||
6113 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); | 6341 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); |
6114 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); | 6342 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); |
6115 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); | 6343 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); |
diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 919ae53adc5c..ea2dc1a2e13d 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h | |||
@@ -540,6 +540,7 @@ struct kvm_ppc_pvinfo { | |||
540 | #endif | 540 | #endif |
541 | #define KVM_CAP_PPC_GET_PVINFO 57 | 541 | #define KVM_CAP_PPC_GET_PVINFO 57 |
542 | #define KVM_CAP_PPC_IRQ_LEVEL 58 | 542 | #define KVM_CAP_PPC_IRQ_LEVEL 58 |
543 | #define KVM_CAP_ASYNC_PF 59 | ||
543 | 544 | ||
544 | #ifdef KVM_CAP_IRQ_ROUTING | 545 | #ifdef KVM_CAP_IRQ_ROUTING |
545 | 546 | ||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a0557422715e..b5021db21858 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
17 | #include <linux/preempt.h> | 17 | #include <linux/preempt.h> |
18 | #include <linux/msi.h> | 18 | #include <linux/msi.h> |
19 | #include <linux/slab.h> | ||
20 | #include <linux/rcupdate.h> | ||
19 | #include <asm/signal.h> | 21 | #include <asm/signal.h> |
20 | 22 | ||
21 | #include <linux/kvm.h> | 23 | #include <linux/kvm.h> |
@@ -40,6 +42,7 @@ | |||
40 | #define KVM_REQ_KICK 9 | 42 | #define KVM_REQ_KICK 9 |
41 | #define KVM_REQ_DEACTIVATE_FPU 10 | 43 | #define KVM_REQ_DEACTIVATE_FPU 10 |
42 | #define KVM_REQ_EVENT 11 | 44 | #define KVM_REQ_EVENT 11 |
45 | #define KVM_REQ_APF_HALT 12 | ||
43 | 46 | ||
44 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 | 47 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 |
45 | 48 | ||
@@ -74,6 +77,27 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, | |||
74 | int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, | 77 | int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, |
75 | struct kvm_io_device *dev); | 78 | struct kvm_io_device *dev); |
76 | 79 | ||
80 | #ifdef CONFIG_KVM_ASYNC_PF | ||
81 | struct kvm_async_pf { | ||
82 | struct work_struct work; | ||
83 | struct list_head link; | ||
84 | struct list_head queue; | ||
85 | struct kvm_vcpu *vcpu; | ||
86 | struct mm_struct *mm; | ||
87 | gva_t gva; | ||
88 | unsigned long addr; | ||
89 | struct kvm_arch_async_pf arch; | ||
90 | struct page *page; | ||
91 | bool done; | ||
92 | }; | ||
93 | |||
94 | void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); | ||
95 | void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); | ||
96 | int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, | ||
97 | struct kvm_arch_async_pf *arch); | ||
98 | int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); | ||
99 | #endif | ||
100 | |||
77 | struct kvm_vcpu { | 101 | struct kvm_vcpu { |
78 | struct kvm *kvm; | 102 | struct kvm *kvm; |
79 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 103 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -104,6 +128,15 @@ struct kvm_vcpu { | |||
104 | gpa_t mmio_phys_addr; | 128 | gpa_t mmio_phys_addr; |
105 | #endif | 129 | #endif |
106 | 130 | ||
131 | #ifdef CONFIG_KVM_ASYNC_PF | ||
132 | struct { | ||
133 | u32 queued; | ||
134 | struct list_head queue; | ||
135 | struct list_head done; | ||
136 | spinlock_t lock; | ||
137 | } async_pf; | ||
138 | #endif | ||
139 | |||
107 | struct kvm_vcpu_arch arch; | 140 | struct kvm_vcpu_arch arch; |
108 | }; | 141 | }; |
109 | 142 | ||
@@ -113,16 +146,19 @@ struct kvm_vcpu { | |||
113 | */ | 146 | */ |
114 | #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) | 147 | #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) |
115 | 148 | ||
149 | struct kvm_lpage_info { | ||
150 | unsigned long rmap_pde; | ||
151 | int write_count; | ||
152 | }; | ||
153 | |||
116 | struct kvm_memory_slot { | 154 | struct kvm_memory_slot { |
117 | gfn_t base_gfn; | 155 | gfn_t base_gfn; |
118 | unsigned long npages; | 156 | unsigned long npages; |
119 | unsigned long flags; | 157 | unsigned long flags; |
120 | unsigned long *rmap; | 158 | unsigned long *rmap; |
121 | unsigned long *dirty_bitmap; | 159 | unsigned long *dirty_bitmap; |
122 | struct { | 160 | unsigned long *dirty_bitmap_head; |
123 | unsigned long rmap_pde; | 161 | struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; |
124 | int write_count; | ||
125 | } *lpage_info[KVM_NR_PAGE_SIZES - 1]; | ||
126 | unsigned long userspace_addr; | 162 | unsigned long userspace_addr; |
127 | int user_alloc; | 163 | int user_alloc; |
128 | int id; | 164 | int id; |
@@ -169,6 +205,7 @@ struct kvm_irq_routing_table {}; | |||
169 | 205 | ||
170 | struct kvm_memslots { | 206 | struct kvm_memslots { |
171 | int nmemslots; | 207 | int nmemslots; |
208 | u64 generation; | ||
172 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + | 209 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + |
173 | KVM_PRIVATE_MEM_SLOTS]; | 210 | KVM_PRIVATE_MEM_SLOTS]; |
174 | }; | 211 | }; |
@@ -206,6 +243,10 @@ struct kvm { | |||
206 | 243 | ||
207 | struct mutex irq_lock; | 244 | struct mutex irq_lock; |
208 | #ifdef CONFIG_HAVE_KVM_IRQCHIP | 245 | #ifdef CONFIG_HAVE_KVM_IRQCHIP |
246 | /* | ||
247 | * Update side is protected by irq_lock and, | ||
248 | * if configured, irqfds.lock. | ||
249 | */ | ||
209 | struct kvm_irq_routing_table __rcu *irq_routing; | 250 | struct kvm_irq_routing_table __rcu *irq_routing; |
210 | struct hlist_head mask_notifier_list; | 251 | struct hlist_head mask_notifier_list; |
211 | struct hlist_head irq_ack_notifier_list; | 252 | struct hlist_head irq_ack_notifier_list; |
@@ -216,6 +257,7 @@ struct kvm { | |||
216 | unsigned long mmu_notifier_seq; | 257 | unsigned long mmu_notifier_seq; |
217 | long mmu_notifier_count; | 258 | long mmu_notifier_count; |
218 | #endif | 259 | #endif |
260 | long tlbs_dirty; | ||
219 | }; | 261 | }; |
220 | 262 | ||
221 | /* The guest did something we don't support. */ | 263 | /* The guest did something we don't support. */ |
@@ -302,7 +344,11 @@ void kvm_set_page_accessed(struct page *page); | |||
302 | 344 | ||
303 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); | 345 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); |
304 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); | 346 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); |
347 | pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, | ||
348 | bool write_fault, bool *writable); | ||
305 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); | 349 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); |
350 | pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | ||
351 | bool *writable); | ||
306 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, | 352 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, |
307 | struct kvm_memory_slot *slot, gfn_t gfn); | 353 | struct kvm_memory_slot *slot, gfn_t gfn); |
308 | int memslot_id(struct kvm *kvm, gfn_t gfn); | 354 | int memslot_id(struct kvm *kvm, gfn_t gfn); |
@@ -321,18 +367,25 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, | |||
321 | int offset, int len); | 367 | int offset, int len); |
322 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | 368 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, |
323 | unsigned long len); | 369 | unsigned long len); |
370 | int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | ||
371 | void *data, unsigned long len); | ||
372 | int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | ||
373 | gpa_t gpa); | ||
324 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); | 374 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); |
325 | int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); | 375 | int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); |
326 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); | 376 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); |
327 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); | 377 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); |
328 | unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); | 378 | unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); |
329 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | 379 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); |
380 | void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, | ||
381 | gfn_t gfn); | ||
330 | 382 | ||
331 | void kvm_vcpu_block(struct kvm_vcpu *vcpu); | 383 | void kvm_vcpu_block(struct kvm_vcpu *vcpu); |
332 | void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); | 384 | void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); |
333 | void kvm_resched(struct kvm_vcpu *vcpu); | 385 | void kvm_resched(struct kvm_vcpu *vcpu); |
334 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); | 386 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); |
335 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); | 387 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); |
388 | |||
336 | void kvm_flush_remote_tlbs(struct kvm *kvm); | 389 | void kvm_flush_remote_tlbs(struct kvm *kvm); |
337 | void kvm_reload_remote_mmus(struct kvm *kvm); | 390 | void kvm_reload_remote_mmus(struct kvm *kvm); |
338 | 391 | ||
@@ -398,7 +451,19 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); | |||
398 | 451 | ||
399 | void kvm_free_physmem(struct kvm *kvm); | 452 | void kvm_free_physmem(struct kvm *kvm); |
400 | 453 | ||
401 | struct kvm *kvm_arch_create_vm(void); | 454 | #ifndef __KVM_HAVE_ARCH_VM_ALLOC |
455 | static inline struct kvm *kvm_arch_alloc_vm(void) | ||
456 | { | ||
457 | return kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
458 | } | ||
459 | |||
460 | static inline void kvm_arch_free_vm(struct kvm *kvm) | ||
461 | { | ||
462 | kfree(kvm); | ||
463 | } | ||
464 | #endif | ||
465 | |||
466 | int kvm_arch_init_vm(struct kvm *kvm); | ||
402 | void kvm_arch_destroy_vm(struct kvm *kvm); | 467 | void kvm_arch_destroy_vm(struct kvm *kvm); |
403 | void kvm_free_all_assigned_devices(struct kvm *kvm); | 468 | void kvm_free_all_assigned_devices(struct kvm *kvm); |
404 | void kvm_arch_sync_events(struct kvm *kvm); | 469 | void kvm_arch_sync_events(struct kvm *kvm); |
@@ -414,16 +479,8 @@ struct kvm_irq_ack_notifier { | |||
414 | void (*irq_acked)(struct kvm_irq_ack_notifier *kian); | 479 | void (*irq_acked)(struct kvm_irq_ack_notifier *kian); |
415 | }; | 480 | }; |
416 | 481 | ||
417 | #define KVM_ASSIGNED_MSIX_PENDING 0x1 | ||
418 | struct kvm_guest_msix_entry { | ||
419 | u32 vector; | ||
420 | u16 entry; | ||
421 | u16 flags; | ||
422 | }; | ||
423 | |||
424 | struct kvm_assigned_dev_kernel { | 482 | struct kvm_assigned_dev_kernel { |
425 | struct kvm_irq_ack_notifier ack_notifier; | 483 | struct kvm_irq_ack_notifier ack_notifier; |
426 | struct work_struct interrupt_work; | ||
427 | struct list_head list; | 484 | struct list_head list; |
428 | int assigned_dev_id; | 485 | int assigned_dev_id; |
429 | int host_segnr; | 486 | int host_segnr; |
@@ -434,13 +491,14 @@ struct kvm_assigned_dev_kernel { | |||
434 | bool host_irq_disabled; | 491 | bool host_irq_disabled; |
435 | struct msix_entry *host_msix_entries; | 492 | struct msix_entry *host_msix_entries; |
436 | int guest_irq; | 493 | int guest_irq; |
437 | struct kvm_guest_msix_entry *guest_msix_entries; | 494 | struct msix_entry *guest_msix_entries; |
438 | unsigned long irq_requested_type; | 495 | unsigned long irq_requested_type; |
439 | int irq_source_id; | 496 | int irq_source_id; |
440 | int flags; | 497 | int flags; |
441 | struct pci_dev *dev; | 498 | struct pci_dev *dev; |
442 | struct kvm *kvm; | 499 | struct kvm *kvm; |
443 | spinlock_t assigned_dev_lock; | 500 | spinlock_t intx_lock; |
501 | char irq_name[32]; | ||
444 | }; | 502 | }; |
445 | 503 | ||
446 | struct kvm_irq_mask_notifier { | 504 | struct kvm_irq_mask_notifier { |
@@ -462,6 +520,8 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, | |||
462 | unsigned long *deliver_bitmask); | 520 | unsigned long *deliver_bitmask); |
463 | #endif | 521 | #endif |
464 | int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); | 522 | int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); |
523 | int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, | ||
524 | int irq_source_id, int level); | ||
465 | void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); | 525 | void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); |
466 | void kvm_register_irq_ack_notifier(struct kvm *kvm, | 526 | void kvm_register_irq_ack_notifier(struct kvm *kvm, |
467 | struct kvm_irq_ack_notifier *kian); | 527 | struct kvm_irq_ack_notifier *kian); |
@@ -603,17 +663,28 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} | |||
603 | void kvm_eventfd_init(struct kvm *kvm); | 663 | void kvm_eventfd_init(struct kvm *kvm); |
604 | int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); | 664 | int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); |
605 | void kvm_irqfd_release(struct kvm *kvm); | 665 | void kvm_irqfd_release(struct kvm *kvm); |
666 | void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); | ||
606 | int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); | 667 | int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); |
607 | 668 | ||
608 | #else | 669 | #else |
609 | 670 | ||
610 | static inline void kvm_eventfd_init(struct kvm *kvm) {} | 671 | static inline void kvm_eventfd_init(struct kvm *kvm) {} |
672 | |||
611 | static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) | 673 | static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) |
612 | { | 674 | { |
613 | return -EINVAL; | 675 | return -EINVAL; |
614 | } | 676 | } |
615 | 677 | ||
616 | static inline void kvm_irqfd_release(struct kvm *kvm) {} | 678 | static inline void kvm_irqfd_release(struct kvm *kvm) {} |
679 | |||
680 | #ifdef CONFIG_HAVE_KVM_IRQCHIP | ||
681 | static inline void kvm_irq_routing_update(struct kvm *kvm, | ||
682 | struct kvm_irq_routing_table *irq_rt) | ||
683 | { | ||
684 | rcu_assign_pointer(kvm->irq_routing, irq_rt); | ||
685 | } | ||
686 | #endif | ||
687 | |||
617 | static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) | 688 | static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) |
618 | { | 689 | { |
619 | return -ENOSYS; | 690 | return -ENOSYS; |
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 7ac0d4eee430..fa7cc7244cbd 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h | |||
@@ -67,4 +67,11 @@ struct kvm_lapic_irq { | |||
67 | u32 dest_id; | 67 | u32 dest_id; |
68 | }; | 68 | }; |
69 | 69 | ||
70 | struct gfn_to_hva_cache { | ||
71 | u64 generation; | ||
72 | gpa_t gpa; | ||
73 | unsigned long hva; | ||
74 | struct kvm_memory_slot *memslot; | ||
75 | }; | ||
76 | |||
70 | #endif /* __KVM_TYPES_H__ */ | 77 | #endif /* __KVM_TYPES_H__ */ |
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 6dd3a51ab1cb..46e3cd8e197a 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h | |||
@@ -6,6 +6,36 @@ | |||
6 | #undef TRACE_SYSTEM | 6 | #undef TRACE_SYSTEM |
7 | #define TRACE_SYSTEM kvm | 7 | #define TRACE_SYSTEM kvm |
8 | 8 | ||
9 | #define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x } | ||
10 | |||
11 | #define kvm_trace_exit_reason \ | ||
12 | ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \ | ||
13 | ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ | ||
14 | ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ | ||
15 | ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ | ||
16 | ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI) | ||
17 | |||
18 | TRACE_EVENT(kvm_userspace_exit, | ||
19 | TP_PROTO(__u32 reason, int errno), | ||
20 | TP_ARGS(reason, errno), | ||
21 | |||
22 | TP_STRUCT__entry( | ||
23 | __field( __u32, reason ) | ||
24 | __field( int, errno ) | ||
25 | ), | ||
26 | |||
27 | TP_fast_assign( | ||
28 | __entry->reason = reason; | ||
29 | __entry->errno = errno; | ||
30 | ), | ||
31 | |||
32 | TP_printk("reason %s (%d)", | ||
33 | __entry->errno < 0 ? | ||
34 | (__entry->errno == -EINTR ? "restart" : "error") : | ||
35 | __print_symbolic(__entry->reason, kvm_trace_exit_reason), | ||
36 | __entry->errno < 0 ? -__entry->errno : __entry->reason) | ||
37 | ); | ||
38 | |||
9 | #if defined(__KVM_HAVE_IOAPIC) | 39 | #if defined(__KVM_HAVE_IOAPIC) |
10 | TRACE_EVENT(kvm_set_irq, | 40 | TRACE_EVENT(kvm_set_irq, |
11 | TP_PROTO(unsigned int gsi, int level, int irq_source_id), | 41 | TP_PROTO(unsigned int gsi, int level, int irq_source_id), |
@@ -185,6 +215,97 @@ TRACE_EVENT(kvm_age_page, | |||
185 | __entry->referenced ? "YOUNG" : "OLD") | 215 | __entry->referenced ? "YOUNG" : "OLD") |
186 | ); | 216 | ); |
187 | 217 | ||
218 | #ifdef CONFIG_KVM_ASYNC_PF | ||
219 | DECLARE_EVENT_CLASS(kvm_async_get_page_class, | ||
220 | |||
221 | TP_PROTO(u64 gva, u64 gfn), | ||
222 | |||
223 | TP_ARGS(gva, gfn), | ||
224 | |||
225 | TP_STRUCT__entry( | ||
226 | __field(__u64, gva) | ||
227 | __field(u64, gfn) | ||
228 | ), | ||
229 | |||
230 | TP_fast_assign( | ||
231 | __entry->gva = gva; | ||
232 | __entry->gfn = gfn; | ||
233 | ), | ||
234 | |||
235 | TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) | ||
236 | ); | ||
237 | |||
238 | DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, | ||
239 | |||
240 | TP_PROTO(u64 gva, u64 gfn), | ||
241 | |||
242 | TP_ARGS(gva, gfn) | ||
243 | ); | ||
244 | |||
245 | DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault, | ||
246 | |||
247 | TP_PROTO(u64 gva, u64 gfn), | ||
248 | |||
249 | TP_ARGS(gva, gfn) | ||
250 | ); | ||
251 | |||
252 | DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready, | ||
253 | |||
254 | TP_PROTO(u64 token, u64 gva), | ||
255 | |||
256 | TP_ARGS(token, gva), | ||
257 | |||
258 | TP_STRUCT__entry( | ||
259 | __field(__u64, token) | ||
260 | __field(__u64, gva) | ||
261 | ), | ||
262 | |||
263 | TP_fast_assign( | ||
264 | __entry->token = token; | ||
265 | __entry->gva = gva; | ||
266 | ), | ||
267 | |||
268 | TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva) | ||
269 | |||
270 | ); | ||
271 | |||
272 | DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present, | ||
273 | |||
274 | TP_PROTO(u64 token, u64 gva), | ||
275 | |||
276 | TP_ARGS(token, gva) | ||
277 | ); | ||
278 | |||
279 | DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, | ||
280 | |||
281 | TP_PROTO(u64 token, u64 gva), | ||
282 | |||
283 | TP_ARGS(token, gva) | ||
284 | ); | ||
285 | |||
286 | TRACE_EVENT( | ||
287 | kvm_async_pf_completed, | ||
288 | TP_PROTO(unsigned long address, struct page *page, u64 gva), | ||
289 | TP_ARGS(address, page, gva), | ||
290 | |||
291 | TP_STRUCT__entry( | ||
292 | __field(unsigned long, address) | ||
293 | __field(pfn_t, pfn) | ||
294 | __field(u64, gva) | ||
295 | ), | ||
296 | |||
297 | TP_fast_assign( | ||
298 | __entry->address = address; | ||
299 | __entry->pfn = page ? page_to_pfn(page) : 0; | ||
300 | __entry->gva = gva; | ||
301 | ), | ||
302 | |||
303 | TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva, | ||
304 | __entry->address, __entry->pfn) | ||
305 | ); | ||
306 | |||
307 | #endif | ||
308 | |||
188 | #endif /* _TRACE_KVM_MAIN_H */ | 309 | #endif /* _TRACE_KVM_MAIN_H */ |
189 | 310 | ||
190 | /* This part must be outside protection */ | 311 | /* This part must be outside protection */ |
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 7f1178f6b839..f63ccb0a5982 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig | |||
@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE | |||
15 | 15 | ||
16 | config KVM_MMIO | 16 | config KVM_MMIO |
17 | bool | 17 | bool |
18 | |||
19 | config KVM_ASYNC_PF | ||
20 | bool | ||
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 7c98928b09d9..ae72ae604c89 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c | |||
@@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel | |||
55 | return index; | 55 | return index; |
56 | } | 56 | } |
57 | 57 | ||
58 | static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) | 58 | static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) |
59 | { | 59 | { |
60 | struct kvm_assigned_dev_kernel *assigned_dev; | 60 | struct kvm_assigned_dev_kernel *assigned_dev = dev_id; |
61 | int i; | 61 | u32 vector; |
62 | int index; | ||
62 | 63 | ||
63 | assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, | 64 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { |
64 | interrupt_work); | 65 | spin_lock(&assigned_dev->intx_lock); |
66 | disable_irq_nosync(irq); | ||
67 | assigned_dev->host_irq_disabled = true; | ||
68 | spin_unlock(&assigned_dev->intx_lock); | ||
69 | } | ||
65 | 70 | ||
66 | spin_lock_irq(&assigned_dev->assigned_dev_lock); | ||
67 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { | 71 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { |
68 | struct kvm_guest_msix_entry *guest_entries = | 72 | index = find_index_from_host_irq(assigned_dev, irq); |
69 | assigned_dev->guest_msix_entries; | 73 | if (index >= 0) { |
70 | for (i = 0; i < assigned_dev->entries_nr; i++) { | 74 | vector = assigned_dev-> |
71 | if (!(guest_entries[i].flags & | 75 | guest_msix_entries[index].vector; |
72 | KVM_ASSIGNED_MSIX_PENDING)) | ||
73 | continue; | ||
74 | guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; | ||
75 | kvm_set_irq(assigned_dev->kvm, | 76 | kvm_set_irq(assigned_dev->kvm, |
76 | assigned_dev->irq_source_id, | 77 | assigned_dev->irq_source_id, vector, 1); |
77 | guest_entries[i].vector, 1); | ||
78 | } | 78 | } |
79 | } else | 79 | } else |
80 | kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, | 80 | kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, |
81 | assigned_dev->guest_irq, 1); | 81 | assigned_dev->guest_irq, 1); |
82 | 82 | ||
83 | spin_unlock_irq(&assigned_dev->assigned_dev_lock); | ||
84 | } | ||
85 | |||
86 | static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) | ||
87 | { | ||
88 | unsigned long flags; | ||
89 | struct kvm_assigned_dev_kernel *assigned_dev = | ||
90 | (struct kvm_assigned_dev_kernel *) dev_id; | ||
91 | |||
92 | spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); | ||
93 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { | ||
94 | int index = find_index_from_host_irq(assigned_dev, irq); | ||
95 | if (index < 0) | ||
96 | goto out; | ||
97 | assigned_dev->guest_msix_entries[index].flags |= | ||
98 | KVM_ASSIGNED_MSIX_PENDING; | ||
99 | } | ||
100 | |||
101 | schedule_work(&assigned_dev->interrupt_work); | ||
102 | |||
103 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { | ||
104 | disable_irq_nosync(irq); | ||
105 | assigned_dev->host_irq_disabled = true; | ||
106 | } | ||
107 | |||
108 | out: | ||
109 | spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); | ||
110 | return IRQ_HANDLED; | 83 | return IRQ_HANDLED; |
111 | } | 84 | } |
112 | 85 | ||
@@ -114,7 +87,6 @@ out: | |||
114 | static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) | 87 | static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) |
115 | { | 88 | { |
116 | struct kvm_assigned_dev_kernel *dev; | 89 | struct kvm_assigned_dev_kernel *dev; |
117 | unsigned long flags; | ||
118 | 90 | ||
119 | if (kian->gsi == -1) | 91 | if (kian->gsi == -1) |
120 | return; | 92 | return; |
@@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) | |||
127 | /* The guest irq may be shared so this ack may be | 99 | /* The guest irq may be shared so this ack may be |
128 | * from another device. | 100 | * from another device. |
129 | */ | 101 | */ |
130 | spin_lock_irqsave(&dev->assigned_dev_lock, flags); | 102 | spin_lock(&dev->intx_lock); |
131 | if (dev->host_irq_disabled) { | 103 | if (dev->host_irq_disabled) { |
132 | enable_irq(dev->host_irq); | 104 | enable_irq(dev->host_irq); |
133 | dev->host_irq_disabled = false; | 105 | dev->host_irq_disabled = false; |
134 | } | 106 | } |
135 | spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); | 107 | spin_unlock(&dev->intx_lock); |
136 | } | 108 | } |
137 | 109 | ||
138 | static void deassign_guest_irq(struct kvm *kvm, | 110 | static void deassign_guest_irq(struct kvm *kvm, |
@@ -141,6 +113,9 @@ static void deassign_guest_irq(struct kvm *kvm, | |||
141 | kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); | 113 | kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); |
142 | assigned_dev->ack_notifier.gsi = -1; | 114 | assigned_dev->ack_notifier.gsi = -1; |
143 | 115 | ||
116 | kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, | ||
117 | assigned_dev->guest_irq, 0); | ||
118 | |||
144 | if (assigned_dev->irq_source_id != -1) | 119 | if (assigned_dev->irq_source_id != -1) |
145 | kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); | 120 | kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); |
146 | assigned_dev->irq_source_id = -1; | 121 | assigned_dev->irq_source_id = -1; |
@@ -152,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm, | |||
152 | struct kvm_assigned_dev_kernel *assigned_dev) | 127 | struct kvm_assigned_dev_kernel *assigned_dev) |
153 | { | 128 | { |
154 | /* | 129 | /* |
155 | * In kvm_free_device_irq, cancel_work_sync return true if: | 130 | * We disable irq here to prevent further events. |
156 | * 1. work is scheduled, and then cancelled. | ||
157 | * 2. work callback is executed. | ||
158 | * | ||
159 | * The first one ensured that the irq is disabled and no more events | ||
160 | * would happen. But for the second one, the irq may be enabled (e.g. | ||
161 | * for MSI). So we disable irq here to prevent further events. | ||
162 | * | 131 | * |
163 | * Notice this maybe result in nested disable if the interrupt type is | 132 | * Notice this maybe result in nested disable if the interrupt type is |
164 | * INTx, but it's OK for we are going to free it. | 133 | * INTx, but it's OK for we are going to free it. |
165 | * | 134 | * |
166 | * If this function is a part of VM destroy, please ensure that till | 135 | * If this function is a part of VM destroy, please ensure that till |
167 | * now, the kvm state is still legal for probably we also have to wait | 136 | * now, the kvm state is still legal for probably we also have to wait |
168 | * interrupt_work done. | 137 | * on a currently running IRQ handler. |
169 | */ | 138 | */ |
170 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { | 139 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { |
171 | int i; | 140 | int i; |
172 | for (i = 0; i < assigned_dev->entries_nr; i++) | 141 | for (i = 0; i < assigned_dev->entries_nr; i++) |
173 | disable_irq_nosync(assigned_dev-> | 142 | disable_irq(assigned_dev->host_msix_entries[i].vector); |
174 | host_msix_entries[i].vector); | ||
175 | |||
176 | cancel_work_sync(&assigned_dev->interrupt_work); | ||
177 | 143 | ||
178 | for (i = 0; i < assigned_dev->entries_nr; i++) | 144 | for (i = 0; i < assigned_dev->entries_nr; i++) |
179 | free_irq(assigned_dev->host_msix_entries[i].vector, | 145 | free_irq(assigned_dev->host_msix_entries[i].vector, |
@@ -185,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm, | |||
185 | pci_disable_msix(assigned_dev->dev); | 151 | pci_disable_msix(assigned_dev->dev); |
186 | } else { | 152 | } else { |
187 | /* Deal with MSI and INTx */ | 153 | /* Deal with MSI and INTx */ |
188 | disable_irq_nosync(assigned_dev->host_irq); | 154 | disable_irq(assigned_dev->host_irq); |
189 | cancel_work_sync(&assigned_dev->interrupt_work); | ||
190 | 155 | ||
191 | free_irq(assigned_dev->host_irq, (void *)assigned_dev); | 156 | free_irq(assigned_dev->host_irq, (void *)assigned_dev); |
192 | 157 | ||
@@ -232,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm, | |||
232 | { | 197 | { |
233 | kvm_free_assigned_irq(kvm, assigned_dev); | 198 | kvm_free_assigned_irq(kvm, assigned_dev); |
234 | 199 | ||
235 | pci_reset_function(assigned_dev->dev); | 200 | __pci_reset_function(assigned_dev->dev); |
201 | pci_restore_state(assigned_dev->dev); | ||
236 | 202 | ||
237 | pci_release_regions(assigned_dev->dev); | 203 | pci_release_regions(assigned_dev->dev); |
238 | pci_disable_device(assigned_dev->dev); | 204 | pci_disable_device(assigned_dev->dev); |
@@ -265,8 +231,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, | |||
265 | * on the same interrupt line is not a happy situation: there | 231 | * on the same interrupt line is not a happy situation: there |
266 | * are going to be long delays in accepting, acking, etc. | 232 | * are going to be long delays in accepting, acking, etc. |
267 | */ | 233 | */ |
268 | if (request_irq(dev->host_irq, kvm_assigned_dev_intr, | 234 | if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, |
269 | 0, "kvm_assigned_intx_device", (void *)dev)) | 235 | IRQF_ONESHOT, dev->irq_name, (void *)dev)) |
270 | return -EIO; | 236 | return -EIO; |
271 | return 0; | 237 | return 0; |
272 | } | 238 | } |
@@ -284,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, | |||
284 | } | 250 | } |
285 | 251 | ||
286 | dev->host_irq = dev->dev->irq; | 252 | dev->host_irq = dev->dev->irq; |
287 | if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, | 253 | if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, |
288 | "kvm_assigned_msi_device", (void *)dev)) { | 254 | 0, dev->irq_name, (void *)dev)) { |
289 | pci_disable_msi(dev->dev); | 255 | pci_disable_msi(dev->dev); |
290 | return -EIO; | 256 | return -EIO; |
291 | } | 257 | } |
@@ -310,10 +276,9 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, | |||
310 | return r; | 276 | return r; |
311 | 277 | ||
312 | for (i = 0; i < dev->entries_nr; i++) { | 278 | for (i = 0; i < dev->entries_nr; i++) { |
313 | r = request_irq(dev->host_msix_entries[i].vector, | 279 | r = request_threaded_irq(dev->host_msix_entries[i].vector, |
314 | kvm_assigned_dev_intr, 0, | 280 | NULL, kvm_assigned_dev_thread, |
315 | "kvm_assigned_msix_device", | 281 | 0, dev->irq_name, (void *)dev); |
316 | (void *)dev); | ||
317 | if (r) | 282 | if (r) |
318 | goto err; | 283 | goto err; |
319 | } | 284 | } |
@@ -370,6 +335,9 @@ static int assign_host_irq(struct kvm *kvm, | |||
370 | if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) | 335 | if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) |
371 | return r; | 336 | return r; |
372 | 337 | ||
338 | snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", | ||
339 | pci_name(dev->dev)); | ||
340 | |||
373 | switch (host_irq_type) { | 341 | switch (host_irq_type) { |
374 | case KVM_DEV_IRQ_HOST_INTX: | 342 | case KVM_DEV_IRQ_HOST_INTX: |
375 | r = assigned_device_enable_host_intx(kvm, dev); | 343 | r = assigned_device_enable_host_intx(kvm, dev); |
@@ -547,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, | |||
547 | } | 515 | } |
548 | 516 | ||
549 | pci_reset_function(dev); | 517 | pci_reset_function(dev); |
518 | pci_save_state(dev); | ||
550 | 519 | ||
551 | match->assigned_dev_id = assigned_dev->assigned_dev_id; | 520 | match->assigned_dev_id = assigned_dev->assigned_dev_id; |
552 | match->host_segnr = assigned_dev->segnr; | 521 | match->host_segnr = assigned_dev->segnr; |
@@ -554,12 +523,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, | |||
554 | match->host_devfn = assigned_dev->devfn; | 523 | match->host_devfn = assigned_dev->devfn; |
555 | match->flags = assigned_dev->flags; | 524 | match->flags = assigned_dev->flags; |
556 | match->dev = dev; | 525 | match->dev = dev; |
557 | spin_lock_init(&match->assigned_dev_lock); | 526 | spin_lock_init(&match->intx_lock); |
558 | match->irq_source_id = -1; | 527 | match->irq_source_id = -1; |
559 | match->kvm = kvm; | 528 | match->kvm = kvm; |
560 | match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; | 529 | match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; |
561 | INIT_WORK(&match->interrupt_work, | ||
562 | kvm_assigned_dev_interrupt_work_handler); | ||
563 | 530 | ||
564 | list_add(&match->list, &kvm->arch.assigned_dev_head); | 531 | list_add(&match->list, &kvm->arch.assigned_dev_head); |
565 | 532 | ||
@@ -579,6 +546,7 @@ out: | |||
579 | mutex_unlock(&kvm->lock); | 546 | mutex_unlock(&kvm->lock); |
580 | return r; | 547 | return r; |
581 | out_list_del: | 548 | out_list_del: |
549 | pci_restore_state(dev); | ||
582 | list_del(&match->list); | 550 | list_del(&match->list); |
583 | pci_release_regions(dev); | 551 | pci_release_regions(dev); |
584 | out_disable: | 552 | out_disable: |
@@ -651,9 +619,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, | |||
651 | r = -ENOMEM; | 619 | r = -ENOMEM; |
652 | goto msix_nr_out; | 620 | goto msix_nr_out; |
653 | } | 621 | } |
654 | adev->guest_msix_entries = kzalloc( | 622 | adev->guest_msix_entries = |
655 | sizeof(struct kvm_guest_msix_entry) * | 623 | kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, |
656 | entry_nr->entry_nr, GFP_KERNEL); | 624 | GFP_KERNEL); |
657 | if (!adev->guest_msix_entries) { | 625 | if (!adev->guest_msix_entries) { |
658 | kfree(adev->host_msix_entries); | 626 | kfree(adev->host_msix_entries); |
659 | r = -ENOMEM; | 627 | r = -ENOMEM; |
@@ -706,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
706 | unsigned long arg) | 674 | unsigned long arg) |
707 | { | 675 | { |
708 | void __user *argp = (void __user *)arg; | 676 | void __user *argp = (void __user *)arg; |
709 | int r = -ENOTTY; | 677 | int r; |
710 | 678 | ||
711 | switch (ioctl) { | 679 | switch (ioctl) { |
712 | case KVM_ASSIGN_PCI_DEVICE: { | 680 | case KVM_ASSIGN_PCI_DEVICE: { |
@@ -724,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
724 | r = -EOPNOTSUPP; | 692 | r = -EOPNOTSUPP; |
725 | break; | 693 | break; |
726 | } | 694 | } |
727 | #ifdef KVM_CAP_ASSIGN_DEV_IRQ | ||
728 | case KVM_ASSIGN_DEV_IRQ: { | 695 | case KVM_ASSIGN_DEV_IRQ: { |
729 | struct kvm_assigned_irq assigned_irq; | 696 | struct kvm_assigned_irq assigned_irq; |
730 | 697 | ||
@@ -747,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
747 | goto out; | 714 | goto out; |
748 | break; | 715 | break; |
749 | } | 716 | } |
750 | #endif | ||
751 | #ifdef KVM_CAP_DEVICE_DEASSIGNMENT | ||
752 | case KVM_DEASSIGN_PCI_DEVICE: { | 717 | case KVM_DEASSIGN_PCI_DEVICE: { |
753 | struct kvm_assigned_pci_dev assigned_dev; | 718 | struct kvm_assigned_pci_dev assigned_dev; |
754 | 719 | ||
@@ -760,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
760 | goto out; | 725 | goto out; |
761 | break; | 726 | break; |
762 | } | 727 | } |
763 | #endif | ||
764 | #ifdef KVM_CAP_IRQ_ROUTING | 728 | #ifdef KVM_CAP_IRQ_ROUTING |
765 | case KVM_SET_GSI_ROUTING: { | 729 | case KVM_SET_GSI_ROUTING: { |
766 | struct kvm_irq_routing routing; | 730 | struct kvm_irq_routing routing; |
@@ -813,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, | |||
813 | break; | 777 | break; |
814 | } | 778 | } |
815 | #endif | 779 | #endif |
780 | default: | ||
781 | r = -ENOTTY; | ||
782 | break; | ||
816 | } | 783 | } |
817 | out: | 784 | out: |
818 | return r; | 785 | return r; |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c new file mode 100644 index 000000000000..74268b4c2ee1 --- /dev/null +++ b/virt/kvm/async_pf.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * kvm asynchronous fault support | ||
3 | * | ||
4 | * Copyright 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * Author: | ||
7 | * Gleb Natapov <gleb@redhat.com> | ||
8 | * | ||
9 | * This file is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of version 2 of the GNU General Public License | ||
11 | * as published by the Free Software Foundation. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software Foundation, | ||
20 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | ||
21 | */ | ||
22 | |||
23 | #include <linux/kvm_host.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/mmu_context.h> | ||
27 | |||
28 | #include "async_pf.h" | ||
29 | #include <trace/events/kvm.h> | ||
30 | |||
31 | static struct kmem_cache *async_pf_cache; | ||
32 | |||
33 | int kvm_async_pf_init(void) | ||
34 | { | ||
35 | async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); | ||
36 | |||
37 | if (!async_pf_cache) | ||
38 | return -ENOMEM; | ||
39 | |||
40 | return 0; | ||
41 | } | ||
42 | |||
43 | void kvm_async_pf_deinit(void) | ||
44 | { | ||
45 | if (async_pf_cache) | ||
46 | kmem_cache_destroy(async_pf_cache); | ||
47 | async_pf_cache = NULL; | ||
48 | } | ||
49 | |||
50 | void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) | ||
51 | { | ||
52 | INIT_LIST_HEAD(&vcpu->async_pf.done); | ||
53 | INIT_LIST_HEAD(&vcpu->async_pf.queue); | ||
54 | spin_lock_init(&vcpu->async_pf.lock); | ||
55 | } | ||
56 | |||
57 | static void async_pf_execute(struct work_struct *work) | ||
58 | { | ||
59 | struct page *page = NULL; | ||
60 | struct kvm_async_pf *apf = | ||
61 | container_of(work, struct kvm_async_pf, work); | ||
62 | struct mm_struct *mm = apf->mm; | ||
63 | struct kvm_vcpu *vcpu = apf->vcpu; | ||
64 | unsigned long addr = apf->addr; | ||
65 | gva_t gva = apf->gva; | ||
66 | |||
67 | might_sleep(); | ||
68 | |||
69 | use_mm(mm); | ||
70 | down_read(&mm->mmap_sem); | ||
71 | get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); | ||
72 | up_read(&mm->mmap_sem); | ||
73 | unuse_mm(mm); | ||
74 | |||
75 | spin_lock(&vcpu->async_pf.lock); | ||
76 | list_add_tail(&apf->link, &vcpu->async_pf.done); | ||
77 | apf->page = page; | ||
78 | apf->done = true; | ||
79 | spin_unlock(&vcpu->async_pf.lock); | ||
80 | |||
81 | /* | ||
82 | * apf may be freed by kvm_check_async_pf_completion() after | ||
83 | * this point | ||
84 | */ | ||
85 | |||
86 | trace_kvm_async_pf_completed(addr, page, gva); | ||
87 | |||
88 | if (waitqueue_active(&vcpu->wq)) | ||
89 | wake_up_interruptible(&vcpu->wq); | ||
90 | |||
91 | mmdrop(mm); | ||
92 | kvm_put_kvm(vcpu->kvm); | ||
93 | } | ||
94 | |||
95 | void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) | ||
96 | { | ||
97 | /* cancel outstanding work queue item */ | ||
98 | while (!list_empty(&vcpu->async_pf.queue)) { | ||
99 | struct kvm_async_pf *work = | ||
100 | list_entry(vcpu->async_pf.queue.next, | ||
101 | typeof(*work), queue); | ||
102 | cancel_work_sync(&work->work); | ||
103 | list_del(&work->queue); | ||
104 | if (!work->done) /* work was canceled */ | ||
105 | kmem_cache_free(async_pf_cache, work); | ||
106 | } | ||
107 | |||
108 | spin_lock(&vcpu->async_pf.lock); | ||
109 | while (!list_empty(&vcpu->async_pf.done)) { | ||
110 | struct kvm_async_pf *work = | ||
111 | list_entry(vcpu->async_pf.done.next, | ||
112 | typeof(*work), link); | ||
113 | list_del(&work->link); | ||
114 | if (work->page) | ||
115 | put_page(work->page); | ||
116 | kmem_cache_free(async_pf_cache, work); | ||
117 | } | ||
118 | spin_unlock(&vcpu->async_pf.lock); | ||
119 | |||
120 | vcpu->async_pf.queued = 0; | ||
121 | } | ||
122 | |||
123 | void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) | ||
124 | { | ||
125 | struct kvm_async_pf *work; | ||
126 | |||
127 | while (!list_empty_careful(&vcpu->async_pf.done) && | ||
128 | kvm_arch_can_inject_async_page_present(vcpu)) { | ||
129 | spin_lock(&vcpu->async_pf.lock); | ||
130 | work = list_first_entry(&vcpu->async_pf.done, typeof(*work), | ||
131 | link); | ||
132 | list_del(&work->link); | ||
133 | spin_unlock(&vcpu->async_pf.lock); | ||
134 | |||
135 | if (work->page) | ||
136 | kvm_arch_async_page_ready(vcpu, work); | ||
137 | kvm_arch_async_page_present(vcpu, work); | ||
138 | |||
139 | list_del(&work->queue); | ||
140 | vcpu->async_pf.queued--; | ||
141 | if (work->page) | ||
142 | put_page(work->page); | ||
143 | kmem_cache_free(async_pf_cache, work); | ||
144 | } | ||
145 | } | ||
146 | |||
147 | int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, | ||
148 | struct kvm_arch_async_pf *arch) | ||
149 | { | ||
150 | struct kvm_async_pf *work; | ||
151 | |||
152 | if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) | ||
153 | return 0; | ||
154 | |||
155 | /* setup delayed work */ | ||
156 | |||
157 | /* | ||
158 | * do alloc nowait since if we are going to sleep anyway we | ||
159 | * may as well sleep faulting in page | ||
160 | */ | ||
161 | work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT); | ||
162 | if (!work) | ||
163 | return 0; | ||
164 | |||
165 | work->page = NULL; | ||
166 | work->done = false; | ||
167 | work->vcpu = vcpu; | ||
168 | work->gva = gva; | ||
169 | work->addr = gfn_to_hva(vcpu->kvm, gfn); | ||
170 | work->arch = *arch; | ||
171 | work->mm = current->mm; | ||
172 | atomic_inc(&work->mm->mm_count); | ||
173 | kvm_get_kvm(work->vcpu->kvm); | ||
174 | |||
175 | /* this can't really happen otherwise gfn_to_pfn_async | ||
176 | would succeed */ | ||
177 | if (unlikely(kvm_is_error_hva(work->addr))) | ||
178 | goto retry_sync; | ||
179 | |||
180 | INIT_WORK(&work->work, async_pf_execute); | ||
181 | if (!schedule_work(&work->work)) | ||
182 | goto retry_sync; | ||
183 | |||
184 | list_add_tail(&work->queue, &vcpu->async_pf.queue); | ||
185 | vcpu->async_pf.queued++; | ||
186 | kvm_arch_async_page_not_present(vcpu, work); | ||
187 | return 1; | ||
188 | retry_sync: | ||
189 | kvm_put_kvm(work->vcpu->kvm); | ||
190 | mmdrop(work->mm); | ||
191 | kmem_cache_free(async_pf_cache, work); | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) | ||
196 | { | ||
197 | struct kvm_async_pf *work; | ||
198 | |||
199 | if (!list_empty_careful(&vcpu->async_pf.done)) | ||
200 | return 0; | ||
201 | |||
202 | work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); | ||
203 | if (!work) | ||
204 | return -ENOMEM; | ||
205 | |||
206 | work->page = bad_page; | ||
207 | get_page(bad_page); | ||
208 | INIT_LIST_HEAD(&work->queue); /* for list_del to work */ | ||
209 | |||
210 | spin_lock(&vcpu->async_pf.lock); | ||
211 | list_add_tail(&work->link, &vcpu->async_pf.done); | ||
212 | spin_unlock(&vcpu->async_pf.lock); | ||
213 | |||
214 | vcpu->async_pf.queued++; | ||
215 | return 0; | ||
216 | } | ||
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h new file mode 100644 index 000000000000..e7ef6447cb82 --- /dev/null +++ b/virt/kvm/async_pf.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* | ||
2 | * kvm asynchronous fault support | ||
3 | * | ||
4 | * Copyright 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * Author: | ||
7 | * Gleb Natapov <gleb@redhat.com> | ||
8 | * | ||
9 | * This file is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of version 2 of the GNU General Public License | ||
11 | * as published by the Free Software Foundation. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software Foundation, | ||
20 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | ||
21 | */ | ||
22 | |||
23 | #ifndef __KVM_ASYNC_PF_H__ | ||
24 | #define __KVM_ASYNC_PF_H__ | ||
25 | |||
26 | #ifdef CONFIG_KVM_ASYNC_PF | ||
27 | int kvm_async_pf_init(void); | ||
28 | void kvm_async_pf_deinit(void); | ||
29 | void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu); | ||
30 | #else | ||
31 | #define kvm_async_pf_init() (0) | ||
32 | #define kvm_async_pf_deinit() do{}while(0) | ||
33 | #define kvm_async_pf_vcpu_init(C) do{}while(0) | ||
34 | #endif | ||
35 | |||
36 | #endif | ||
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c1f1e3c62984..2ca4535f4fb7 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c | |||
@@ -44,14 +44,19 @@ | |||
44 | */ | 44 | */ |
45 | 45 | ||
46 | struct _irqfd { | 46 | struct _irqfd { |
47 | struct kvm *kvm; | 47 | /* Used for MSI fast-path */ |
48 | struct eventfd_ctx *eventfd; | 48 | struct kvm *kvm; |
49 | int gsi; | 49 | wait_queue_t wait; |
50 | struct list_head list; | 50 | /* Update side is protected by irqfds.lock */ |
51 | poll_table pt; | 51 | struct kvm_kernel_irq_routing_entry __rcu *irq_entry; |
52 | wait_queue_t wait; | 52 | /* Used for level IRQ fast-path */ |
53 | struct work_struct inject; | 53 | int gsi; |
54 | struct work_struct shutdown; | 54 | struct work_struct inject; |
55 | /* Used for setup/shutdown */ | ||
56 | struct eventfd_ctx *eventfd; | ||
57 | struct list_head list; | ||
58 | poll_table pt; | ||
59 | struct work_struct shutdown; | ||
55 | }; | 60 | }; |
56 | 61 | ||
57 | static struct workqueue_struct *irqfd_cleanup_wq; | 62 | static struct workqueue_struct *irqfd_cleanup_wq; |
@@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) | |||
125 | { | 130 | { |
126 | struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); | 131 | struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); |
127 | unsigned long flags = (unsigned long)key; | 132 | unsigned long flags = (unsigned long)key; |
133 | struct kvm_kernel_irq_routing_entry *irq; | ||
134 | struct kvm *kvm = irqfd->kvm; | ||
128 | 135 | ||
129 | if (flags & POLLIN) | 136 | if (flags & POLLIN) { |
137 | rcu_read_lock(); | ||
138 | irq = rcu_dereference(irqfd->irq_entry); | ||
130 | /* An event has been signaled, inject an interrupt */ | 139 | /* An event has been signaled, inject an interrupt */ |
131 | schedule_work(&irqfd->inject); | 140 | if (irq) |
141 | kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); | ||
142 | else | ||
143 | schedule_work(&irqfd->inject); | ||
144 | rcu_read_unlock(); | ||
145 | } | ||
132 | 146 | ||
133 | if (flags & POLLHUP) { | 147 | if (flags & POLLHUP) { |
134 | /* The eventfd is closing, detach from KVM */ | 148 | /* The eventfd is closing, detach from KVM */ |
135 | struct kvm *kvm = irqfd->kvm; | ||
136 | unsigned long flags; | 149 | unsigned long flags; |
137 | 150 | ||
138 | spin_lock_irqsave(&kvm->irqfds.lock, flags); | 151 | spin_lock_irqsave(&kvm->irqfds.lock, flags); |
@@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, | |||
163 | add_wait_queue(wqh, &irqfd->wait); | 176 | add_wait_queue(wqh, &irqfd->wait); |
164 | } | 177 | } |
165 | 178 | ||
179 | /* Must be called under irqfds.lock */ | ||
180 | static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, | ||
181 | struct kvm_irq_routing_table *irq_rt) | ||
182 | { | ||
183 | struct kvm_kernel_irq_routing_entry *e; | ||
184 | struct hlist_node *n; | ||
185 | |||
186 | if (irqfd->gsi >= irq_rt->nr_rt_entries) { | ||
187 | rcu_assign_pointer(irqfd->irq_entry, NULL); | ||
188 | return; | ||
189 | } | ||
190 | |||
191 | hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) { | ||
192 | /* Only fast-path MSI. */ | ||
193 | if (e->type == KVM_IRQ_ROUTING_MSI) | ||
194 | rcu_assign_pointer(irqfd->irq_entry, e); | ||
195 | else | ||
196 | rcu_assign_pointer(irqfd->irq_entry, NULL); | ||
197 | } | ||
198 | } | ||
199 | |||
166 | static int | 200 | static int |
167 | kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) | 201 | kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) |
168 | { | 202 | { |
203 | struct kvm_irq_routing_table *irq_rt; | ||
169 | struct _irqfd *irqfd, *tmp; | 204 | struct _irqfd *irqfd, *tmp; |
170 | struct file *file = NULL; | 205 | struct file *file = NULL; |
171 | struct eventfd_ctx *eventfd = NULL; | 206 | struct eventfd_ctx *eventfd = NULL; |
@@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) | |||
215 | goto fail; | 250 | goto fail; |
216 | } | 251 | } |
217 | 252 | ||
253 | irq_rt = rcu_dereference_protected(kvm->irq_routing, | ||
254 | lockdep_is_held(&kvm->irqfds.lock)); | ||
255 | irqfd_update(kvm, irqfd, irq_rt); | ||
256 | |||
218 | events = file->f_op->poll(file, &irqfd->pt); | 257 | events = file->f_op->poll(file, &irqfd->pt); |
219 | 258 | ||
220 | list_add_tail(&irqfd->list, &kvm->irqfds.items); | 259 | list_add_tail(&irqfd->list, &kvm->irqfds.items); |
@@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) | |||
271 | spin_lock_irq(&kvm->irqfds.lock); | 310 | spin_lock_irq(&kvm->irqfds.lock); |
272 | 311 | ||
273 | list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { | 312 | list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { |
274 | if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) | 313 | if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { |
314 | /* | ||
315 | * This rcu_assign_pointer is needed for when | ||
316 | * another thread calls kvm_irqfd_update before | ||
317 | * we flush workqueue below. | ||
318 | * It is paired with synchronize_rcu done by caller | ||
319 | * of that function. | ||
320 | */ | ||
321 | rcu_assign_pointer(irqfd->irq_entry, NULL); | ||
275 | irqfd_deactivate(irqfd); | 322 | irqfd_deactivate(irqfd); |
323 | } | ||
276 | } | 324 | } |
277 | 325 | ||
278 | spin_unlock_irq(&kvm->irqfds.lock); | 326 | spin_unlock_irq(&kvm->irqfds.lock); |
@@ -322,6 +370,25 @@ kvm_irqfd_release(struct kvm *kvm) | |||
322 | } | 370 | } |
323 | 371 | ||
324 | /* | 372 | /* |
373 | * Change irq_routing and irqfd. | ||
374 | * Caller must invoke synchronize_rcu afterwards. | ||
375 | */ | ||
376 | void kvm_irq_routing_update(struct kvm *kvm, | ||
377 | struct kvm_irq_routing_table *irq_rt) | ||
378 | { | ||
379 | struct _irqfd *irqfd; | ||
380 | |||
381 | spin_lock_irq(&kvm->irqfds.lock); | ||
382 | |||
383 | rcu_assign_pointer(kvm->irq_routing, irq_rt); | ||
384 | |||
385 | list_for_each_entry(irqfd, &kvm->irqfds.items, list) | ||
386 | irqfd_update(kvm, irqfd, irq_rt); | ||
387 | |||
388 | spin_unlock_irq(&kvm->irqfds.lock); | ||
389 | } | ||
390 | |||
391 | /* | ||
325 | * create a host-wide workqueue for issuing deferred shutdown requests | 392 | * create a host-wide workqueue for issuing deferred shutdown requests |
326 | * aggregated from all vm* instances. We need our own isolated single-thread | 393 | * aggregated from all vm* instances. We need our own isolated single-thread |
327 | * queue to prevent deadlock against flushing the normal work-queue. | 394 | * queue to prevent deadlock against flushing the normal work-queue. |
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 8edca9141b78..9f614b4e365f 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c | |||
@@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, | |||
114 | return r; | 114 | return r; |
115 | } | 115 | } |
116 | 116 | ||
117 | static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, | 117 | int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, |
118 | struct kvm *kvm, int irq_source_id, int level) | 118 | struct kvm *kvm, int irq_source_id, int level) |
119 | { | 119 | { |
120 | struct kvm_lapic_irq irq; | 120 | struct kvm_lapic_irq irq; |
121 | 121 | ||
@@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm, | |||
409 | 409 | ||
410 | mutex_lock(&kvm->irq_lock); | 410 | mutex_lock(&kvm->irq_lock); |
411 | old = kvm->irq_routing; | 411 | old = kvm->irq_routing; |
412 | rcu_assign_pointer(kvm->irq_routing, new); | 412 | kvm_irq_routing_update(kvm, new); |
413 | mutex_unlock(&kvm->irq_lock); | 413 | mutex_unlock(&kvm->irq_lock); |
414 | |||
414 | synchronize_rcu(); | 415 | synchronize_rcu(); |
415 | 416 | ||
416 | new = old; | 417 | new = old; |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5225052aebc1..7f686251f711 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <asm-generic/bitops/le.h> | 55 | #include <asm-generic/bitops/le.h> |
56 | 56 | ||
57 | #include "coalesced_mmio.h" | 57 | #include "coalesced_mmio.h" |
58 | #include "async_pf.h" | ||
58 | 59 | ||
59 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
60 | #include <trace/events/kvm.h> | 61 | #include <trace/events/kvm.h> |
@@ -89,7 +90,8 @@ static void hardware_disable_all(void); | |||
89 | 90 | ||
90 | static void kvm_io_bus_destroy(struct kvm_io_bus *bus); | 91 | static void kvm_io_bus_destroy(struct kvm_io_bus *bus); |
91 | 92 | ||
92 | static bool kvm_rebooting; | 93 | bool kvm_rebooting; |
94 | EXPORT_SYMBOL_GPL(kvm_rebooting); | ||
93 | 95 | ||
94 | static bool largepages_enabled = true; | 96 | static bool largepages_enabled = true; |
95 | 97 | ||
@@ -167,8 +169,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) | |||
167 | 169 | ||
168 | void kvm_flush_remote_tlbs(struct kvm *kvm) | 170 | void kvm_flush_remote_tlbs(struct kvm *kvm) |
169 | { | 171 | { |
172 | int dirty_count = kvm->tlbs_dirty; | ||
173 | |||
174 | smp_mb(); | ||
170 | if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) | 175 | if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) |
171 | ++kvm->stat.remote_tlb_flush; | 176 | ++kvm->stat.remote_tlb_flush; |
177 | cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); | ||
172 | } | 178 | } |
173 | 179 | ||
174 | void kvm_reload_remote_mmus(struct kvm *kvm) | 180 | void kvm_reload_remote_mmus(struct kvm *kvm) |
@@ -186,6 +192,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | |||
186 | vcpu->kvm = kvm; | 192 | vcpu->kvm = kvm; |
187 | vcpu->vcpu_id = id; | 193 | vcpu->vcpu_id = id; |
188 | init_waitqueue_head(&vcpu->wq); | 194 | init_waitqueue_head(&vcpu->wq); |
195 | kvm_async_pf_vcpu_init(vcpu); | ||
189 | 196 | ||
190 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | 197 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); |
191 | if (!page) { | 198 | if (!page) { |
@@ -247,7 +254,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, | |||
247 | idx = srcu_read_lock(&kvm->srcu); | 254 | idx = srcu_read_lock(&kvm->srcu); |
248 | spin_lock(&kvm->mmu_lock); | 255 | spin_lock(&kvm->mmu_lock); |
249 | kvm->mmu_notifier_seq++; | 256 | kvm->mmu_notifier_seq++; |
250 | need_tlb_flush = kvm_unmap_hva(kvm, address); | 257 | need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; |
251 | spin_unlock(&kvm->mmu_lock); | 258 | spin_unlock(&kvm->mmu_lock); |
252 | srcu_read_unlock(&kvm->srcu, idx); | 259 | srcu_read_unlock(&kvm->srcu, idx); |
253 | 260 | ||
@@ -291,6 +298,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, | |||
291 | kvm->mmu_notifier_count++; | 298 | kvm->mmu_notifier_count++; |
292 | for (; start < end; start += PAGE_SIZE) | 299 | for (; start < end; start += PAGE_SIZE) |
293 | need_tlb_flush |= kvm_unmap_hva(kvm, start); | 300 | need_tlb_flush |= kvm_unmap_hva(kvm, start); |
301 | need_tlb_flush |= kvm->tlbs_dirty; | ||
294 | spin_unlock(&kvm->mmu_lock); | 302 | spin_unlock(&kvm->mmu_lock); |
295 | srcu_read_unlock(&kvm->srcu, idx); | 303 | srcu_read_unlock(&kvm->srcu, idx); |
296 | 304 | ||
@@ -381,11 +389,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) | |||
381 | 389 | ||
382 | static struct kvm *kvm_create_vm(void) | 390 | static struct kvm *kvm_create_vm(void) |
383 | { | 391 | { |
384 | int r = 0, i; | 392 | int r, i; |
385 | struct kvm *kvm = kvm_arch_create_vm(); | 393 | struct kvm *kvm = kvm_arch_alloc_vm(); |
386 | 394 | ||
387 | if (IS_ERR(kvm)) | 395 | if (!kvm) |
388 | goto out; | 396 | return ERR_PTR(-ENOMEM); |
397 | |||
398 | r = kvm_arch_init_vm(kvm); | ||
399 | if (r) | ||
400 | goto out_err_nodisable; | ||
389 | 401 | ||
390 | r = hardware_enable_all(); | 402 | r = hardware_enable_all(); |
391 | if (r) | 403 | if (r) |
@@ -399,23 +411,19 @@ static struct kvm *kvm_create_vm(void) | |||
399 | r = -ENOMEM; | 411 | r = -ENOMEM; |
400 | kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 412 | kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); |
401 | if (!kvm->memslots) | 413 | if (!kvm->memslots) |
402 | goto out_err; | 414 | goto out_err_nosrcu; |
403 | if (init_srcu_struct(&kvm->srcu)) | 415 | if (init_srcu_struct(&kvm->srcu)) |
404 | goto out_err; | 416 | goto out_err_nosrcu; |
405 | for (i = 0; i < KVM_NR_BUSES; i++) { | 417 | for (i = 0; i < KVM_NR_BUSES; i++) { |
406 | kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), | 418 | kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), |
407 | GFP_KERNEL); | 419 | GFP_KERNEL); |
408 | if (!kvm->buses[i]) { | 420 | if (!kvm->buses[i]) |
409 | cleanup_srcu_struct(&kvm->srcu); | ||
410 | goto out_err; | 421 | goto out_err; |
411 | } | ||
412 | } | 422 | } |
413 | 423 | ||
414 | r = kvm_init_mmu_notifier(kvm); | 424 | r = kvm_init_mmu_notifier(kvm); |
415 | if (r) { | 425 | if (r) |
416 | cleanup_srcu_struct(&kvm->srcu); | ||
417 | goto out_err; | 426 | goto out_err; |
418 | } | ||
419 | 427 | ||
420 | kvm->mm = current->mm; | 428 | kvm->mm = current->mm; |
421 | atomic_inc(&kvm->mm->mm_count); | 429 | atomic_inc(&kvm->mm->mm_count); |
@@ -429,19 +437,35 @@ static struct kvm *kvm_create_vm(void) | |||
429 | spin_lock(&kvm_lock); | 437 | spin_lock(&kvm_lock); |
430 | list_add(&kvm->vm_list, &vm_list); | 438 | list_add(&kvm->vm_list, &vm_list); |
431 | spin_unlock(&kvm_lock); | 439 | spin_unlock(&kvm_lock); |
432 | out: | 440 | |
433 | return kvm; | 441 | return kvm; |
434 | 442 | ||
435 | out_err: | 443 | out_err: |
444 | cleanup_srcu_struct(&kvm->srcu); | ||
445 | out_err_nosrcu: | ||
436 | hardware_disable_all(); | 446 | hardware_disable_all(); |
437 | out_err_nodisable: | 447 | out_err_nodisable: |
438 | for (i = 0; i < KVM_NR_BUSES; i++) | 448 | for (i = 0; i < KVM_NR_BUSES; i++) |
439 | kfree(kvm->buses[i]); | 449 | kfree(kvm->buses[i]); |
440 | kfree(kvm->memslots); | 450 | kfree(kvm->memslots); |
441 | kfree(kvm); | 451 | kvm_arch_free_vm(kvm); |
442 | return ERR_PTR(r); | 452 | return ERR_PTR(r); |
443 | } | 453 | } |
444 | 454 | ||
455 | static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) | ||
456 | { | ||
457 | if (!memslot->dirty_bitmap) | ||
458 | return; | ||
459 | |||
460 | if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) | ||
461 | vfree(memslot->dirty_bitmap_head); | ||
462 | else | ||
463 | kfree(memslot->dirty_bitmap_head); | ||
464 | |||
465 | memslot->dirty_bitmap = NULL; | ||
466 | memslot->dirty_bitmap_head = NULL; | ||
467 | } | ||
468 | |||
445 | /* | 469 | /* |
446 | * Free any memory in @free but not in @dont. | 470 | * Free any memory in @free but not in @dont. |
447 | */ | 471 | */ |
@@ -454,7 +478,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | |||
454 | vfree(free->rmap); | 478 | vfree(free->rmap); |
455 | 479 | ||
456 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | 480 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) |
457 | vfree(free->dirty_bitmap); | 481 | kvm_destroy_dirty_bitmap(free); |
458 | 482 | ||
459 | 483 | ||
460 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 484 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { |
@@ -465,7 +489,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | |||
465 | } | 489 | } |
466 | 490 | ||
467 | free->npages = 0; | 491 | free->npages = 0; |
468 | free->dirty_bitmap = NULL; | ||
469 | free->rmap = NULL; | 492 | free->rmap = NULL; |
470 | } | 493 | } |
471 | 494 | ||
@@ -499,6 +522,9 @@ static void kvm_destroy_vm(struct kvm *kvm) | |||
499 | kvm_arch_flush_shadow(kvm); | 522 | kvm_arch_flush_shadow(kvm); |
500 | #endif | 523 | #endif |
501 | kvm_arch_destroy_vm(kvm); | 524 | kvm_arch_destroy_vm(kvm); |
525 | kvm_free_physmem(kvm); | ||
526 | cleanup_srcu_struct(&kvm->srcu); | ||
527 | kvm_arch_free_vm(kvm); | ||
502 | hardware_disable_all(); | 528 | hardware_disable_all(); |
503 | mmdrop(mm); | 529 | mmdrop(mm); |
504 | } | 530 | } |
@@ -528,6 +554,27 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) | |||
528 | } | 554 | } |
529 | 555 | ||
530 | /* | 556 | /* |
557 | * Allocation size is twice as large as the actual dirty bitmap size. | ||
558 | * This makes it possible to do double buffering: see x86's | ||
559 | * kvm_vm_ioctl_get_dirty_log(). | ||
560 | */ | ||
561 | static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) | ||
562 | { | ||
563 | unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); | ||
564 | |||
565 | if (dirty_bytes > PAGE_SIZE) | ||
566 | memslot->dirty_bitmap = vzalloc(dirty_bytes); | ||
567 | else | ||
568 | memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); | ||
569 | |||
570 | if (!memslot->dirty_bitmap) | ||
571 | return -ENOMEM; | ||
572 | |||
573 | memslot->dirty_bitmap_head = memslot->dirty_bitmap; | ||
574 | return 0; | ||
575 | } | ||
576 | |||
577 | /* | ||
531 | * Allocate some memory and give it an address in the guest physical address | 578 | * Allocate some memory and give it an address in the guest physical address |
532 | * space. | 579 | * space. |
533 | * | 580 | * |
@@ -604,13 +651,11 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
604 | /* Allocate if a slot is being created */ | 651 | /* Allocate if a slot is being created */ |
605 | #ifndef CONFIG_S390 | 652 | #ifndef CONFIG_S390 |
606 | if (npages && !new.rmap) { | 653 | if (npages && !new.rmap) { |
607 | new.rmap = vmalloc(npages * sizeof(*new.rmap)); | 654 | new.rmap = vzalloc(npages * sizeof(*new.rmap)); |
608 | 655 | ||
609 | if (!new.rmap) | 656 | if (!new.rmap) |
610 | goto out_free; | 657 | goto out_free; |
611 | 658 | ||
612 | memset(new.rmap, 0, npages * sizeof(*new.rmap)); | ||
613 | |||
614 | new.user_alloc = user_alloc; | 659 | new.user_alloc = user_alloc; |
615 | new.userspace_addr = mem->userspace_addr; | 660 | new.userspace_addr = mem->userspace_addr; |
616 | } | 661 | } |
@@ -633,14 +678,11 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
633 | >> KVM_HPAGE_GFN_SHIFT(level)); | 678 | >> KVM_HPAGE_GFN_SHIFT(level)); |
634 | lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); | 679 | lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); |
635 | 680 | ||
636 | new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); | 681 | new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); |
637 | 682 | ||
638 | if (!new.lpage_info[i]) | 683 | if (!new.lpage_info[i]) |
639 | goto out_free; | 684 | goto out_free; |
640 | 685 | ||
641 | memset(new.lpage_info[i], 0, | ||
642 | lpages * sizeof(*new.lpage_info[i])); | ||
643 | |||
644 | if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) | 686 | if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) |
645 | new.lpage_info[i][0].write_count = 1; | 687 | new.lpage_info[i][0].write_count = 1; |
646 | if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) | 688 | if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) |
@@ -661,12 +703,8 @@ skip_lpage: | |||
661 | 703 | ||
662 | /* Allocate page dirty bitmap if needed */ | 704 | /* Allocate page dirty bitmap if needed */ |
663 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | 705 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { |
664 | unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); | 706 | if (kvm_create_dirty_bitmap(&new) < 0) |
665 | |||
666 | new.dirty_bitmap = vmalloc(dirty_bytes); | ||
667 | if (!new.dirty_bitmap) | ||
668 | goto out_free; | 707 | goto out_free; |
669 | memset(new.dirty_bitmap, 0, dirty_bytes); | ||
670 | /* destroy any largepage mappings for dirty tracking */ | 708 | /* destroy any largepage mappings for dirty tracking */ |
671 | if (old.npages) | 709 | if (old.npages) |
672 | flush_shadow = 1; | 710 | flush_shadow = 1; |
@@ -685,6 +723,7 @@ skip_lpage: | |||
685 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 723 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); |
686 | if (mem->slot >= slots->nmemslots) | 724 | if (mem->slot >= slots->nmemslots) |
687 | slots->nmemslots = mem->slot + 1; | 725 | slots->nmemslots = mem->slot + 1; |
726 | slots->generation++; | ||
688 | slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; | 727 | slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; |
689 | 728 | ||
690 | old_memslots = kvm->memslots; | 729 | old_memslots = kvm->memslots; |
@@ -719,6 +758,7 @@ skip_lpage: | |||
719 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 758 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); |
720 | if (mem->slot >= slots->nmemslots) | 759 | if (mem->slot >= slots->nmemslots) |
721 | slots->nmemslots = mem->slot + 1; | 760 | slots->nmemslots = mem->slot + 1; |
761 | slots->generation++; | ||
722 | 762 | ||
723 | /* actual memory is freed via old in kvm_free_physmem_slot below */ | 763 | /* actual memory is freed via old in kvm_free_physmem_slot below */ |
724 | if (!npages) { | 764 | if (!npages) { |
@@ -849,10 +889,10 @@ int kvm_is_error_hva(unsigned long addr) | |||
849 | } | 889 | } |
850 | EXPORT_SYMBOL_GPL(kvm_is_error_hva); | 890 | EXPORT_SYMBOL_GPL(kvm_is_error_hva); |
851 | 891 | ||
852 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 892 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, |
893 | gfn_t gfn) | ||
853 | { | 894 | { |
854 | int i; | 895 | int i; |
855 | struct kvm_memslots *slots = kvm_memslots(kvm); | ||
856 | 896 | ||
857 | for (i = 0; i < slots->nmemslots; ++i) { | 897 | for (i = 0; i < slots->nmemslots; ++i) { |
858 | struct kvm_memory_slot *memslot = &slots->memslots[i]; | 898 | struct kvm_memory_slot *memslot = &slots->memslots[i]; |
@@ -863,6 +903,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | |||
863 | } | 903 | } |
864 | return NULL; | 904 | return NULL; |
865 | } | 905 | } |
906 | |||
907 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
908 | { | ||
909 | return __gfn_to_memslot(kvm_memslots(kvm), gfn); | ||
910 | } | ||
866 | EXPORT_SYMBOL_GPL(gfn_to_memslot); | 911 | EXPORT_SYMBOL_GPL(gfn_to_memslot); |
867 | 912 | ||
868 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) | 913 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) |
@@ -925,12 +970,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn) | |||
925 | return memslot - slots->memslots; | 970 | return memslot - slots->memslots; |
926 | } | 971 | } |
927 | 972 | ||
928 | static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, | 973 | static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, |
929 | gfn_t *nr_pages) | 974 | gfn_t *nr_pages) |
930 | { | 975 | { |
931 | struct kvm_memory_slot *slot; | ||
932 | |||
933 | slot = gfn_to_memslot(kvm, gfn); | ||
934 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) | 976 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID) |
935 | return bad_hva(); | 977 | return bad_hva(); |
936 | 978 | ||
@@ -942,28 +984,61 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, | |||
942 | 984 | ||
943 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) | 985 | unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) |
944 | { | 986 | { |
945 | return gfn_to_hva_many(kvm, gfn, NULL); | 987 | return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); |
946 | } | 988 | } |
947 | EXPORT_SYMBOL_GPL(gfn_to_hva); | 989 | EXPORT_SYMBOL_GPL(gfn_to_hva); |
948 | 990 | ||
949 | static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) | 991 | static pfn_t get_fault_pfn(void) |
992 | { | ||
993 | get_page(fault_page); | ||
994 | return fault_pfn; | ||
995 | } | ||
996 | |||
997 | static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, | ||
998 | bool *async, bool write_fault, bool *writable) | ||
950 | { | 999 | { |
951 | struct page *page[1]; | 1000 | struct page *page[1]; |
952 | int npages; | 1001 | int npages = 0; |
953 | pfn_t pfn; | 1002 | pfn_t pfn; |
954 | 1003 | ||
955 | if (atomic) | 1004 | /* we can do it either atomically or asynchronously, not both */ |
1005 | BUG_ON(atomic && async); | ||
1006 | |||
1007 | BUG_ON(!write_fault && !writable); | ||
1008 | |||
1009 | if (writable) | ||
1010 | *writable = true; | ||
1011 | |||
1012 | if (atomic || async) | ||
956 | npages = __get_user_pages_fast(addr, 1, 1, page); | 1013 | npages = __get_user_pages_fast(addr, 1, 1, page); |
957 | else { | 1014 | |
1015 | if (unlikely(npages != 1) && !atomic) { | ||
958 | might_sleep(); | 1016 | might_sleep(); |
959 | npages = get_user_pages_fast(addr, 1, 1, page); | 1017 | |
1018 | if (writable) | ||
1019 | *writable = write_fault; | ||
1020 | |||
1021 | npages = get_user_pages_fast(addr, 1, write_fault, page); | ||
1022 | |||
1023 | /* map read fault as writable if possible */ | ||
1024 | if (unlikely(!write_fault) && npages == 1) { | ||
1025 | struct page *wpage[1]; | ||
1026 | |||
1027 | npages = __get_user_pages_fast(addr, 1, 1, wpage); | ||
1028 | if (npages == 1) { | ||
1029 | *writable = true; | ||
1030 | put_page(page[0]); | ||
1031 | page[0] = wpage[0]; | ||
1032 | } | ||
1033 | npages = 1; | ||
1034 | } | ||
960 | } | 1035 | } |
961 | 1036 | ||
962 | if (unlikely(npages != 1)) { | 1037 | if (unlikely(npages != 1)) { |
963 | struct vm_area_struct *vma; | 1038 | struct vm_area_struct *vma; |
964 | 1039 | ||
965 | if (atomic) | 1040 | if (atomic) |
966 | goto return_fault_page; | 1041 | return get_fault_pfn(); |
967 | 1042 | ||
968 | down_read(¤t->mm->mmap_sem); | 1043 | down_read(¤t->mm->mmap_sem); |
969 | if (is_hwpoison_address(addr)) { | 1044 | if (is_hwpoison_address(addr)) { |
@@ -972,19 +1047,20 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) | |||
972 | return page_to_pfn(hwpoison_page); | 1047 | return page_to_pfn(hwpoison_page); |
973 | } | 1048 | } |
974 | 1049 | ||
975 | vma = find_vma(current->mm, addr); | 1050 | vma = find_vma_intersection(current->mm, addr, addr+1); |
976 | 1051 | ||
977 | if (vma == NULL || addr < vma->vm_start || | 1052 | if (vma == NULL) |
978 | !(vma->vm_flags & VM_PFNMAP)) { | 1053 | pfn = get_fault_pfn(); |
979 | up_read(¤t->mm->mmap_sem); | 1054 | else if ((vma->vm_flags & VM_PFNMAP)) { |
980 | return_fault_page: | 1055 | pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + |
981 | get_page(fault_page); | 1056 | vma->vm_pgoff; |
982 | return page_to_pfn(fault_page); | 1057 | BUG_ON(!kvm_is_mmio_pfn(pfn)); |
1058 | } else { | ||
1059 | if (async && (vma->vm_flags & VM_WRITE)) | ||
1060 | *async = true; | ||
1061 | pfn = get_fault_pfn(); | ||
983 | } | 1062 | } |
984 | |||
985 | pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
986 | up_read(¤t->mm->mmap_sem); | 1063 | up_read(¤t->mm->mmap_sem); |
987 | BUG_ON(!kvm_is_mmio_pfn(pfn)); | ||
988 | } else | 1064 | } else |
989 | pfn = page_to_pfn(page[0]); | 1065 | pfn = page_to_pfn(page[0]); |
990 | 1066 | ||
@@ -993,40 +1069,58 @@ return_fault_page: | |||
993 | 1069 | ||
994 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) | 1070 | pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) |
995 | { | 1071 | { |
996 | return hva_to_pfn(kvm, addr, true); | 1072 | return hva_to_pfn(kvm, addr, true, NULL, true, NULL); |
997 | } | 1073 | } |
998 | EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); | 1074 | EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); |
999 | 1075 | ||
1000 | static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) | 1076 | static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, |
1077 | bool write_fault, bool *writable) | ||
1001 | { | 1078 | { |
1002 | unsigned long addr; | 1079 | unsigned long addr; |
1003 | 1080 | ||
1081 | if (async) | ||
1082 | *async = false; | ||
1083 | |||
1004 | addr = gfn_to_hva(kvm, gfn); | 1084 | addr = gfn_to_hva(kvm, gfn); |
1005 | if (kvm_is_error_hva(addr)) { | 1085 | if (kvm_is_error_hva(addr)) { |
1006 | get_page(bad_page); | 1086 | get_page(bad_page); |
1007 | return page_to_pfn(bad_page); | 1087 | return page_to_pfn(bad_page); |
1008 | } | 1088 | } |
1009 | 1089 | ||
1010 | return hva_to_pfn(kvm, addr, atomic); | 1090 | return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); |
1011 | } | 1091 | } |
1012 | 1092 | ||
1013 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) | 1093 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) |
1014 | { | 1094 | { |
1015 | return __gfn_to_pfn(kvm, gfn, true); | 1095 | return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); |
1016 | } | 1096 | } |
1017 | EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); | 1097 | EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); |
1018 | 1098 | ||
1099 | pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, | ||
1100 | bool write_fault, bool *writable) | ||
1101 | { | ||
1102 | return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); | ||
1103 | } | ||
1104 | EXPORT_SYMBOL_GPL(gfn_to_pfn_async); | ||
1105 | |||
1019 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) | 1106 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) |
1020 | { | 1107 | { |
1021 | return __gfn_to_pfn(kvm, gfn, false); | 1108 | return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); |
1022 | } | 1109 | } |
1023 | EXPORT_SYMBOL_GPL(gfn_to_pfn); | 1110 | EXPORT_SYMBOL_GPL(gfn_to_pfn); |
1024 | 1111 | ||
1112 | pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | ||
1113 | bool *writable) | ||
1114 | { | ||
1115 | return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); | ||
1116 | } | ||
1117 | EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); | ||
1118 | |||
1025 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, | 1119 | pfn_t gfn_to_pfn_memslot(struct kvm *kvm, |
1026 | struct kvm_memory_slot *slot, gfn_t gfn) | 1120 | struct kvm_memory_slot *slot, gfn_t gfn) |
1027 | { | 1121 | { |
1028 | unsigned long addr = gfn_to_hva_memslot(slot, gfn); | 1122 | unsigned long addr = gfn_to_hva_memslot(slot, gfn); |
1029 | return hva_to_pfn(kvm, addr, false); | 1123 | return hva_to_pfn(kvm, addr, false, NULL, true, NULL); |
1030 | } | 1124 | } |
1031 | 1125 | ||
1032 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | 1126 | int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, |
@@ -1035,7 +1129,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, | |||
1035 | unsigned long addr; | 1129 | unsigned long addr; |
1036 | gfn_t entry; | 1130 | gfn_t entry; |
1037 | 1131 | ||
1038 | addr = gfn_to_hva_many(kvm, gfn, &entry); | 1132 | addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); |
1039 | if (kvm_is_error_hva(addr)) | 1133 | if (kvm_is_error_hva(addr)) |
1040 | return -1; | 1134 | return -1; |
1041 | 1135 | ||
@@ -1219,9 +1313,51 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | |||
1219 | return 0; | 1313 | return 0; |
1220 | } | 1314 | } |
1221 | 1315 | ||
1316 | int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | ||
1317 | gpa_t gpa) | ||
1318 | { | ||
1319 | struct kvm_memslots *slots = kvm_memslots(kvm); | ||
1320 | int offset = offset_in_page(gpa); | ||
1321 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1322 | |||
1323 | ghc->gpa = gpa; | ||
1324 | ghc->generation = slots->generation; | ||
1325 | ghc->memslot = __gfn_to_memslot(slots, gfn); | ||
1326 | ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); | ||
1327 | if (!kvm_is_error_hva(ghc->hva)) | ||
1328 | ghc->hva += offset; | ||
1329 | else | ||
1330 | return -EFAULT; | ||
1331 | |||
1332 | return 0; | ||
1333 | } | ||
1334 | EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); | ||
1335 | |||
1336 | int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | ||
1337 | void *data, unsigned long len) | ||
1338 | { | ||
1339 | struct kvm_memslots *slots = kvm_memslots(kvm); | ||
1340 | int r; | ||
1341 | |||
1342 | if (slots->generation != ghc->generation) | ||
1343 | kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); | ||
1344 | |||
1345 | if (kvm_is_error_hva(ghc->hva)) | ||
1346 | return -EFAULT; | ||
1347 | |||
1348 | r = copy_to_user((void __user *)ghc->hva, data, len); | ||
1349 | if (r) | ||
1350 | return -EFAULT; | ||
1351 | mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); | ||
1352 | |||
1353 | return 0; | ||
1354 | } | ||
1355 | EXPORT_SYMBOL_GPL(kvm_write_guest_cached); | ||
1356 | |||
1222 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) | 1357 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) |
1223 | { | 1358 | { |
1224 | return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); | 1359 | return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, |
1360 | offset, len); | ||
1225 | } | 1361 | } |
1226 | EXPORT_SYMBOL_GPL(kvm_clear_guest_page); | 1362 | EXPORT_SYMBOL_GPL(kvm_clear_guest_page); |
1227 | 1363 | ||
@@ -1244,11 +1380,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) | |||
1244 | } | 1380 | } |
1245 | EXPORT_SYMBOL_GPL(kvm_clear_guest); | 1381 | EXPORT_SYMBOL_GPL(kvm_clear_guest); |
1246 | 1382 | ||
1247 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | 1383 | void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, |
1384 | gfn_t gfn) | ||
1248 | { | 1385 | { |
1249 | struct kvm_memory_slot *memslot; | ||
1250 | |||
1251 | memslot = gfn_to_memslot(kvm, gfn); | ||
1252 | if (memslot && memslot->dirty_bitmap) { | 1386 | if (memslot && memslot->dirty_bitmap) { |
1253 | unsigned long rel_gfn = gfn - memslot->base_gfn; | 1387 | unsigned long rel_gfn = gfn - memslot->base_gfn; |
1254 | 1388 | ||
@@ -1256,6 +1390,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | |||
1256 | } | 1390 | } |
1257 | } | 1391 | } |
1258 | 1392 | ||
1393 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | ||
1394 | { | ||
1395 | struct kvm_memory_slot *memslot; | ||
1396 | |||
1397 | memslot = gfn_to_memslot(kvm, gfn); | ||
1398 | mark_page_dirty_in_slot(kvm, memslot, gfn); | ||
1399 | } | ||
1400 | |||
1259 | /* | 1401 | /* |
1260 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. | 1402 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. |
1261 | */ | 1403 | */ |
@@ -1457,6 +1599,7 @@ static long kvm_vcpu_ioctl(struct file *filp, | |||
1457 | if (arg) | 1599 | if (arg) |
1458 | goto out; | 1600 | goto out; |
1459 | r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); | 1601 | r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); |
1602 | trace_kvm_userspace_exit(vcpu->run->exit_reason, r); | ||
1460 | break; | 1603 | break; |
1461 | case KVM_GET_REGS: { | 1604 | case KVM_GET_REGS: { |
1462 | struct kvm_regs *kvm_regs; | 1605 | struct kvm_regs *kvm_regs; |
@@ -1824,7 +1967,7 @@ static struct file_operations kvm_vm_fops = { | |||
1824 | 1967 | ||
1825 | static int kvm_dev_ioctl_create_vm(void) | 1968 | static int kvm_dev_ioctl_create_vm(void) |
1826 | { | 1969 | { |
1827 | int fd, r; | 1970 | int r; |
1828 | struct kvm *kvm; | 1971 | struct kvm *kvm; |
1829 | 1972 | ||
1830 | kvm = kvm_create_vm(); | 1973 | kvm = kvm_create_vm(); |
@@ -1837,11 +1980,11 @@ static int kvm_dev_ioctl_create_vm(void) | |||
1837 | return r; | 1980 | return r; |
1838 | } | 1981 | } |
1839 | #endif | 1982 | #endif |
1840 | fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); | 1983 | r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); |
1841 | if (fd < 0) | 1984 | if (r < 0) |
1842 | kvm_put_kvm(kvm); | 1985 | kvm_put_kvm(kvm); |
1843 | 1986 | ||
1844 | return fd; | 1987 | return r; |
1845 | } | 1988 | } |
1846 | 1989 | ||
1847 | static long kvm_dev_ioctl_check_extension_generic(long arg) | 1990 | static long kvm_dev_ioctl_check_extension_generic(long arg) |
@@ -1922,7 +2065,7 @@ static struct miscdevice kvm_dev = { | |||
1922 | &kvm_chardev_ops, | 2065 | &kvm_chardev_ops, |
1923 | }; | 2066 | }; |
1924 | 2067 | ||
1925 | static void hardware_enable(void *junk) | 2068 | static void hardware_enable_nolock(void *junk) |
1926 | { | 2069 | { |
1927 | int cpu = raw_smp_processor_id(); | 2070 | int cpu = raw_smp_processor_id(); |
1928 | int r; | 2071 | int r; |
@@ -1942,7 +2085,14 @@ static void hardware_enable(void *junk) | |||
1942 | } | 2085 | } |
1943 | } | 2086 | } |
1944 | 2087 | ||
1945 | static void hardware_disable(void *junk) | 2088 | static void hardware_enable(void *junk) |
2089 | { | ||
2090 | spin_lock(&kvm_lock); | ||
2091 | hardware_enable_nolock(junk); | ||
2092 | spin_unlock(&kvm_lock); | ||
2093 | } | ||
2094 | |||
2095 | static void hardware_disable_nolock(void *junk) | ||
1946 | { | 2096 | { |
1947 | int cpu = raw_smp_processor_id(); | 2097 | int cpu = raw_smp_processor_id(); |
1948 | 2098 | ||
@@ -1952,13 +2102,20 @@ static void hardware_disable(void *junk) | |||
1952 | kvm_arch_hardware_disable(NULL); | 2102 | kvm_arch_hardware_disable(NULL); |
1953 | } | 2103 | } |
1954 | 2104 | ||
2105 | static void hardware_disable(void *junk) | ||
2106 | { | ||
2107 | spin_lock(&kvm_lock); | ||
2108 | hardware_disable_nolock(junk); | ||
2109 | spin_unlock(&kvm_lock); | ||
2110 | } | ||
2111 | |||
1955 | static void hardware_disable_all_nolock(void) | 2112 | static void hardware_disable_all_nolock(void) |
1956 | { | 2113 | { |
1957 | BUG_ON(!kvm_usage_count); | 2114 | BUG_ON(!kvm_usage_count); |
1958 | 2115 | ||
1959 | kvm_usage_count--; | 2116 | kvm_usage_count--; |
1960 | if (!kvm_usage_count) | 2117 | if (!kvm_usage_count) |
1961 | on_each_cpu(hardware_disable, NULL, 1); | 2118 | on_each_cpu(hardware_disable_nolock, NULL, 1); |
1962 | } | 2119 | } |
1963 | 2120 | ||
1964 | static void hardware_disable_all(void) | 2121 | static void hardware_disable_all(void) |
@@ -1977,7 +2134,7 @@ static int hardware_enable_all(void) | |||
1977 | kvm_usage_count++; | 2134 | kvm_usage_count++; |
1978 | if (kvm_usage_count == 1) { | 2135 | if (kvm_usage_count == 1) { |
1979 | atomic_set(&hardware_enable_failed, 0); | 2136 | atomic_set(&hardware_enable_failed, 0); |
1980 | on_each_cpu(hardware_enable, NULL, 1); | 2137 | on_each_cpu(hardware_enable_nolock, NULL, 1); |
1981 | 2138 | ||
1982 | if (atomic_read(&hardware_enable_failed)) { | 2139 | if (atomic_read(&hardware_enable_failed)) { |
1983 | hardware_disable_all_nolock(); | 2140 | hardware_disable_all_nolock(); |
@@ -2008,27 +2165,19 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, | |||
2008 | case CPU_STARTING: | 2165 | case CPU_STARTING: |
2009 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", | 2166 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", |
2010 | cpu); | 2167 | cpu); |
2011 | spin_lock(&kvm_lock); | ||
2012 | hardware_enable(NULL); | 2168 | hardware_enable(NULL); |
2013 | spin_unlock(&kvm_lock); | ||
2014 | break; | 2169 | break; |
2015 | } | 2170 | } |
2016 | return NOTIFY_OK; | 2171 | return NOTIFY_OK; |
2017 | } | 2172 | } |
2018 | 2173 | ||
2019 | 2174 | ||
2020 | asmlinkage void kvm_handle_fault_on_reboot(void) | 2175 | asmlinkage void kvm_spurious_fault(void) |
2021 | { | 2176 | { |
2022 | if (kvm_rebooting) { | ||
2023 | /* spin while reset goes on */ | ||
2024 | local_irq_enable(); | ||
2025 | while (true) | ||
2026 | cpu_relax(); | ||
2027 | } | ||
2028 | /* Fault while not rebooting. We want the trace. */ | 2177 | /* Fault while not rebooting. We want the trace. */ |
2029 | BUG(); | 2178 | BUG(); |
2030 | } | 2179 | } |
2031 | EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); | 2180 | EXPORT_SYMBOL_GPL(kvm_spurious_fault); |
2032 | 2181 | ||
2033 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | 2182 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, |
2034 | void *v) | 2183 | void *v) |
@@ -2041,7 +2190,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | |||
2041 | */ | 2190 | */ |
2042 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); | 2191 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); |
2043 | kvm_rebooting = true; | 2192 | kvm_rebooting = true; |
2044 | on_each_cpu(hardware_disable, NULL, 1); | 2193 | on_each_cpu(hardware_disable_nolock, NULL, 1); |
2045 | return NOTIFY_OK; | 2194 | return NOTIFY_OK; |
2046 | } | 2195 | } |
2047 | 2196 | ||
@@ -2211,7 +2360,7 @@ static void kvm_exit_debug(void) | |||
2211 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) | 2360 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) |
2212 | { | 2361 | { |
2213 | if (kvm_usage_count) | 2362 | if (kvm_usage_count) |
2214 | hardware_disable(NULL); | 2363 | hardware_disable_nolock(NULL); |
2215 | return 0; | 2364 | return 0; |
2216 | } | 2365 | } |
2217 | 2366 | ||
@@ -2219,7 +2368,7 @@ static int kvm_resume(struct sys_device *dev) | |||
2219 | { | 2368 | { |
2220 | if (kvm_usage_count) { | 2369 | if (kvm_usage_count) { |
2221 | WARN_ON(spin_is_locked(&kvm_lock)); | 2370 | WARN_ON(spin_is_locked(&kvm_lock)); |
2222 | hardware_enable(NULL); | 2371 | hardware_enable_nolock(NULL); |
2223 | } | 2372 | } |
2224 | return 0; | 2373 | return 0; |
2225 | } | 2374 | } |
@@ -2336,6 +2485,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
2336 | goto out_free_5; | 2485 | goto out_free_5; |
2337 | } | 2486 | } |
2338 | 2487 | ||
2488 | r = kvm_async_pf_init(); | ||
2489 | if (r) | ||
2490 | goto out_free; | ||
2491 | |||
2339 | kvm_chardev_ops.owner = module; | 2492 | kvm_chardev_ops.owner = module; |
2340 | kvm_vm_fops.owner = module; | 2493 | kvm_vm_fops.owner = module; |
2341 | kvm_vcpu_fops.owner = module; | 2494 | kvm_vcpu_fops.owner = module; |
@@ -2343,7 +2496,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
2343 | r = misc_register(&kvm_dev); | 2496 | r = misc_register(&kvm_dev); |
2344 | if (r) { | 2497 | if (r) { |
2345 | printk(KERN_ERR "kvm: misc device register failed\n"); | 2498 | printk(KERN_ERR "kvm: misc device register failed\n"); |
2346 | goto out_free; | 2499 | goto out_unreg; |
2347 | } | 2500 | } |
2348 | 2501 | ||
2349 | kvm_preempt_ops.sched_in = kvm_sched_in; | 2502 | kvm_preempt_ops.sched_in = kvm_sched_in; |
@@ -2353,6 +2506,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
2353 | 2506 | ||
2354 | return 0; | 2507 | return 0; |
2355 | 2508 | ||
2509 | out_unreg: | ||
2510 | kvm_async_pf_deinit(); | ||
2356 | out_free: | 2511 | out_free: |
2357 | kmem_cache_destroy(kvm_vcpu_cache); | 2512 | kmem_cache_destroy(kvm_vcpu_cache); |
2358 | out_free_5: | 2513 | out_free_5: |
@@ -2385,11 +2540,12 @@ void kvm_exit(void) | |||
2385 | kvm_exit_debug(); | 2540 | kvm_exit_debug(); |
2386 | misc_deregister(&kvm_dev); | 2541 | misc_deregister(&kvm_dev); |
2387 | kmem_cache_destroy(kvm_vcpu_cache); | 2542 | kmem_cache_destroy(kvm_vcpu_cache); |
2543 | kvm_async_pf_deinit(); | ||
2388 | sysdev_unregister(&kvm_sysdev); | 2544 | sysdev_unregister(&kvm_sysdev); |
2389 | sysdev_class_unregister(&kvm_sysdev_class); | 2545 | sysdev_class_unregister(&kvm_sysdev_class); |
2390 | unregister_reboot_notifier(&kvm_reboot_notifier); | 2546 | unregister_reboot_notifier(&kvm_reboot_notifier); |
2391 | unregister_cpu_notifier(&kvm_cpu_notifier); | 2547 | unregister_cpu_notifier(&kvm_cpu_notifier); |
2392 | on_each_cpu(hardware_disable, NULL, 1); | 2548 | on_each_cpu(hardware_disable_nolock, NULL, 1); |
2393 | kvm_arch_hardware_unsetup(); | 2549 | kvm_arch_hardware_unsetup(); |
2394 | kvm_arch_exit(); | 2550 | kvm_arch_exit(); |
2395 | free_cpumask_var(cpus_hardware_enabled); | 2551 | free_cpumask_var(cpus_hardware_enabled); |