diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-14 20:43:43 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-14 20:43:43 -0400 |
| commit | 69def9f05dfce3281bb06599057e6b8097385d39 (patch) | |
| tree | 7d826b22924268ddbfad101993b248996d40e2ec | |
| parent | 353f6dd2dec992ddd34620a94b051b0f76227379 (diff) | |
| parent | 8e616fc8d343bd7f0f0a0c22407fdcb77f6d22b1 (diff) | |
Merge branch 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (202 commits)
MAINTAINERS: update KVM entry
KVM: correct error-handling code
KVM: fix compile warnings on s390
KVM: VMX: Check cpl before emulating debug register access
KVM: fix misreporting of coalesced interrupts by kvm tracer
KVM: x86: drop duplicate kvm_flush_remote_tlb calls
KVM: VMX: call vmx_load_host_state() only if msr is cached
KVM: VMX: Conditionally reload debug register 6
KVM: Use thread debug register storage instead of kvm specific data
KVM guest: do not batch pte updates from interrupt context
KVM: Fix coalesced interrupt reporting in IOAPIC
KVM guest: fix bogus wallclock physical address calculation
KVM: VMX: Fix cr8 exiting control clobbering by EPT
KVM: Optimize kvm_mmu_unprotect_page_virt() for tdp
KVM: Document KVM_CAP_IRQCHIP
KVM: Protect update_cr8_intercept() when running without an apic
KVM: VMX: Fix EPT with WP bit change during paging
KVM: Use kvm_{read,write}_guest_virt() to read and write segment descriptors
KVM: x86 emulator: Add adc and sbb missing decoder flags
KVM: Add missing #include
...
80 files changed, 5692 insertions, 2160 deletions
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 1c058b552e9..aafca0a8f66 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
| @@ -193,7 +193,7 @@ Code Seq# Include File Comments | |||
| 193 | 0xAD 00 Netfilter device in development: | 193 | 0xAD 00 Netfilter device in development: |
| 194 | <mailto:rusty@rustcorp.com.au> | 194 | <mailto:rusty@rustcorp.com.au> |
| 195 | 0xAE all linux/kvm.h Kernel-based Virtual Machine | 195 | 0xAE all linux/kvm.h Kernel-based Virtual Machine |
| 196 | <mailto:kvm-devel@lists.sourceforge.net> | 196 | <mailto:kvm@vger.kernel.org> |
| 197 | 0xB0 all RATIO devices in development: | 197 | 0xB0 all RATIO devices in development: |
| 198 | <mailto:vgo@ratio.de> | 198 | <mailto:vgo@ratio.de> |
| 199 | 0xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca> | 199 | 0xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca> |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cb3a169e372..3a238644c81 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
| @@ -57,6 +57,7 @@ parameter is applicable: | |||
| 57 | ISAPNP ISA PnP code is enabled. | 57 | ISAPNP ISA PnP code is enabled. |
| 58 | ISDN Appropriate ISDN support is enabled. | 58 | ISDN Appropriate ISDN support is enabled. |
| 59 | JOY Appropriate joystick support is enabled. | 59 | JOY Appropriate joystick support is enabled. |
| 60 | KVM Kernel Virtual Machine support is enabled. | ||
| 60 | LIBATA Libata driver is enabled | 61 | LIBATA Libata driver is enabled |
| 61 | LP Printer support is enabled. | 62 | LP Printer support is enabled. |
| 62 | LOOP Loopback device support is enabled. | 63 | LOOP Loopback device support is enabled. |
| @@ -1098,6 +1099,44 @@ and is between 256 and 4096 characters. It is defined in the file | |||
| 1098 | kstack=N [X86] Print N words from the kernel stack | 1099 | kstack=N [X86] Print N words from the kernel stack |
| 1099 | in oops dumps. | 1100 | in oops dumps. |
| 1100 | 1101 | ||
| 1102 | kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. | ||
| 1103 | Default is 0 (don't ignore, but inject #GP) | ||
| 1104 | |||
| 1105 | kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging. | ||
| 1106 | Default is 1 (enabled) | ||
| 1107 | |||
| 1108 | kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. | ||
| 1109 | Default is 0 (off) | ||
| 1110 | |||
| 1111 | kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU) | ||
| 1112 | for all guests. | ||
| 1113 | Default is 1 (enabled) if in 64bit or 32bit-PAE mode | ||
| 1114 | |||
| 1115 | kvm-intel.bypass_guest_pf= | ||
| 1116 | [KVM,Intel] Disables bypassing of guest page faults | ||
| 1117 | on Intel chips. Default is 1 (enabled) | ||
| 1118 | |||
| 1119 | kvm-intel.ept= [KVM,Intel] Disable extended page tables | ||
| 1120 | (virtualized MMU) support on capable Intel chips. | ||
| 1121 | Default is 1 (enabled) | ||
| 1122 | |||
| 1123 | kvm-intel.emulate_invalid_guest_state= | ||
| 1124 | [KVM,Intel] Enable emulation of invalid guest states | ||
| 1125 | Default is 0 (disabled) | ||
| 1126 | |||
| 1127 | kvm-intel.flexpriority= | ||
| 1128 | [KVM,Intel] Disable FlexPriority feature (TPR shadow). | ||
| 1129 | Default is 1 (enabled) | ||
| 1130 | |||
| 1131 | kvm-intel.unrestricted_guest= | ||
| 1132 | [KVM,Intel] Disable unrestricted guest feature | ||
| 1133 | (virtualized real and unpaged mode) on capable | ||
| 1134 | Intel chips. Default is 1 (enabled) | ||
| 1135 | |||
| 1136 | kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification | ||
| 1137 | feature (tagged TLBs) on capable Intel chips. | ||
| 1138 | Default is 1 (enabled) | ||
| 1139 | |||
| 1101 | l2cr= [PPC] | 1140 | l2cr= [PPC] |
| 1102 | 1141 | ||
| 1103 | l3cr= [PPC] | 1142 | l3cr= [PPC] |
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt new file mode 100644 index 00000000000..5a4bc8cf6d0 --- /dev/null +++ b/Documentation/kvm/api.txt | |||
| @@ -0,0 +1,759 @@ | |||
| 1 | The Definitive KVM (Kernel-based Virtual Machine) API Documentation | ||
| 2 | =================================================================== | ||
| 3 | |||
| 4 | 1. General description | ||
| 5 | |||
| 6 | The kvm API is a set of ioctls that are issued to control various aspects | ||
| 7 | of a virtual machine. The ioctls belong to three classes | ||
| 8 | |||
| 9 | - System ioctls: These query and set global attributes which affect the | ||
| 10 | whole kvm subsystem. In addition a system ioctl is used to create | ||
| 11 | virtual machines | ||
| 12 | |||
| 13 | - VM ioctls: These query and set attributes that affect an entire virtual | ||
| 14 | machine, for example memory layout. In addition a VM ioctl is used to | ||
| 15 | create virtual cpus (vcpus). | ||
| 16 | |||
| 17 | Only run VM ioctls from the same process (address space) that was used | ||
| 18 | to create the VM. | ||
| 19 | |||
| 20 | - vcpu ioctls: These query and set attributes that control the operation | ||
| 21 | of a single virtual cpu. | ||
| 22 | |||
| 23 | Only run vcpu ioctls from the same thread that was used to create the | ||
| 24 | vcpu. | ||
| 25 | |||
| 26 | 2. File descritpors | ||
| 27 | |||
| 28 | The kvm API is centered around file descriptors. An initial | ||
| 29 | open("/dev/kvm") obtains a handle to the kvm subsystem; this handle | ||
| 30 | can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this | ||
| 31 | handle will create a VM file descripror which can be used to issue VM | ||
| 32 | ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu | ||
| 33 | and return a file descriptor pointing to it. Finally, ioctls on a vcpu | ||
| 34 | fd can be used to control the vcpu, including the important task of | ||
| 35 | actually running guest code. | ||
| 36 | |||
| 37 | In general file descriptors can be migrated among processes by means | ||
| 38 | of fork() and the SCM_RIGHTS facility of unix domain socket. These | ||
| 39 | kinds of tricks are explicitly not supported by kvm. While they will | ||
| 40 | not cause harm to the host, their actual behavior is not guaranteed by | ||
| 41 | the API. The only supported use is one virtual machine per process, | ||
| 42 | and one vcpu per thread. | ||
| 43 | |||
| 44 | 3. Extensions | ||
| 45 | |||
| 46 | As of Linux 2.6.22, the KVM ABI has been stabilized: no backward | ||
| 47 | incompatible change are allowed. However, there is an extension | ||
| 48 | facility that allows backward-compatible extensions to the API to be | ||
| 49 | queried and used. | ||
| 50 | |||
| 51 | The extension mechanism is not based on on the Linux version number. | ||
| 52 | Instead, kvm defines extension identifiers and a facility to query | ||
| 53 | whether a particular extension identifier is available. If it is, a | ||
| 54 | set of ioctls is available for application use. | ||
| 55 | |||
| 56 | 4. API description | ||
| 57 | |||
| 58 | This section describes ioctls that can be used to control kvm guests. | ||
| 59 | For each ioctl, the following information is provided along with a | ||
| 60 | description: | ||
| 61 | |||
| 62 | Capability: which KVM extension provides this ioctl. Can be 'basic', | ||
| 63 | which means that is will be provided by any kernel that supports | ||
| 64 | API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which | ||
| 65 | means availability needs to be checked with KVM_CHECK_EXTENSION | ||
| 66 | (see section 4.4). | ||
| 67 | |||
| 68 | Architectures: which instruction set architectures provide this ioctl. | ||
| 69 | x86 includes both i386 and x86_64. | ||
| 70 | |||
| 71 | Type: system, vm, or vcpu. | ||
| 72 | |||
| 73 | Parameters: what parameters are accepted by the ioctl. | ||
| 74 | |||
| 75 | Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) | ||
| 76 | are not detailed, but errors with specific meanings are. | ||
| 77 | |||
| 78 | 4.1 KVM_GET_API_VERSION | ||
| 79 | |||
| 80 | Capability: basic | ||
| 81 | Architectures: all | ||
| 82 | Type: system ioctl | ||
| 83 | Parameters: none | ||
| 84 | Returns: the constant KVM_API_VERSION (=12) | ||
| 85 | |||
| 86 | This identifies the API version as the stable kvm API. It is not | ||
| 87 | expected that this number will change. However, Linux 2.6.20 and | ||
| 88 | 2.6.21 report earlier versions; these are not documented and not | ||
| 89 | supported. Applications should refuse to run if KVM_GET_API_VERSION | ||
| 90 | returns a value other than 12. If this check passes, all ioctls | ||
| 91 | described as 'basic' will be available. | ||
| 92 | |||
| 93 | 4.2 KVM_CREATE_VM | ||
| 94 | |||
| 95 | Capability: basic | ||
| 96 | Architectures: all | ||
| 97 | Type: system ioctl | ||
| 98 | Parameters: none | ||
| 99 | Returns: a VM fd that can be used to control the new virtual machine. | ||
| 100 | |||
| 101 | The new VM has no virtual cpus and no memory. An mmap() of a VM fd | ||
| 102 | will access the virtual machine's physical address space; offset zero | ||
| 103 | corresponds to guest physical address zero. Use of mmap() on a VM fd | ||
| 104 | is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is | ||
| 105 | available. | ||
| 106 | |||
| 107 | 4.3 KVM_GET_MSR_INDEX_LIST | ||
| 108 | |||
| 109 | Capability: basic | ||
| 110 | Architectures: x86 | ||
| 111 | Type: system | ||
| 112 | Parameters: struct kvm_msr_list (in/out) | ||
| 113 | Returns: 0 on success; -1 on error | ||
| 114 | Errors: | ||
| 115 | E2BIG: the msr index list is to be to fit in the array specified by | ||
| 116 | the user. | ||
| 117 | |||
| 118 | struct kvm_msr_list { | ||
| 119 | __u32 nmsrs; /* number of msrs in entries */ | ||
| 120 | __u32 indices[0]; | ||
| 121 | }; | ||
| 122 | |||
| 123 | This ioctl returns the guest msrs that are supported. The list varies | ||
| 124 | by kvm version and host processor, but does not change otherwise. The | ||
| 125 | user fills in the size of the indices array in nmsrs, and in return | ||
| 126 | kvm adjusts nmsrs to reflect the actual number of msrs and fills in | ||
| 127 | the indices array with their numbers. | ||
| 128 | |||
| 129 | 4.4 KVM_CHECK_EXTENSION | ||
| 130 | |||
| 131 | Capability: basic | ||
| 132 | Architectures: all | ||
| 133 | Type: system ioctl | ||
| 134 | Parameters: extension identifier (KVM_CAP_*) | ||
| 135 | Returns: 0 if unsupported; 1 (or some other positive integer) if supported | ||
| 136 | |||
| 137 | The API allows the application to query about extensions to the core | ||
| 138 | kvm API. Userspace passes an extension identifier (an integer) and | ||
| 139 | receives an integer that describes the extension availability. | ||
| 140 | Generally 0 means no and 1 means yes, but some extensions may report | ||
| 141 | additional information in the integer return value. | ||
| 142 | |||
| 143 | 4.5 KVM_GET_VCPU_MMAP_SIZE | ||
| 144 | |||
| 145 | Capability: basic | ||
| 146 | Architectures: all | ||
| 147 | Type: system ioctl | ||
| 148 | Parameters: none | ||
| 149 | Returns: size of vcpu mmap area, in bytes | ||
| 150 | |||
| 151 | The KVM_RUN ioctl (cf.) communicates with userspace via a shared | ||
| 152 | memory region. This ioctl returns the size of that region. See the | ||
| 153 | KVM_RUN documentation for details. | ||
| 154 | |||
| 155 | 4.6 KVM_SET_MEMORY_REGION | ||
| 156 | |||
| 157 | Capability: basic | ||
| 158 | Architectures: all | ||
| 159 | Type: vm ioctl | ||
| 160 | Parameters: struct kvm_memory_region (in) | ||
| 161 | Returns: 0 on success, -1 on error | ||
| 162 | |||
| 163 | struct kvm_memory_region { | ||
| 164 | __u32 slot; | ||
| 165 | __u32 flags; | ||
| 166 | __u64 guest_phys_addr; | ||
| 167 | __u64 memory_size; /* bytes */ | ||
| 168 | }; | ||
| 169 | |||
| 170 | /* for kvm_memory_region::flags */ | ||
| 171 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | ||
| 172 | |||
| 173 | This ioctl allows the user to create or modify a guest physical memory | ||
| 174 | slot. When changing an existing slot, it may be moved in the guest | ||
| 175 | physical memory space, or its flags may be modified. It may not be | ||
| 176 | resized. Slots may not overlap. | ||
| 177 | |||
| 178 | The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which | ||
| 179 | instructs kvm to keep track of writes to memory within the slot. See | ||
| 180 | the KVM_GET_DIRTY_LOG ioctl. | ||
| 181 | |||
| 182 | It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead | ||
| 183 | of this API, if available. This newer API allows placing guest memory | ||
| 184 | at specified locations in the host address space, yielding better | ||
| 185 | control and easy access. | ||
| 186 | |||
| 187 | 4.6 KVM_CREATE_VCPU | ||
| 188 | |||
| 189 | Capability: basic | ||
| 190 | Architectures: all | ||
| 191 | Type: vm ioctl | ||
| 192 | Parameters: vcpu id (apic id on x86) | ||
| 193 | Returns: vcpu fd on success, -1 on error | ||
| 194 | |||
| 195 | This API adds a vcpu to a virtual machine. The vcpu id is a small integer | ||
| 196 | in the range [0, max_vcpus). | ||
| 197 | |||
| 198 | 4.7 KVM_GET_DIRTY_LOG (vm ioctl) | ||
| 199 | |||
| 200 | Capability: basic | ||
| 201 | Architectures: x86 | ||
| 202 | Type: vm ioctl | ||
| 203 | Parameters: struct kvm_dirty_log (in/out) | ||
| 204 | Returns: 0 on success, -1 on error | ||
| 205 | |||
| 206 | /* for KVM_GET_DIRTY_LOG */ | ||
| 207 | struct kvm_dirty_log { | ||
| 208 | __u32 slot; | ||
| 209 | __u32 padding; | ||
| 210 | union { | ||
| 211 | void __user *dirty_bitmap; /* one bit per page */ | ||
| 212 | __u64 padding; | ||
| 213 | }; | ||
| 214 | }; | ||
| 215 | |||
| 216 | Given a memory slot, return a bitmap containing any pages dirtied | ||
| 217 | since the last call to this ioctl. Bit 0 is the first page in the | ||
| 218 | memory slot. Ensure the entire structure is cleared to avoid padding | ||
| 219 | issues. | ||
| 220 | |||
| 221 | 4.8 KVM_SET_MEMORY_ALIAS | ||
| 222 | |||
| 223 | Capability: basic | ||
| 224 | Architectures: x86 | ||
| 225 | Type: vm ioctl | ||
| 226 | Parameters: struct kvm_memory_alias (in) | ||
| 227 | Returns: 0 (success), -1 (error) | ||
| 228 | |||
| 229 | struct kvm_memory_alias { | ||
| 230 | __u32 slot; /* this has a different namespace than memory slots */ | ||
| 231 | __u32 flags; | ||
| 232 | __u64 guest_phys_addr; | ||
| 233 | __u64 memory_size; | ||
| 234 | __u64 target_phys_addr; | ||
| 235 | }; | ||
| 236 | |||
| 237 | Defines a guest physical address space region as an alias to another | ||
| 238 | region. Useful for aliased address, for example the VGA low memory | ||
| 239 | window. Should not be used with userspace memory. | ||
| 240 | |||
| 241 | 4.9 KVM_RUN | ||
| 242 | |||
| 243 | Capability: basic | ||
| 244 | Architectures: all | ||
| 245 | Type: vcpu ioctl | ||
| 246 | Parameters: none | ||
| 247 | Returns: 0 on success, -1 on error | ||
| 248 | Errors: | ||
| 249 | EINTR: an unmasked signal is pending | ||
| 250 | |||
| 251 | This ioctl is used to run a guest virtual cpu. While there are no | ||
| 252 | explicit parameters, there is an implicit parameter block that can be | ||
| 253 | obtained by mmap()ing the vcpu fd at offset 0, with the size given by | ||
| 254 | KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct | ||
| 255 | kvm_run' (see below). | ||
| 256 | |||
| 257 | 4.10 KVM_GET_REGS | ||
| 258 | |||
| 259 | Capability: basic | ||
| 260 | Architectures: all | ||
| 261 | Type: vcpu ioctl | ||
| 262 | Parameters: struct kvm_regs (out) | ||
| 263 | Returns: 0 on success, -1 on error | ||
| 264 | |||
| 265 | Reads the general purpose registers from the vcpu. | ||
| 266 | |||
| 267 | /* x86 */ | ||
| 268 | struct kvm_regs { | ||
| 269 | /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ | ||
| 270 | __u64 rax, rbx, rcx, rdx; | ||
| 271 | __u64 rsi, rdi, rsp, rbp; | ||
| 272 | __u64 r8, r9, r10, r11; | ||
| 273 | __u64 r12, r13, r14, r15; | ||
| 274 | __u64 rip, rflags; | ||
| 275 | }; | ||
| 276 | |||
| 277 | 4.11 KVM_SET_REGS | ||
| 278 | |||
| 279 | Capability: basic | ||
| 280 | Architectures: all | ||
| 281 | Type: vcpu ioctl | ||
| 282 | Parameters: struct kvm_regs (in) | ||
| 283 | Returns: 0 on success, -1 on error | ||
| 284 | |||
| 285 | Writes the general purpose registers into the vcpu. | ||
| 286 | |||
| 287 | See KVM_GET_REGS for the data structure. | ||
| 288 | |||
| 289 | 4.12 KVM_GET_SREGS | ||
| 290 | |||
| 291 | Capability: basic | ||
| 292 | Architectures: x86 | ||
| 293 | Type: vcpu ioctl | ||
| 294 | Parameters: struct kvm_sregs (out) | ||
| 295 | Returns: 0 on success, -1 on error | ||
| 296 | |||
| 297 | Reads special registers from the vcpu. | ||
| 298 | |||
| 299 | /* x86 */ | ||
| 300 | struct kvm_sregs { | ||
| 301 | struct kvm_segment cs, ds, es, fs, gs, ss; | ||
| 302 | struct kvm_segment tr, ldt; | ||
| 303 | struct kvm_dtable gdt, idt; | ||
| 304 | __u64 cr0, cr2, cr3, cr4, cr8; | ||
| 305 | __u64 efer; | ||
| 306 | __u64 apic_base; | ||
| 307 | __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; | ||
| 308 | }; | ||
| 309 | |||
| 310 | interrupt_bitmap is a bitmap of pending external interrupts. At most | ||
| 311 | one bit may be set. This interrupt has been acknowledged by the APIC | ||
| 312 | but not yet injected into the cpu core. | ||
| 313 | |||
| 314 | 4.13 KVM_SET_SREGS | ||
| 315 | |||
| 316 | Capability: basic | ||
| 317 | Architectures: x86 | ||
| 318 | Type: vcpu ioctl | ||
| 319 | Parameters: struct kvm_sregs (in) | ||
| 320 | Returns: 0 on success, -1 on error | ||
| 321 | |||
| 322 | Writes special registers into the vcpu. See KVM_GET_SREGS for the | ||
| 323 | data structures. | ||
| 324 | |||
| 325 | 4.14 KVM_TRANSLATE | ||
| 326 | |||
| 327 | Capability: basic | ||
| 328 | Architectures: x86 | ||
| 329 | Type: vcpu ioctl | ||
| 330 | Parameters: struct kvm_translation (in/out) | ||
| 331 | Returns: 0 on success, -1 on error | ||
| 332 | |||
| 333 | Translates a virtual address according to the vcpu's current address | ||
| 334 | translation mode. | ||
| 335 | |||
| 336 | struct kvm_translation { | ||
| 337 | /* in */ | ||
| 338 | __u64 linear_address; | ||
| 339 | |||
| 340 | /* out */ | ||
| 341 | __u64 physical_address; | ||
| 342 | __u8 valid; | ||
| 343 | __u8 writeable; | ||
| 344 | __u8 usermode; | ||
| 345 | __u8 pad[5]; | ||
| 346 | }; | ||
| 347 | |||
| 348 | 4.15 KVM_INTERRUPT | ||
| 349 | |||
| 350 | Capability: basic | ||
| 351 | Architectures: x86 | ||
| 352 | Type: vcpu ioctl | ||
| 353 | Parameters: struct kvm_interrupt (in) | ||
| 354 | Returns: 0 on success, -1 on error | ||
| 355 | |||
| 356 | Queues a hardware interrupt vector to be injected. This is only | ||
| 357 | useful if in-kernel local APIC is not used. | ||
| 358 | |||
| 359 | /* for KVM_INTERRUPT */ | ||
| 360 | struct kvm_interrupt { | ||
| 361 | /* in */ | ||
| 362 | __u32 irq; | ||
| 363 | }; | ||
| 364 | |||
| 365 | Note 'irq' is an interrupt vector, not an interrupt pin or line. | ||
| 366 | |||
| 367 | 4.16 KVM_DEBUG_GUEST | ||
| 368 | |||
| 369 | Capability: basic | ||
| 370 | Architectures: none | ||
| 371 | Type: vcpu ioctl | ||
| 372 | Parameters: none) | ||
| 373 | Returns: -1 on error | ||
| 374 | |||
| 375 | Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. | ||
| 376 | |||
| 377 | 4.17 KVM_GET_MSRS | ||
| 378 | |||
| 379 | Capability: basic | ||
| 380 | Architectures: x86 | ||
| 381 | Type: vcpu ioctl | ||
| 382 | Parameters: struct kvm_msrs (in/out) | ||
| 383 | Returns: 0 on success, -1 on error | ||
| 384 | |||
| 385 | Reads model-specific registers from the vcpu. Supported msr indices can | ||
| 386 | be obtained using KVM_GET_MSR_INDEX_LIST. | ||
| 387 | |||
| 388 | struct kvm_msrs { | ||
| 389 | __u32 nmsrs; /* number of msrs in entries */ | ||
| 390 | __u32 pad; | ||
| 391 | |||
| 392 | struct kvm_msr_entry entries[0]; | ||
| 393 | }; | ||
| 394 | |||
| 395 | struct kvm_msr_entry { | ||
| 396 | __u32 index; | ||
| 397 | __u32 reserved; | ||
| 398 | __u64 data; | ||
| 399 | }; | ||
| 400 | |||
| 401 | Application code should set the 'nmsrs' member (which indicates the | ||
| 402 | size of the entries array) and the 'index' member of each array entry. | ||
| 403 | kvm will fill in the 'data' member. | ||
| 404 | |||
| 405 | 4.18 KVM_SET_MSRS | ||
| 406 | |||
| 407 | Capability: basic | ||
| 408 | Architectures: x86 | ||
| 409 | Type: vcpu ioctl | ||
| 410 | Parameters: struct kvm_msrs (in) | ||
| 411 | Returns: 0 on success, -1 on error | ||
| 412 | |||
| 413 | Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the | ||
| 414 | data structures. | ||
| 415 | |||
| 416 | Application code should set the 'nmsrs' member (which indicates the | ||
| 417 | size of the entries array), and the 'index' and 'data' members of each | ||
| 418 | array entry. | ||
| 419 | |||
| 420 | 4.19 KVM_SET_CPUID | ||
| 421 | |||
| 422 | Capability: basic | ||
| 423 | Architectures: x86 | ||
| 424 | Type: vcpu ioctl | ||
| 425 | Parameters: struct kvm_cpuid (in) | ||
| 426 | Returns: 0 on success, -1 on error | ||
| 427 | |||
| 428 | Defines the vcpu responses to the cpuid instruction. Applications | ||
| 429 | should use the KVM_SET_CPUID2 ioctl if available. | ||
| 430 | |||
| 431 | |||
| 432 | struct kvm_cpuid_entry { | ||
| 433 | __u32 function; | ||
| 434 | __u32 eax; | ||
| 435 | __u32 ebx; | ||
| 436 | __u32 ecx; | ||
| 437 | __u32 edx; | ||
| 438 | __u32 padding; | ||
| 439 | }; | ||
| 440 | |||
| 441 | /* for KVM_SET_CPUID */ | ||
| 442 | struct kvm_cpuid { | ||
| 443 | __u32 nent; | ||
| 444 | __u32 padding; | ||
| 445 | struct kvm_cpuid_entry entries[0]; | ||
| 446 | }; | ||
| 447 | |||
| 448 | 4.20 KVM_SET_SIGNAL_MASK | ||
| 449 | |||
| 450 | Capability: basic | ||
| 451 | Architectures: x86 | ||
| 452 | Type: vcpu ioctl | ||
| 453 | Parameters: struct kvm_signal_mask (in) | ||
| 454 | Returns: 0 on success, -1 on error | ||
| 455 | |||
| 456 | Defines which signals are blocked during execution of KVM_RUN. This | ||
| 457 | signal mask temporarily overrides the threads signal mask. Any | ||
| 458 | unblocked signal received (except SIGKILL and SIGSTOP, which retain | ||
| 459 | their traditional behaviour) will cause KVM_RUN to return with -EINTR. | ||
| 460 | |||
| 461 | Note the signal will only be delivered if not blocked by the original | ||
| 462 | signal mask. | ||
| 463 | |||
| 464 | /* for KVM_SET_SIGNAL_MASK */ | ||
| 465 | struct kvm_signal_mask { | ||
| 466 | __u32 len; | ||
| 467 | __u8 sigset[0]; | ||
| 468 | }; | ||
| 469 | |||
| 470 | 4.21 KVM_GET_FPU | ||
| 471 | |||
| 472 | Capability: basic | ||
| 473 | Architectures: x86 | ||
| 474 | Type: vcpu ioctl | ||
| 475 | Parameters: struct kvm_fpu (out) | ||
| 476 | Returns: 0 on success, -1 on error | ||
| 477 | |||
| 478 | Reads the floating point state from the vcpu. | ||
| 479 | |||
| 480 | /* for KVM_GET_FPU and KVM_SET_FPU */ | ||
| 481 | struct kvm_fpu { | ||
| 482 | __u8 fpr[8][16]; | ||
| 483 | __u16 fcw; | ||
| 484 | __u16 fsw; | ||
| 485 | __u8 ftwx; /* in fxsave format */ | ||
| 486 | __u8 pad1; | ||
| 487 | __u16 last_opcode; | ||
| 488 | __u64 last_ip; | ||
| 489 | __u64 last_dp; | ||
| 490 | __u8 xmm[16][16]; | ||
| 491 | __u32 mxcsr; | ||
| 492 | __u32 pad2; | ||
| 493 | }; | ||
| 494 | |||
| 495 | 4.22 KVM_SET_FPU | ||
| 496 | |||
| 497 | Capability: basic | ||
| 498 | Architectures: x86 | ||
| 499 | Type: vcpu ioctl | ||
| 500 | Parameters: struct kvm_fpu (in) | ||
| 501 | Returns: 0 on success, -1 on error | ||
| 502 | |||
| 503 | Writes the floating point state to the vcpu. | ||
| 504 | |||
| 505 | /* for KVM_GET_FPU and KVM_SET_FPU */ | ||
| 506 | struct kvm_fpu { | ||
| 507 | __u8 fpr[8][16]; | ||
| 508 | __u16 fcw; | ||
| 509 | __u16 fsw; | ||
| 510 | __u8 ftwx; /* in fxsave format */ | ||
| 511 | __u8 pad1; | ||
| 512 | __u16 last_opcode; | ||
| 513 | __u64 last_ip; | ||
| 514 | __u64 last_dp; | ||
| 515 | __u8 xmm[16][16]; | ||
| 516 | __u32 mxcsr; | ||
| 517 | __u32 pad2; | ||
| 518 | }; | ||
| 519 | |||
| 520 | 4.23 KVM_CREATE_IRQCHIP | ||
| 521 | |||
| 522 | Capability: KVM_CAP_IRQCHIP | ||
| 523 | Architectures: x86, ia64 | ||
| 524 | Type: vm ioctl | ||
| 525 | Parameters: none | ||
| 526 | Returns: 0 on success, -1 on error | ||
| 527 | |||
| 528 | Creates an interrupt controller model in the kernel. On x86, creates a virtual | ||
| 529 | ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a | ||
| 530 | local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 | ||
| 531 | only go to the IOAPIC. On ia64, a IOSAPIC is created. | ||
| 532 | |||
| 533 | 4.24 KVM_IRQ_LINE | ||
| 534 | |||
| 535 | Capability: KVM_CAP_IRQCHIP | ||
| 536 | Architectures: x86, ia64 | ||
| 537 | Type: vm ioctl | ||
| 538 | Parameters: struct kvm_irq_level | ||
| 539 | Returns: 0 on success, -1 on error | ||
| 540 | |||
| 541 | Sets the level of a GSI input to the interrupt controller model in the kernel. | ||
| 542 | Requires that an interrupt controller model has been previously created with | ||
| 543 | KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level | ||
| 544 | to be set to 1 and then back to 0. | ||
| 545 | |||
| 546 | struct kvm_irq_level { | ||
| 547 | union { | ||
| 548 | __u32 irq; /* GSI */ | ||
| 549 | __s32 status; /* not used for KVM_IRQ_LEVEL */ | ||
| 550 | }; | ||
| 551 | __u32 level; /* 0 or 1 */ | ||
| 552 | }; | ||
| 553 | |||
| 554 | 4.25 KVM_GET_IRQCHIP | ||
| 555 | |||
| 556 | Capability: KVM_CAP_IRQCHIP | ||
| 557 | Architectures: x86, ia64 | ||
| 558 | Type: vm ioctl | ||
| 559 | Parameters: struct kvm_irqchip (in/out) | ||
| 560 | Returns: 0 on success, -1 on error | ||
| 561 | |||
| 562 | Reads the state of a kernel interrupt controller created with | ||
| 563 | KVM_CREATE_IRQCHIP into a buffer provided by the caller. | ||
| 564 | |||
| 565 | struct kvm_irqchip { | ||
| 566 | __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ | ||
| 567 | __u32 pad; | ||
| 568 | union { | ||
| 569 | char dummy[512]; /* reserving space */ | ||
| 570 | struct kvm_pic_state pic; | ||
| 571 | struct kvm_ioapic_state ioapic; | ||
| 572 | } chip; | ||
| 573 | }; | ||
| 574 | |||
| 575 | 4.26 KVM_SET_IRQCHIP | ||
| 576 | |||
| 577 | Capability: KVM_CAP_IRQCHIP | ||
| 578 | Architectures: x86, ia64 | ||
| 579 | Type: vm ioctl | ||
| 580 | Parameters: struct kvm_irqchip (in) | ||
| 581 | Returns: 0 on success, -1 on error | ||
| 582 | |||
| 583 | Sets the state of a kernel interrupt controller created with | ||
| 584 | KVM_CREATE_IRQCHIP from a buffer provided by the caller. | ||
| 585 | |||
| 586 | struct kvm_irqchip { | ||
| 587 | __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ | ||
| 588 | __u32 pad; | ||
| 589 | union { | ||
| 590 | char dummy[512]; /* reserving space */ | ||
| 591 | struct kvm_pic_state pic; | ||
| 592 | struct kvm_ioapic_state ioapic; | ||
| 593 | } chip; | ||
| 594 | }; | ||
| 595 | |||
| 596 | 5. The kvm_run structure | ||
| 597 | |||
| 598 | Application code obtains a pointer to the kvm_run structure by | ||
| 599 | mmap()ing a vcpu fd. From that point, application code can control | ||
| 600 | execution by changing fields in kvm_run prior to calling the KVM_RUN | ||
| 601 | ioctl, and obtain information about the reason KVM_RUN returned by | ||
| 602 | looking up structure members. | ||
| 603 | |||
| 604 | struct kvm_run { | ||
| 605 | /* in */ | ||
| 606 | __u8 request_interrupt_window; | ||
| 607 | |||
| 608 | Request that KVM_RUN return when it becomes possible to inject external | ||
| 609 | interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. | ||
| 610 | |||
| 611 | __u8 padding1[7]; | ||
| 612 | |||
| 613 | /* out */ | ||
| 614 | __u32 exit_reason; | ||
| 615 | |||
| 616 | When KVM_RUN has returned successfully (return value 0), this informs | ||
| 617 | application code why KVM_RUN has returned. Allowable values for this | ||
| 618 | field are detailed below. | ||
| 619 | |||
| 620 | __u8 ready_for_interrupt_injection; | ||
| 621 | |||
| 622 | If request_interrupt_window has been specified, this field indicates | ||
| 623 | an interrupt can be injected now with KVM_INTERRUPT. | ||
| 624 | |||
| 625 | __u8 if_flag; | ||
| 626 | |||
| 627 | The value of the current interrupt flag. Only valid if in-kernel | ||
| 628 | local APIC is not used. | ||
| 629 | |||
| 630 | __u8 padding2[2]; | ||
| 631 | |||
| 632 | /* in (pre_kvm_run), out (post_kvm_run) */ | ||
| 633 | __u64 cr8; | ||
| 634 | |||
| 635 | The value of the cr8 register. Only valid if in-kernel local APIC is | ||
| 636 | not used. Both input and output. | ||
| 637 | |||
| 638 | __u64 apic_base; | ||
| 639 | |||
| 640 | The value of the APIC BASE msr. Only valid if in-kernel local | ||
| 641 | APIC is not used. Both input and output. | ||
| 642 | |||
| 643 | union { | ||
| 644 | /* KVM_EXIT_UNKNOWN */ | ||
| 645 | struct { | ||
| 646 | __u64 hardware_exit_reason; | ||
| 647 | } hw; | ||
| 648 | |||
| 649 | If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown | ||
| 650 | reasons. Further architecture-specific information is available in | ||
| 651 | hardware_exit_reason. | ||
| 652 | |||
| 653 | /* KVM_EXIT_FAIL_ENTRY */ | ||
| 654 | struct { | ||
| 655 | __u64 hardware_entry_failure_reason; | ||
| 656 | } fail_entry; | ||
| 657 | |||
| 658 | If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due | ||
| 659 | to unknown reasons. Further architecture-specific information is | ||
| 660 | available in hardware_entry_failure_reason. | ||
| 661 | |||
| 662 | /* KVM_EXIT_EXCEPTION */ | ||
| 663 | struct { | ||
| 664 | __u32 exception; | ||
| 665 | __u32 error_code; | ||
| 666 | } ex; | ||
| 667 | |||
| 668 | Unused. | ||
| 669 | |||
| 670 | /* KVM_EXIT_IO */ | ||
| 671 | struct { | ||
| 672 | #define KVM_EXIT_IO_IN 0 | ||
| 673 | #define KVM_EXIT_IO_OUT 1 | ||
| 674 | __u8 direction; | ||
| 675 | __u8 size; /* bytes */ | ||
| 676 | __u16 port; | ||
| 677 | __u32 count; | ||
| 678 | __u64 data_offset; /* relative to kvm_run start */ | ||
| 679 | } io; | ||
| 680 | |||
| 681 | If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has | ||
| 682 | executed a port I/O instruction which could not be satisfied by kvm. | ||
| 683 | data_offset describes where the data is located (KVM_EXIT_IO_OUT) or | ||
| 684 | where kvm expects application code to place the data for the next | ||
| 685 | KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array. | ||
| 686 | |||
| 687 | struct { | ||
| 688 | struct kvm_debug_exit_arch arch; | ||
| 689 | } debug; | ||
| 690 | |||
| 691 | Unused. | ||
| 692 | |||
| 693 | /* KVM_EXIT_MMIO */ | ||
| 694 | struct { | ||
| 695 | __u64 phys_addr; | ||
| 696 | __u8 data[8]; | ||
| 697 | __u32 len; | ||
| 698 | __u8 is_write; | ||
| 699 | } mmio; | ||
| 700 | |||
| 701 | If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has | ||
| 702 | executed a memory-mapped I/O instruction which could not be satisfied | ||
| 703 | by kvm. The 'data' member contains the written data if 'is_write' is | ||
| 704 | true, and should be filled by application code otherwise. | ||
| 705 | |||
| 706 | /* KVM_EXIT_HYPERCALL */ | ||
| 707 | struct { | ||
| 708 | __u64 nr; | ||
| 709 | __u64 args[6]; | ||
| 710 | __u64 ret; | ||
| 711 | __u32 longmode; | ||
| 712 | __u32 pad; | ||
| 713 | } hypercall; | ||
| 714 | |||
| 715 | Unused. | ||
| 716 | |||
| 717 | /* KVM_EXIT_TPR_ACCESS */ | ||
| 718 | struct { | ||
| 719 | __u64 rip; | ||
| 720 | __u32 is_write; | ||
| 721 | __u32 pad; | ||
| 722 | } tpr_access; | ||
| 723 | |||
| 724 | To be documented (KVM_TPR_ACCESS_REPORTING). | ||
| 725 | |||
| 726 | /* KVM_EXIT_S390_SIEIC */ | ||
| 727 | struct { | ||
| 728 | __u8 icptcode; | ||
| 729 | __u64 mask; /* psw upper half */ | ||
| 730 | __u64 addr; /* psw lower half */ | ||
| 731 | __u16 ipa; | ||
| 732 | __u32 ipb; | ||
| 733 | } s390_sieic; | ||
| 734 | |||
| 735 | s390 specific. | ||
| 736 | |||
| 737 | /* KVM_EXIT_S390_RESET */ | ||
| 738 | #define KVM_S390_RESET_POR 1 | ||
| 739 | #define KVM_S390_RESET_CLEAR 2 | ||
| 740 | #define KVM_S390_RESET_SUBSYSTEM 4 | ||
| 741 | #define KVM_S390_RESET_CPU_INIT 8 | ||
| 742 | #define KVM_S390_RESET_IPL 16 | ||
| 743 | __u64 s390_reset_flags; | ||
| 744 | |||
| 745 | s390 specific. | ||
| 746 | |||
| 747 | /* KVM_EXIT_DCR */ | ||
| 748 | struct { | ||
| 749 | __u32 dcrn; | ||
| 750 | __u32 data; | ||
| 751 | __u8 is_write; | ||
| 752 | } dcr; | ||
| 753 | |||
| 754 | powerpc specific. | ||
| 755 | |||
| 756 | /* Fix the size of the union. */ | ||
| 757 | char padding[256]; | ||
| 758 | }; | ||
| 759 | }; | ||
diff --git a/MAINTAINERS b/MAINTAINERS index e95cb772f93..15169365c33 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -2926,6 +2926,7 @@ F: include/linux/sunrpc/ | |||
| 2926 | 2926 | ||
| 2927 | KERNEL VIRTUAL MACHINE (KVM) | 2927 | KERNEL VIRTUAL MACHINE (KVM) |
| 2928 | M: Avi Kivity <avi@redhat.com> | 2928 | M: Avi Kivity <avi@redhat.com> |
| 2929 | M: Marcelo Tosatti <mtosatti@redhat.com> | ||
| 2929 | L: kvm@vger.kernel.org | 2930 | L: kvm@vger.kernel.org |
| 2930 | W: http://kvm.qumranet.com | 2931 | W: http://kvm.qumranet.com |
| 2931 | S: Supported | 2932 | S: Supported |
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 5f43697aed3..d9b6325a932 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h | |||
| @@ -235,7 +235,8 @@ struct kvm_vm_data { | |||
| 235 | #define KVM_REQ_PTC_G 32 | 235 | #define KVM_REQ_PTC_G 32 |
| 236 | #define KVM_REQ_RESUME 33 | 236 | #define KVM_REQ_RESUME 33 |
| 237 | 237 | ||
| 238 | #define KVM_PAGES_PER_HPAGE 1 | 238 | #define KVM_NR_PAGE_SIZES 1 |
| 239 | #define KVM_PAGES_PER_HPAGE(x) 1 | ||
| 239 | 240 | ||
| 240 | struct kvm; | 241 | struct kvm; |
| 241 | struct kvm_vcpu; | 242 | struct kvm_vcpu; |
| @@ -465,7 +466,6 @@ struct kvm_arch { | |||
| 465 | unsigned long metaphysical_rr4; | 466 | unsigned long metaphysical_rr4; |
| 466 | unsigned long vmm_init_rr; | 467 | unsigned long vmm_init_rr; |
| 467 | 468 | ||
| 468 | int online_vcpus; | ||
| 469 | int is_sn2; | 469 | int is_sn2; |
| 470 | 470 | ||
| 471 | struct kvm_ioapic *vioapic; | 471 | struct kvm_ioapic *vioapic; |
diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h index 0d6d8ca07b8..1588aee781a 100644 --- a/arch/ia64/include/asm/kvm_para.h +++ b/arch/ia64/include/asm/kvm_para.h | |||
| @@ -19,9 +19,13 @@ | |||
| 19 | * | 19 | * |
| 20 | */ | 20 | */ |
| 21 | 21 | ||
| 22 | #ifdef __KERNEL__ | ||
| 23 | |||
| 22 | static inline unsigned int kvm_arch_para_features(void) | 24 | static inline unsigned int kvm_arch_para_features(void) |
| 23 | { | 25 | { |
| 24 | return 0; | 26 | return 0; |
| 25 | } | 27 | } |
| 26 | 28 | ||
| 27 | #endif | 29 | #endif |
| 30 | |||
| 31 | #endif | ||
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 64d52093787..ef3e7be29ca 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig | |||
| @@ -1,12 +1,8 @@ | |||
| 1 | # | 1 | # |
| 2 | # KVM configuration | 2 | # KVM configuration |
| 3 | # | 3 | # |
| 4 | config HAVE_KVM | ||
| 5 | bool | ||
| 6 | 4 | ||
| 7 | config HAVE_KVM_IRQCHIP | 5 | source "virt/kvm/Kconfig" |
| 8 | bool | ||
| 9 | default y | ||
| 10 | 6 | ||
| 11 | menuconfig VIRTUALIZATION | 7 | menuconfig VIRTUALIZATION |
| 12 | bool "Virtualization" | 8 | bool "Virtualization" |
| @@ -28,6 +24,8 @@ config KVM | |||
| 28 | depends on PCI | 24 | depends on PCI |
| 29 | select PREEMPT_NOTIFIERS | 25 | select PREEMPT_NOTIFIERS |
| 30 | select ANON_INODES | 26 | select ANON_INODES |
| 27 | select HAVE_KVM_IRQCHIP | ||
| 28 | select KVM_APIC_ARCHITECTURE | ||
| 31 | ---help--- | 29 | ---help--- |
| 32 | Support hosting fully virtualized guest machines using hardware | 30 | Support hosting fully virtualized guest machines using hardware |
| 33 | virtualization extensions. You will need a fairly recent | 31 | virtualization extensions. You will need a fairly recent |
| @@ -49,9 +47,6 @@ config KVM_INTEL | |||
| 49 | Provides support for KVM on Itanium 2 processors equipped with the VT | 47 | Provides support for KVM on Itanium 2 processors equipped with the VT |
| 50 | extensions. | 48 | extensions. |
| 51 | 49 | ||
| 52 | config KVM_TRACE | ||
| 53 | bool | ||
| 54 | |||
| 55 | source drivers/virtio/Kconfig | 50 | source drivers/virtio/Kconfig |
| 56 | 51 | ||
| 57 | endif # VIRTUALIZATION | 52 | endif # VIRTUALIZATION |
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 80c57b0a21c..0ad09f05efa 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c | |||
| @@ -210,16 +210,6 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
| 210 | 210 | ||
| 211 | } | 211 | } |
| 212 | 212 | ||
| 213 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | ||
| 214 | gpa_t addr, int len, int is_write) | ||
| 215 | { | ||
| 216 | struct kvm_io_device *dev; | ||
| 217 | |||
| 218 | dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write); | ||
| 219 | |||
| 220 | return dev; | ||
| 221 | } | ||
| 222 | |||
| 223 | static int handle_vm_error(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 213 | static int handle_vm_error(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 224 | { | 214 | { |
| 225 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 215 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; |
| @@ -231,6 +221,7 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 231 | { | 221 | { |
| 232 | struct kvm_mmio_req *p; | 222 | struct kvm_mmio_req *p; |
| 233 | struct kvm_io_device *mmio_dev; | 223 | struct kvm_io_device *mmio_dev; |
| 224 | int r; | ||
| 234 | 225 | ||
| 235 | p = kvm_get_vcpu_ioreq(vcpu); | 226 | p = kvm_get_vcpu_ioreq(vcpu); |
| 236 | 227 | ||
| @@ -247,16 +238,13 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 247 | kvm_run->exit_reason = KVM_EXIT_MMIO; | 238 | kvm_run->exit_reason = KVM_EXIT_MMIO; |
| 248 | return 0; | 239 | return 0; |
| 249 | mmio: | 240 | mmio: |
| 250 | mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir); | 241 | if (p->dir) |
| 251 | if (mmio_dev) { | 242 | r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr, |
| 252 | if (!p->dir) | 243 | p->size, &p->data); |
| 253 | kvm_iodevice_write(mmio_dev, p->addr, p->size, | 244 | else |
| 254 | &p->data); | 245 | r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr, |
| 255 | else | 246 | p->size, &p->data); |
| 256 | kvm_iodevice_read(mmio_dev, p->addr, p->size, | 247 | if (r) |
| 257 | &p->data); | ||
| 258 | |||
| 259 | } else | ||
| 260 | printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); | 248 | printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); |
| 261 | p->state = STATE_IORESP_READY; | 249 | p->state = STATE_IORESP_READY; |
| 262 | 250 | ||
| @@ -337,13 +325,12 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id, | |||
| 337 | { | 325 | { |
| 338 | union ia64_lid lid; | 326 | union ia64_lid lid; |
| 339 | int i; | 327 | int i; |
| 328 | struct kvm_vcpu *vcpu; | ||
| 340 | 329 | ||
| 341 | for (i = 0; i < kvm->arch.online_vcpus; i++) { | 330 | kvm_for_each_vcpu(i, vcpu, kvm) { |
| 342 | if (kvm->vcpus[i]) { | 331 | lid.val = VCPU_LID(vcpu); |
| 343 | lid.val = VCPU_LID(kvm->vcpus[i]); | 332 | if (lid.id == id && lid.eid == eid) |
| 344 | if (lid.id == id && lid.eid == eid) | 333 | return vcpu; |
| 345 | return kvm->vcpus[i]; | ||
| 346 | } | ||
| 347 | } | 334 | } |
| 348 | 335 | ||
| 349 | return NULL; | 336 | return NULL; |
| @@ -409,21 +396,21 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 409 | struct kvm *kvm = vcpu->kvm; | 396 | struct kvm *kvm = vcpu->kvm; |
| 410 | struct call_data call_data; | 397 | struct call_data call_data; |
| 411 | int i; | 398 | int i; |
| 399 | struct kvm_vcpu *vcpui; | ||
| 412 | 400 | ||
| 413 | call_data.ptc_g_data = p->u.ptc_g_data; | 401 | call_data.ptc_g_data = p->u.ptc_g_data; |
| 414 | 402 | ||
| 415 | for (i = 0; i < kvm->arch.online_vcpus; i++) { | 403 | kvm_for_each_vcpu(i, vcpui, kvm) { |
| 416 | if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state == | 404 | if (vcpui->arch.mp_state == KVM_MP_STATE_UNINITIALIZED || |
| 417 | KVM_MP_STATE_UNINITIALIZED || | 405 | vcpu == vcpui) |
| 418 | vcpu == kvm->vcpus[i]) | ||
| 419 | continue; | 406 | continue; |
| 420 | 407 | ||
| 421 | if (waitqueue_active(&kvm->vcpus[i]->wq)) | 408 | if (waitqueue_active(&vcpui->wq)) |
| 422 | wake_up_interruptible(&kvm->vcpus[i]->wq); | 409 | wake_up_interruptible(&vcpui->wq); |
| 423 | 410 | ||
| 424 | if (kvm->vcpus[i]->cpu != -1) { | 411 | if (vcpui->cpu != -1) { |
| 425 | call_data.vcpu = kvm->vcpus[i]; | 412 | call_data.vcpu = vcpui; |
| 426 | smp_call_function_single(kvm->vcpus[i]->cpu, | 413 | smp_call_function_single(vcpui->cpu, |
| 427 | vcpu_global_purge, &call_data, 1); | 414 | vcpu_global_purge, &call_data, 1); |
| 428 | } else | 415 | } else |
| 429 | printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n"); | 416 | printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n"); |
| @@ -852,8 +839,6 @@ struct kvm *kvm_arch_create_vm(void) | |||
| 852 | 839 | ||
| 853 | kvm_init_vm(kvm); | 840 | kvm_init_vm(kvm); |
| 854 | 841 | ||
| 855 | kvm->arch.online_vcpus = 0; | ||
| 856 | |||
| 857 | return kvm; | 842 | return kvm; |
| 858 | 843 | ||
| 859 | } | 844 | } |
| @@ -1000,10 +985,10 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 1000 | goto out; | 985 | goto out; |
| 1001 | if (irqchip_in_kernel(kvm)) { | 986 | if (irqchip_in_kernel(kvm)) { |
| 1002 | __s32 status; | 987 | __s32 status; |
| 1003 | mutex_lock(&kvm->lock); | 988 | mutex_lock(&kvm->irq_lock); |
| 1004 | status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, | 989 | status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, |
| 1005 | irq_event.irq, irq_event.level); | 990 | irq_event.irq, irq_event.level); |
| 1006 | mutex_unlock(&kvm->lock); | 991 | mutex_unlock(&kvm->irq_lock); |
| 1007 | if (ioctl == KVM_IRQ_LINE_STATUS) { | 992 | if (ioctl == KVM_IRQ_LINE_STATUS) { |
| 1008 | irq_event.status = status; | 993 | irq_event.status = status; |
| 1009 | if (copy_to_user(argp, &irq_event, | 994 | if (copy_to_user(argp, &irq_event, |
| @@ -1216,7 +1201,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
| 1216 | if (IS_ERR(vmm_vcpu)) | 1201 | if (IS_ERR(vmm_vcpu)) |
| 1217 | return PTR_ERR(vmm_vcpu); | 1202 | return PTR_ERR(vmm_vcpu); |
| 1218 | 1203 | ||
| 1219 | if (vcpu->vcpu_id == 0) { | 1204 | if (kvm_vcpu_is_bsp(vcpu)) { |
| 1220 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 1205 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
| 1221 | 1206 | ||
| 1222 | /*Set entry address for first run.*/ | 1207 | /*Set entry address for first run.*/ |
| @@ -1224,7 +1209,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
| 1224 | 1209 | ||
| 1225 | /*Initialize itc offset for vcpus*/ | 1210 | /*Initialize itc offset for vcpus*/ |
| 1226 | itc_offset = 0UL - kvm_get_itc(vcpu); | 1211 | itc_offset = 0UL - kvm_get_itc(vcpu); |
| 1227 | for (i = 0; i < kvm->arch.online_vcpus; i++) { | 1212 | for (i = 0; i < KVM_MAX_VCPUS; i++) { |
| 1228 | v = (struct kvm_vcpu *)((char *)vcpu + | 1213 | v = (struct kvm_vcpu *)((char *)vcpu + |
| 1229 | sizeof(struct kvm_vcpu_data) * i); | 1214 | sizeof(struct kvm_vcpu_data) * i); |
| 1230 | v->arch.itc_offset = itc_offset; | 1215 | v->arch.itc_offset = itc_offset; |
| @@ -1356,8 +1341,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | |||
| 1356 | goto fail; | 1341 | goto fail; |
| 1357 | } | 1342 | } |
| 1358 | 1343 | ||
| 1359 | kvm->arch.online_vcpus++; | ||
| 1360 | |||
| 1361 | return vcpu; | 1344 | return vcpu; |
| 1362 | fail: | 1345 | fail: |
| 1363 | return ERR_PTR(r); | 1346 | return ERR_PTR(r); |
| @@ -1952,19 +1935,6 @@ int kvm_highest_pending_irq(struct kvm_vcpu *vcpu) | |||
| 1952 | return find_highest_bits((int *)&vpd->irr[0]); | 1935 | return find_highest_bits((int *)&vpd->irr[0]); |
| 1953 | } | 1936 | } |
| 1954 | 1937 | ||
| 1955 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) | ||
| 1956 | { | ||
| 1957 | if (kvm_highest_pending_irq(vcpu) != -1) | ||
| 1958 | return 1; | ||
| 1959 | return 0; | ||
| 1960 | } | ||
| 1961 | |||
| 1962 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) | ||
| 1963 | { | ||
| 1964 | /* do real check here */ | ||
| 1965 | return 1; | ||
| 1966 | } | ||
| 1967 | |||
| 1968 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) | 1938 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) |
| 1969 | { | 1939 | { |
| 1970 | return vcpu->arch.timer_fired; | 1940 | return vcpu->arch.timer_fired; |
| @@ -1977,7 +1947,8 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | |||
| 1977 | 1947 | ||
| 1978 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 1948 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
| 1979 | { | 1949 | { |
| 1980 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE; | 1950 | return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) || |
| 1951 | (kvm_highest_pending_irq(vcpu) != -1); | ||
| 1981 | } | 1952 | } |
| 1982 | 1953 | ||
| 1983 | int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, | 1954 | int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, |
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index cc406d064a0..dce75b70cdd 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c | |||
| @@ -830,8 +830,8 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val) | |||
| 830 | 830 | ||
| 831 | kvm = (struct kvm *)KVM_VM_BASE; | 831 | kvm = (struct kvm *)KVM_VM_BASE; |
| 832 | 832 | ||
| 833 | if (vcpu->vcpu_id == 0) { | 833 | if (kvm_vcpu_is_bsp(vcpu)) { |
| 834 | for (i = 0; i < kvm->arch.online_vcpus; i++) { | 834 | for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) { |
| 835 | v = (struct kvm_vcpu *)((char *)vcpu + | 835 | v = (struct kvm_vcpu *)((char *)vcpu + |
| 836 | sizeof(struct kvm_vcpu_data) * i); | 836 | sizeof(struct kvm_vcpu_data) * i); |
| 837 | VMX(v, itc_offset) = itc_offset; | 837 | VMX(v, itc_offset) = itc_offset; |
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index fddc3ed715f..c9c930ed11d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h | |||
| @@ -34,7 +34,8 @@ | |||
| 34 | #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 | 34 | #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 |
| 35 | 35 | ||
| 36 | /* We don't currently support large pages. */ | 36 | /* We don't currently support large pages. */ |
| 37 | #define KVM_PAGES_PER_HPAGE (1UL << 31) | 37 | #define KVM_NR_PAGE_SIZES 1 |
| 38 | #define KVM_PAGES_PER_HPAGE(x) (1UL<<31) | ||
| 38 | 39 | ||
| 39 | struct kvm; | 40 | struct kvm; |
| 40 | struct kvm_run; | 41 | struct kvm_run; |
| @@ -153,7 +154,6 @@ struct kvm_vcpu_arch { | |||
| 153 | u32 pid; | 154 | u32 pid; |
| 154 | u32 swap_pid; | 155 | u32 swap_pid; |
| 155 | 156 | ||
| 156 | u32 pvr; | ||
| 157 | u32 ccr0; | 157 | u32 ccr0; |
| 158 | u32 ccr1; | 158 | u32 ccr1; |
| 159 | u32 dbcr0; | 159 | u32 dbcr0; |
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 0cef809cec2..f4d1b55aa70 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c | |||
| @@ -138,7 +138,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) | |||
| 138 | kmem_cache_free(kvm_vcpu_cache, vcpu_44x); | 138 | kmem_cache_free(kvm_vcpu_cache, vcpu_44x); |
| 139 | } | 139 | } |
| 140 | 140 | ||
| 141 | static int kvmppc_44x_init(void) | 141 | static int __init kvmppc_44x_init(void) |
| 142 | { | 142 | { |
| 143 | int r; | 143 | int r; |
| 144 | 144 | ||
| @@ -149,7 +149,7 @@ static int kvmppc_44x_init(void) | |||
| 149 | return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE); | 149 | return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE); |
| 150 | } | 150 | } |
| 151 | 151 | ||
| 152 | static void kvmppc_44x_exit(void) | 152 | static void __exit kvmppc_44x_exit(void) |
| 153 | { | 153 | { |
| 154 | kvmppc_booke_exit(); | 154 | kvmppc_booke_exit(); |
| 155 | } | 155 | } |
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 4a16f472cc1..ff3cb63b811 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include "timing.h" | 30 | #include "timing.h" |
| 31 | 31 | ||
| 32 | #include "44x_tlb.h" | 32 | #include "44x_tlb.h" |
| 33 | #include "trace.h" | ||
| 33 | 34 | ||
| 34 | #ifndef PPC44x_TLBE_SIZE | 35 | #ifndef PPC44x_TLBE_SIZE |
| 35 | #define PPC44x_TLBE_SIZE PPC44x_TLB_4K | 36 | #define PPC44x_TLBE_SIZE PPC44x_TLB_4K |
| @@ -263,7 +264,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x, | |||
| 263 | 264 | ||
| 264 | /* XXX set tlb_44x_index to stlb_index? */ | 265 | /* XXX set tlb_44x_index to stlb_index? */ |
| 265 | 266 | ||
| 266 | KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler); | 267 | trace_kvm_stlb_inval(stlb_index); |
| 267 | } | 268 | } |
| 268 | 269 | ||
| 269 | void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) | 270 | void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) |
| @@ -365,8 +366,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, | |||
| 365 | /* Insert shadow mapping into hardware TLB. */ | 366 | /* Insert shadow mapping into hardware TLB. */ |
| 366 | kvmppc_44x_tlbe_set_modified(vcpu_44x, victim); | 367 | kvmppc_44x_tlbe_set_modified(vcpu_44x, victim); |
| 367 | kvmppc_44x_tlbwe(victim, &stlbe); | 368 | kvmppc_44x_tlbwe(victim, &stlbe); |
| 368 | KVMTRACE_5D(STLB_WRITE, vcpu, victim, stlbe.tid, stlbe.word0, stlbe.word1, | 369 | trace_kvm_stlb_write(victim, stlbe.tid, stlbe.word0, stlbe.word1, |
| 369 | stlbe.word2, handler); | 370 | stlbe.word2); |
| 370 | } | 371 | } |
| 371 | 372 | ||
| 372 | /* For a particular guest TLB entry, invalidate the corresponding host TLB | 373 | /* For a particular guest TLB entry, invalidate the corresponding host TLB |
| @@ -485,8 +486,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) | |||
| 485 | kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); | 486 | kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); |
| 486 | } | 487 | } |
| 487 | 488 | ||
| 488 | KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0, | 489 | trace_kvm_gtlb_write(gtlb_index, tlbe->tid, tlbe->word0, tlbe->word1, |
| 489 | tlbe->word1, tlbe->word2, handler); | 490 | tlbe->word2); |
| 490 | 491 | ||
| 491 | kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); | 492 | kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); |
| 492 | return EMULATE_DONE; | 493 | return EMULATE_DONE; |
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 5a152a52796..c2992684661 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig | |||
| @@ -2,8 +2,7 @@ | |||
| 2 | # KVM configuration | 2 | # KVM configuration |
| 3 | # | 3 | # |
| 4 | 4 | ||
| 5 | config HAVE_KVM_IRQCHIP | 5 | source "virt/kvm/Kconfig" |
| 6 | bool | ||
| 7 | 6 | ||
| 8 | menuconfig VIRTUALIZATION | 7 | menuconfig VIRTUALIZATION |
| 9 | bool "Virtualization" | 8 | bool "Virtualization" |
| @@ -59,17 +58,6 @@ config KVM_E500 | |||
| 59 | 58 | ||
| 60 | If unsure, say N. | 59 | If unsure, say N. |
| 61 | 60 | ||
| 62 | config KVM_TRACE | ||
| 63 | bool "KVM trace support" | ||
| 64 | depends on KVM && MARKERS && SYSFS | ||
| 65 | select RELAY | ||
| 66 | select DEBUG_FS | ||
| 67 | default n | ||
| 68 | ---help--- | ||
| 69 | This option allows reading a trace of kvm-related events through | ||
| 70 | relayfs. Note the ABI is not considered stable and will be | ||
| 71 | modified in future updates. | ||
| 72 | |||
| 73 | source drivers/virtio/Kconfig | 61 | source drivers/virtio/Kconfig |
| 74 | 62 | ||
| 75 | endif # VIRTUALIZATION | 63 | endif # VIRTUALIZATION |
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 459c7ee580f..37655fe19f2 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile | |||
| @@ -8,7 +8,9 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm | |||
| 8 | 8 | ||
| 9 | common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) | 9 | common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) |
| 10 | 10 | ||
| 11 | common-objs-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) | 11 | CFLAGS_44x_tlb.o := -I. |
| 12 | CFLAGS_e500_tlb.o := -I. | ||
| 13 | CFLAGS_emulate.o := -I. | ||
| 12 | 14 | ||
| 13 | kvm-objs := $(common-objs-y) powerpc.o emulate.o | 15 | kvm-objs := $(common-objs-y) powerpc.o emulate.o |
| 14 | obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o | 16 | obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o |
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 642e4204cf2..e7bf4d02948 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c | |||
| @@ -520,7 +520,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | |||
| 520 | return kvmppc_core_vcpu_translate(vcpu, tr); | 520 | return kvmppc_core_vcpu_translate(vcpu, tr); |
| 521 | } | 521 | } |
| 522 | 522 | ||
| 523 | int kvmppc_booke_init(void) | 523 | int __init kvmppc_booke_init(void) |
| 524 | { | 524 | { |
| 525 | unsigned long ivor[16]; | 525 | unsigned long ivor[16]; |
| 526 | unsigned long max_ivor = 0; | 526 | unsigned long max_ivor = 0; |
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index d8067fd81cd..64949eef43f 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c | |||
| @@ -60,9 +60,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) | |||
| 60 | 60 | ||
| 61 | kvmppc_e500_tlb_setup(vcpu_e500); | 61 | kvmppc_e500_tlb_setup(vcpu_e500); |
| 62 | 62 | ||
| 63 | /* Use the same core vertion as host's */ | ||
| 64 | vcpu->arch.pvr = mfspr(SPRN_PVR); | ||
| 65 | |||
| 66 | return 0; | 63 | return 0; |
| 67 | } | 64 | } |
| 68 | 65 | ||
| @@ -132,7 +129,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) | |||
| 132 | kmem_cache_free(kvm_vcpu_cache, vcpu_e500); | 129 | kmem_cache_free(kvm_vcpu_cache, vcpu_e500); |
| 133 | } | 130 | } |
| 134 | 131 | ||
| 135 | static int kvmppc_e500_init(void) | 132 | static int __init kvmppc_e500_init(void) |
| 136 | { | 133 | { |
| 137 | int r, i; | 134 | int r, i; |
| 138 | unsigned long ivor[3]; | 135 | unsigned long ivor[3]; |
| @@ -160,7 +157,7 @@ static int kvmppc_e500_init(void) | |||
| 160 | return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE); | 157 | return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE); |
| 161 | } | 158 | } |
| 162 | 159 | ||
| 163 | static void kvmppc_e500_exit(void) | 160 | static void __init kvmppc_e500_exit(void) |
| 164 | { | 161 | { |
| 165 | kvmppc_booke_exit(); | 162 | kvmppc_booke_exit(); |
| 166 | } | 163 | } |
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 3f760414b9f..be95b8d8e3b 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c | |||
| @@ -180,6 +180,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) | |||
| 180 | case SPRN_MMUCSR0: | 180 | case SPRN_MMUCSR0: |
| 181 | vcpu->arch.gpr[rt] = 0; break; | 181 | vcpu->arch.gpr[rt] = 0; break; |
| 182 | 182 | ||
| 183 | case SPRN_MMUCFG: | ||
| 184 | vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break; | ||
| 185 | |||
| 183 | /* extra exceptions */ | 186 | /* extra exceptions */ |
| 184 | case SPRN_IVOR32: | 187 | case SPRN_IVOR32: |
| 185 | vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; | 188 | vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; |
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 0e773fc2d5e..fb1e1dc11ba 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | 22 | ||
| 23 | #include "../mm/mmu_decl.h" | 23 | #include "../mm/mmu_decl.h" |
| 24 | #include "e500_tlb.h" | 24 | #include "e500_tlb.h" |
| 25 | #include "trace.h" | ||
| 25 | 26 | ||
| 26 | #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) | 27 | #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) |
| 27 | 28 | ||
| @@ -224,9 +225,8 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, | |||
| 224 | 225 | ||
| 225 | kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); | 226 | kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); |
| 226 | stlbe->mas1 = 0; | 227 | stlbe->mas1 = 0; |
| 227 | KVMTRACE_5D(STLB_INVAL, &vcpu_e500->vcpu, index_of(tlbsel, esel), | 228 | trace_kvm_stlb_inval(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, |
| 228 | stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7, | 229 | stlbe->mas3, stlbe->mas7); |
| 229 | handler); | ||
| 230 | } | 230 | } |
| 231 | 231 | ||
| 232 | static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, | 232 | static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, |
| @@ -269,7 +269,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, | |||
| 269 | tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; | 269 | tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; |
| 270 | victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; | 270 | victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; |
| 271 | pidsel = (vcpu_e500->mas4 >> 16) & 0xf; | 271 | pidsel = (vcpu_e500->mas4 >> 16) & 0xf; |
| 272 | tsized = (vcpu_e500->mas4 >> 8) & 0xf; | 272 | tsized = (vcpu_e500->mas4 >> 7) & 0x1f; |
| 273 | 273 | ||
| 274 | vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | 274 | vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) |
| 275 | | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); | 275 | | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); |
| @@ -309,7 +309,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, | |||
| 309 | vcpu_e500->shadow_pages[tlbsel][esel] = new_page; | 309 | vcpu_e500->shadow_pages[tlbsel][esel] = new_page; |
| 310 | 310 | ||
| 311 | /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ | 311 | /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ |
| 312 | stlbe->mas1 = MAS1_TSIZE(BOOKE_PAGESZ_4K) | 312 | stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K) |
| 313 | | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; | 313 | | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; |
| 314 | stlbe->mas2 = (gvaddr & MAS2_EPN) | 314 | stlbe->mas2 = (gvaddr & MAS2_EPN) |
| 315 | | e500_shadow_mas2_attrib(gtlbe->mas2, | 315 | | e500_shadow_mas2_attrib(gtlbe->mas2, |
| @@ -319,9 +319,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, | |||
| 319 | vcpu_e500->vcpu.arch.msr & MSR_PR); | 319 | vcpu_e500->vcpu.arch.msr & MSR_PR); |
| 320 | stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; | 320 | stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; |
| 321 | 321 | ||
| 322 | KVMTRACE_5D(STLB_WRITE, &vcpu_e500->vcpu, index_of(tlbsel, esel), | 322 | trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, |
| 323 | stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7, | 323 | stlbe->mas3, stlbe->mas7); |
| 324 | handler); | ||
| 325 | } | 324 | } |
| 326 | 325 | ||
| 327 | /* XXX only map the one-one case, for now use TLB0 */ | 326 | /* XXX only map the one-one case, for now use TLB0 */ |
| @@ -535,9 +534,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) | |||
| 535 | gtlbe->mas3 = vcpu_e500->mas3; | 534 | gtlbe->mas3 = vcpu_e500->mas3; |
| 536 | gtlbe->mas7 = vcpu_e500->mas7; | 535 | gtlbe->mas7 = vcpu_e500->mas7; |
| 537 | 536 | ||
| 538 | KVMTRACE_5D(GTLB_WRITE, vcpu, vcpu_e500->mas0, | 537 | trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, |
| 539 | gtlbe->mas1, gtlbe->mas2, gtlbe->mas3, gtlbe->mas7, | 538 | gtlbe->mas3, gtlbe->mas7); |
| 540 | handler); | ||
| 541 | 539 | ||
| 542 | /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ | 540 | /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ |
| 543 | if (tlbe_is_host_safe(vcpu, gtlbe)) { | 541 | if (tlbe_is_host_safe(vcpu, gtlbe)) { |
| @@ -545,7 +543,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) | |||
| 545 | case 0: | 543 | case 0: |
| 546 | /* TLB0 */ | 544 | /* TLB0 */ |
| 547 | gtlbe->mas1 &= ~MAS1_TSIZE(~0); | 545 | gtlbe->mas1 &= ~MAS1_TSIZE(~0); |
| 548 | gtlbe->mas1 |= MAS1_TSIZE(BOOKE_PAGESZ_4K); | 546 | gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); |
| 549 | 547 | ||
| 550 | stlbsel = 0; | 548 | stlbsel = 0; |
| 551 | sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); | 549 | sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); |
| @@ -679,14 +677,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) | |||
| 679 | 677 | ||
| 680 | /* Insert large initial mapping for guest. */ | 678 | /* Insert large initial mapping for guest. */ |
| 681 | tlbe = &vcpu_e500->guest_tlb[1][0]; | 679 | tlbe = &vcpu_e500->guest_tlb[1][0]; |
| 682 | tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_256M); | 680 | tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); |
| 683 | tlbe->mas2 = 0; | 681 | tlbe->mas2 = 0; |
| 684 | tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; | 682 | tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; |
| 685 | tlbe->mas7 = 0; | 683 | tlbe->mas7 = 0; |
| 686 | 684 | ||
| 687 | /* 4K map for serial output. Used by kernel wrapper. */ | 685 | /* 4K map for serial output. Used by kernel wrapper. */ |
| 688 | tlbe = &vcpu_e500->guest_tlb[1][1]; | 686 | tlbe = &vcpu_e500->guest_tlb[1][1]; |
| 689 | tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_4K); | 687 | tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); |
| 690 | tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; | 688 | tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; |
| 691 | tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; | 689 | tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; |
| 692 | tlbe->mas7 = 0; | 690 | tlbe->mas7 = 0; |
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 45b064b7690..d28e3010a5e 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h | |||
| @@ -16,7 +16,7 @@ | |||
| 16 | #define __KVM_E500_TLB_H__ | 16 | #define __KVM_E500_TLB_H__ |
| 17 | 17 | ||
| 18 | #include <linux/kvm_host.h> | 18 | #include <linux/kvm_host.h> |
| 19 | #include <asm/mmu-fsl-booke.h> | 19 | #include <asm/mmu-book3e.h> |
| 20 | #include <asm/tlb.h> | 20 | #include <asm/tlb.h> |
| 21 | #include <asm/kvm_e500.h> | 21 | #include <asm/kvm_e500.h> |
| 22 | 22 | ||
| @@ -59,7 +59,7 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); | |||
| 59 | /* TLB helper functions */ | 59 | /* TLB helper functions */ |
| 60 | static inline unsigned int get_tlb_size(const struct tlbe *tlbe) | 60 | static inline unsigned int get_tlb_size(const struct tlbe *tlbe) |
| 61 | { | 61 | { |
| 62 | return (tlbe->mas1 >> 8) & 0xf; | 62 | return (tlbe->mas1 >> 7) & 0x1f; |
| 63 | } | 63 | } |
| 64 | 64 | ||
| 65 | static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) | 65 | static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) |
| @@ -70,7 +70,7 @@ static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) | |||
| 70 | static inline u64 get_tlb_bytes(const struct tlbe *tlbe) | 70 | static inline u64 get_tlb_bytes(const struct tlbe *tlbe) |
| 71 | { | 71 | { |
| 72 | unsigned int pgsize = get_tlb_size(tlbe); | 72 | unsigned int pgsize = get_tlb_size(tlbe); |
| 73 | return 1ULL << 10 << (pgsize << 1); | 73 | return 1ULL << 10 << pgsize; |
| 74 | } | 74 | } |
| 75 | 75 | ||
| 76 | static inline gva_t get_tlb_end(const struct tlbe *tlbe) | 76 | static inline gva_t get_tlb_end(const struct tlbe *tlbe) |
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index a561d6e8da1..7737146af3f 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | #include <asm/kvm_ppc.h> | 29 | #include <asm/kvm_ppc.h> |
| 30 | #include <asm/disassemble.h> | 30 | #include <asm/disassemble.h> |
| 31 | #include "timing.h" | 31 | #include "timing.h" |
| 32 | #include "trace.h" | ||
| 32 | 33 | ||
| 33 | #define OP_TRAP 3 | 34 | #define OP_TRAP 3 |
| 34 | 35 | ||
| @@ -187,7 +188,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
| 187 | case SPRN_SRR1: | 188 | case SPRN_SRR1: |
| 188 | vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; | 189 | vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; |
| 189 | case SPRN_PVR: | 190 | case SPRN_PVR: |
| 190 | vcpu->arch.gpr[rt] = vcpu->arch.pvr; break; | 191 | vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break; |
| 192 | case SPRN_PIR: | ||
| 193 | vcpu->arch.gpr[rt] = mfspr(SPRN_PIR); break; | ||
| 191 | 194 | ||
| 192 | /* Note: mftb and TBRL/TBWL are user-accessible, so | 195 | /* Note: mftb and TBRL/TBWL are user-accessible, so |
| 193 | * the guest can always access the real TB anyways. | 196 | * the guest can always access the real TB anyways. |
| @@ -417,7 +420,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
| 417 | } | 420 | } |
| 418 | } | 421 | } |
| 419 | 422 | ||
| 420 | KVMTRACE_3D(PPC_INSTR, vcpu, inst, (int)vcpu->arch.pc, emulated, entryexit); | 423 | trace_kvm_ppc_instr(inst, vcpu->arch.pc, emulated); |
| 421 | 424 | ||
| 422 | if (advance) | 425 | if (advance) |
| 423 | vcpu->arch.pc += 4; /* Advance past emulated instruction. */ | 426 | vcpu->arch.pc += 4; /* Advance past emulated instruction. */ |
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2cf915e51e7..2a4551f78f6 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c | |||
| @@ -31,25 +31,17 @@ | |||
| 31 | #include "timing.h" | 31 | #include "timing.h" |
| 32 | #include "../mm/mmu_decl.h" | 32 | #include "../mm/mmu_decl.h" |
| 33 | 33 | ||
| 34 | #define CREATE_TRACE_POINTS | ||
| 35 | #include "trace.h" | ||
| 36 | |||
| 34 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | 37 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) |
| 35 | { | 38 | { |
| 36 | return gfn; | 39 | return gfn; |
| 37 | } | 40 | } |
| 38 | 41 | ||
| 39 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v) | ||
| 40 | { | ||
| 41 | return !!(v->arch.pending_exceptions); | ||
| 42 | } | ||
| 43 | |||
| 44 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) | ||
| 45 | { | ||
| 46 | /* do real check here */ | ||
| 47 | return 1; | ||
| 48 | } | ||
| 49 | |||
| 50 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) | 42 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) |
| 51 | { | 43 | { |
| 52 | return !(v->arch.msr & MSR_WE); | 44 | return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions); |
| 53 | } | 45 | } |
| 54 | 46 | ||
| 55 | 47 | ||
| @@ -122,13 +114,17 @@ struct kvm *kvm_arch_create_vm(void) | |||
| 122 | static void kvmppc_free_vcpus(struct kvm *kvm) | 114 | static void kvmppc_free_vcpus(struct kvm *kvm) |
| 123 | { | 115 | { |
| 124 | unsigned int i; | 116 | unsigned int i; |
| 117 | struct kvm_vcpu *vcpu; | ||
| 125 | 118 | ||
| 126 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 119 | kvm_for_each_vcpu(i, vcpu, kvm) |
| 127 | if (kvm->vcpus[i]) { | 120 | kvm_arch_vcpu_free(vcpu); |
| 128 | kvm_arch_vcpu_free(kvm->vcpus[i]); | 121 | |
| 129 | kvm->vcpus[i] = NULL; | 122 | mutex_lock(&kvm->lock); |
| 130 | } | 123 | for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) |
| 131 | } | 124 | kvm->vcpus[i] = NULL; |
| 125 | |||
| 126 | atomic_set(&kvm->online_vcpus, 0); | ||
| 127 | mutex_unlock(&kvm->lock); | ||
| 132 | } | 128 | } |
| 133 | 129 | ||
| 134 | void kvm_arch_sync_events(struct kvm *kvm) | 130 | void kvm_arch_sync_events(struct kvm *kvm) |
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h new file mode 100644 index 00000000000..67f219de045 --- /dev/null +++ b/arch/powerpc/kvm/trace.h | |||
| @@ -0,0 +1,104 @@ | |||
| 1 | #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 2 | #define _TRACE_KVM_H | ||
| 3 | |||
| 4 | #include <linux/tracepoint.h> | ||
| 5 | |||
| 6 | #undef TRACE_SYSTEM | ||
| 7 | #define TRACE_SYSTEM kvm | ||
| 8 | #define TRACE_INCLUDE_PATH . | ||
| 9 | #define TRACE_INCLUDE_FILE trace | ||
| 10 | |||
| 11 | /* | ||
| 12 | * Tracepoint for guest mode entry. | ||
| 13 | */ | ||
| 14 | TRACE_EVENT(kvm_ppc_instr, | ||
| 15 | TP_PROTO(unsigned int inst, unsigned long pc, unsigned int emulate), | ||
| 16 | TP_ARGS(inst, pc, emulate), | ||
| 17 | |||
| 18 | TP_STRUCT__entry( | ||
| 19 | __field( unsigned int, inst ) | ||
| 20 | __field( unsigned long, pc ) | ||
| 21 | __field( unsigned int, emulate ) | ||
| 22 | ), | ||
| 23 | |||
| 24 | TP_fast_assign( | ||
| 25 | __entry->inst = inst; | ||
| 26 | __entry->pc = pc; | ||
| 27 | __entry->emulate = emulate; | ||
| 28 | ), | ||
| 29 | |||
| 30 | TP_printk("inst %u pc 0x%lx emulate %u\n", | ||
| 31 | __entry->inst, __entry->pc, __entry->emulate) | ||
| 32 | ); | ||
| 33 | |||
| 34 | TRACE_EVENT(kvm_stlb_inval, | ||
| 35 | TP_PROTO(unsigned int stlb_index), | ||
| 36 | TP_ARGS(stlb_index), | ||
| 37 | |||
| 38 | TP_STRUCT__entry( | ||
| 39 | __field( unsigned int, stlb_index ) | ||
| 40 | ), | ||
| 41 | |||
| 42 | TP_fast_assign( | ||
| 43 | __entry->stlb_index = stlb_index; | ||
| 44 | ), | ||
| 45 | |||
| 46 | TP_printk("stlb_index %u", __entry->stlb_index) | ||
| 47 | ); | ||
| 48 | |||
| 49 | TRACE_EVENT(kvm_stlb_write, | ||
| 50 | TP_PROTO(unsigned int victim, unsigned int tid, unsigned int word0, | ||
| 51 | unsigned int word1, unsigned int word2), | ||
| 52 | TP_ARGS(victim, tid, word0, word1, word2), | ||
| 53 | |||
| 54 | TP_STRUCT__entry( | ||
| 55 | __field( unsigned int, victim ) | ||
| 56 | __field( unsigned int, tid ) | ||
| 57 | __field( unsigned int, word0 ) | ||
| 58 | __field( unsigned int, word1 ) | ||
| 59 | __field( unsigned int, word2 ) | ||
| 60 | ), | ||
| 61 | |||
| 62 | TP_fast_assign( | ||
| 63 | __entry->victim = victim; | ||
| 64 | __entry->tid = tid; | ||
| 65 | __entry->word0 = word0; | ||
| 66 | __entry->word1 = word1; | ||
| 67 | __entry->word2 = word2; | ||
| 68 | ), | ||
| 69 | |||
| 70 | TP_printk("victim %u tid %u w0 %u w1 %u w2 %u", | ||
| 71 | __entry->victim, __entry->tid, __entry->word0, | ||
| 72 | __entry->word1, __entry->word2) | ||
| 73 | ); | ||
| 74 | |||
| 75 | TRACE_EVENT(kvm_gtlb_write, | ||
| 76 | TP_PROTO(unsigned int gtlb_index, unsigned int tid, unsigned int word0, | ||
| 77 | unsigned int word1, unsigned int word2), | ||
| 78 | TP_ARGS(gtlb_index, tid, word0, word1, word2), | ||
| 79 | |||
| 80 | TP_STRUCT__entry( | ||
| 81 | __field( unsigned int, gtlb_index ) | ||
| 82 | __field( unsigned int, tid ) | ||
| 83 | __field( unsigned int, word0 ) | ||
| 84 | __field( unsigned int, word1 ) | ||
| 85 | __field( unsigned int, word2 ) | ||
| 86 | ), | ||
| 87 | |||
| 88 | TP_fast_assign( | ||
| 89 | __entry->gtlb_index = gtlb_index; | ||
| 90 | __entry->tid = tid; | ||
| 91 | __entry->word0 = word0; | ||
| 92 | __entry->word1 = word1; | ||
| 93 | __entry->word2 = word2; | ||
| 94 | ), | ||
| 95 | |||
| 96 | TP_printk("gtlb_index %u tid %u w0 %u w1 %u w2 %u", | ||
| 97 | __entry->gtlb_index, __entry->tid, __entry->word0, | ||
| 98 | __entry->word1, __entry->word2) | ||
| 99 | ); | ||
| 100 | |||
| 101 | #endif /* _TRACE_KVM_H */ | ||
| 102 | |||
| 103 | /* This part must be outside protection */ | ||
| 104 | #include <trace/define_trace.h> | ||
diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 0b2f829f6d5..3dfcaeb5d7f 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h | |||
| @@ -15,15 +15,6 @@ | |||
| 15 | */ | 15 | */ |
| 16 | #include <linux/types.h> | 16 | #include <linux/types.h> |
| 17 | 17 | ||
| 18 | /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ | ||
| 19 | struct kvm_pic_state { | ||
| 20 | /* no PIC for s390 */ | ||
| 21 | }; | ||
| 22 | |||
| 23 | struct kvm_ioapic_state { | ||
| 24 | /* no IOAPIC for s390 */ | ||
| 25 | }; | ||
| 26 | |||
| 27 | /* for KVM_GET_REGS and KVM_SET_REGS */ | 18 | /* for KVM_GET_REGS and KVM_SET_REGS */ |
| 28 | struct kvm_regs { | 19 | struct kvm_regs { |
| 29 | /* general purpose regs for s390 */ | 20 | /* general purpose regs for s390 */ |
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 698988f6940..27605b62b98 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * asm-s390/kvm_host.h - definition for kernel virtual machines on s390 | 2 | * asm-s390/kvm_host.h - definition for kernel virtual machines on s390 |
| 3 | * | 3 | * |
| 4 | * Copyright IBM Corp. 2008 | 4 | * Copyright IBM Corp. 2008,2009 |
| 5 | * | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License (version 2 only) | 7 | * it under the terms of the GNU General Public License (version 2 only) |
| @@ -40,7 +40,11 @@ struct sca_block { | |||
| 40 | struct sca_entry cpu[64]; | 40 | struct sca_entry cpu[64]; |
| 41 | } __attribute__((packed)); | 41 | } __attribute__((packed)); |
| 42 | 42 | ||
| 43 | #define KVM_PAGES_PER_HPAGE 256 | 43 | #define KVM_NR_PAGE_SIZES 2 |
| 44 | #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8) | ||
| 45 | #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) | ||
| 46 | #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) | ||
| 47 | #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) | ||
| 44 | 48 | ||
| 45 | #define CPUSTAT_HOST 0x80000000 | 49 | #define CPUSTAT_HOST 0x80000000 |
| 46 | #define CPUSTAT_WAIT 0x10000000 | 50 | #define CPUSTAT_WAIT 0x10000000 |
| @@ -182,8 +186,9 @@ struct kvm_s390_interrupt_info { | |||
| 182 | }; | 186 | }; |
| 183 | 187 | ||
| 184 | /* for local_interrupt.action_flags */ | 188 | /* for local_interrupt.action_flags */ |
| 185 | #define ACTION_STORE_ON_STOP 1 | 189 | #define ACTION_STORE_ON_STOP (1<<0) |
| 186 | #define ACTION_STOP_ON_STOP 2 | 190 | #define ACTION_STOP_ON_STOP (1<<1) |
| 191 | #define ACTION_RELOADVCPU_ON_STOP (1<<2) | ||
| 187 | 192 | ||
| 188 | struct kvm_s390_local_interrupt { | 193 | struct kvm_s390_local_interrupt { |
| 189 | spinlock_t lock; | 194 | spinlock_t lock; |
| @@ -227,8 +232,6 @@ struct kvm_vm_stat { | |||
| 227 | }; | 232 | }; |
| 228 | 233 | ||
| 229 | struct kvm_arch{ | 234 | struct kvm_arch{ |
| 230 | unsigned long guest_origin; | ||
| 231 | unsigned long guest_memsize; | ||
| 232 | struct sca_block *sca; | 235 | struct sca_block *sca; |
| 233 | debug_info_t *dbf; | 236 | debug_info_t *dbf; |
| 234 | struct kvm_s390_float_interrupt float_int; | 237 | struct kvm_s390_float_interrupt float_int; |
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h index 2c503796b61..6964db226f8 100644 --- a/arch/s390/include/asm/kvm_para.h +++ b/arch/s390/include/asm/kvm_para.h | |||
| @@ -13,6 +13,8 @@ | |||
| 13 | #ifndef __S390_KVM_PARA_H | 13 | #ifndef __S390_KVM_PARA_H |
| 14 | #define __S390_KVM_PARA_H | 14 | #define __S390_KVM_PARA_H |
| 15 | 15 | ||
| 16 | #ifdef __KERNEL__ | ||
| 17 | |||
| 16 | /* | 18 | /* |
| 17 | * Hypercalls for KVM on s390. The calling convention is similar to the | 19 | * Hypercalls for KVM on s390. The calling convention is similar to the |
| 18 | * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1 | 20 | * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1 |
| @@ -147,4 +149,6 @@ static inline unsigned int kvm_arch_para_features(void) | |||
| 147 | return 0; | 149 | return 0; |
| 148 | } | 150 | } |
| 149 | 151 | ||
| 152 | #endif | ||
| 153 | |||
| 150 | #endif /* __S390_KVM_PARA_H */ | 154 | #endif /* __S390_KVM_PARA_H */ |
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 3e260b7e37b..bf164fc2186 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig | |||
| @@ -1,11 +1,7 @@ | |||
| 1 | # | 1 | # |
| 2 | # KVM configuration | 2 | # KVM configuration |
| 3 | # | 3 | # |
| 4 | config HAVE_KVM | 4 | source "virt/kvm/Kconfig" |
| 5 | bool | ||
| 6 | |||
| 7 | config HAVE_KVM_IRQCHIP | ||
| 8 | bool | ||
| 9 | 5 | ||
| 10 | menuconfig VIRTUALIZATION | 6 | menuconfig VIRTUALIZATION |
| 11 | bool "Virtualization" | 7 | bool "Virtualization" |
| @@ -38,9 +34,6 @@ config KVM | |||
| 38 | 34 | ||
| 39 | If unsure, say N. | 35 | If unsure, say N. |
| 40 | 36 | ||
| 41 | config KVM_TRACE | ||
| 42 | bool | ||
| 43 | |||
| 44 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | 37 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under |
| 45 | # the virtualization menu. | 38 | # the virtualization menu. |
| 46 | source drivers/virtio/Kconfig | 39 | source drivers/virtio/Kconfig |
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index ed60f3a74a8..03c716a0f01 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * gaccess.h - access guest memory | 2 | * gaccess.h - access guest memory |
| 3 | * | 3 | * |
| 4 | * Copyright IBM Corp. 2008 | 4 | * Copyright IBM Corp. 2008,2009 |
| 5 | * | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License (version 2 only) | 7 | * it under the terms of the GNU General Public License (version 2 only) |
| @@ -16,13 +16,14 @@ | |||
| 16 | #include <linux/compiler.h> | 16 | #include <linux/compiler.h> |
| 17 | #include <linux/kvm_host.h> | 17 | #include <linux/kvm_host.h> |
| 18 | #include <asm/uaccess.h> | 18 | #include <asm/uaccess.h> |
| 19 | #include "kvm-s390.h" | ||
| 19 | 20 | ||
| 20 | static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu, | 21 | static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu, |
| 21 | unsigned long guestaddr) | 22 | unsigned long guestaddr) |
| 22 | { | 23 | { |
| 23 | unsigned long prefix = vcpu->arch.sie_block->prefix; | 24 | unsigned long prefix = vcpu->arch.sie_block->prefix; |
| 24 | unsigned long origin = vcpu->kvm->arch.guest_origin; | 25 | unsigned long origin = vcpu->arch.sie_block->gmsor; |
| 25 | unsigned long memsize = vcpu->kvm->arch.guest_memsize; | 26 | unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); |
| 26 | 27 | ||
| 27 | if (guestaddr < 2 * PAGE_SIZE) | 28 | if (guestaddr < 2 * PAGE_SIZE) |
| 28 | guestaddr += prefix; | 29 | guestaddr += prefix; |
| @@ -158,8 +159,8 @@ static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest, | |||
| 158 | const void *from, unsigned long n) | 159 | const void *from, unsigned long n) |
| 159 | { | 160 | { |
| 160 | unsigned long prefix = vcpu->arch.sie_block->prefix; | 161 | unsigned long prefix = vcpu->arch.sie_block->prefix; |
| 161 | unsigned long origin = vcpu->kvm->arch.guest_origin; | 162 | unsigned long origin = vcpu->arch.sie_block->gmsor; |
| 162 | unsigned long memsize = vcpu->kvm->arch.guest_memsize; | 163 | unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); |
| 163 | 164 | ||
| 164 | if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE)) | 165 | if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE)) |
| 165 | goto slowpath; | 166 | goto slowpath; |
| @@ -209,8 +210,8 @@ static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to, | |||
| 209 | unsigned long guestsrc, unsigned long n) | 210 | unsigned long guestsrc, unsigned long n) |
| 210 | { | 211 | { |
| 211 | unsigned long prefix = vcpu->arch.sie_block->prefix; | 212 | unsigned long prefix = vcpu->arch.sie_block->prefix; |
| 212 | unsigned long origin = vcpu->kvm->arch.guest_origin; | 213 | unsigned long origin = vcpu->arch.sie_block->gmsor; |
| 213 | unsigned long memsize = vcpu->kvm->arch.guest_memsize; | 214 | unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); |
| 214 | 215 | ||
| 215 | if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE)) | 216 | if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE)) |
| 216 | goto slowpath; | 217 | goto slowpath; |
| @@ -244,8 +245,8 @@ static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu, | |||
| 244 | unsigned long guestdest, | 245 | unsigned long guestdest, |
| 245 | const void *from, unsigned long n) | 246 | const void *from, unsigned long n) |
| 246 | { | 247 | { |
| 247 | unsigned long origin = vcpu->kvm->arch.guest_origin; | 248 | unsigned long origin = vcpu->arch.sie_block->gmsor; |
| 248 | unsigned long memsize = vcpu->kvm->arch.guest_memsize; | 249 | unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); |
| 249 | 250 | ||
| 250 | if (guestdest + n > memsize) | 251 | if (guestdest + n > memsize) |
| 251 | return -EFAULT; | 252 | return -EFAULT; |
| @@ -262,8 +263,8 @@ static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to, | |||
| 262 | unsigned long guestsrc, | 263 | unsigned long guestsrc, |
| 263 | unsigned long n) | 264 | unsigned long n) |
| 264 | { | 265 | { |
| 265 | unsigned long origin = vcpu->kvm->arch.guest_origin; | 266 | unsigned long origin = vcpu->arch.sie_block->gmsor; |
| 266 | unsigned long memsize = vcpu->kvm->arch.guest_memsize; | 267 | unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu); |
| 267 | 268 | ||
| 268 | if (guestsrc + n > memsize) | 269 | if (guestsrc + n > memsize) |
| 269 | return -EFAULT; | 270 | return -EFAULT; |
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 98997ccba50..ba9d8a7bc1a 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * intercept.c - in-kernel handling for sie intercepts | 2 | * intercept.c - in-kernel handling for sie intercepts |
| 3 | * | 3 | * |
| 4 | * Copyright IBM Corp. 2008 | 4 | * Copyright IBM Corp. 2008,2009 |
| 5 | * | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License (version 2 only) | 7 | * it under the terms of the GNU General Public License (version 2 only) |
| @@ -128,7 +128,7 @@ static int handle_noop(struct kvm_vcpu *vcpu) | |||
| 128 | 128 | ||
| 129 | static int handle_stop(struct kvm_vcpu *vcpu) | 129 | static int handle_stop(struct kvm_vcpu *vcpu) |
| 130 | { | 130 | { |
| 131 | int rc; | 131 | int rc = 0; |
| 132 | 132 | ||
| 133 | vcpu->stat.exit_stop_request++; | 133 | vcpu->stat.exit_stop_request++; |
| 134 | atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); | 134 | atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); |
| @@ -141,12 +141,18 @@ static int handle_stop(struct kvm_vcpu *vcpu) | |||
| 141 | rc = -ENOTSUPP; | 141 | rc = -ENOTSUPP; |
| 142 | } | 142 | } |
| 143 | 143 | ||
| 144 | if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { | ||
| 145 | vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; | ||
| 146 | rc = SIE_INTERCEPT_RERUNVCPU; | ||
| 147 | vcpu->run->exit_reason = KVM_EXIT_INTR; | ||
| 148 | } | ||
| 149 | |||
| 144 | if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) { | 150 | if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) { |
| 145 | vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP; | 151 | vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP; |
| 146 | VCPU_EVENT(vcpu, 3, "%s", "cpu stopped"); | 152 | VCPU_EVENT(vcpu, 3, "%s", "cpu stopped"); |
| 147 | rc = -ENOTSUPP; | 153 | rc = -ENOTSUPP; |
| 148 | } else | 154 | } |
| 149 | rc = 0; | 155 | |
| 150 | spin_unlock_bh(&vcpu->arch.local_int.lock); | 156 | spin_unlock_bh(&vcpu->arch.local_int.lock); |
| 151 | return rc; | 157 | return rc; |
| 152 | } | 158 | } |
| @@ -158,9 +164,9 @@ static int handle_validity(struct kvm_vcpu *vcpu) | |||
| 158 | 164 | ||
| 159 | vcpu->stat.exit_validity++; | 165 | vcpu->stat.exit_validity++; |
| 160 | if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix | 166 | if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix |
| 161 | <= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){ | 167 | <= kvm_s390_vcpu_get_memsize(vcpu) - 2*PAGE_SIZE)) { |
| 162 | rc = fault_in_pages_writeable((char __user *) | 168 | rc = fault_in_pages_writeable((char __user *) |
| 163 | vcpu->kvm->arch.guest_origin + | 169 | vcpu->arch.sie_block->gmsor + |
| 164 | vcpu->arch.sie_block->prefix, | 170 | vcpu->arch.sie_block->prefix, |
| 165 | 2*PAGE_SIZE); | 171 | 2*PAGE_SIZE); |
| 166 | if (rc) | 172 | if (rc) |
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 4d613415c43..2c2f9835341 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c | |||
| @@ -283,7 +283,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) | |||
| 283 | return 1; | 283 | return 1; |
| 284 | } | 284 | } |
| 285 | 285 | ||
| 286 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) | 286 | static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) |
| 287 | { | 287 | { |
| 288 | struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; | 288 | struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; |
| 289 | struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; | 289 | struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; |
| @@ -320,12 +320,6 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) | |||
| 320 | return rc; | 320 | return rc; |
| 321 | } | 321 | } |
| 322 | 322 | ||
| 323 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) | ||
| 324 | { | ||
| 325 | /* do real check here */ | ||
| 326 | return 1; | ||
| 327 | } | ||
| 328 | |||
| 329 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) | 323 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) |
| 330 | { | 324 | { |
| 331 | return 0; | 325 | return 0; |
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 90d9d1ba258..07ced89740d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * s390host.c -- hosting zSeries kernel virtual machines | 2 | * s390host.c -- hosting zSeries kernel virtual machines |
| 3 | * | 3 | * |
| 4 | * Copyright IBM Corp. 2008 | 4 | * Copyright IBM Corp. 2008,2009 |
| 5 | * | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License (version 2 only) | 7 | * it under the terms of the GNU General Public License (version 2 only) |
| @@ -10,6 +10,7 @@ | |||
| 10 | * Author(s): Carsten Otte <cotte@de.ibm.com> | 10 | * Author(s): Carsten Otte <cotte@de.ibm.com> |
| 11 | * Christian Borntraeger <borntraeger@de.ibm.com> | 11 | * Christian Borntraeger <borntraeger@de.ibm.com> |
| 12 | * Heiko Carstens <heiko.carstens@de.ibm.com> | 12 | * Heiko Carstens <heiko.carstens@de.ibm.com> |
| 13 | * Christian Ehrhardt <ehrhardt@de.ibm.com> | ||
| 13 | */ | 14 | */ |
| 14 | 15 | ||
| 15 | #include <linux/compiler.h> | 16 | #include <linux/compiler.h> |
| @@ -210,13 +211,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | |||
| 210 | static void kvm_free_vcpus(struct kvm *kvm) | 211 | static void kvm_free_vcpus(struct kvm *kvm) |
| 211 | { | 212 | { |
| 212 | unsigned int i; | 213 | unsigned int i; |
| 214 | struct kvm_vcpu *vcpu; | ||
| 213 | 215 | ||
| 214 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 216 | kvm_for_each_vcpu(i, vcpu, kvm) |
| 215 | if (kvm->vcpus[i]) { | 217 | kvm_arch_vcpu_destroy(vcpu); |
| 216 | kvm_arch_vcpu_destroy(kvm->vcpus[i]); | 218 | |
| 217 | kvm->vcpus[i] = NULL; | 219 | mutex_lock(&kvm->lock); |
| 218 | } | 220 | for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) |
| 219 | } | 221 | kvm->vcpus[i] = NULL; |
| 222 | |||
| 223 | atomic_set(&kvm->online_vcpus, 0); | ||
| 224 | mutex_unlock(&kvm->lock); | ||
| 220 | } | 225 | } |
| 221 | 226 | ||
| 222 | void kvm_arch_sync_events(struct kvm *kvm) | 227 | void kvm_arch_sync_events(struct kvm *kvm) |
| @@ -278,16 +283,10 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) | |||
| 278 | vcpu->arch.sie_block->gbea = 1; | 283 | vcpu->arch.sie_block->gbea = 1; |
| 279 | } | 284 | } |
| 280 | 285 | ||
| 281 | /* The current code can have up to 256 pages for virtio */ | ||
| 282 | #define VIRTIODESCSPACE (256ul * 4096ul) | ||
| 283 | |||
| 284 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | 286 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) |
| 285 | { | 287 | { |
| 286 | atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH); | 288 | atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH); |
| 287 | vcpu->arch.sie_block->gmslm = vcpu->kvm->arch.guest_memsize + | 289 | set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests); |
| 288 | vcpu->kvm->arch.guest_origin + | ||
| 289 | VIRTIODESCSPACE - 1ul; | ||
| 290 | vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin; | ||
| 291 | vcpu->arch.sie_block->ecb = 2; | 290 | vcpu->arch.sie_block->ecb = 2; |
| 292 | vcpu->arch.sie_block->eca = 0xC1002001U; | 291 | vcpu->arch.sie_block->eca = 0xC1002001U; |
| 293 | vcpu->arch.sie_block->fac = (int) (long) facilities; | 292 | vcpu->arch.sie_block->fac = (int) (long) facilities; |
| @@ -319,8 +318,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | |||
| 319 | BUG_ON(!kvm->arch.sca); | 318 | BUG_ON(!kvm->arch.sca); |
| 320 | if (!kvm->arch.sca->cpu[id].sda) | 319 | if (!kvm->arch.sca->cpu[id].sda) |
| 321 | kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; | 320 | kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; |
| 322 | else | ||
| 323 | BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */ | ||
| 324 | vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); | 321 | vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); |
| 325 | vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; | 322 | vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; |
| 326 | 323 | ||
| @@ -490,9 +487,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 490 | 487 | ||
| 491 | vcpu_load(vcpu); | 488 | vcpu_load(vcpu); |
| 492 | 489 | ||
| 490 | rerun_vcpu: | ||
| 491 | if (vcpu->requests) | ||
| 492 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) | ||
| 493 | kvm_s390_vcpu_set_mem(vcpu); | ||
| 494 | |||
| 493 | /* verify, that memory has been registered */ | 495 | /* verify, that memory has been registered */ |
| 494 | if (!vcpu->kvm->arch.guest_memsize) { | 496 | if (!vcpu->arch.sie_block->gmslm) { |
| 495 | vcpu_put(vcpu); | 497 | vcpu_put(vcpu); |
| 498 | VCPU_EVENT(vcpu, 3, "%s", "no memory registered to run vcpu"); | ||
| 496 | return -EINVAL; | 499 | return -EINVAL; |
| 497 | } | 500 | } |
| 498 | 501 | ||
| @@ -509,6 +512,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 509 | vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr; | 512 | vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr; |
| 510 | break; | 513 | break; |
| 511 | case KVM_EXIT_UNKNOWN: | 514 | case KVM_EXIT_UNKNOWN: |
| 515 | case KVM_EXIT_INTR: | ||
| 512 | case KVM_EXIT_S390_RESET: | 516 | case KVM_EXIT_S390_RESET: |
| 513 | break; | 517 | break; |
| 514 | default: | 518 | default: |
| @@ -522,8 +526,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 522 | rc = kvm_handle_sie_intercept(vcpu); | 526 | rc = kvm_handle_sie_intercept(vcpu); |
| 523 | } while (!signal_pending(current) && !rc); | 527 | } while (!signal_pending(current) && !rc); |
| 524 | 528 | ||
| 525 | if (signal_pending(current) && !rc) | 529 | if (rc == SIE_INTERCEPT_RERUNVCPU) |
| 530 | goto rerun_vcpu; | ||
| 531 | |||
| 532 | if (signal_pending(current) && !rc) { | ||
| 533 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
| 526 | rc = -EINTR; | 534 | rc = -EINTR; |
| 535 | } | ||
| 527 | 536 | ||
| 528 | if (rc == -ENOTSUPP) { | 537 | if (rc == -ENOTSUPP) { |
| 529 | /* intercept cannot be handled in-kernel, prepare kvm-run */ | 538 | /* intercept cannot be handled in-kernel, prepare kvm-run */ |
| @@ -676,6 +685,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
| 676 | int user_alloc) | 685 | int user_alloc) |
| 677 | { | 686 | { |
| 678 | int i; | 687 | int i; |
| 688 | struct kvm_vcpu *vcpu; | ||
| 679 | 689 | ||
| 680 | /* A few sanity checks. We can have exactly one memory slot which has | 690 | /* A few sanity checks. We can have exactly one memory slot which has |
| 681 | to start at guest virtual zero and which has to be located at a | 691 | to start at guest virtual zero and which has to be located at a |
| @@ -684,7 +694,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
| 684 | vmas. It is okay to mmap() and munmap() stuff in this slot after | 694 | vmas. It is okay to mmap() and munmap() stuff in this slot after |
| 685 | doing this call at any time */ | 695 | doing this call at any time */ |
| 686 | 696 | ||
| 687 | if (mem->slot || kvm->arch.guest_memsize) | 697 | if (mem->slot) |
| 688 | return -EINVAL; | 698 | return -EINVAL; |
| 689 | 699 | ||
| 690 | if (mem->guest_phys_addr) | 700 | if (mem->guest_phys_addr) |
| @@ -699,36 +709,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
| 699 | if (!user_alloc) | 709 | if (!user_alloc) |
| 700 | return -EINVAL; | 710 | return -EINVAL; |
| 701 | 711 | ||
| 702 | /* lock all vcpus */ | 712 | /* request update of sie control block for all available vcpus */ |
| 703 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 713 | kvm_for_each_vcpu(i, vcpu, kvm) { |
| 704 | if (!kvm->vcpus[i]) | 714 | if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) |
| 705 | continue; | 715 | continue; |
| 706 | if (!mutex_trylock(&kvm->vcpus[i]->mutex)) | 716 | kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP); |
| 707 | goto fail_out; | ||
| 708 | } | ||
| 709 | |||
| 710 | kvm->arch.guest_origin = mem->userspace_addr; | ||
| 711 | kvm->arch.guest_memsize = mem->memory_size; | ||
| 712 | |||
| 713 | /* update sie control blocks, and unlock all vcpus */ | ||
| 714 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
| 715 | if (kvm->vcpus[i]) { | ||
| 716 | kvm->vcpus[i]->arch.sie_block->gmsor = | ||
| 717 | kvm->arch.guest_origin; | ||
| 718 | kvm->vcpus[i]->arch.sie_block->gmslm = | ||
| 719 | kvm->arch.guest_memsize + | ||
| 720 | kvm->arch.guest_origin + | ||
| 721 | VIRTIODESCSPACE - 1ul; | ||
| 722 | mutex_unlock(&kvm->vcpus[i]->mutex); | ||
| 723 | } | ||
| 724 | } | 717 | } |
| 725 | 718 | ||
| 726 | return 0; | 719 | return 0; |
| 727 | |||
| 728 | fail_out: | ||
| 729 | for (; i >= 0; i--) | ||
| 730 | mutex_unlock(&kvm->vcpus[i]->mutex); | ||
| 731 | return -EINVAL; | ||
| 732 | } | 720 | } |
| 733 | 721 | ||
| 734 | void kvm_arch_flush_shadow(struct kvm *kvm) | 722 | void kvm_arch_flush_shadow(struct kvm *kvm) |
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 748fee87232..ec5eee7c25d 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * kvm_s390.h - definition for kvm on s390 | 2 | * kvm_s390.h - definition for kvm on s390 |
| 3 | * | 3 | * |
| 4 | * Copyright IBM Corp. 2008 | 4 | * Copyright IBM Corp. 2008,2009 |
| 5 | * | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License (version 2 only) | 7 | * it under the terms of the GNU General Public License (version 2 only) |
| @@ -9,6 +9,7 @@ | |||
| 9 | * | 9 | * |
| 10 | * Author(s): Carsten Otte <cotte@de.ibm.com> | 10 | * Author(s): Carsten Otte <cotte@de.ibm.com> |
| 11 | * Christian Borntraeger <borntraeger@de.ibm.com> | 11 | * Christian Borntraeger <borntraeger@de.ibm.com> |
| 12 | * Christian Ehrhardt <ehrhardt@de.ibm.com> | ||
| 12 | */ | 13 | */ |
| 13 | 14 | ||
| 14 | #ifndef ARCH_S390_KVM_S390_H | 15 | #ifndef ARCH_S390_KVM_S390_H |
| @@ -18,8 +19,13 @@ | |||
| 18 | #include <linux/kvm.h> | 19 | #include <linux/kvm.h> |
| 19 | #include <linux/kvm_host.h> | 20 | #include <linux/kvm_host.h> |
| 20 | 21 | ||
| 22 | /* The current code can have up to 256 pages for virtio */ | ||
| 23 | #define VIRTIODESCSPACE (256ul * 4096ul) | ||
| 24 | |||
| 21 | typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); | 25 | typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); |
| 22 | 26 | ||
| 27 | /* negativ values are error codes, positive values for internal conditions */ | ||
| 28 | #define SIE_INTERCEPT_RERUNVCPU (1<<0) | ||
| 23 | int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); | 29 | int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); |
| 24 | 30 | ||
| 25 | #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ | 31 | #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ |
| @@ -50,6 +56,30 @@ int kvm_s390_inject_vm(struct kvm *kvm, | |||
| 50 | int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, | 56 | int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, |
| 51 | struct kvm_s390_interrupt *s390int); | 57 | struct kvm_s390_interrupt *s390int); |
| 52 | int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); | 58 | int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); |
| 59 | int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); | ||
| 60 | |||
| 61 | static inline int kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu) | ||
| 62 | { | ||
| 63 | return vcpu->arch.sie_block->gmslm | ||
| 64 | - vcpu->arch.sie_block->gmsor | ||
| 65 | - VIRTIODESCSPACE + 1ul; | ||
| 66 | } | ||
| 67 | |||
| 68 | static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) | ||
| 69 | { | ||
| 70 | struct kvm_memory_slot *mem; | ||
| 71 | |||
| 72 | down_read(&vcpu->kvm->slots_lock); | ||
| 73 | mem = &vcpu->kvm->memslots[0]; | ||
| 74 | |||
| 75 | vcpu->arch.sie_block->gmsor = mem->userspace_addr; | ||
| 76 | vcpu->arch.sie_block->gmslm = | ||
| 77 | mem->userspace_addr + | ||
| 78 | (mem->npages << PAGE_SHIFT) + | ||
| 79 | VIRTIODESCSPACE - 1ul; | ||
| 80 | |||
| 81 | up_read(&vcpu->kvm->slots_lock); | ||
| 82 | } | ||
| 53 | 83 | ||
| 54 | /* implemented in priv.c */ | 84 | /* implemented in priv.c */ |
| 55 | int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); | 85 | int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); |
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 0ef81d6776e..40c8c6748cf 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * sigp.c - handlinge interprocessor communication | 2 | * sigp.c - handlinge interprocessor communication |
| 3 | * | 3 | * |
| 4 | * Copyright IBM Corp. 2008 | 4 | * Copyright IBM Corp. 2008,2009 |
| 5 | * | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License (version 2 only) | 7 | * it under the terms of the GNU General Public License (version 2 only) |
| @@ -9,6 +9,7 @@ | |||
| 9 | * | 9 | * |
| 10 | * Author(s): Carsten Otte <cotte@de.ibm.com> | 10 | * Author(s): Carsten Otte <cotte@de.ibm.com> |
| 11 | * Christian Borntraeger <borntraeger@de.ibm.com> | 11 | * Christian Borntraeger <borntraeger@de.ibm.com> |
| 12 | * Christian Ehrhardt <ehrhardt@de.ibm.com> | ||
| 12 | */ | 13 | */ |
| 13 | 14 | ||
| 14 | #include <linux/kvm.h> | 15 | #include <linux/kvm.h> |
| @@ -107,46 +108,57 @@ unlock: | |||
| 107 | return rc; | 108 | return rc; |
| 108 | } | 109 | } |
| 109 | 110 | ||
| 110 | static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store) | 111 | static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) |
| 111 | { | 112 | { |
| 112 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; | ||
| 113 | struct kvm_s390_local_interrupt *li; | ||
| 114 | struct kvm_s390_interrupt_info *inti; | 113 | struct kvm_s390_interrupt_info *inti; |
| 115 | int rc; | ||
| 116 | |||
| 117 | if (cpu_addr >= KVM_MAX_VCPUS) | ||
| 118 | return 3; /* not operational */ | ||
| 119 | 114 | ||
| 120 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); | 115 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); |
| 121 | if (!inti) | 116 | if (!inti) |
| 122 | return -ENOMEM; | 117 | return -ENOMEM; |
| 123 | |||
| 124 | inti->type = KVM_S390_SIGP_STOP; | 118 | inti->type = KVM_S390_SIGP_STOP; |
| 125 | 119 | ||
| 126 | spin_lock(&fi->lock); | ||
| 127 | li = fi->local_int[cpu_addr]; | ||
| 128 | if (li == NULL) { | ||
| 129 | rc = 3; /* not operational */ | ||
| 130 | kfree(inti); | ||
| 131 | goto unlock; | ||
| 132 | } | ||
| 133 | spin_lock_bh(&li->lock); | 120 | spin_lock_bh(&li->lock); |
| 134 | list_add_tail(&inti->list, &li->list); | 121 | list_add_tail(&inti->list, &li->list); |
| 135 | atomic_set(&li->active, 1); | 122 | atomic_set(&li->active, 1); |
| 136 | atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); | 123 | atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); |
| 137 | if (store) | 124 | li->action_bits |= action; |
| 138 | li->action_bits |= ACTION_STORE_ON_STOP; | ||
| 139 | li->action_bits |= ACTION_STOP_ON_STOP; | ||
| 140 | if (waitqueue_active(&li->wq)) | 125 | if (waitqueue_active(&li->wq)) |
| 141 | wake_up_interruptible(&li->wq); | 126 | wake_up_interruptible(&li->wq); |
| 142 | spin_unlock_bh(&li->lock); | 127 | spin_unlock_bh(&li->lock); |
| 143 | rc = 0; /* order accepted */ | 128 | |
| 129 | return 0; /* order accepted */ | ||
| 130 | } | ||
| 131 | |||
| 132 | static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) | ||
| 133 | { | ||
| 134 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; | ||
| 135 | struct kvm_s390_local_interrupt *li; | ||
| 136 | int rc; | ||
| 137 | |||
| 138 | if (cpu_addr >= KVM_MAX_VCPUS) | ||
| 139 | return 3; /* not operational */ | ||
| 140 | |||
| 141 | spin_lock(&fi->lock); | ||
| 142 | li = fi->local_int[cpu_addr]; | ||
| 143 | if (li == NULL) { | ||
| 144 | rc = 3; /* not operational */ | ||
| 145 | goto unlock; | ||
| 146 | } | ||
| 147 | |||
| 148 | rc = __inject_sigp_stop(li, action); | ||
| 149 | |||
| 144 | unlock: | 150 | unlock: |
| 145 | spin_unlock(&fi->lock); | 151 | spin_unlock(&fi->lock); |
| 146 | VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr); | 152 | VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr); |
| 147 | return rc; | 153 | return rc; |
| 148 | } | 154 | } |
| 149 | 155 | ||
| 156 | int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action) | ||
| 157 | { | ||
| 158 | struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; | ||
| 159 | return __inject_sigp_stop(li, action); | ||
| 160 | } | ||
| 161 | |||
| 150 | static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) | 162 | static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) |
| 151 | { | 163 | { |
| 152 | int rc; | 164 | int rc; |
| @@ -177,9 +189,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, | |||
| 177 | /* make sure that the new value is valid memory */ | 189 | /* make sure that the new value is valid memory */ |
| 178 | address = address & 0x7fffe000u; | 190 | address = address & 0x7fffe000u; |
| 179 | if ((copy_from_guest(vcpu, &tmp, | 191 | if ((copy_from_guest(vcpu, &tmp, |
| 180 | (u64) (address + vcpu->kvm->arch.guest_origin) , 1)) || | 192 | (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) || |
| 181 | (copy_from_guest(vcpu, &tmp, (u64) (address + | 193 | (copy_from_guest(vcpu, &tmp, (u64) (address + |
| 182 | vcpu->kvm->arch.guest_origin + PAGE_SIZE), 1))) { | 194 | vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) { |
| 183 | *reg |= SIGP_STAT_INVALID_PARAMETER; | 195 | *reg |= SIGP_STAT_INVALID_PARAMETER; |
| 184 | return 1; /* invalid parameter */ | 196 | return 1; /* invalid parameter */ |
| 185 | } | 197 | } |
| @@ -262,11 +274,11 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) | |||
| 262 | break; | 274 | break; |
| 263 | case SIGP_STOP: | 275 | case SIGP_STOP: |
| 264 | vcpu->stat.instruction_sigp_stop++; | 276 | vcpu->stat.instruction_sigp_stop++; |
| 265 | rc = __sigp_stop(vcpu, cpu_addr, 0); | 277 | rc = __sigp_stop(vcpu, cpu_addr, ACTION_STOP_ON_STOP); |
| 266 | break; | 278 | break; |
| 267 | case SIGP_STOP_STORE_STATUS: | 279 | case SIGP_STOP_STORE_STATUS: |
| 268 | vcpu->stat.instruction_sigp_stop++; | 280 | vcpu->stat.instruction_sigp_stop++; |
| 269 | rc = __sigp_stop(vcpu, cpu_addr, 1); | 281 | rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP); |
| 270 | break; | 282 | break; |
| 271 | case SIGP_SET_ARCH: | 283 | case SIGP_SET_ARCH: |
| 272 | vcpu->stat.instruction_sigp_arch++; | 284 | vcpu->stat.instruction_sigp_arch++; |
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7386bfa4f4b..3b62da926de 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | 15 | ||
| 16 | #define APIC_LVR 0x30 | 16 | #define APIC_LVR 0x30 |
| 17 | #define APIC_LVR_MASK 0xFF00FF | 17 | #define APIC_LVR_MASK 0xFF00FF |
| 18 | #define APIC_LVR_DIRECTED_EOI (1 << 24) | ||
| 18 | #define GET_APIC_VERSION(x) ((x) & 0xFFu) | 19 | #define GET_APIC_VERSION(x) ((x) & 0xFFu) |
| 19 | #define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) | 20 | #define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) |
| 20 | #ifdef CONFIG_X86_32 | 21 | #ifdef CONFIG_X86_32 |
| @@ -41,6 +42,7 @@ | |||
| 41 | #define APIC_DFR_CLUSTER 0x0FFFFFFFul | 42 | #define APIC_DFR_CLUSTER 0x0FFFFFFFul |
| 42 | #define APIC_DFR_FLAT 0xFFFFFFFFul | 43 | #define APIC_DFR_FLAT 0xFFFFFFFFul |
| 43 | #define APIC_SPIV 0xF0 | 44 | #define APIC_SPIV 0xF0 |
| 45 | #define APIC_SPIV_DIRECTED_EOI (1 << 12) | ||
| 44 | #define APIC_SPIV_FOCUS_DISABLED (1 << 9) | 46 | #define APIC_SPIV_FOCUS_DISABLED (1 << 9) |
| 45 | #define APIC_SPIV_APIC_ENABLED (1 << 8) | 47 | #define APIC_SPIV_APIC_ENABLED (1 << 8) |
| 46 | #define APIC_ISR 0x100 | 48 | #define APIC_ISR 0x100 |
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 125be8b1956..4a5fe914dc5 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h | |||
| @@ -17,6 +17,8 @@ | |||
| 17 | #define __KVM_HAVE_USER_NMI | 17 | #define __KVM_HAVE_USER_NMI |
| 18 | #define __KVM_HAVE_GUEST_DEBUG | 18 | #define __KVM_HAVE_GUEST_DEBUG |
| 19 | #define __KVM_HAVE_MSIX | 19 | #define __KVM_HAVE_MSIX |
| 20 | #define __KVM_HAVE_MCE | ||
| 21 | #define __KVM_HAVE_PIT_STATE2 | ||
| 20 | 22 | ||
| 21 | /* Architectural interrupt line count. */ | 23 | /* Architectural interrupt line count. */ |
| 22 | #define KVM_NR_INTERRUPTS 256 | 24 | #define KVM_NR_INTERRUPTS 256 |
| @@ -236,6 +238,14 @@ struct kvm_pit_state { | |||
| 236 | struct kvm_pit_channel_state channels[3]; | 238 | struct kvm_pit_channel_state channels[3]; |
| 237 | }; | 239 | }; |
| 238 | 240 | ||
| 241 | #define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 | ||
| 242 | |||
| 243 | struct kvm_pit_state2 { | ||
| 244 | struct kvm_pit_channel_state channels[3]; | ||
| 245 | __u32 flags; | ||
| 246 | __u32 reserved[9]; | ||
| 247 | }; | ||
| 248 | |||
| 239 | struct kvm_reinject_control { | 249 | struct kvm_reinject_control { |
| 240 | __u8 pit_reinject; | 250 | __u8 pit_reinject; |
| 241 | __u8 reserved[31]; | 251 | __u8 reserved[31]; |
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b7ed2c42311..b7ed2c42311 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eabdc1cfab5..3be000435fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
| 15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
| 16 | #include <linux/mmu_notifier.h> | 16 | #include <linux/mmu_notifier.h> |
| 17 | #include <linux/tracepoint.h> | ||
| 17 | 18 | ||
| 18 | #include <linux/kvm.h> | 19 | #include <linux/kvm.h> |
| 19 | #include <linux/kvm_para.h> | 20 | #include <linux/kvm_para.h> |
| @@ -37,12 +38,14 @@ | |||
| 37 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ | 38 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ |
| 38 | 0xFFFFFF0000000000ULL) | 39 | 0xFFFFFF0000000000ULL) |
| 39 | 40 | ||
| 40 | #define KVM_GUEST_CR0_MASK \ | 41 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
| 41 | (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ | 42 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
| 42 | | X86_CR0_NW | X86_CR0_CD) | 43 | #define KVM_GUEST_CR0_MASK \ |
| 44 | (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) | ||
| 45 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ | ||
| 46 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) | ||
| 43 | #define KVM_VM_CR0_ALWAYS_ON \ | 47 | #define KVM_VM_CR0_ALWAYS_ON \ |
| 44 | (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ | 48 | (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) |
| 45 | | X86_CR0_MP) | ||
| 46 | #define KVM_GUEST_CR4_MASK \ | 49 | #define KVM_GUEST_CR4_MASK \ |
| 47 | (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) | 50 | (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) |
| 48 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) | 51 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) |
| @@ -51,12 +54,12 @@ | |||
| 51 | #define INVALID_PAGE (~(hpa_t)0) | 54 | #define INVALID_PAGE (~(hpa_t)0) |
| 52 | #define UNMAPPED_GVA (~(gpa_t)0) | 55 | #define UNMAPPED_GVA (~(gpa_t)0) |
| 53 | 56 | ||
| 54 | /* shadow tables are PAE even on non-PAE hosts */ | 57 | /* KVM Hugepage definitions for x86 */ |
| 55 | #define KVM_HPAGE_SHIFT 21 | 58 | #define KVM_NR_PAGE_SIZES 3 |
| 56 | #define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) | 59 | #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) |
| 57 | #define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) | 60 | #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) |
| 58 | 61 | #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) | |
| 59 | #define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) | 62 | #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) |
| 60 | 63 | ||
| 61 | #define DE_VECTOR 0 | 64 | #define DE_VECTOR 0 |
| 62 | #define DB_VECTOR 1 | 65 | #define DB_VECTOR 1 |
| @@ -120,6 +123,10 @@ enum kvm_reg { | |||
| 120 | NR_VCPU_REGS | 123 | NR_VCPU_REGS |
| 121 | }; | 124 | }; |
| 122 | 125 | ||
| 126 | enum kvm_reg_ex { | ||
| 127 | VCPU_EXREG_PDPTR = NR_VCPU_REGS, | ||
| 128 | }; | ||
| 129 | |||
| 123 | enum { | 130 | enum { |
| 124 | VCPU_SREG_ES, | 131 | VCPU_SREG_ES, |
| 125 | VCPU_SREG_CS, | 132 | VCPU_SREG_CS, |
| @@ -131,7 +138,7 @@ enum { | |||
| 131 | VCPU_SREG_LDTR, | 138 | VCPU_SREG_LDTR, |
| 132 | }; | 139 | }; |
| 133 | 140 | ||
| 134 | #include <asm/kvm_x86_emulate.h> | 141 | #include <asm/kvm_emulate.h> |
| 135 | 142 | ||
| 136 | #define KVM_NR_MEM_OBJS 40 | 143 | #define KVM_NR_MEM_OBJS 40 |
| 137 | 144 | ||
| @@ -308,7 +315,6 @@ struct kvm_vcpu_arch { | |||
| 308 | struct { | 315 | struct { |
| 309 | gfn_t gfn; /* presumed gfn during guest pte update */ | 316 | gfn_t gfn; /* presumed gfn during guest pte update */ |
| 310 | pfn_t pfn; /* pfn corresponding to that gfn */ | 317 | pfn_t pfn; /* pfn corresponding to that gfn */ |
| 311 | int largepage; | ||
| 312 | unsigned long mmu_seq; | 318 | unsigned long mmu_seq; |
| 313 | } update_pte; | 319 | } update_pte; |
| 314 | 320 | ||
| @@ -334,16 +340,6 @@ struct kvm_vcpu_arch { | |||
| 334 | u8 nr; | 340 | u8 nr; |
| 335 | } interrupt; | 341 | } interrupt; |
| 336 | 342 | ||
| 337 | struct { | ||
| 338 | int vm86_active; | ||
| 339 | u8 save_iopl; | ||
| 340 | struct kvm_save_segment { | ||
| 341 | u16 selector; | ||
| 342 | unsigned long base; | ||
| 343 | u32 limit; | ||
| 344 | u32 ar; | ||
| 345 | } tr, es, ds, fs, gs; | ||
| 346 | } rmode; | ||
| 347 | int halt_request; /* real mode on Intel only */ | 343 | int halt_request; /* real mode on Intel only */ |
| 348 | 344 | ||
| 349 | int cpuid_nent; | 345 | int cpuid_nent; |
| @@ -366,13 +362,15 @@ struct kvm_vcpu_arch { | |||
| 366 | u32 pat; | 362 | u32 pat; |
| 367 | 363 | ||
| 368 | int switch_db_regs; | 364 | int switch_db_regs; |
| 369 | unsigned long host_db[KVM_NR_DB_REGS]; | ||
| 370 | unsigned long host_dr6; | ||
| 371 | unsigned long host_dr7; | ||
| 372 | unsigned long db[KVM_NR_DB_REGS]; | 365 | unsigned long db[KVM_NR_DB_REGS]; |
| 373 | unsigned long dr6; | 366 | unsigned long dr6; |
| 374 | unsigned long dr7; | 367 | unsigned long dr7; |
| 375 | unsigned long eff_db[KVM_NR_DB_REGS]; | 368 | unsigned long eff_db[KVM_NR_DB_REGS]; |
| 369 | |||
| 370 | u64 mcg_cap; | ||
| 371 | u64 mcg_status; | ||
| 372 | u64 mcg_ctl; | ||
| 373 | u64 *mce_banks; | ||
| 376 | }; | 374 | }; |
| 377 | 375 | ||
| 378 | struct kvm_mem_alias { | 376 | struct kvm_mem_alias { |
| @@ -409,6 +407,7 @@ struct kvm_arch{ | |||
| 409 | 407 | ||
| 410 | struct page *ept_identity_pagetable; | 408 | struct page *ept_identity_pagetable; |
| 411 | bool ept_identity_pagetable_done; | 409 | bool ept_identity_pagetable_done; |
| 410 | gpa_t ept_identity_map_addr; | ||
| 412 | 411 | ||
| 413 | unsigned long irq_sources_bitmap; | 412 | unsigned long irq_sources_bitmap; |
| 414 | unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; | 413 | unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; |
| @@ -526,6 +525,9 @@ struct kvm_x86_ops { | |||
| 526 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); | 525 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); |
| 527 | int (*get_tdp_level)(void); | 526 | int (*get_tdp_level)(void); |
| 528 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); | 527 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); |
| 528 | bool (*gb_page_enable)(void); | ||
| 529 | |||
| 530 | const struct trace_print_flags *exit_reasons_str; | ||
| 529 | }; | 531 | }; |
| 530 | 532 | ||
| 531 | extern struct kvm_x86_ops *kvm_x86_ops; | 533 | extern struct kvm_x86_ops *kvm_x86_ops; |
| @@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); | |||
| 618 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 620 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
| 619 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, | 621 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, |
| 620 | u32 error_code); | 622 | u32 error_code); |
| 623 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); | ||
| 621 | 624 | ||
| 622 | int kvm_pic_set_irq(void *opaque, int irq, int level); | 625 | int kvm_pic_set_irq(void *opaque, int irq, int level); |
| 623 | 626 | ||
| @@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) | |||
| 752 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); | 755 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); |
| 753 | } | 756 | } |
| 754 | 757 | ||
| 755 | #define MSR_IA32_TIME_STAMP_COUNTER 0x010 | ||
| 756 | |||
| 757 | #define TSS_IOPB_BASE_OFFSET 0x66 | 758 | #define TSS_IOPB_BASE_OFFSET 0x66 |
| 758 | #define TSS_BASE_SIZE 0x68 | 759 | #define TSS_BASE_SIZE 0x68 |
| 759 | #define TSS_IOPB_SIZE (65536 / 8) | 760 | #define TSS_IOPB_SIZE (65536 / 8) |
| @@ -796,5 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void); | |||
| 796 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | 797 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); |
| 797 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); | 798 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); |
| 798 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); | 799 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); |
| 800 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); | ||
| 801 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); | ||
| 802 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
| 799 | 803 | ||
| 800 | #endif /* _ASM_X86_KVM_HOST_H */ | 804 | #endif /* _ASM_X86_KVM_HOST_H */ |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index b8a3305ae09..c584076a47f 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | #ifndef _ASM_X86_KVM_PARA_H | 1 | #ifndef _ASM_X86_KVM_PARA_H |
| 2 | #define _ASM_X86_KVM_PARA_H | 2 | #define _ASM_X86_KVM_PARA_H |
| 3 | 3 | ||
| 4 | #include <linux/types.h> | ||
| 5 | |||
| 4 | /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It | 6 | /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It |
| 5 | * should be used to determine that a VM is running under KVM. | 7 | * should be used to determine that a VM is running under KVM. |
| 6 | */ | 8 | */ |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6be7fc254b5..bd5549034a9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
| @@ -374,6 +374,7 @@ | |||
| 374 | /* AMD-V MSRs */ | 374 | /* AMD-V MSRs */ |
| 375 | 375 | ||
| 376 | #define MSR_VM_CR 0xc0010114 | 376 | #define MSR_VM_CR 0xc0010114 |
| 377 | #define MSR_VM_IGNNE 0xc0010115 | ||
| 377 | #define MSR_VM_HSAVE_PA 0xc0010117 | 378 | #define MSR_VM_HSAVE_PA 0xc0010117 |
| 378 | 379 | ||
| 379 | #endif /* _ASM_X86_MSR_INDEX_H */ | 380 | #endif /* _ASM_X86_MSR_INDEX_H */ |
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 11be5ad2e0e..272514c2d45 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
| @@ -55,6 +55,7 @@ | |||
| 55 | #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 | 55 | #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 |
| 56 | #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 | 56 | #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 |
| 57 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | 57 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 |
| 58 | #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 | ||
| 58 | 59 | ||
| 59 | 60 | ||
| 60 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | 61 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 |
| @@ -351,9 +352,16 @@ enum vmcs_field { | |||
| 351 | #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 | 352 | #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 |
| 352 | #define VMX_EPT_EXTENT_CONTEXT 1 | 353 | #define VMX_EPT_EXTENT_CONTEXT 1 |
| 353 | #define VMX_EPT_EXTENT_GLOBAL 2 | 354 | #define VMX_EPT_EXTENT_GLOBAL 2 |
| 355 | |||
| 356 | #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) | ||
| 357 | #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) | ||
| 358 | #define VMX_EPTP_UC_BIT (1ull << 8) | ||
| 359 | #define VMX_EPTP_WB_BIT (1ull << 14) | ||
| 360 | #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) | ||
| 354 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) | 361 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) |
| 355 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) | 362 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) |
| 356 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) | 363 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) |
| 364 | |||
| 357 | #define VMX_EPT_DEFAULT_GAW 3 | 365 | #define VMX_EPT_DEFAULT_GAW 3 |
| 358 | #define VMX_EPT_MAX_GAW 0x4 | 366 | #define VMX_EPT_MAX_GAW 0x4 |
| 359 | #define VMX_EPT_MT_EPTE_SHIFT 3 | 367 | #define VMX_EPT_MT_EPTE_SHIFT 3 |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c664d515f61..63b0ec8d3d4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
| @@ -34,7 +34,6 @@ | |||
| 34 | struct kvm_para_state { | 34 | struct kvm_para_state { |
| 35 | u8 mmu_queue[MMU_QUEUE_SIZE]; | 35 | u8 mmu_queue[MMU_QUEUE_SIZE]; |
| 36 | int mmu_queue_len; | 36 | int mmu_queue_len; |
| 37 | enum paravirt_lazy_mode mode; | ||
| 38 | }; | 37 | }; |
| 39 | 38 | ||
| 40 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); | 39 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); |
| @@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len) | |||
| 77 | { | 76 | { |
| 78 | struct kvm_para_state *state = kvm_para_state(); | 77 | struct kvm_para_state *state = kvm_para_state(); |
| 79 | 78 | ||
| 80 | if (state->mode != PARAVIRT_LAZY_MMU) { | 79 | if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { |
| 81 | kvm_mmu_op(buffer, len); | 80 | kvm_mmu_op(buffer, len); |
| 82 | return; | 81 | return; |
| 83 | } | 82 | } |
| @@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn) | |||
| 185 | 184 | ||
| 186 | static void kvm_enter_lazy_mmu(void) | 185 | static void kvm_enter_lazy_mmu(void) |
| 187 | { | 186 | { |
| 188 | struct kvm_para_state *state = kvm_para_state(); | ||
| 189 | |||
| 190 | paravirt_enter_lazy_mmu(); | 187 | paravirt_enter_lazy_mmu(); |
| 191 | state->mode = paravirt_get_lazy_mode(); | ||
| 192 | } | 188 | } |
| 193 | 189 | ||
| 194 | static void kvm_leave_lazy_mmu(void) | 190 | static void kvm_leave_lazy_mmu(void) |
| @@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void) | |||
| 197 | 193 | ||
| 198 | mmu_queue_flush(state); | 194 | mmu_queue_flush(state); |
| 199 | paravirt_leave_lazy_mmu(); | 195 | paravirt_leave_lazy_mmu(); |
| 200 | state->mode = paravirt_get_lazy_mode(); | ||
| 201 | } | 196 | } |
| 202 | 197 | ||
| 203 | static void __init paravirt_ops_setup(void) | 198 | static void __init paravirt_ops_setup(void) |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f152..e5efcdcca31 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
| @@ -50,8 +50,8 @@ static unsigned long kvm_get_wallclock(void) | |||
| 50 | struct timespec ts; | 50 | struct timespec ts; |
| 51 | int low, high; | 51 | int low, high; |
| 52 | 52 | ||
| 53 | low = (int)__pa(&wall_clock); | 53 | low = (int)__pa_symbol(&wall_clock); |
| 54 | high = ((u64)__pa(&wall_clock) >> 32); | 54 | high = ((u64)__pa_symbol(&wall_clock) >> 32); |
| 55 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); | 55 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); |
| 56 | 56 | ||
| 57 | vcpu_time = &get_cpu_var(hv_clock); | 57 | vcpu_time = &get_cpu_var(hv_clock); |
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8600a09e0c6..b84e571f417 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
| @@ -1,12 +1,8 @@ | |||
| 1 | # | 1 | # |
| 2 | # KVM configuration | 2 | # KVM configuration |
| 3 | # | 3 | # |
| 4 | config HAVE_KVM | ||
| 5 | bool | ||
| 6 | 4 | ||
| 7 | config HAVE_KVM_IRQCHIP | 5 | source "virt/kvm/Kconfig" |
| 8 | bool | ||
| 9 | default y | ||
| 10 | 6 | ||
| 11 | menuconfig VIRTUALIZATION | 7 | menuconfig VIRTUALIZATION |
| 12 | bool "Virtualization" | 8 | bool "Virtualization" |
| @@ -29,6 +25,9 @@ config KVM | |||
| 29 | select PREEMPT_NOTIFIERS | 25 | select PREEMPT_NOTIFIERS |
| 30 | select MMU_NOTIFIER | 26 | select MMU_NOTIFIER |
| 31 | select ANON_INODES | 27 | select ANON_INODES |
| 28 | select HAVE_KVM_IRQCHIP | ||
| 29 | select HAVE_KVM_EVENTFD | ||
| 30 | select KVM_APIC_ARCHITECTURE | ||
| 32 | ---help--- | 31 | ---help--- |
| 33 | Support hosting fully virtualized guest machines using hardware | 32 | Support hosting fully virtualized guest machines using hardware |
| 34 | virtualization extensions. You will need a fairly recent | 33 | virtualization extensions. You will need a fairly recent |
| @@ -63,18 +62,6 @@ config KVM_AMD | |||
| 63 | To compile this as a module, choose M here: the module | 62 | To compile this as a module, choose M here: the module |
| 64 | will be called kvm-amd. | 63 | will be called kvm-amd. |
| 65 | 64 | ||
| 66 | config KVM_TRACE | ||
| 67 | bool "KVM trace support" | ||
| 68 | depends on KVM && SYSFS | ||
| 69 | select MARKERS | ||
| 70 | select RELAY | ||
| 71 | select DEBUG_FS | ||
| 72 | default n | ||
| 73 | ---help--- | ||
| 74 | This option allows reading a trace of kvm-related events through | ||
| 75 | relayfs. Note the ABI is not considered stable and will be | ||
| 76 | modified in future updates. | ||
| 77 | |||
| 78 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | 65 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under |
| 79 | # the virtualization menu. | 66 | # the virtualization menu. |
| 80 | source drivers/lguest/Kconfig | 67 | source drivers/lguest/Kconfig |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b43c4efafe8..0e7fe78d0f7 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
| @@ -1,22 +1,19 @@ | |||
| 1 | # | ||
| 2 | # Makefile for Kernel-based Virtual Machine module | ||
| 3 | # | ||
| 4 | |||
| 5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | ||
| 6 | coalesced_mmio.o irq_comm.o) | ||
| 7 | ifeq ($(CONFIG_KVM_TRACE),y) | ||
| 8 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) | ||
| 9 | endif | ||
| 10 | ifeq ($(CONFIG_IOMMU_API),y) | ||
| 11 | common-objs += $(addprefix ../../../virt/kvm/, iommu.o) | ||
| 12 | endif | ||
| 13 | 1 | ||
| 14 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 2 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm |
| 15 | 3 | ||
| 16 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ | 4 | CFLAGS_x86.o := -I. |
| 17 | i8254.o timer.o | 5 | CFLAGS_svm.o := -I. |
| 18 | obj-$(CONFIG_KVM) += kvm.o | 6 | CFLAGS_vmx.o := -I. |
| 19 | kvm-intel-objs = vmx.o | 7 | |
| 20 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | 8 | kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ |
| 21 | kvm-amd-objs = svm.o | 9 | coalesced_mmio.o irq_comm.o eventfd.o) |
| 22 | obj-$(CONFIG_KVM_AMD) += kvm-amd.o | 10 | kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) |
| 11 | |||
| 12 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | ||
| 13 | i8254.o timer.o | ||
| 14 | kvm-intel-y += vmx.o | ||
| 15 | kvm-amd-y += svm.o | ||
| 16 | |||
| 17 | obj-$(CONFIG_KVM) += kvm.o | ||
| 18 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | ||
| 19 | obj-$(CONFIG_KVM_AMD) += kvm-amd.o | ||
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c index 616de4628d6..1be5cd640e9 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/emulate.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /****************************************************************************** | 1 | /****************************************************************************** |
| 2 | * x86_emulate.c | 2 | * emulate.c |
| 3 | * | 3 | * |
| 4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | 4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. |
| 5 | * | 5 | * |
| @@ -30,7 +30,9 @@ | |||
| 30 | #define DPRINTF(x...) do {} while (0) | 30 | #define DPRINTF(x...) do {} while (0) |
| 31 | #endif | 31 | #endif |
| 32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
| 33 | #include <asm/kvm_x86_emulate.h> | 33 | #include <asm/kvm_emulate.h> |
| 34 | |||
| 35 | #include "mmu.h" /* for is_long_mode() */ | ||
| 34 | 36 | ||
| 35 | /* | 37 | /* |
| 36 | * Opcode effective-address decode tables. | 38 | * Opcode effective-address decode tables. |
| @@ -60,6 +62,7 @@ | |||
| 60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ | 62 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ |
| 61 | #define SrcOne (7<<4) /* Implied '1' */ | 63 | #define SrcOne (7<<4) /* Implied '1' */ |
| 62 | #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ | 64 | #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ |
| 65 | #define SrcImmU (9<<4) /* Immediate operand, unsigned */ | ||
| 63 | #define SrcMask (0xf<<4) | 66 | #define SrcMask (0xf<<4) |
| 64 | /* Generic ModRM decode. */ | 67 | /* Generic ModRM decode. */ |
| 65 | #define ModRM (1<<8) | 68 | #define ModRM (1<<8) |
| @@ -97,11 +100,11 @@ static u32 opcode_table[256] = { | |||
| 97 | /* 0x10 - 0x17 */ | 100 | /* 0x10 - 0x17 */ |
| 98 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 101 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
| 99 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 102 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
| 100 | 0, 0, 0, 0, | 103 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, |
| 101 | /* 0x18 - 0x1F */ | 104 | /* 0x18 - 0x1F */ |
| 102 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 105 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
| 103 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 106 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
| 104 | 0, 0, 0, 0, | 107 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, |
| 105 | /* 0x20 - 0x27 */ | 108 | /* 0x20 - 0x27 */ |
| 106 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 109 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
| 107 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 110 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
| @@ -195,7 +198,7 @@ static u32 opcode_table[256] = { | |||
| 195 | ByteOp | SrcImmUByte, SrcImmUByte, | 198 | ByteOp | SrcImmUByte, SrcImmUByte, |
| 196 | /* 0xE8 - 0xEF */ | 199 | /* 0xE8 - 0xEF */ |
| 197 | SrcImm | Stack, SrcImm | ImplicitOps, | 200 | SrcImm | Stack, SrcImm | ImplicitOps, |
| 198 | SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, | 201 | SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, |
| 199 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | 202 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
| 200 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | 203 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
| 201 | /* 0xF0 - 0xF7 */ | 204 | /* 0xF0 - 0xF7 */ |
| @@ -208,7 +211,7 @@ static u32 opcode_table[256] = { | |||
| 208 | 211 | ||
| 209 | static u32 twobyte_table[256] = { | 212 | static u32 twobyte_table[256] = { |
| 210 | /* 0x00 - 0x0F */ | 213 | /* 0x00 - 0x0F */ |
| 211 | 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, | 214 | 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, |
| 212 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | 215 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, |
| 213 | /* 0x10 - 0x1F */ | 216 | /* 0x10 - 0x1F */ |
| 214 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | 217 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, |
| @@ -216,7 +219,9 @@ static u32 twobyte_table[256] = { | |||
| 216 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | 219 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, |
| 217 | 0, 0, 0, 0, 0, 0, 0, 0, | 220 | 0, 0, 0, 0, 0, 0, 0, 0, |
| 218 | /* 0x30 - 0x3F */ | 221 | /* 0x30 - 0x3F */ |
| 219 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 222 | ImplicitOps, 0, ImplicitOps, 0, |
| 223 | ImplicitOps, ImplicitOps, 0, 0, | ||
| 224 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 220 | /* 0x40 - 0x47 */ | 225 | /* 0x40 - 0x47 */ |
| 221 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | 226 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, |
| 222 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | 227 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, |
| @@ -319,8 +324,11 @@ static u32 group2_table[] = { | |||
| 319 | }; | 324 | }; |
| 320 | 325 | ||
| 321 | /* EFLAGS bit definitions. */ | 326 | /* EFLAGS bit definitions. */ |
| 327 | #define EFLG_VM (1<<17) | ||
| 328 | #define EFLG_RF (1<<16) | ||
| 322 | #define EFLG_OF (1<<11) | 329 | #define EFLG_OF (1<<11) |
| 323 | #define EFLG_DF (1<<10) | 330 | #define EFLG_DF (1<<10) |
| 331 | #define EFLG_IF (1<<9) | ||
| 324 | #define EFLG_SF (1<<7) | 332 | #define EFLG_SF (1<<7) |
| 325 | #define EFLG_ZF (1<<6) | 333 | #define EFLG_ZF (1<<6) |
| 326 | #define EFLG_AF (1<<4) | 334 | #define EFLG_AF (1<<4) |
| @@ -1027,6 +1035,7 @@ done_prefixes: | |||
| 1027 | c->src.type = OP_MEM; | 1035 | c->src.type = OP_MEM; |
| 1028 | break; | 1036 | break; |
| 1029 | case SrcImm: | 1037 | case SrcImm: |
| 1038 | case SrcImmU: | ||
| 1030 | c->src.type = OP_IMM; | 1039 | c->src.type = OP_IMM; |
| 1031 | c->src.ptr = (unsigned long *)c->eip; | 1040 | c->src.ptr = (unsigned long *)c->eip; |
| 1032 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1041 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
| @@ -1044,6 +1053,19 @@ done_prefixes: | |||
| 1044 | c->src.val = insn_fetch(s32, 4, c->eip); | 1053 | c->src.val = insn_fetch(s32, 4, c->eip); |
| 1045 | break; | 1054 | break; |
| 1046 | } | 1055 | } |
| 1056 | if ((c->d & SrcMask) == SrcImmU) { | ||
| 1057 | switch (c->src.bytes) { | ||
| 1058 | case 1: | ||
| 1059 | c->src.val &= 0xff; | ||
| 1060 | break; | ||
| 1061 | case 2: | ||
| 1062 | c->src.val &= 0xffff; | ||
| 1063 | break; | ||
| 1064 | case 4: | ||
| 1065 | c->src.val &= 0xffffffff; | ||
| 1066 | break; | ||
| 1067 | } | ||
| 1068 | } | ||
| 1047 | break; | 1069 | break; |
| 1048 | case SrcImmByte: | 1070 | case SrcImmByte: |
| 1049 | case SrcImmUByte: | 1071 | case SrcImmUByte: |
| @@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) | |||
| 1375 | ctxt->interruptibility = mask; | 1397 | ctxt->interruptibility = mask; |
| 1376 | } | 1398 | } |
| 1377 | 1399 | ||
| 1400 | static inline void | ||
| 1401 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | ||
| 1402 | struct kvm_segment *cs, struct kvm_segment *ss) | ||
| 1403 | { | ||
| 1404 | memset(cs, 0, sizeof(struct kvm_segment)); | ||
| 1405 | kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); | ||
| 1406 | memset(ss, 0, sizeof(struct kvm_segment)); | ||
| 1407 | |||
| 1408 | cs->l = 0; /* will be adjusted later */ | ||
| 1409 | cs->base = 0; /* flat segment */ | ||
| 1410 | cs->g = 1; /* 4kb granularity */ | ||
| 1411 | cs->limit = 0xffffffff; /* 4GB limit */ | ||
| 1412 | cs->type = 0x0b; /* Read, Execute, Accessed */ | ||
| 1413 | cs->s = 1; | ||
| 1414 | cs->dpl = 0; /* will be adjusted later */ | ||
| 1415 | cs->present = 1; | ||
| 1416 | cs->db = 1; | ||
| 1417 | |||
| 1418 | ss->unusable = 0; | ||
| 1419 | ss->base = 0; /* flat segment */ | ||
| 1420 | ss->limit = 0xffffffff; /* 4GB limit */ | ||
| 1421 | ss->g = 1; /* 4kb granularity */ | ||
| 1422 | ss->s = 1; | ||
| 1423 | ss->type = 0x03; /* Read/Write, Accessed */ | ||
| 1424 | ss->db = 1; /* 32bit stack segment */ | ||
| 1425 | ss->dpl = 0; | ||
| 1426 | ss->present = 1; | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | static int | ||
| 1430 | emulate_syscall(struct x86_emulate_ctxt *ctxt) | ||
| 1431 | { | ||
| 1432 | struct decode_cache *c = &ctxt->decode; | ||
| 1433 | struct kvm_segment cs, ss; | ||
| 1434 | u64 msr_data; | ||
| 1435 | |||
| 1436 | /* syscall is not available in real mode */ | ||
| 1437 | if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL | ||
| 1438 | || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) | ||
| 1439 | return -1; | ||
| 1440 | |||
| 1441 | setup_syscalls_segments(ctxt, &cs, &ss); | ||
| 1442 | |||
| 1443 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); | ||
| 1444 | msr_data >>= 32; | ||
| 1445 | cs.selector = (u16)(msr_data & 0xfffc); | ||
| 1446 | ss.selector = (u16)(msr_data + 8); | ||
| 1447 | |||
| 1448 | if (is_long_mode(ctxt->vcpu)) { | ||
| 1449 | cs.db = 0; | ||
| 1450 | cs.l = 1; | ||
| 1451 | } | ||
| 1452 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | ||
| 1453 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | ||
| 1454 | |||
| 1455 | c->regs[VCPU_REGS_RCX] = c->eip; | ||
| 1456 | if (is_long_mode(ctxt->vcpu)) { | ||
| 1457 | #ifdef CONFIG_X86_64 | ||
| 1458 | c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; | ||
| 1459 | |||
| 1460 | kvm_x86_ops->get_msr(ctxt->vcpu, | ||
| 1461 | ctxt->mode == X86EMUL_MODE_PROT64 ? | ||
| 1462 | MSR_LSTAR : MSR_CSTAR, &msr_data); | ||
| 1463 | c->eip = msr_data; | ||
| 1464 | |||
| 1465 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); | ||
| 1466 | ctxt->eflags &= ~(msr_data | EFLG_RF); | ||
| 1467 | #endif | ||
| 1468 | } else { | ||
| 1469 | /* legacy mode */ | ||
| 1470 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); | ||
| 1471 | c->eip = (u32)msr_data; | ||
| 1472 | |||
| 1473 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | return 0; | ||
| 1477 | } | ||
| 1478 | |||
| 1479 | static int | ||
| 1480 | emulate_sysenter(struct x86_emulate_ctxt *ctxt) | ||
| 1481 | { | ||
| 1482 | struct decode_cache *c = &ctxt->decode; | ||
| 1483 | struct kvm_segment cs, ss; | ||
| 1484 | u64 msr_data; | ||
| 1485 | |||
| 1486 | /* inject #UD if LOCK prefix is used */ | ||
| 1487 | if (c->lock_prefix) | ||
| 1488 | return -1; | ||
| 1489 | |||
| 1490 | /* inject #GP if in real mode or paging is disabled */ | ||
| 1491 | if (ctxt->mode == X86EMUL_MODE_REAL || | ||
| 1492 | !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { | ||
| 1493 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1494 | return -1; | ||
| 1495 | } | ||
| 1496 | |||
| 1497 | /* XXX sysenter/sysexit have not been tested in 64bit mode. | ||
| 1498 | * Therefore, we inject an #UD. | ||
| 1499 | */ | ||
| 1500 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
| 1501 | return -1; | ||
| 1502 | |||
| 1503 | setup_syscalls_segments(ctxt, &cs, &ss); | ||
| 1504 | |||
| 1505 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | ||
| 1506 | switch (ctxt->mode) { | ||
| 1507 | case X86EMUL_MODE_PROT32: | ||
| 1508 | if ((msr_data & 0xfffc) == 0x0) { | ||
| 1509 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1510 | return -1; | ||
| 1511 | } | ||
| 1512 | break; | ||
| 1513 | case X86EMUL_MODE_PROT64: | ||
| 1514 | if (msr_data == 0x0) { | ||
| 1515 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1516 | return -1; | ||
| 1517 | } | ||
| 1518 | break; | ||
| 1519 | } | ||
| 1520 | |||
| 1521 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | ||
| 1522 | cs.selector = (u16)msr_data; | ||
| 1523 | cs.selector &= ~SELECTOR_RPL_MASK; | ||
| 1524 | ss.selector = cs.selector + 8; | ||
| 1525 | ss.selector &= ~SELECTOR_RPL_MASK; | ||
| 1526 | if (ctxt->mode == X86EMUL_MODE_PROT64 | ||
| 1527 | || is_long_mode(ctxt->vcpu)) { | ||
| 1528 | cs.db = 0; | ||
| 1529 | cs.l = 1; | ||
| 1530 | } | ||
| 1531 | |||
| 1532 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | ||
| 1533 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | ||
| 1534 | |||
| 1535 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); | ||
| 1536 | c->eip = msr_data; | ||
| 1537 | |||
| 1538 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); | ||
| 1539 | c->regs[VCPU_REGS_RSP] = msr_data; | ||
| 1540 | |||
| 1541 | return 0; | ||
| 1542 | } | ||
| 1543 | |||
| 1544 | static int | ||
| 1545 | emulate_sysexit(struct x86_emulate_ctxt *ctxt) | ||
| 1546 | { | ||
| 1547 | struct decode_cache *c = &ctxt->decode; | ||
| 1548 | struct kvm_segment cs, ss; | ||
| 1549 | u64 msr_data; | ||
| 1550 | int usermode; | ||
| 1551 | |||
| 1552 | /* inject #UD if LOCK prefix is used */ | ||
| 1553 | if (c->lock_prefix) | ||
| 1554 | return -1; | ||
| 1555 | |||
| 1556 | /* inject #GP if in real mode or paging is disabled */ | ||
| 1557 | if (ctxt->mode == X86EMUL_MODE_REAL | ||
| 1558 | || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { | ||
| 1559 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1560 | return -1; | ||
| 1561 | } | ||
| 1562 | |||
| 1563 | /* sysexit must be called from CPL 0 */ | ||
| 1564 | if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { | ||
| 1565 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1566 | return -1; | ||
| 1567 | } | ||
| 1568 | |||
| 1569 | setup_syscalls_segments(ctxt, &cs, &ss); | ||
| 1570 | |||
| 1571 | if ((c->rex_prefix & 0x8) != 0x0) | ||
| 1572 | usermode = X86EMUL_MODE_PROT64; | ||
| 1573 | else | ||
| 1574 | usermode = X86EMUL_MODE_PROT32; | ||
| 1575 | |||
| 1576 | cs.dpl = 3; | ||
| 1577 | ss.dpl = 3; | ||
| 1578 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | ||
| 1579 | switch (usermode) { | ||
| 1580 | case X86EMUL_MODE_PROT32: | ||
| 1581 | cs.selector = (u16)(msr_data + 16); | ||
| 1582 | if ((msr_data & 0xfffc) == 0x0) { | ||
| 1583 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1584 | return -1; | ||
| 1585 | } | ||
| 1586 | ss.selector = (u16)(msr_data + 24); | ||
| 1587 | break; | ||
| 1588 | case X86EMUL_MODE_PROT64: | ||
| 1589 | cs.selector = (u16)(msr_data + 32); | ||
| 1590 | if (msr_data == 0x0) { | ||
| 1591 | kvm_inject_gp(ctxt->vcpu, 0); | ||
| 1592 | return -1; | ||
| 1593 | } | ||
| 1594 | ss.selector = cs.selector + 8; | ||
| 1595 | cs.db = 0; | ||
| 1596 | cs.l = 1; | ||
| 1597 | break; | ||
| 1598 | } | ||
| 1599 | cs.selector |= SELECTOR_RPL_MASK; | ||
| 1600 | ss.selector |= SELECTOR_RPL_MASK; | ||
| 1601 | |||
| 1602 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | ||
| 1603 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | ||
| 1604 | |||
| 1605 | c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; | ||
| 1606 | c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 1607 | |||
| 1608 | return 0; | ||
| 1609 | } | ||
| 1610 | |||
| 1378 | int | 1611 | int |
| 1379 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | 1612 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) |
| 1380 | { | 1613 | { |
| @@ -1970,6 +2203,12 @@ twobyte_insn: | |||
| 1970 | goto cannot_emulate; | 2203 | goto cannot_emulate; |
| 1971 | } | 2204 | } |
| 1972 | break; | 2205 | break; |
| 2206 | case 0x05: /* syscall */ | ||
| 2207 | if (emulate_syscall(ctxt) == -1) | ||
| 2208 | goto cannot_emulate; | ||
| 2209 | else | ||
| 2210 | goto writeback; | ||
| 2211 | break; | ||
| 1973 | case 0x06: | 2212 | case 0x06: |
| 1974 | emulate_clts(ctxt->vcpu); | 2213 | emulate_clts(ctxt->vcpu); |
| 1975 | c->dst.type = OP_NONE; | 2214 | c->dst.type = OP_NONE; |
| @@ -2036,6 +2275,18 @@ twobyte_insn: | |||
| 2036 | rc = X86EMUL_CONTINUE; | 2275 | rc = X86EMUL_CONTINUE; |
| 2037 | c->dst.type = OP_NONE; | 2276 | c->dst.type = OP_NONE; |
| 2038 | break; | 2277 | break; |
| 2278 | case 0x34: /* sysenter */ | ||
| 2279 | if (emulate_sysenter(ctxt) == -1) | ||
| 2280 | goto cannot_emulate; | ||
| 2281 | else | ||
| 2282 | goto writeback; | ||
| 2283 | break; | ||
| 2284 | case 0x35: /* sysexit */ | ||
| 2285 | if (emulate_sysexit(ctxt) == -1) | ||
| 2286 | goto cannot_emulate; | ||
| 2287 | else | ||
| 2288 | goto writeback; | ||
| 2289 | break; | ||
| 2039 | case 0x40 ... 0x4f: /* cmov */ | 2290 | case 0x40 ... 0x4f: /* cmov */ |
| 2040 | c->dst.val = c->dst.orig_val = c->src.val; | 2291 | c->dst.val = c->dst.orig_val = c->src.val; |
| 2041 | if (!test_cc(c->b, ctxt->eflags)) | 2292 | if (!test_cc(c->b, ctxt->eflags)) |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 21f68e00524..82ad523b490 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
| @@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) | |||
| 231 | { | 231 | { |
| 232 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 232 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
| 233 | 233 | ||
| 234 | if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) | 234 | if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) |
| 235 | return atomic_read(&pit->pit_state.pit_timer.pending); | 235 | return atomic_read(&pit->pit_state.pit_timer.pending); |
| 236 | return 0; | 236 | return 0; |
| 237 | } | 237 | } |
| @@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) | |||
| 252 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 252 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
| 253 | struct hrtimer *timer; | 253 | struct hrtimer *timer; |
| 254 | 254 | ||
| 255 | if (vcpu->vcpu_id != 0 || !pit) | 255 | if (!kvm_vcpu_is_bsp(vcpu) || !pit) |
| 256 | return; | 256 | return; |
| 257 | 257 | ||
| 258 | timer = &pit->pit_state.pit_timer.timer; | 258 | timer = &pit->pit_state.pit_timer.timer; |
| @@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) | |||
| 294 | pt->timer.function = kvm_timer_fn; | 294 | pt->timer.function = kvm_timer_fn; |
| 295 | pt->t_ops = &kpit_ops; | 295 | pt->t_ops = &kpit_ops; |
| 296 | pt->kvm = ps->pit->kvm; | 296 | pt->kvm = ps->pit->kvm; |
| 297 | pt->vcpu_id = 0; | 297 | pt->vcpu = pt->kvm->bsp_vcpu; |
| 298 | 298 | ||
| 299 | atomic_set(&pt->pending, 0); | 299 | atomic_set(&pt->pending, 0); |
| 300 | ps->irq_ack = 1; | 300 | ps->irq_ack = 1; |
| @@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) | |||
| 332 | case 1: | 332 | case 1: |
| 333 | /* FIXME: enhance mode 4 precision */ | 333 | /* FIXME: enhance mode 4 precision */ |
| 334 | case 4: | 334 | case 4: |
| 335 | create_pit_timer(ps, val, 0); | 335 | if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { |
| 336 | create_pit_timer(ps, val, 0); | ||
| 337 | } | ||
| 336 | break; | 338 | break; |
| 337 | case 2: | 339 | case 2: |
| 338 | case 3: | 340 | case 3: |
| 339 | create_pit_timer(ps, val, 1); | 341 | if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ |
| 342 | create_pit_timer(ps, val, 1); | ||
| 343 | } | ||
| 340 | break; | 344 | break; |
| 341 | default: | 345 | default: |
| 342 | destroy_pit_timer(&ps->pit_timer); | 346 | destroy_pit_timer(&ps->pit_timer); |
| 343 | } | 347 | } |
| 344 | } | 348 | } |
| 345 | 349 | ||
| 346 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) | 350 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) |
| 351 | { | ||
| 352 | u8 saved_mode; | ||
| 353 | if (hpet_legacy_start) { | ||
| 354 | /* save existing mode for later reenablement */ | ||
| 355 | saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; | ||
| 356 | kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ | ||
| 357 | pit_load_count(kvm, channel, val); | ||
| 358 | kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; | ||
| 359 | } else { | ||
| 360 | pit_load_count(kvm, channel, val); | ||
| 361 | } | ||
| 362 | } | ||
| 363 | |||
| 364 | static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) | ||
| 365 | { | ||
| 366 | return container_of(dev, struct kvm_pit, dev); | ||
| 367 | } | ||
| 368 | |||
| 369 | static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) | ||
| 347 | { | 370 | { |
| 348 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | 371 | return container_of(dev, struct kvm_pit, speaker_dev); |
| 349 | pit_load_count(kvm, channel, val); | ||
| 350 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
| 351 | } | 372 | } |
| 352 | 373 | ||
| 353 | static void pit_ioport_write(struct kvm_io_device *this, | 374 | static inline int pit_in_range(gpa_t addr) |
| 354 | gpa_t addr, int len, const void *data) | ||
| 355 | { | 375 | { |
| 356 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | 376 | return ((addr >= KVM_PIT_BASE_ADDRESS) && |
| 377 | (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); | ||
| 378 | } | ||
| 379 | |||
| 380 | static int pit_ioport_write(struct kvm_io_device *this, | ||
| 381 | gpa_t addr, int len, const void *data) | ||
| 382 | { | ||
| 383 | struct kvm_pit *pit = dev_to_pit(this); | ||
| 357 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 384 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
| 358 | struct kvm *kvm = pit->kvm; | 385 | struct kvm *kvm = pit->kvm; |
| 359 | int channel, access; | 386 | int channel, access; |
| 360 | struct kvm_kpit_channel_state *s; | 387 | struct kvm_kpit_channel_state *s; |
| 361 | u32 val = *(u32 *) data; | 388 | u32 val = *(u32 *) data; |
| 389 | if (!pit_in_range(addr)) | ||
| 390 | return -EOPNOTSUPP; | ||
| 362 | 391 | ||
| 363 | val &= 0xff; | 392 | val &= 0xff; |
| 364 | addr &= KVM_PIT_CHANNEL_MASK; | 393 | addr &= KVM_PIT_CHANNEL_MASK; |
| @@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this, | |||
| 421 | } | 450 | } |
| 422 | 451 | ||
| 423 | mutex_unlock(&pit_state->lock); | 452 | mutex_unlock(&pit_state->lock); |
| 453 | return 0; | ||
| 424 | } | 454 | } |
| 425 | 455 | ||
| 426 | static void pit_ioport_read(struct kvm_io_device *this, | 456 | static int pit_ioport_read(struct kvm_io_device *this, |
| 427 | gpa_t addr, int len, void *data) | 457 | gpa_t addr, int len, void *data) |
| 428 | { | 458 | { |
| 429 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | 459 | struct kvm_pit *pit = dev_to_pit(this); |
| 430 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 460 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
| 431 | struct kvm *kvm = pit->kvm; | 461 | struct kvm *kvm = pit->kvm; |
| 432 | int ret, count; | 462 | int ret, count; |
| 433 | struct kvm_kpit_channel_state *s; | 463 | struct kvm_kpit_channel_state *s; |
| 464 | if (!pit_in_range(addr)) | ||
| 465 | return -EOPNOTSUPP; | ||
| 434 | 466 | ||
| 435 | addr &= KVM_PIT_CHANNEL_MASK; | 467 | addr &= KVM_PIT_CHANNEL_MASK; |
| 436 | s = &pit_state->channels[addr]; | 468 | s = &pit_state->channels[addr]; |
| @@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this, | |||
| 485 | memcpy(data, (char *)&ret, len); | 517 | memcpy(data, (char *)&ret, len); |
| 486 | 518 | ||
| 487 | mutex_unlock(&pit_state->lock); | 519 | mutex_unlock(&pit_state->lock); |
| 520 | return 0; | ||
| 488 | } | 521 | } |
| 489 | 522 | ||
| 490 | static int pit_in_range(struct kvm_io_device *this, gpa_t addr, | 523 | static int speaker_ioport_write(struct kvm_io_device *this, |
| 491 | int len, int is_write) | 524 | gpa_t addr, int len, const void *data) |
| 492 | { | ||
| 493 | return ((addr >= KVM_PIT_BASE_ADDRESS) && | ||
| 494 | (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); | ||
| 495 | } | ||
| 496 | |||
| 497 | static void speaker_ioport_write(struct kvm_io_device *this, | ||
| 498 | gpa_t addr, int len, const void *data) | ||
| 499 | { | 525 | { |
| 500 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | 526 | struct kvm_pit *pit = speaker_to_pit(this); |
| 501 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 527 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
| 502 | struct kvm *kvm = pit->kvm; | 528 | struct kvm *kvm = pit->kvm; |
| 503 | u32 val = *(u32 *) data; | 529 | u32 val = *(u32 *) data; |
| 530 | if (addr != KVM_SPEAKER_BASE_ADDRESS) | ||
| 531 | return -EOPNOTSUPP; | ||
| 504 | 532 | ||
| 505 | mutex_lock(&pit_state->lock); | 533 | mutex_lock(&pit_state->lock); |
| 506 | pit_state->speaker_data_on = (val >> 1) & 1; | 534 | pit_state->speaker_data_on = (val >> 1) & 1; |
| 507 | pit_set_gate(kvm, 2, val & 1); | 535 | pit_set_gate(kvm, 2, val & 1); |
| 508 | mutex_unlock(&pit_state->lock); | 536 | mutex_unlock(&pit_state->lock); |
| 537 | return 0; | ||
| 509 | } | 538 | } |
| 510 | 539 | ||
| 511 | static void speaker_ioport_read(struct kvm_io_device *this, | 540 | static int speaker_ioport_read(struct kvm_io_device *this, |
| 512 | gpa_t addr, int len, void *data) | 541 | gpa_t addr, int len, void *data) |
| 513 | { | 542 | { |
| 514 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | 543 | struct kvm_pit *pit = speaker_to_pit(this); |
| 515 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 544 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
| 516 | struct kvm *kvm = pit->kvm; | 545 | struct kvm *kvm = pit->kvm; |
| 517 | unsigned int refresh_clock; | 546 | unsigned int refresh_clock; |
| 518 | int ret; | 547 | int ret; |
| 548 | if (addr != KVM_SPEAKER_BASE_ADDRESS) | ||
| 549 | return -EOPNOTSUPP; | ||
| 519 | 550 | ||
| 520 | /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ | 551 | /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ |
| 521 | refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; | 552 | refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; |
| @@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this, | |||
| 527 | len = sizeof(ret); | 558 | len = sizeof(ret); |
| 528 | memcpy(data, (char *)&ret, len); | 559 | memcpy(data, (char *)&ret, len); |
| 529 | mutex_unlock(&pit_state->lock); | 560 | mutex_unlock(&pit_state->lock); |
| 530 | } | 561 | return 0; |
| 531 | |||
| 532 | static int speaker_in_range(struct kvm_io_device *this, gpa_t addr, | ||
| 533 | int len, int is_write) | ||
| 534 | { | ||
| 535 | return (addr == KVM_SPEAKER_BASE_ADDRESS); | ||
| 536 | } | 562 | } |
| 537 | 563 | ||
| 538 | void kvm_pit_reset(struct kvm_pit *pit) | 564 | void kvm_pit_reset(struct kvm_pit *pit) |
| @@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit) | |||
| 541 | struct kvm_kpit_channel_state *c; | 567 | struct kvm_kpit_channel_state *c; |
| 542 | 568 | ||
| 543 | mutex_lock(&pit->pit_state.lock); | 569 | mutex_lock(&pit->pit_state.lock); |
| 570 | pit->pit_state.flags = 0; | ||
| 544 | for (i = 0; i < 3; i++) { | 571 | for (i = 0; i < 3; i++) { |
| 545 | c = &pit->pit_state.channels[i]; | 572 | c = &pit->pit_state.channels[i]; |
| 546 | c->mode = 0xff; | 573 | c->mode = 0xff; |
| @@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) | |||
| 563 | } | 590 | } |
| 564 | } | 591 | } |
| 565 | 592 | ||
| 566 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) | 593 | static const struct kvm_io_device_ops pit_dev_ops = { |
| 594 | .read = pit_ioport_read, | ||
| 595 | .write = pit_ioport_write, | ||
| 596 | }; | ||
| 597 | |||
| 598 | static const struct kvm_io_device_ops speaker_dev_ops = { | ||
| 599 | .read = speaker_ioport_read, | ||
| 600 | .write = speaker_ioport_write, | ||
| 601 | }; | ||
| 602 | |||
| 603 | /* Caller must have writers lock on slots_lock */ | ||
| 604 | struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) | ||
| 567 | { | 605 | { |
| 568 | struct kvm_pit *pit; | 606 | struct kvm_pit *pit; |
| 569 | struct kvm_kpit_state *pit_state; | 607 | struct kvm_kpit_state *pit_state; |
| 608 | int ret; | ||
| 570 | 609 | ||
| 571 | pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); | 610 | pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); |
| 572 | if (!pit) | 611 | if (!pit) |
| @@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
| 582 | mutex_lock(&pit->pit_state.lock); | 621 | mutex_lock(&pit->pit_state.lock); |
| 583 | spin_lock_init(&pit->pit_state.inject_lock); | 622 | spin_lock_init(&pit->pit_state.inject_lock); |
| 584 | 623 | ||
| 585 | /* Initialize PIO device */ | ||
| 586 | pit->dev.read = pit_ioport_read; | ||
| 587 | pit->dev.write = pit_ioport_write; | ||
| 588 | pit->dev.in_range = pit_in_range; | ||
| 589 | pit->dev.private = pit; | ||
| 590 | kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); | ||
| 591 | |||
| 592 | pit->speaker_dev.read = speaker_ioport_read; | ||
| 593 | pit->speaker_dev.write = speaker_ioport_write; | ||
| 594 | pit->speaker_dev.in_range = speaker_in_range; | ||
| 595 | pit->speaker_dev.private = pit; | ||
| 596 | kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); | ||
| 597 | |||
| 598 | kvm->arch.vpit = pit; | 624 | kvm->arch.vpit = pit; |
| 599 | pit->kvm = kvm; | 625 | pit->kvm = kvm; |
| 600 | 626 | ||
| @@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
| 613 | pit->mask_notifier.func = pit_mask_notifer; | 639 | pit->mask_notifier.func = pit_mask_notifer; |
| 614 | kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); | 640 | kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); |
| 615 | 641 | ||
| 642 | kvm_iodevice_init(&pit->dev, &pit_dev_ops); | ||
| 643 | ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); | ||
| 644 | if (ret < 0) | ||
| 645 | goto fail; | ||
| 646 | |||
| 647 | if (flags & KVM_PIT_SPEAKER_DUMMY) { | ||
| 648 | kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); | ||
| 649 | ret = __kvm_io_bus_register_dev(&kvm->pio_bus, | ||
| 650 | &pit->speaker_dev); | ||
| 651 | if (ret < 0) | ||
| 652 | goto fail_unregister; | ||
| 653 | } | ||
| 654 | |||
| 616 | return pit; | 655 | return pit; |
| 656 | |||
| 657 | fail_unregister: | ||
| 658 | __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); | ||
| 659 | |||
| 660 | fail: | ||
| 661 | if (pit->irq_source_id >= 0) | ||
| 662 | kvm_free_irq_source_id(kvm, pit->irq_source_id); | ||
| 663 | |||
| 664 | kfree(pit); | ||
| 665 | return NULL; | ||
| 617 | } | 666 | } |
| 618 | 667 | ||
| 619 | void kvm_free_pit(struct kvm *kvm) | 668 | void kvm_free_pit(struct kvm *kvm) |
| @@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm) | |||
| 623 | if (kvm->arch.vpit) { | 672 | if (kvm->arch.vpit) { |
| 624 | kvm_unregister_irq_mask_notifier(kvm, 0, | 673 | kvm_unregister_irq_mask_notifier(kvm, 0, |
| 625 | &kvm->arch.vpit->mask_notifier); | 674 | &kvm->arch.vpit->mask_notifier); |
| 675 | kvm_unregister_irq_ack_notifier(kvm, | ||
| 676 | &kvm->arch.vpit->pit_state.irq_ack_notifier); | ||
| 626 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | 677 | mutex_lock(&kvm->arch.vpit->pit_state.lock); |
| 627 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; | 678 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; |
| 628 | hrtimer_cancel(timer); | 679 | hrtimer_cancel(timer); |
| @@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm) | |||
| 637 | struct kvm_vcpu *vcpu; | 688 | struct kvm_vcpu *vcpu; |
| 638 | int i; | 689 | int i; |
| 639 | 690 | ||
| 640 | mutex_lock(&kvm->lock); | 691 | mutex_lock(&kvm->irq_lock); |
| 641 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); | 692 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); |
| 642 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); | 693 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); |
| 643 | mutex_unlock(&kvm->lock); | 694 | mutex_unlock(&kvm->irq_lock); |
| 644 | 695 | ||
| 645 | /* | 696 | /* |
| 646 | * Provides NMI watchdog support via Virtual Wire mode. | 697 | * Provides NMI watchdog support via Virtual Wire mode. |
| @@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm) | |||
| 652 | * VCPU0, and only if its LVT0 is in EXTINT mode. | 703 | * VCPU0, and only if its LVT0 is in EXTINT mode. |
| 653 | */ | 704 | */ |
| 654 | if (kvm->arch.vapics_in_nmi_mode > 0) | 705 | if (kvm->arch.vapics_in_nmi_mode > 0) |
| 655 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 706 | kvm_for_each_vcpu(i, vcpu, kvm) |
| 656 | vcpu = kvm->vcpus[i]; | 707 | kvm_apic_nmi_wd_deliver(vcpu); |
| 657 | if (vcpu) | ||
| 658 | kvm_apic_nmi_wd_deliver(vcpu); | ||
| 659 | } | ||
| 660 | } | 708 | } |
| 661 | 709 | ||
| 662 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | 710 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) |
| @@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | |||
| 665 | struct kvm *kvm = vcpu->kvm; | 713 | struct kvm *kvm = vcpu->kvm; |
| 666 | struct kvm_kpit_state *ps; | 714 | struct kvm_kpit_state *ps; |
| 667 | 715 | ||
| 668 | if (vcpu && pit) { | 716 | if (pit) { |
| 669 | int inject = 0; | 717 | int inject = 0; |
| 670 | ps = &pit->pit_state; | 718 | ps = &pit->pit_state; |
| 671 | 719 | ||
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index bbd863ff60b..d4c1c7ffdc0 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
| @@ -21,6 +21,7 @@ struct kvm_kpit_channel_state { | |||
| 21 | 21 | ||
| 22 | struct kvm_kpit_state { | 22 | struct kvm_kpit_state { |
| 23 | struct kvm_kpit_channel_state channels[3]; | 23 | struct kvm_kpit_channel_state channels[3]; |
| 24 | u32 flags; | ||
| 24 | struct kvm_timer pit_timer; | 25 | struct kvm_timer pit_timer; |
| 25 | bool is_periodic; | 26 | bool is_periodic; |
| 26 | u32 speaker_data_on; | 27 | u32 speaker_data_on; |
| @@ -49,8 +50,8 @@ struct kvm_pit { | |||
| 49 | #define KVM_PIT_CHANNEL_MASK 0x3 | 50 | #define KVM_PIT_CHANNEL_MASK 0x3 |
| 50 | 51 | ||
| 51 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); | 52 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); |
| 52 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); | 53 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); |
| 53 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); | 54 | struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); |
| 54 | void kvm_free_pit(struct kvm *kvm); | 55 | void kvm_free_pit(struct kvm *kvm); |
| 55 | void kvm_pit_reset(struct kvm_pit *pit); | 56 | void kvm_pit_reset(struct kvm_pit *pit); |
| 56 | 57 | ||
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1ccb50c74f1..01f15168280 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
| @@ -30,50 +30,24 @@ | |||
| 30 | #include "irq.h" | 30 | #include "irq.h" |
| 31 | 31 | ||
| 32 | #include <linux/kvm_host.h> | 32 | #include <linux/kvm_host.h> |
| 33 | 33 | #include "trace.h" | |
| 34 | static void pic_lock(struct kvm_pic *s) | ||
| 35 | __acquires(&s->lock) | ||
| 36 | { | ||
| 37 | spin_lock(&s->lock); | ||
| 38 | } | ||
| 39 | |||
| 40 | static void pic_unlock(struct kvm_pic *s) | ||
| 41 | __releases(&s->lock) | ||
| 42 | { | ||
| 43 | struct kvm *kvm = s->kvm; | ||
| 44 | unsigned acks = s->pending_acks; | ||
| 45 | bool wakeup = s->wakeup_needed; | ||
| 46 | struct kvm_vcpu *vcpu; | ||
| 47 | |||
| 48 | s->pending_acks = 0; | ||
| 49 | s->wakeup_needed = false; | ||
| 50 | |||
| 51 | spin_unlock(&s->lock); | ||
| 52 | |||
| 53 | while (acks) { | ||
| 54 | kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)), | ||
| 55 | __ffs(acks)); | ||
| 56 | acks &= acks - 1; | ||
| 57 | } | ||
| 58 | |||
| 59 | if (wakeup) { | ||
| 60 | vcpu = s->kvm->vcpus[0]; | ||
| 61 | if (vcpu) | ||
| 62 | kvm_vcpu_kick(vcpu); | ||
| 63 | } | ||
| 64 | } | ||
| 65 | 34 | ||
| 66 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) | 35 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) |
| 67 | { | 36 | { |
| 68 | s->isr &= ~(1 << irq); | 37 | s->isr &= ~(1 << irq); |
| 69 | s->isr_ack |= (1 << irq); | 38 | s->isr_ack |= (1 << irq); |
| 39 | if (s != &s->pics_state->pics[0]) | ||
| 40 | irq += 8; | ||
| 41 | kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); | ||
| 70 | } | 42 | } |
| 71 | 43 | ||
| 72 | void kvm_pic_clear_isr_ack(struct kvm *kvm) | 44 | void kvm_pic_clear_isr_ack(struct kvm *kvm) |
| 73 | { | 45 | { |
| 74 | struct kvm_pic *s = pic_irqchip(kvm); | 46 | struct kvm_pic *s = pic_irqchip(kvm); |
| 47 | spin_lock(&s->lock); | ||
| 75 | s->pics[0].isr_ack = 0xff; | 48 | s->pics[0].isr_ack = 0xff; |
| 76 | s->pics[1].isr_ack = 0xff; | 49 | s->pics[1].isr_ack = 0xff; |
| 50 | spin_unlock(&s->lock); | ||
| 77 | } | 51 | } |
| 78 | 52 | ||
| 79 | /* | 53 | /* |
| @@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s) | |||
| 174 | 148 | ||
| 175 | void kvm_pic_update_irq(struct kvm_pic *s) | 149 | void kvm_pic_update_irq(struct kvm_pic *s) |
| 176 | { | 150 | { |
| 177 | pic_lock(s); | 151 | spin_lock(&s->lock); |
| 178 | pic_update_irq(s); | 152 | pic_update_irq(s); |
| 179 | pic_unlock(s); | 153 | spin_unlock(&s->lock); |
| 180 | } | 154 | } |
| 181 | 155 | ||
| 182 | int kvm_pic_set_irq(void *opaque, int irq, int level) | 156 | int kvm_pic_set_irq(void *opaque, int irq, int level) |
| @@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) | |||
| 184 | struct kvm_pic *s = opaque; | 158 | struct kvm_pic *s = opaque; |
| 185 | int ret = -1; | 159 | int ret = -1; |
| 186 | 160 | ||
| 187 | pic_lock(s); | 161 | spin_lock(&s->lock); |
| 188 | if (irq >= 0 && irq < PIC_NUM_PINS) { | 162 | if (irq >= 0 && irq < PIC_NUM_PINS) { |
| 189 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); | 163 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); |
| 190 | pic_update_irq(s); | 164 | pic_update_irq(s); |
| 165 | trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, | ||
| 166 | s->pics[irq >> 3].imr, ret == 0); | ||
| 191 | } | 167 | } |
| 192 | pic_unlock(s); | 168 | spin_unlock(&s->lock); |
| 193 | 169 | ||
| 194 | return ret; | 170 | return ret; |
| 195 | } | 171 | } |
| @@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
| 217 | int irq, irq2, intno; | 193 | int irq, irq2, intno; |
| 218 | struct kvm_pic *s = pic_irqchip(kvm); | 194 | struct kvm_pic *s = pic_irqchip(kvm); |
| 219 | 195 | ||
| 220 | pic_lock(s); | 196 | spin_lock(&s->lock); |
| 221 | irq = pic_get_irq(&s->pics[0]); | 197 | irq = pic_get_irq(&s->pics[0]); |
| 222 | if (irq >= 0) { | 198 | if (irq >= 0) { |
| 223 | pic_intack(&s->pics[0], irq); | 199 | pic_intack(&s->pics[0], irq); |
| @@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
| 242 | intno = s->pics[0].irq_base + irq; | 218 | intno = s->pics[0].irq_base + irq; |
| 243 | } | 219 | } |
| 244 | pic_update_irq(s); | 220 | pic_update_irq(s); |
| 245 | pic_unlock(s); | 221 | spin_unlock(&s->lock); |
| 246 | kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq); | ||
| 247 | 222 | ||
| 248 | return intno; | 223 | return intno; |
| 249 | } | 224 | } |
| @@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
| 252 | { | 227 | { |
| 253 | int irq, irqbase, n; | 228 | int irq, irqbase, n; |
| 254 | struct kvm *kvm = s->pics_state->irq_request_opaque; | 229 | struct kvm *kvm = s->pics_state->irq_request_opaque; |
| 255 | struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; | 230 | struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; |
| 256 | 231 | ||
| 257 | if (s == &s->pics_state->pics[0]) | 232 | if (s == &s->pics_state->pics[0]) |
| 258 | irqbase = 0; | 233 | irqbase = 0; |
| @@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
| 263 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) | 238 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) |
| 264 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) { | 239 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) { |
| 265 | n = irq + irqbase; | 240 | n = irq + irqbase; |
| 266 | s->pics_state->pending_acks |= 1 << n; | 241 | kvm_notify_acked_irq(kvm, SELECT_PIC(n), n); |
| 267 | } | 242 | } |
| 268 | } | 243 | } |
| 269 | s->last_irr = 0; | 244 | s->last_irr = 0; |
| @@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) | |||
| 428 | return s->elcr; | 403 | return s->elcr; |
| 429 | } | 404 | } |
| 430 | 405 | ||
| 431 | static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, | 406 | static int picdev_in_range(gpa_t addr) |
| 432 | int len, int is_write) | ||
| 433 | { | 407 | { |
| 434 | switch (addr) { | 408 | switch (addr) { |
| 435 | case 0x20: | 409 | case 0x20: |
| @@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, | |||
| 444 | } | 418 | } |
| 445 | } | 419 | } |
| 446 | 420 | ||
| 447 | static void picdev_write(struct kvm_io_device *this, | 421 | static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) |
| 422 | { | ||
| 423 | return container_of(dev, struct kvm_pic, dev); | ||
| 424 | } | ||
| 425 | |||
| 426 | static int picdev_write(struct kvm_io_device *this, | ||
| 448 | gpa_t addr, int len, const void *val) | 427 | gpa_t addr, int len, const void *val) |
| 449 | { | 428 | { |
| 450 | struct kvm_pic *s = this->private; | 429 | struct kvm_pic *s = to_pic(this); |
| 451 | unsigned char data = *(unsigned char *)val; | 430 | unsigned char data = *(unsigned char *)val; |
| 431 | if (!picdev_in_range(addr)) | ||
| 432 | return -EOPNOTSUPP; | ||
| 452 | 433 | ||
| 453 | if (len != 1) { | 434 | if (len != 1) { |
| 454 | if (printk_ratelimit()) | 435 | if (printk_ratelimit()) |
| 455 | printk(KERN_ERR "PIC: non byte write\n"); | 436 | printk(KERN_ERR "PIC: non byte write\n"); |
| 456 | return; | 437 | return 0; |
| 457 | } | 438 | } |
| 458 | pic_lock(s); | 439 | spin_lock(&s->lock); |
| 459 | switch (addr) { | 440 | switch (addr) { |
| 460 | case 0x20: | 441 | case 0x20: |
| 461 | case 0x21: | 442 | case 0x21: |
| @@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this, | |||
| 468 | elcr_ioport_write(&s->pics[addr & 1], addr, data); | 449 | elcr_ioport_write(&s->pics[addr & 1], addr, data); |
| 469 | break; | 450 | break; |
| 470 | } | 451 | } |
| 471 | pic_unlock(s); | 452 | spin_unlock(&s->lock); |
| 453 | return 0; | ||
| 472 | } | 454 | } |
| 473 | 455 | ||
| 474 | static void picdev_read(struct kvm_io_device *this, | 456 | static int picdev_read(struct kvm_io_device *this, |
| 475 | gpa_t addr, int len, void *val) | 457 | gpa_t addr, int len, void *val) |
| 476 | { | 458 | { |
| 477 | struct kvm_pic *s = this->private; | 459 | struct kvm_pic *s = to_pic(this); |
| 478 | unsigned char data = 0; | 460 | unsigned char data = 0; |
| 461 | if (!picdev_in_range(addr)) | ||
| 462 | return -EOPNOTSUPP; | ||
| 479 | 463 | ||
| 480 | if (len != 1) { | 464 | if (len != 1) { |
| 481 | if (printk_ratelimit()) | 465 | if (printk_ratelimit()) |
| 482 | printk(KERN_ERR "PIC: non byte read\n"); | 466 | printk(KERN_ERR "PIC: non byte read\n"); |
| 483 | return; | 467 | return 0; |
| 484 | } | 468 | } |
| 485 | pic_lock(s); | 469 | spin_lock(&s->lock); |
| 486 | switch (addr) { | 470 | switch (addr) { |
| 487 | case 0x20: | 471 | case 0x20: |
| 488 | case 0x21: | 472 | case 0x21: |
| @@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this, | |||
| 496 | break; | 480 | break; |
| 497 | } | 481 | } |
| 498 | *(unsigned char *)val = data; | 482 | *(unsigned char *)val = data; |
| 499 | pic_unlock(s); | 483 | spin_unlock(&s->lock); |
| 484 | return 0; | ||
| 500 | } | 485 | } |
| 501 | 486 | ||
| 502 | /* | 487 | /* |
| @@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this, | |||
| 505 | static void pic_irq_request(void *opaque, int level) | 490 | static void pic_irq_request(void *opaque, int level) |
| 506 | { | 491 | { |
| 507 | struct kvm *kvm = opaque; | 492 | struct kvm *kvm = opaque; |
| 508 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; | 493 | struct kvm_vcpu *vcpu = kvm->bsp_vcpu; |
| 509 | struct kvm_pic *s = pic_irqchip(kvm); | 494 | struct kvm_pic *s = pic_irqchip(kvm); |
| 510 | int irq = pic_get_irq(&s->pics[0]); | 495 | int irq = pic_get_irq(&s->pics[0]); |
| 511 | 496 | ||
| 512 | s->output = level; | 497 | s->output = level; |
| 513 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { | 498 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { |
| 514 | s->pics[0].isr_ack &= ~(1 << irq); | 499 | s->pics[0].isr_ack &= ~(1 << irq); |
| 515 | s->wakeup_needed = true; | 500 | kvm_vcpu_kick(vcpu); |
| 516 | } | 501 | } |
| 517 | } | 502 | } |
| 518 | 503 | ||
| 504 | static const struct kvm_io_device_ops picdev_ops = { | ||
| 505 | .read = picdev_read, | ||
| 506 | .write = picdev_write, | ||
| 507 | }; | ||
| 508 | |||
| 519 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) | 509 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) |
| 520 | { | 510 | { |
| 521 | struct kvm_pic *s; | 511 | struct kvm_pic *s; |
| 512 | int ret; | ||
| 513 | |||
| 522 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); | 514 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); |
| 523 | if (!s) | 515 | if (!s) |
| 524 | return NULL; | 516 | return NULL; |
| @@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) | |||
| 534 | /* | 526 | /* |
| 535 | * Initialize PIO device | 527 | * Initialize PIO device |
| 536 | */ | 528 | */ |
| 537 | s->dev.read = picdev_read; | 529 | kvm_iodevice_init(&s->dev, &picdev_ops); |
| 538 | s->dev.write = picdev_write; | 530 | ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); |
| 539 | s->dev.in_range = picdev_in_range; | 531 | if (ret < 0) { |
| 540 | s->dev.private = s; | 532 | kfree(s); |
| 541 | kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); | 533 | return NULL; |
| 534 | } | ||
| 535 | |||
| 542 | return s; | 536 | return s; |
| 543 | } | 537 | } |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9f593188129..7d6058a2fd3 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
| @@ -63,7 +63,6 @@ struct kvm_kpic_state { | |||
| 63 | 63 | ||
| 64 | struct kvm_pic { | 64 | struct kvm_pic { |
| 65 | spinlock_t lock; | 65 | spinlock_t lock; |
| 66 | bool wakeup_needed; | ||
| 67 | unsigned pending_acks; | 66 | unsigned pending_acks; |
| 68 | struct kvm *kvm; | 67 | struct kvm *kvm; |
| 69 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | 68 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1ff819dce7d..7bcc5b6a440 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
| @@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) | |||
| 29 | kvm_register_write(vcpu, VCPU_REGS_RIP, val); | 29 | kvm_register_write(vcpu, VCPU_REGS_RIP, val); |
| 30 | } | 30 | } |
| 31 | 31 | ||
| 32 | static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) | ||
| 33 | { | ||
| 34 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
| 35 | (unsigned long *)&vcpu->arch.regs_avail)) | ||
| 36 | kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); | ||
| 37 | |||
| 38 | return vcpu->arch.pdptrs[index]; | ||
| 39 | } | ||
| 40 | |||
| 32 | #endif | 41 | #endif |
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h deleted file mode 100644 index ed66e4c078d..00000000000 --- a/arch/x86/kvm/kvm_svm.h +++ /dev/null | |||
| @@ -1,51 +0,0 @@ | |||
| 1 | #ifndef __KVM_SVM_H | ||
| 2 | #define __KVM_SVM_H | ||
| 3 | |||
| 4 | #include <linux/kernel.h> | ||
| 5 | #include <linux/types.h> | ||
| 6 | #include <linux/list.h> | ||
| 7 | #include <linux/kvm_host.h> | ||
| 8 | #include <asm/msr.h> | ||
| 9 | |||
| 10 | #include <asm/svm.h> | ||
| 11 | |||
| 12 | static const u32 host_save_user_msrs[] = { | ||
| 13 | #ifdef CONFIG_X86_64 | ||
| 14 | MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, | ||
| 15 | MSR_FS_BASE, | ||
| 16 | #endif | ||
| 17 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
| 18 | }; | ||
| 19 | |||
| 20 | #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) | ||
| 21 | |||
| 22 | struct kvm_vcpu; | ||
| 23 | |||
| 24 | struct vcpu_svm { | ||
| 25 | struct kvm_vcpu vcpu; | ||
| 26 | struct vmcb *vmcb; | ||
| 27 | unsigned long vmcb_pa; | ||
| 28 | struct svm_cpu_data *svm_data; | ||
| 29 | uint64_t asid_generation; | ||
| 30 | |||
| 31 | u64 next_rip; | ||
| 32 | |||
| 33 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | ||
| 34 | u64 host_gs_base; | ||
| 35 | unsigned long host_cr2; | ||
| 36 | |||
| 37 | u32 *msrpm; | ||
| 38 | struct vmcb *hsave; | ||
| 39 | u64 hsave_msr; | ||
| 40 | |||
| 41 | u64 nested_vmcb; | ||
| 42 | |||
| 43 | /* These are the merged vectors */ | ||
| 44 | u32 *nested_msrpm; | ||
| 45 | |||
| 46 | /* gpa pointers to the real vectors */ | ||
| 47 | u64 nested_vmcb_msrpm; | ||
| 48 | }; | ||
| 49 | |||
| 50 | #endif | ||
| 51 | |||
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 26bd6ba74e1..55c7524dda5 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h | |||
| @@ -6,7 +6,7 @@ struct kvm_timer { | |||
| 6 | bool reinject; | 6 | bool reinject; |
| 7 | struct kvm_timer_ops *t_ops; | 7 | struct kvm_timer_ops *t_ops; |
| 8 | struct kvm *kvm; | 8 | struct kvm *kvm; |
| 9 | int vcpu_id; | 9 | struct kvm_vcpu *vcpu; |
| 10 | }; | 10 | }; |
| 11 | 11 | ||
| 12 | struct kvm_timer_ops { | 12 | struct kvm_timer_ops { |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ae99d83f81a..1ae5ceba7eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
| @@ -32,8 +32,11 @@ | |||
| 32 | #include <asm/current.h> | 32 | #include <asm/current.h> |
| 33 | #include <asm/apicdef.h> | 33 | #include <asm/apicdef.h> |
| 34 | #include <asm/atomic.h> | 34 | #include <asm/atomic.h> |
| 35 | #include <asm/apicdef.h> | ||
| 35 | #include "kvm_cache_regs.h" | 36 | #include "kvm_cache_regs.h" |
| 36 | #include "irq.h" | 37 | #include "irq.h" |
| 38 | #include "trace.h" | ||
| 39 | #include "x86.h" | ||
| 37 | 40 | ||
| 38 | #ifndef CONFIG_X86_64 | 41 | #ifndef CONFIG_X86_64 |
| 39 | #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) | 42 | #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) |
| @@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) | |||
| 141 | return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; | 144 | return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; |
| 142 | } | 145 | } |
| 143 | 146 | ||
| 147 | void kvm_apic_set_version(struct kvm_vcpu *vcpu) | ||
| 148 | { | ||
| 149 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
| 150 | struct kvm_cpuid_entry2 *feat; | ||
| 151 | u32 v = APIC_VERSION; | ||
| 152 | |||
| 153 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
| 154 | return; | ||
| 155 | |||
| 156 | feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); | ||
| 157 | if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) | ||
| 158 | v |= APIC_LVR_DIRECTED_EOI; | ||
| 159 | apic_set_reg(apic, APIC_LVR, v); | ||
| 160 | } | ||
| 161 | |||
| 162 | static inline int apic_x2apic_mode(struct kvm_lapic *apic) | ||
| 163 | { | ||
| 164 | return apic->vcpu->arch.apic_base & X2APIC_ENABLE; | ||
| 165 | } | ||
| 166 | |||
| 144 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { | 167 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { |
| 145 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ | 168 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ |
| 146 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ | 169 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ |
| @@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap) | |||
| 165 | 188 | ||
| 166 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) | 189 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) |
| 167 | { | 190 | { |
| 191 | apic->irr_pending = true; | ||
| 168 | return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); | 192 | return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); |
| 169 | } | 193 | } |
| 170 | 194 | ||
| 171 | static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | 195 | static inline int apic_search_irr(struct kvm_lapic *apic) |
| 172 | { | 196 | { |
| 173 | apic_clear_vector(vec, apic->regs + APIC_IRR); | 197 | return find_highest_vector(apic->regs + APIC_IRR); |
| 174 | } | 198 | } |
| 175 | 199 | ||
| 176 | static inline int apic_find_highest_irr(struct kvm_lapic *apic) | 200 | static inline int apic_find_highest_irr(struct kvm_lapic *apic) |
| 177 | { | 201 | { |
| 178 | int result; | 202 | int result; |
| 179 | 203 | ||
| 180 | result = find_highest_vector(apic->regs + APIC_IRR); | 204 | if (!apic->irr_pending) |
| 205 | return -1; | ||
| 206 | |||
| 207 | result = apic_search_irr(apic); | ||
| 181 | ASSERT(result == -1 || result >= 16); | 208 | ASSERT(result == -1 || result >= 16); |
| 182 | 209 | ||
| 183 | return result; | 210 | return result; |
| 184 | } | 211 | } |
| 185 | 212 | ||
| 213 | static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | ||
| 214 | { | ||
| 215 | apic->irr_pending = false; | ||
| 216 | apic_clear_vector(vec, apic->regs + APIC_IRR); | ||
| 217 | if (apic_search_irr(apic) != -1) | ||
| 218 | apic->irr_pending = true; | ||
| 219 | } | ||
| 220 | |||
| 186 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | 221 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) |
| 187 | { | 222 | { |
| 188 | struct kvm_lapic *apic = vcpu->arch.apic; | 223 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 189 | int highest_irr; | 224 | int highest_irr; |
| 190 | 225 | ||
| 226 | /* This may race with setting of irr in __apic_accept_irq() and | ||
| 227 | * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq | ||
| 228 | * will cause vmexit immediately and the value will be recalculated | ||
| 229 | * on the next vmentry. | ||
| 230 | */ | ||
| 191 | if (!apic) | 231 | if (!apic) |
| 192 | return 0; | 232 | return 0; |
| 193 | highest_irr = apic_find_highest_irr(apic); | 233 | highest_irr = apic_find_highest_irr(apic); |
| 194 | 234 | ||
| 195 | return highest_irr; | 235 | return highest_irr; |
| 196 | } | 236 | } |
| 197 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); | ||
| 198 | 237 | ||
| 199 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | 238 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, |
| 200 | int vector, int level, int trig_mode); | 239 | int vector, int level, int trig_mode); |
| @@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) | |||
| 251 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | 290 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) |
| 252 | { | 291 | { |
| 253 | int result = 0; | 292 | int result = 0; |
| 254 | u8 logical_id; | 293 | u32 logical_id; |
| 294 | |||
| 295 | if (apic_x2apic_mode(apic)) { | ||
| 296 | logical_id = apic_get_reg(apic, APIC_LDR); | ||
| 297 | return logical_id & mda; | ||
| 298 | } | ||
| 255 | 299 | ||
| 256 | logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); | 300 | logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); |
| 257 | 301 | ||
| @@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
| 331 | break; | 375 | break; |
| 332 | 376 | ||
| 333 | result = !apic_test_and_set_irr(vector, apic); | 377 | result = !apic_test_and_set_irr(vector, apic); |
| 378 | trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, | ||
| 379 | trig_mode, vector, !result); | ||
| 334 | if (!result) { | 380 | if (!result) { |
| 335 | if (trig_mode) | 381 | if (trig_mode) |
| 336 | apic_debug("level trig mode repeatedly for " | 382 | apic_debug("level trig mode repeatedly for " |
| @@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
| 425 | trigger_mode = IOAPIC_LEVEL_TRIG; | 471 | trigger_mode = IOAPIC_LEVEL_TRIG; |
| 426 | else | 472 | else |
| 427 | trigger_mode = IOAPIC_EDGE_TRIG; | 473 | trigger_mode = IOAPIC_EDGE_TRIG; |
| 428 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | 474 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { |
| 475 | mutex_lock(&apic->vcpu->kvm->irq_lock); | ||
| 476 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | ||
| 477 | mutex_unlock(&apic->vcpu->kvm->irq_lock); | ||
| 478 | } | ||
| 429 | } | 479 | } |
| 430 | 480 | ||
| 431 | static void apic_send_ipi(struct kvm_lapic *apic) | 481 | static void apic_send_ipi(struct kvm_lapic *apic) |
| @@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
| 440 | irq.level = icr_low & APIC_INT_ASSERT; | 490 | irq.level = icr_low & APIC_INT_ASSERT; |
| 441 | irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; | 491 | irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; |
| 442 | irq.shorthand = icr_low & APIC_SHORT_MASK; | 492 | irq.shorthand = icr_low & APIC_SHORT_MASK; |
| 443 | irq.dest_id = GET_APIC_DEST_FIELD(icr_high); | 493 | if (apic_x2apic_mode(apic)) |
| 494 | irq.dest_id = icr_high; | ||
| 495 | else | ||
| 496 | irq.dest_id = GET_APIC_DEST_FIELD(icr_high); | ||
| 497 | |||
| 498 | trace_kvm_apic_ipi(icr_low, irq.dest_id); | ||
| 444 | 499 | ||
| 445 | apic_debug("icr_high 0x%x, icr_low 0x%x, " | 500 | apic_debug("icr_high 0x%x, icr_low 0x%x, " |
| 446 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " | 501 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " |
| @@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
| 449 | irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, | 504 | irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, |
| 450 | irq.vector); | 505 | irq.vector); |
| 451 | 506 | ||
| 507 | mutex_lock(&apic->vcpu->kvm->irq_lock); | ||
| 452 | kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); | 508 | kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); |
| 509 | mutex_unlock(&apic->vcpu->kvm->irq_lock); | ||
| 453 | } | 510 | } |
| 454 | 511 | ||
| 455 | static u32 apic_get_tmcct(struct kvm_lapic *apic) | 512 | static u32 apic_get_tmcct(struct kvm_lapic *apic) |
| @@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | |||
| 495 | { | 552 | { |
| 496 | u32 val = 0; | 553 | u32 val = 0; |
| 497 | 554 | ||
| 498 | KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); | ||
| 499 | |||
| 500 | if (offset >= LAPIC_MMIO_LENGTH) | 555 | if (offset >= LAPIC_MMIO_LENGTH) |
| 501 | return 0; | 556 | return 0; |
| 502 | 557 | ||
| 503 | switch (offset) { | 558 | switch (offset) { |
| 559 | case APIC_ID: | ||
| 560 | if (apic_x2apic_mode(apic)) | ||
| 561 | val = kvm_apic_id(apic); | ||
| 562 | else | ||
| 563 | val = kvm_apic_id(apic) << 24; | ||
| 564 | break; | ||
| 504 | case APIC_ARBPRI: | 565 | case APIC_ARBPRI: |
| 505 | printk(KERN_WARNING "Access APIC ARBPRI register " | 566 | printk(KERN_WARNING "Access APIC ARBPRI register " |
| 506 | "which is for P6\n"); | 567 | "which is for P6\n"); |
| @@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | |||
| 522 | return val; | 583 | return val; |
| 523 | } | 584 | } |
| 524 | 585 | ||
| 525 | static void apic_mmio_read(struct kvm_io_device *this, | 586 | static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) |
| 526 | gpa_t address, int len, void *data) | 587 | { |
| 588 | return container_of(dev, struct kvm_lapic, dev); | ||
| 589 | } | ||
| 590 | |||
| 591 | static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, | ||
| 592 | void *data) | ||
| 527 | { | 593 | { |
| 528 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
| 529 | unsigned int offset = address - apic->base_address; | ||
| 530 | unsigned char alignment = offset & 0xf; | 594 | unsigned char alignment = offset & 0xf; |
| 531 | u32 result; | 595 | u32 result; |
| 596 | /* this bitmask has a bit cleared for each reserver register */ | ||
| 597 | static const u64 rmask = 0x43ff01ffffffe70cULL; | ||
| 532 | 598 | ||
| 533 | if ((alignment + len) > 4) { | 599 | if ((alignment + len) > 4) { |
| 534 | printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", | 600 | apic_debug("KVM_APIC_READ: alignment error %x %d\n", |
| 535 | (unsigned long)address, len); | 601 | offset, len); |
| 536 | return; | 602 | return 1; |
| 537 | } | 603 | } |
| 604 | |||
| 605 | if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { | ||
| 606 | apic_debug("KVM_APIC_READ: read reserved register %x\n", | ||
| 607 | offset); | ||
| 608 | return 1; | ||
| 609 | } | ||
| 610 | |||
| 538 | result = __apic_read(apic, offset & ~0xf); | 611 | result = __apic_read(apic, offset & ~0xf); |
| 539 | 612 | ||
| 613 | trace_kvm_apic_read(offset, result); | ||
| 614 | |||
| 540 | switch (len) { | 615 | switch (len) { |
| 541 | case 1: | 616 | case 1: |
| 542 | case 2: | 617 | case 2: |
| @@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this, | |||
| 548 | "should be 1,2, or 4 instead\n", len); | 623 | "should be 1,2, or 4 instead\n", len); |
| 549 | break; | 624 | break; |
| 550 | } | 625 | } |
| 626 | return 0; | ||
| 627 | } | ||
| 628 | |||
| 629 | static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) | ||
| 630 | { | ||
| 631 | return apic_hw_enabled(apic) && | ||
| 632 | addr >= apic->base_address && | ||
| 633 | addr < apic->base_address + LAPIC_MMIO_LENGTH; | ||
| 634 | } | ||
| 635 | |||
| 636 | static int apic_mmio_read(struct kvm_io_device *this, | ||
| 637 | gpa_t address, int len, void *data) | ||
| 638 | { | ||
| 639 | struct kvm_lapic *apic = to_lapic(this); | ||
| 640 | u32 offset = address - apic->base_address; | ||
| 641 | |||
| 642 | if (!apic_mmio_in_range(apic, address)) | ||
| 643 | return -EOPNOTSUPP; | ||
| 644 | |||
| 645 | apic_reg_read(apic, offset, len, data); | ||
| 646 | |||
| 647 | return 0; | ||
| 551 | } | 648 | } |
| 552 | 649 | ||
| 553 | static void update_divide_count(struct kvm_lapic *apic) | 650 | static void update_divide_count(struct kvm_lapic *apic) |
| @@ -573,6 +670,15 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
| 573 | 670 | ||
| 574 | if (!apic->lapic_timer.period) | 671 | if (!apic->lapic_timer.period) |
| 575 | return; | 672 | return; |
| 673 | /* | ||
| 674 | * Do not allow the guest to program periodic timers with small | ||
| 675 | * interval, since the hrtimers are not throttled by the host | ||
| 676 | * scheduler. | ||
| 677 | */ | ||
| 678 | if (apic_lvtt_period(apic)) { | ||
| 679 | if (apic->lapic_timer.period < NSEC_PER_MSEC/2) | ||
| 680 | apic->lapic_timer.period = NSEC_PER_MSEC/2; | ||
| 681 | } | ||
| 576 | 682 | ||
| 577 | hrtimer_start(&apic->lapic_timer.timer, | 683 | hrtimer_start(&apic->lapic_timer.timer, |
| 578 | ktime_add_ns(now, apic->lapic_timer.period), | 684 | ktime_add_ns(now, apic->lapic_timer.period), |
| @@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) | |||
| 603 | apic->vcpu->kvm->arch.vapics_in_nmi_mode--; | 709 | apic->vcpu->kvm->arch.vapics_in_nmi_mode--; |
| 604 | } | 710 | } |
| 605 | 711 | ||
| 606 | static void apic_mmio_write(struct kvm_io_device *this, | 712 | static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) |
| 607 | gpa_t address, int len, const void *data) | ||
| 608 | { | 713 | { |
| 609 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | 714 | int ret = 0; |
| 610 | unsigned int offset = address - apic->base_address; | ||
| 611 | unsigned char alignment = offset & 0xf; | ||
| 612 | u32 val; | ||
| 613 | |||
| 614 | /* | ||
| 615 | * APIC register must be aligned on 128-bits boundary. | ||
| 616 | * 32/64/128 bits registers must be accessed thru 32 bits. | ||
| 617 | * Refer SDM 8.4.1 | ||
| 618 | */ | ||
| 619 | if (len != 4 || alignment) { | ||
| 620 | /* Don't shout loud, $infamous_os would cause only noise. */ | ||
| 621 | apic_debug("apic write: bad size=%d %lx\n", | ||
| 622 | len, (long)address); | ||
| 623 | return; | ||
| 624 | } | ||
| 625 | |||
| 626 | val = *(u32 *) data; | ||
| 627 | |||
| 628 | /* too common printing */ | ||
| 629 | if (offset != APIC_EOI) | ||
| 630 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " | ||
| 631 | "0x%x\n", __func__, offset, len, val); | ||
| 632 | |||
| 633 | offset &= 0xff0; | ||
| 634 | 715 | ||
| 635 | KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); | 716 | trace_kvm_apic_write(reg, val); |
| 636 | 717 | ||
| 637 | switch (offset) { | 718 | switch (reg) { |
| 638 | case APIC_ID: /* Local APIC ID */ | 719 | case APIC_ID: /* Local APIC ID */ |
| 639 | apic_set_reg(apic, APIC_ID, val); | 720 | if (!apic_x2apic_mode(apic)) |
| 721 | apic_set_reg(apic, APIC_ID, val); | ||
| 722 | else | ||
| 723 | ret = 1; | ||
| 640 | break; | 724 | break; |
| 641 | 725 | ||
| 642 | case APIC_TASKPRI: | 726 | case APIC_TASKPRI: |
| @@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
| 649 | break; | 733 | break; |
| 650 | 734 | ||
| 651 | case APIC_LDR: | 735 | case APIC_LDR: |
| 652 | apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); | 736 | if (!apic_x2apic_mode(apic)) |
| 737 | apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); | ||
| 738 | else | ||
| 739 | ret = 1; | ||
| 653 | break; | 740 | break; |
| 654 | 741 | ||
| 655 | case APIC_DFR: | 742 | case APIC_DFR: |
| 656 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); | 743 | if (!apic_x2apic_mode(apic)) |
| 744 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); | ||
| 745 | else | ||
| 746 | ret = 1; | ||
| 657 | break; | 747 | break; |
| 658 | 748 | ||
| 659 | case APIC_SPIV: | 749 | case APIC_SPIV: { |
| 660 | apic_set_reg(apic, APIC_SPIV, val & 0x3ff); | 750 | u32 mask = 0x3ff; |
| 751 | if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) | ||
| 752 | mask |= APIC_SPIV_DIRECTED_EOI; | ||
| 753 | apic_set_reg(apic, APIC_SPIV, val & mask); | ||
| 661 | if (!(val & APIC_SPIV_APIC_ENABLED)) { | 754 | if (!(val & APIC_SPIV_APIC_ENABLED)) { |
| 662 | int i; | 755 | int i; |
| 663 | u32 lvt_val; | 756 | u32 lvt_val; |
| @@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
| 672 | 765 | ||
| 673 | } | 766 | } |
| 674 | break; | 767 | break; |
| 675 | 768 | } | |
| 676 | case APIC_ICR: | 769 | case APIC_ICR: |
| 677 | /* No delay here, so we always clear the pending bit */ | 770 | /* No delay here, so we always clear the pending bit */ |
| 678 | apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); | 771 | apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); |
| @@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
| 680 | break; | 773 | break; |
| 681 | 774 | ||
| 682 | case APIC_ICR2: | 775 | case APIC_ICR2: |
| 683 | apic_set_reg(apic, APIC_ICR2, val & 0xff000000); | 776 | if (!apic_x2apic_mode(apic)) |
| 777 | val &= 0xff000000; | ||
| 778 | apic_set_reg(apic, APIC_ICR2, val); | ||
| 684 | break; | 779 | break; |
| 685 | 780 | ||
| 686 | case APIC_LVT0: | 781 | case APIC_LVT0: |
| @@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
| 694 | if (!apic_sw_enabled(apic)) | 789 | if (!apic_sw_enabled(apic)) |
| 695 | val |= APIC_LVT_MASKED; | 790 | val |= APIC_LVT_MASKED; |
| 696 | 791 | ||
| 697 | val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; | 792 | val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; |
| 698 | apic_set_reg(apic, offset, val); | 793 | apic_set_reg(apic, reg, val); |
| 699 | 794 | ||
| 700 | break; | 795 | break; |
| 701 | 796 | ||
| @@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
| 703 | hrtimer_cancel(&apic->lapic_timer.timer); | 798 | hrtimer_cancel(&apic->lapic_timer.timer); |
| 704 | apic_set_reg(apic, APIC_TMICT, val); | 799 | apic_set_reg(apic, APIC_TMICT, val); |
| 705 | start_apic_timer(apic); | 800 | start_apic_timer(apic); |
| 706 | return; | 801 | break; |
| 707 | 802 | ||
| 708 | case APIC_TDCR: | 803 | case APIC_TDCR: |
| 709 | if (val & 4) | 804 | if (val & 4) |
| @@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
| 712 | update_divide_count(apic); | 807 | update_divide_count(apic); |
| 713 | break; | 808 | break; |
| 714 | 809 | ||
| 810 | case APIC_ESR: | ||
| 811 | if (apic_x2apic_mode(apic) && val != 0) { | ||
| 812 | printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); | ||
| 813 | ret = 1; | ||
| 814 | } | ||
| 815 | break; | ||
| 816 | |||
| 817 | case APIC_SELF_IPI: | ||
| 818 | if (apic_x2apic_mode(apic)) { | ||
| 819 | apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); | ||
| 820 | } else | ||
| 821 | ret = 1; | ||
| 822 | break; | ||
| 715 | default: | 823 | default: |
| 716 | apic_debug("Local APIC Write to read-only register %x\n", | 824 | ret = 1; |
| 717 | offset); | ||
| 718 | break; | 825 | break; |
| 719 | } | 826 | } |
| 720 | 827 | if (ret) | |
| 828 | apic_debug("Local APIC Write to read-only register %x\n", reg); | ||
| 829 | return ret; | ||
| 721 | } | 830 | } |
| 722 | 831 | ||
| 723 | static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, | 832 | static int apic_mmio_write(struct kvm_io_device *this, |
| 724 | int len, int size) | 833 | gpa_t address, int len, const void *data) |
| 725 | { | 834 | { |
| 726 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | 835 | struct kvm_lapic *apic = to_lapic(this); |
| 727 | int ret = 0; | 836 | unsigned int offset = address - apic->base_address; |
| 837 | u32 val; | ||
| 728 | 838 | ||
| 839 | if (!apic_mmio_in_range(apic, address)) | ||
| 840 | return -EOPNOTSUPP; | ||
| 729 | 841 | ||
| 730 | if (apic_hw_enabled(apic) && | 842 | /* |
| 731 | (addr >= apic->base_address) && | 843 | * APIC register must be aligned on 128-bits boundary. |
| 732 | (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) | 844 | * 32/64/128 bits registers must be accessed thru 32 bits. |
| 733 | ret = 1; | 845 | * Refer SDM 8.4.1 |
| 846 | */ | ||
| 847 | if (len != 4 || (offset & 0xf)) { | ||
| 848 | /* Don't shout loud, $infamous_os would cause only noise. */ | ||
| 849 | apic_debug("apic write: bad size=%d %lx\n", len, (long)address); | ||
| 850 | return 0; | ||
| 851 | } | ||
| 734 | 852 | ||
| 735 | return ret; | 853 | val = *(u32*)data; |
| 854 | |||
| 855 | /* too common printing */ | ||
| 856 | if (offset != APIC_EOI) | ||
| 857 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " | ||
| 858 | "0x%x\n", __func__, offset, len, val); | ||
| 859 | |||
| 860 | apic_reg_write(apic, offset & 0xff0, val); | ||
| 861 | |||
| 862 | return 0; | ||
| 736 | } | 863 | } |
| 737 | 864 | ||
| 738 | void kvm_free_lapic(struct kvm_vcpu *vcpu) | 865 | void kvm_free_lapic(struct kvm_vcpu *vcpu) |
| @@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
| 763 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | 890 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) |
| 764 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); | 891 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); |
| 765 | } | 892 | } |
| 766 | EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr); | ||
| 767 | 893 | ||
| 768 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | 894 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) |
| 769 | { | 895 | { |
| @@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | |||
| 776 | 902 | ||
| 777 | return (tpr & 0xf0) >> 4; | 903 | return (tpr & 0xf0) >> 4; |
| 778 | } | 904 | } |
| 779 | EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); | ||
| 780 | 905 | ||
| 781 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | 906 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) |
| 782 | { | 907 | { |
| @@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | |||
| 787 | vcpu->arch.apic_base = value; | 912 | vcpu->arch.apic_base = value; |
| 788 | return; | 913 | return; |
| 789 | } | 914 | } |
| 790 | if (apic->vcpu->vcpu_id) | 915 | |
| 916 | if (!kvm_vcpu_is_bsp(apic->vcpu)) | ||
| 791 | value &= ~MSR_IA32_APICBASE_BSP; | 917 | value &= ~MSR_IA32_APICBASE_BSP; |
| 792 | 918 | ||
| 793 | vcpu->arch.apic_base = value; | 919 | vcpu->arch.apic_base = value; |
| 920 | if (apic_x2apic_mode(apic)) { | ||
| 921 | u32 id = kvm_apic_id(apic); | ||
| 922 | u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); | ||
| 923 | apic_set_reg(apic, APIC_LDR, ldr); | ||
| 924 | } | ||
| 794 | apic->base_address = apic->vcpu->arch.apic_base & | 925 | apic->base_address = apic->vcpu->arch.apic_base & |
| 795 | MSR_IA32_APICBASE_BASE; | 926 | MSR_IA32_APICBASE_BASE; |
| 796 | 927 | ||
| @@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | |||
| 800 | 931 | ||
| 801 | } | 932 | } |
| 802 | 933 | ||
| 803 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) | ||
| 804 | { | ||
| 805 | return vcpu->arch.apic_base; | ||
| 806 | } | ||
| 807 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); | ||
| 808 | |||
| 809 | void kvm_lapic_reset(struct kvm_vcpu *vcpu) | 934 | void kvm_lapic_reset(struct kvm_vcpu *vcpu) |
| 810 | { | 935 | { |
| 811 | struct kvm_lapic *apic; | 936 | struct kvm_lapic *apic; |
| @@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
| 821 | hrtimer_cancel(&apic->lapic_timer.timer); | 946 | hrtimer_cancel(&apic->lapic_timer.timer); |
| 822 | 947 | ||
| 823 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); | 948 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); |
| 824 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | 949 | kvm_apic_set_version(apic->vcpu); |
| 825 | 950 | ||
| 826 | for (i = 0; i < APIC_LVT_NUM; i++) | 951 | for (i = 0; i < APIC_LVT_NUM; i++) |
| 827 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); | 952 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); |
| @@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
| 842 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); | 967 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); |
| 843 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | 968 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); |
| 844 | } | 969 | } |
| 970 | apic->irr_pending = false; | ||
| 845 | update_divide_count(apic); | 971 | update_divide_count(apic); |
| 846 | atomic_set(&apic->lapic_timer.pending, 0); | 972 | atomic_set(&apic->lapic_timer.pending, 0); |
| 847 | if (vcpu->vcpu_id == 0) | 973 | if (kvm_vcpu_is_bsp(vcpu)) |
| 848 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; | 974 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; |
| 849 | apic_update_ppr(apic); | 975 | apic_update_ppr(apic); |
| 850 | 976 | ||
| @@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
| 855 | vcpu, kvm_apic_id(apic), | 981 | vcpu, kvm_apic_id(apic), |
| 856 | vcpu->arch.apic_base, apic->base_address); | 982 | vcpu->arch.apic_base, apic->base_address); |
| 857 | } | 983 | } |
| 858 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); | ||
| 859 | 984 | ||
| 860 | bool kvm_apic_present(struct kvm_vcpu *vcpu) | 985 | bool kvm_apic_present(struct kvm_vcpu *vcpu) |
| 861 | { | 986 | { |
| @@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | |||
| 866 | { | 991 | { |
| 867 | return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); | 992 | return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); |
| 868 | } | 993 | } |
| 869 | EXPORT_SYMBOL_GPL(kvm_lapic_enabled); | ||
| 870 | 994 | ||
| 871 | /* | 995 | /* |
| 872 | *---------------------------------------------------------------------- | 996 | *---------------------------------------------------------------------- |
| @@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = { | |||
| 917 | .is_periodic = lapic_is_periodic, | 1041 | .is_periodic = lapic_is_periodic, |
| 918 | }; | 1042 | }; |
| 919 | 1043 | ||
| 1044 | static const struct kvm_io_device_ops apic_mmio_ops = { | ||
| 1045 | .read = apic_mmio_read, | ||
| 1046 | .write = apic_mmio_write, | ||
| 1047 | }; | ||
| 1048 | |||
| 920 | int kvm_create_lapic(struct kvm_vcpu *vcpu) | 1049 | int kvm_create_lapic(struct kvm_vcpu *vcpu) |
| 921 | { | 1050 | { |
| 922 | struct kvm_lapic *apic; | 1051 | struct kvm_lapic *apic; |
| @@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
| 945 | apic->lapic_timer.timer.function = kvm_timer_fn; | 1074 | apic->lapic_timer.timer.function = kvm_timer_fn; |
| 946 | apic->lapic_timer.t_ops = &lapic_timer_ops; | 1075 | apic->lapic_timer.t_ops = &lapic_timer_ops; |
| 947 | apic->lapic_timer.kvm = vcpu->kvm; | 1076 | apic->lapic_timer.kvm = vcpu->kvm; |
| 948 | apic->lapic_timer.vcpu_id = vcpu->vcpu_id; | 1077 | apic->lapic_timer.vcpu = vcpu; |
| 949 | 1078 | ||
| 950 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | 1079 | apic->base_address = APIC_DEFAULT_PHYS_BASE; |
| 951 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; | 1080 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; |
| 952 | 1081 | ||
| 953 | kvm_lapic_reset(vcpu); | 1082 | kvm_lapic_reset(vcpu); |
| 954 | apic->dev.read = apic_mmio_read; | 1083 | kvm_iodevice_init(&apic->dev, &apic_mmio_ops); |
| 955 | apic->dev.write = apic_mmio_write; | ||
| 956 | apic->dev.in_range = apic_mmio_range; | ||
| 957 | apic->dev.private = apic; | ||
| 958 | 1084 | ||
| 959 | return 0; | 1085 | return 0; |
| 960 | nomem_free_apic: | 1086 | nomem_free_apic: |
| @@ -962,7 +1088,6 @@ nomem_free_apic: | |||
| 962 | nomem: | 1088 | nomem: |
| 963 | return -ENOMEM; | 1089 | return -ENOMEM; |
| 964 | } | 1090 | } |
| 965 | EXPORT_SYMBOL_GPL(kvm_create_lapic); | ||
| 966 | 1091 | ||
| 967 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | 1092 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) |
| 968 | { | 1093 | { |
| @@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | |||
| 985 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); | 1110 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); |
| 986 | int r = 0; | 1111 | int r = 0; |
| 987 | 1112 | ||
| 988 | if (vcpu->vcpu_id == 0) { | 1113 | if (kvm_vcpu_is_bsp(vcpu)) { |
| 989 | if (!apic_hw_enabled(vcpu->arch.apic)) | 1114 | if (!apic_hw_enabled(vcpu->arch.apic)) |
| 990 | r = 1; | 1115 | r = 1; |
| 991 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | 1116 | if ((lvt0 & APIC_LVT_MASKED) == 0 && |
| @@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
| 1025 | 1150 | ||
| 1026 | apic->base_address = vcpu->arch.apic_base & | 1151 | apic->base_address = vcpu->arch.apic_base & |
| 1027 | MSR_IA32_APICBASE_BASE; | 1152 | MSR_IA32_APICBASE_BASE; |
| 1028 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | 1153 | kvm_apic_set_version(vcpu); |
| 1154 | |||
| 1029 | apic_update_ppr(apic); | 1155 | apic_update_ppr(apic); |
| 1030 | hrtimer_cancel(&apic->lapic_timer.timer); | 1156 | hrtimer_cancel(&apic->lapic_timer.timer); |
| 1031 | update_divide_count(apic); | 1157 | update_divide_count(apic); |
| @@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) | |||
| 1092 | 1218 | ||
| 1093 | vcpu->arch.apic->vapic_addr = vapic_addr; | 1219 | vcpu->arch.apic->vapic_addr = vapic_addr; |
| 1094 | } | 1220 | } |
| 1221 | |||
| 1222 | int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
| 1223 | { | ||
| 1224 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
| 1225 | u32 reg = (msr - APIC_BASE_MSR) << 4; | ||
| 1226 | |||
| 1227 | if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) | ||
| 1228 | return 1; | ||
| 1229 | |||
| 1230 | /* if this is ICR write vector before command */ | ||
| 1231 | if (msr == 0x830) | ||
| 1232 | apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); | ||
| 1233 | return apic_reg_write(apic, reg, (u32)data); | ||
| 1234 | } | ||
| 1235 | |||
| 1236 | int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) | ||
| 1237 | { | ||
| 1238 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
| 1239 | u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; | ||
| 1240 | |||
| 1241 | if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) | ||
| 1242 | return 1; | ||
| 1243 | |||
| 1244 | if (apic_reg_read(apic, reg, 4, &low)) | ||
| 1245 | return 1; | ||
| 1246 | if (msr == 0x830) | ||
| 1247 | apic_reg_read(apic, APIC_ICR2, 4, &high); | ||
| 1248 | |||
| 1249 | *data = (((u64)high) << 32) | low; | ||
| 1250 | |||
| 1251 | return 0; | ||
| 1252 | } | ||
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a587f8349c4..40010b09c4a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
| @@ -12,6 +12,7 @@ struct kvm_lapic { | |||
| 12 | struct kvm_timer lapic_timer; | 12 | struct kvm_timer lapic_timer; |
| 13 | u32 divide_count; | 13 | u32 divide_count; |
| 14 | struct kvm_vcpu *vcpu; | 14 | struct kvm_vcpu *vcpu; |
| 15 | bool irr_pending; | ||
| 15 | struct page *regs_page; | 16 | struct page *regs_page; |
| 16 | void *regs; | 17 | void *regs; |
| 17 | gpa_t vapic_addr; | 18 | gpa_t vapic_addr; |
| @@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | |||
| 28 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | 29 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); |
| 29 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | 30 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); |
| 30 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); | 31 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); |
| 32 | void kvm_apic_set_version(struct kvm_vcpu *vcpu); | ||
| 31 | 33 | ||
| 32 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | 34 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); |
| 33 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | 35 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); |
| @@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); | |||
| 44 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); | 46 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); |
| 45 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); | 47 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); |
| 46 | 48 | ||
| 49 | int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); | ||
| 50 | int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); | ||
| 47 | #endif | 51 | #endif |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0ef5bb2b404..eca41ae9f45 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | */ | 18 | */ |
| 19 | 19 | ||
| 20 | #include "mmu.h" | 20 | #include "mmu.h" |
| 21 | #include "kvm_cache_regs.h" | ||
| 21 | 22 | ||
| 22 | #include <linux/kvm_host.h> | 23 | #include <linux/kvm_host.h> |
| 23 | #include <linux/types.h> | 24 | #include <linux/types.h> |
| @@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644); | |||
| 107 | 108 | ||
| 108 | #define PT32_LEVEL_MASK(level) \ | 109 | #define PT32_LEVEL_MASK(level) \ |
| 109 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | 110 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) |
| 111 | #define PT32_LVL_OFFSET_MASK(level) \ | ||
| 112 | (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ | ||
| 113 | * PT32_LEVEL_BITS))) - 1)) | ||
| 110 | 114 | ||
| 111 | #define PT32_INDEX(address, level)\ | 115 | #define PT32_INDEX(address, level)\ |
| 112 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | 116 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) |
| @@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644); | |||
| 115 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | 119 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) |
| 116 | #define PT64_DIR_BASE_ADDR_MASK \ | 120 | #define PT64_DIR_BASE_ADDR_MASK \ |
| 117 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | 121 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) |
| 122 | #define PT64_LVL_ADDR_MASK(level) \ | ||
| 123 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ | ||
| 124 | * PT64_LEVEL_BITS))) - 1)) | ||
| 125 | #define PT64_LVL_OFFSET_MASK(level) \ | ||
| 126 | (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ | ||
| 127 | * PT64_LEVEL_BITS))) - 1)) | ||
| 118 | 128 | ||
| 119 | #define PT32_BASE_ADDR_MASK PAGE_MASK | 129 | #define PT32_BASE_ADDR_MASK PAGE_MASK |
| 120 | #define PT32_DIR_BASE_ADDR_MASK \ | 130 | #define PT32_DIR_BASE_ADDR_MASK \ |
| 121 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | 131 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) |
| 132 | #define PT32_LVL_ADDR_MASK(level) \ | ||
| 133 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ | ||
| 134 | * PT32_LEVEL_BITS))) - 1)) | ||
| 122 | 135 | ||
| 123 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | 136 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ |
| 124 | | PT64_NX_MASK) | 137 | | PT64_NX_MASK) |
| @@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644); | |||
| 129 | #define PFERR_RSVD_MASK (1U << 3) | 142 | #define PFERR_RSVD_MASK (1U << 3) |
| 130 | #define PFERR_FETCH_MASK (1U << 4) | 143 | #define PFERR_FETCH_MASK (1U << 4) |
| 131 | 144 | ||
| 145 | #define PT_PDPE_LEVEL 3 | ||
| 132 | #define PT_DIRECTORY_LEVEL 2 | 146 | #define PT_DIRECTORY_LEVEL 2 |
| 133 | #define PT_PAGE_TABLE_LEVEL 1 | 147 | #define PT_PAGE_TABLE_LEVEL 1 |
| 134 | 148 | ||
| @@ -139,10 +153,13 @@ module_param(oos_shadow, bool, 0644); | |||
| 139 | #define ACC_USER_MASK PT_USER_MASK | 153 | #define ACC_USER_MASK PT_USER_MASK |
| 140 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | 154 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
| 141 | 155 | ||
| 156 | #define CREATE_TRACE_POINTS | ||
| 157 | #include "mmutrace.h" | ||
| 158 | |||
| 142 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 159 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
| 143 | 160 | ||
| 144 | struct kvm_rmap_desc { | 161 | struct kvm_rmap_desc { |
| 145 | u64 *shadow_ptes[RMAP_EXT]; | 162 | u64 *sptes[RMAP_EXT]; |
| 146 | struct kvm_rmap_desc *more; | 163 | struct kvm_rmap_desc *more; |
| 147 | }; | 164 | }; |
| 148 | 165 | ||
| @@ -239,16 +256,25 @@ static int is_writeble_pte(unsigned long pte) | |||
| 239 | return pte & PT_WRITABLE_MASK; | 256 | return pte & PT_WRITABLE_MASK; |
| 240 | } | 257 | } |
| 241 | 258 | ||
| 242 | static int is_dirty_pte(unsigned long pte) | 259 | static int is_dirty_gpte(unsigned long pte) |
| 243 | { | 260 | { |
| 244 | return pte & shadow_dirty_mask; | 261 | return pte & PT_DIRTY_MASK; |
| 245 | } | 262 | } |
| 246 | 263 | ||
| 247 | static int is_rmap_pte(u64 pte) | 264 | static int is_rmap_spte(u64 pte) |
| 248 | { | 265 | { |
| 249 | return is_shadow_present_pte(pte); | 266 | return is_shadow_present_pte(pte); |
| 250 | } | 267 | } |
| 251 | 268 | ||
| 269 | static int is_last_spte(u64 pte, int level) | ||
| 270 | { | ||
| 271 | if (level == PT_PAGE_TABLE_LEVEL) | ||
| 272 | return 1; | ||
| 273 | if (is_large_pte(pte)) | ||
| 274 | return 1; | ||
| 275 | return 0; | ||
| 276 | } | ||
| 277 | |||
| 252 | static pfn_t spte_to_pfn(u64 pte) | 278 | static pfn_t spte_to_pfn(u64 pte) |
| 253 | { | 279 | { |
| 254 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 280 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
| @@ -261,7 +287,7 @@ static gfn_t pse36_gfn_delta(u32 gpte) | |||
| 261 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | 287 | return (gpte & PT32_DIR_PSE36_MASK) << shift; |
| 262 | } | 288 | } |
| 263 | 289 | ||
| 264 | static void set_shadow_pte(u64 *sptep, u64 spte) | 290 | static void __set_spte(u64 *sptep, u64 spte) |
| 265 | { | 291 | { |
| 266 | #ifdef CONFIG_X86_64 | 292 | #ifdef CONFIG_X86_64 |
| 267 | set_64bit((unsigned long *)sptep, spte); | 293 | set_64bit((unsigned long *)sptep, spte); |
| @@ -380,37 +406,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | |||
| 380 | * Return the pointer to the largepage write count for a given | 406 | * Return the pointer to the largepage write count for a given |
| 381 | * gfn, handling slots that are not large page aligned. | 407 | * gfn, handling slots that are not large page aligned. |
| 382 | */ | 408 | */ |
| 383 | static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) | 409 | static int *slot_largepage_idx(gfn_t gfn, |
| 410 | struct kvm_memory_slot *slot, | ||
| 411 | int level) | ||
| 384 | { | 412 | { |
| 385 | unsigned long idx; | 413 | unsigned long idx; |
| 386 | 414 | ||
| 387 | idx = (gfn / KVM_PAGES_PER_HPAGE) - | 415 | idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - |
| 388 | (slot->base_gfn / KVM_PAGES_PER_HPAGE); | 416 | (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); |
| 389 | return &slot->lpage_info[idx].write_count; | 417 | return &slot->lpage_info[level - 2][idx].write_count; |
| 390 | } | 418 | } |
| 391 | 419 | ||
| 392 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | 420 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) |
| 393 | { | 421 | { |
| 422 | struct kvm_memory_slot *slot; | ||
| 394 | int *write_count; | 423 | int *write_count; |
| 424 | int i; | ||
| 395 | 425 | ||
| 396 | gfn = unalias_gfn(kvm, gfn); | 426 | gfn = unalias_gfn(kvm, gfn); |
| 397 | write_count = slot_largepage_idx(gfn, | 427 | |
| 398 | gfn_to_memslot_unaliased(kvm, gfn)); | 428 | slot = gfn_to_memslot_unaliased(kvm, gfn); |
| 399 | *write_count += 1; | 429 | for (i = PT_DIRECTORY_LEVEL; |
| 430 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | ||
| 431 | write_count = slot_largepage_idx(gfn, slot, i); | ||
| 432 | *write_count += 1; | ||
| 433 | } | ||
| 400 | } | 434 | } |
| 401 | 435 | ||
| 402 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | 436 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) |
| 403 | { | 437 | { |
| 438 | struct kvm_memory_slot *slot; | ||
| 404 | int *write_count; | 439 | int *write_count; |
| 440 | int i; | ||
| 405 | 441 | ||
| 406 | gfn = unalias_gfn(kvm, gfn); | 442 | gfn = unalias_gfn(kvm, gfn); |
| 407 | write_count = slot_largepage_idx(gfn, | 443 | for (i = PT_DIRECTORY_LEVEL; |
| 408 | gfn_to_memslot_unaliased(kvm, gfn)); | 444 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
| 409 | *write_count -= 1; | 445 | slot = gfn_to_memslot_unaliased(kvm, gfn); |
| 410 | WARN_ON(*write_count < 0); | 446 | write_count = slot_largepage_idx(gfn, slot, i); |
| 447 | *write_count -= 1; | ||
| 448 | WARN_ON(*write_count < 0); | ||
| 449 | } | ||
| 411 | } | 450 | } |
| 412 | 451 | ||
| 413 | static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) | 452 | static int has_wrprotected_page(struct kvm *kvm, |
| 453 | gfn_t gfn, | ||
| 454 | int level) | ||
| 414 | { | 455 | { |
| 415 | struct kvm_memory_slot *slot; | 456 | struct kvm_memory_slot *slot; |
| 416 | int *largepage_idx; | 457 | int *largepage_idx; |
| @@ -418,47 +459,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) | |||
| 418 | gfn = unalias_gfn(kvm, gfn); | 459 | gfn = unalias_gfn(kvm, gfn); |
| 419 | slot = gfn_to_memslot_unaliased(kvm, gfn); | 460 | slot = gfn_to_memslot_unaliased(kvm, gfn); |
| 420 | if (slot) { | 461 | if (slot) { |
| 421 | largepage_idx = slot_largepage_idx(gfn, slot); | 462 | largepage_idx = slot_largepage_idx(gfn, slot, level); |
| 422 | return *largepage_idx; | 463 | return *largepage_idx; |
| 423 | } | 464 | } |
| 424 | 465 | ||
| 425 | return 1; | 466 | return 1; |
| 426 | } | 467 | } |
| 427 | 468 | ||
| 428 | static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) | 469 | static int host_mapping_level(struct kvm *kvm, gfn_t gfn) |
| 429 | { | 470 | { |
| 471 | unsigned long page_size = PAGE_SIZE; | ||
| 430 | struct vm_area_struct *vma; | 472 | struct vm_area_struct *vma; |
| 431 | unsigned long addr; | 473 | unsigned long addr; |
| 432 | int ret = 0; | 474 | int i, ret = 0; |
| 433 | 475 | ||
| 434 | addr = gfn_to_hva(kvm, gfn); | 476 | addr = gfn_to_hva(kvm, gfn); |
| 435 | if (kvm_is_error_hva(addr)) | 477 | if (kvm_is_error_hva(addr)) |
| 436 | return ret; | 478 | return page_size; |
| 437 | 479 | ||
| 438 | down_read(¤t->mm->mmap_sem); | 480 | down_read(¤t->mm->mmap_sem); |
| 439 | vma = find_vma(current->mm, addr); | 481 | vma = find_vma(current->mm, addr); |
| 440 | if (vma && is_vm_hugetlb_page(vma)) | 482 | if (!vma) |
| 441 | ret = 1; | 483 | goto out; |
| 484 | |||
| 485 | page_size = vma_kernel_pagesize(vma); | ||
| 486 | |||
| 487 | out: | ||
| 442 | up_read(¤t->mm->mmap_sem); | 488 | up_read(¤t->mm->mmap_sem); |
| 443 | 489 | ||
| 490 | for (i = PT_PAGE_TABLE_LEVEL; | ||
| 491 | i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { | ||
| 492 | if (page_size >= KVM_HPAGE_SIZE(i)) | ||
| 493 | ret = i; | ||
| 494 | else | ||
| 495 | break; | ||
| 496 | } | ||
| 497 | |||
| 444 | return ret; | 498 | return ret; |
| 445 | } | 499 | } |
| 446 | 500 | ||
| 447 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 501 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) |
| 448 | { | 502 | { |
| 449 | struct kvm_memory_slot *slot; | 503 | struct kvm_memory_slot *slot; |
| 450 | 504 | int host_level; | |
| 451 | if (has_wrprotected_page(vcpu->kvm, large_gfn)) | 505 | int level = PT_PAGE_TABLE_LEVEL; |
| 452 | return 0; | ||
| 453 | |||
| 454 | if (!host_largepage_backed(vcpu->kvm, large_gfn)) | ||
| 455 | return 0; | ||
| 456 | 506 | ||
| 457 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); | 507 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); |
| 458 | if (slot && slot->dirty_bitmap) | 508 | if (slot && slot->dirty_bitmap) |
| 459 | return 0; | 509 | return PT_PAGE_TABLE_LEVEL; |
| 460 | 510 | ||
| 461 | return 1; | 511 | host_level = host_mapping_level(vcpu->kvm, large_gfn); |
| 512 | |||
| 513 | if (host_level == PT_PAGE_TABLE_LEVEL) | ||
| 514 | return host_level; | ||
| 515 | |||
| 516 | for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { | ||
| 517 | |||
| 518 | if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) | ||
| 519 | break; | ||
| 520 | } | ||
| 521 | |||
| 522 | return level - 1; | ||
| 462 | } | 523 | } |
| 463 | 524 | ||
| 464 | /* | 525 | /* |
| @@ -466,19 +527,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
| 466 | * Note: gfn must be unaliased before this function get called | 527 | * Note: gfn must be unaliased before this function get called |
| 467 | */ | 528 | */ |
| 468 | 529 | ||
| 469 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) | 530 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
| 470 | { | 531 | { |
| 471 | struct kvm_memory_slot *slot; | 532 | struct kvm_memory_slot *slot; |
| 472 | unsigned long idx; | 533 | unsigned long idx; |
| 473 | 534 | ||
| 474 | slot = gfn_to_memslot(kvm, gfn); | 535 | slot = gfn_to_memslot(kvm, gfn); |
| 475 | if (!lpage) | 536 | if (likely(level == PT_PAGE_TABLE_LEVEL)) |
| 476 | return &slot->rmap[gfn - slot->base_gfn]; | 537 | return &slot->rmap[gfn - slot->base_gfn]; |
| 477 | 538 | ||
| 478 | idx = (gfn / KVM_PAGES_PER_HPAGE) - | 539 | idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - |
| 479 | (slot->base_gfn / KVM_PAGES_PER_HPAGE); | 540 | (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); |
| 480 | 541 | ||
| 481 | return &slot->lpage_info[idx].rmap_pde; | 542 | return &slot->lpage_info[level - 2][idx].rmap_pde; |
| 482 | } | 543 | } |
| 483 | 544 | ||
| 484 | /* | 545 | /* |
| @@ -494,42 +555,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) | |||
| 494 | * the spte was not added. | 555 | * the spte was not added. |
| 495 | * | 556 | * |
| 496 | */ | 557 | */ |
| 497 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) | 558 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
| 498 | { | 559 | { |
| 499 | struct kvm_mmu_page *sp; | 560 | struct kvm_mmu_page *sp; |
| 500 | struct kvm_rmap_desc *desc; | 561 | struct kvm_rmap_desc *desc; |
| 501 | unsigned long *rmapp; | 562 | unsigned long *rmapp; |
| 502 | int i, count = 0; | 563 | int i, count = 0; |
| 503 | 564 | ||
| 504 | if (!is_rmap_pte(*spte)) | 565 | if (!is_rmap_spte(*spte)) |
| 505 | return count; | 566 | return count; |
| 506 | gfn = unalias_gfn(vcpu->kvm, gfn); | 567 | gfn = unalias_gfn(vcpu->kvm, gfn); |
| 507 | sp = page_header(__pa(spte)); | 568 | sp = page_header(__pa(spte)); |
| 508 | sp->gfns[spte - sp->spt] = gfn; | 569 | sp->gfns[spte - sp->spt] = gfn; |
| 509 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); | 570 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); |
| 510 | if (!*rmapp) { | 571 | if (!*rmapp) { |
| 511 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | 572 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); |
| 512 | *rmapp = (unsigned long)spte; | 573 | *rmapp = (unsigned long)spte; |
| 513 | } else if (!(*rmapp & 1)) { | 574 | } else if (!(*rmapp & 1)) { |
| 514 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | 575 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); |
| 515 | desc = mmu_alloc_rmap_desc(vcpu); | 576 | desc = mmu_alloc_rmap_desc(vcpu); |
| 516 | desc->shadow_ptes[0] = (u64 *)*rmapp; | 577 | desc->sptes[0] = (u64 *)*rmapp; |
| 517 | desc->shadow_ptes[1] = spte; | 578 | desc->sptes[1] = spte; |
| 518 | *rmapp = (unsigned long)desc | 1; | 579 | *rmapp = (unsigned long)desc | 1; |
| 519 | } else { | 580 | } else { |
| 520 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 581 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); |
| 521 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 582 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
| 522 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { | 583 | while (desc->sptes[RMAP_EXT-1] && desc->more) { |
| 523 | desc = desc->more; | 584 | desc = desc->more; |
| 524 | count += RMAP_EXT; | 585 | count += RMAP_EXT; |
| 525 | } | 586 | } |
| 526 | if (desc->shadow_ptes[RMAP_EXT-1]) { | 587 | if (desc->sptes[RMAP_EXT-1]) { |
| 527 | desc->more = mmu_alloc_rmap_desc(vcpu); | 588 | desc->more = mmu_alloc_rmap_desc(vcpu); |
| 528 | desc = desc->more; | 589 | desc = desc->more; |
| 529 | } | 590 | } |
| 530 | for (i = 0; desc->shadow_ptes[i]; ++i) | 591 | for (i = 0; desc->sptes[i]; ++i) |
| 531 | ; | 592 | ; |
| 532 | desc->shadow_ptes[i] = spte; | 593 | desc->sptes[i] = spte; |
| 533 | } | 594 | } |
| 534 | return count; | 595 | return count; |
| 535 | } | 596 | } |
| @@ -541,14 +602,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp, | |||
| 541 | { | 602 | { |
| 542 | int j; | 603 | int j; |
| 543 | 604 | ||
| 544 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | 605 | for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) |
| 545 | ; | 606 | ; |
| 546 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | 607 | desc->sptes[i] = desc->sptes[j]; |
| 547 | desc->shadow_ptes[j] = NULL; | 608 | desc->sptes[j] = NULL; |
| 548 | if (j != 0) | 609 | if (j != 0) |
| 549 | return; | 610 | return; |
| 550 | if (!prev_desc && !desc->more) | 611 | if (!prev_desc && !desc->more) |
| 551 | *rmapp = (unsigned long)desc->shadow_ptes[0]; | 612 | *rmapp = (unsigned long)desc->sptes[0]; |
| 552 | else | 613 | else |
| 553 | if (prev_desc) | 614 | if (prev_desc) |
| 554 | prev_desc->more = desc->more; | 615 | prev_desc->more = desc->more; |
| @@ -566,7 +627,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
| 566 | unsigned long *rmapp; | 627 | unsigned long *rmapp; |
| 567 | int i; | 628 | int i; |
| 568 | 629 | ||
| 569 | if (!is_rmap_pte(*spte)) | 630 | if (!is_rmap_spte(*spte)) |
| 570 | return; | 631 | return; |
| 571 | sp = page_header(__pa(spte)); | 632 | sp = page_header(__pa(spte)); |
| 572 | pfn = spte_to_pfn(*spte); | 633 | pfn = spte_to_pfn(*spte); |
| @@ -576,7 +637,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
| 576 | kvm_release_pfn_dirty(pfn); | 637 | kvm_release_pfn_dirty(pfn); |
| 577 | else | 638 | else |
| 578 | kvm_release_pfn_clean(pfn); | 639 | kvm_release_pfn_clean(pfn); |
| 579 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); | 640 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); |
| 580 | if (!*rmapp) { | 641 | if (!*rmapp) { |
| 581 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | 642 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); |
| 582 | BUG(); | 643 | BUG(); |
| @@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
| 593 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 654 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
| 594 | prev_desc = NULL; | 655 | prev_desc = NULL; |
| 595 | while (desc) { | 656 | while (desc) { |
| 596 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | 657 | for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) |
| 597 | if (desc->shadow_ptes[i] == spte) { | 658 | if (desc->sptes[i] == spte) { |
| 598 | rmap_desc_remove_entry(rmapp, | 659 | rmap_desc_remove_entry(rmapp, |
| 599 | desc, i, | 660 | desc, i, |
| 600 | prev_desc); | 661 | prev_desc); |
| @@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | |||
| 625 | prev_desc = NULL; | 686 | prev_desc = NULL; |
| 626 | prev_spte = NULL; | 687 | prev_spte = NULL; |
| 627 | while (desc) { | 688 | while (desc) { |
| 628 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { | 689 | for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { |
| 629 | if (prev_spte == spte) | 690 | if (prev_spte == spte) |
| 630 | return desc->shadow_ptes[i]; | 691 | return desc->sptes[i]; |
| 631 | prev_spte = desc->shadow_ptes[i]; | 692 | prev_spte = desc->sptes[i]; |
| 632 | } | 693 | } |
| 633 | desc = desc->more; | 694 | desc = desc->more; |
| 634 | } | 695 | } |
| @@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
| 639 | { | 700 | { |
| 640 | unsigned long *rmapp; | 701 | unsigned long *rmapp; |
| 641 | u64 *spte; | 702 | u64 *spte; |
| 642 | int write_protected = 0; | 703 | int i, write_protected = 0; |
| 643 | 704 | ||
| 644 | gfn = unalias_gfn(kvm, gfn); | 705 | gfn = unalias_gfn(kvm, gfn); |
| 645 | rmapp = gfn_to_rmap(kvm, gfn, 0); | 706 | rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); |
| 646 | 707 | ||
| 647 | spte = rmap_next(kvm, rmapp, NULL); | 708 | spte = rmap_next(kvm, rmapp, NULL); |
| 648 | while (spte) { | 709 | while (spte) { |
| @@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
| 650 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 711 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
| 651 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 712 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); |
| 652 | if (is_writeble_pte(*spte)) { | 713 | if (is_writeble_pte(*spte)) { |
| 653 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | 714 | __set_spte(spte, *spte & ~PT_WRITABLE_MASK); |
| 654 | write_protected = 1; | 715 | write_protected = 1; |
| 655 | } | 716 | } |
| 656 | spte = rmap_next(kvm, rmapp, spte); | 717 | spte = rmap_next(kvm, rmapp, spte); |
| @@ -664,21 +725,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
| 664 | } | 725 | } |
| 665 | 726 | ||
| 666 | /* check for huge page mappings */ | 727 | /* check for huge page mappings */ |
| 667 | rmapp = gfn_to_rmap(kvm, gfn, 1); | 728 | for (i = PT_DIRECTORY_LEVEL; |
| 668 | spte = rmap_next(kvm, rmapp, NULL); | 729 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
| 669 | while (spte) { | 730 | rmapp = gfn_to_rmap(kvm, gfn, i); |
| 670 | BUG_ON(!spte); | 731 | spte = rmap_next(kvm, rmapp, NULL); |
| 671 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 732 | while (spte) { |
| 672 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | 733 | BUG_ON(!spte); |
| 673 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | 734 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
| 674 | if (is_writeble_pte(*spte)) { | 735 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); |
| 675 | rmap_remove(kvm, spte); | 736 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); |
| 676 | --kvm->stat.lpages; | 737 | if (is_writeble_pte(*spte)) { |
| 677 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 738 | rmap_remove(kvm, spte); |
| 678 | spte = NULL; | 739 | --kvm->stat.lpages; |
| 679 | write_protected = 1; | 740 | __set_spte(spte, shadow_trap_nonpresent_pte); |
| 741 | spte = NULL; | ||
| 742 | write_protected = 1; | ||
| 743 | } | ||
| 744 | spte = rmap_next(kvm, rmapp, spte); | ||
| 680 | } | 745 | } |
| 681 | spte = rmap_next(kvm, rmapp, spte); | ||
| 682 | } | 746 | } |
| 683 | 747 | ||
| 684 | return write_protected; | 748 | return write_protected; |
| @@ -693,7 +757,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
| 693 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 757 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
| 694 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | 758 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); |
| 695 | rmap_remove(kvm, spte); | 759 | rmap_remove(kvm, spte); |
| 696 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 760 | __set_spte(spte, shadow_trap_nonpresent_pte); |
| 697 | need_tlb_flush = 1; | 761 | need_tlb_flush = 1; |
| 698 | } | 762 | } |
| 699 | return need_tlb_flush; | 763 | return need_tlb_flush; |
| @@ -702,7 +766,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
| 702 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | 766 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, |
| 703 | int (*handler)(struct kvm *kvm, unsigned long *rmapp)) | 767 | int (*handler)(struct kvm *kvm, unsigned long *rmapp)) |
| 704 | { | 768 | { |
| 705 | int i; | 769 | int i, j; |
| 706 | int retval = 0; | 770 | int retval = 0; |
| 707 | 771 | ||
| 708 | /* | 772 | /* |
| @@ -721,11 +785,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
| 721 | end = start + (memslot->npages << PAGE_SHIFT); | 785 | end = start + (memslot->npages << PAGE_SHIFT); |
| 722 | if (hva >= start && hva < end) { | 786 | if (hva >= start && hva < end) { |
| 723 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | 787 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; |
| 788 | |||
| 724 | retval |= handler(kvm, &memslot->rmap[gfn_offset]); | 789 | retval |= handler(kvm, &memslot->rmap[gfn_offset]); |
| 725 | retval |= handler(kvm, | 790 | |
| 726 | &memslot->lpage_info[ | 791 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { |
| 727 | gfn_offset / | 792 | int idx = gfn_offset; |
| 728 | KVM_PAGES_PER_HPAGE].rmap_pde); | 793 | idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); |
| 794 | retval |= handler(kvm, | ||
| 795 | &memslot->lpage_info[j][idx].rmap_pde); | ||
| 796 | } | ||
| 729 | } | 797 | } |
| 730 | } | 798 | } |
| 731 | 799 | ||
| @@ -763,12 +831,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
| 763 | 831 | ||
| 764 | #define RMAP_RECYCLE_THRESHOLD 1000 | 832 | #define RMAP_RECYCLE_THRESHOLD 1000 |
| 765 | 833 | ||
| 766 | static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) | 834 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
| 767 | { | 835 | { |
| 768 | unsigned long *rmapp; | 836 | unsigned long *rmapp; |
| 837 | struct kvm_mmu_page *sp; | ||
| 838 | |||
| 839 | sp = page_header(__pa(spte)); | ||
| 769 | 840 | ||
| 770 | gfn = unalias_gfn(vcpu->kvm, gfn); | 841 | gfn = unalias_gfn(vcpu->kvm, gfn); |
| 771 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); | 842 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); |
| 772 | 843 | ||
| 773 | kvm_unmap_rmapp(vcpu->kvm, rmapp); | 844 | kvm_unmap_rmapp(vcpu->kvm, rmapp); |
| 774 | kvm_flush_remote_tlbs(vcpu->kvm); | 845 | kvm_flush_remote_tlbs(vcpu->kvm); |
| @@ -1109,6 +1180,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
| 1109 | return 1; | 1180 | return 1; |
| 1110 | } | 1181 | } |
| 1111 | 1182 | ||
| 1183 | trace_kvm_mmu_sync_page(sp); | ||
| 1112 | if (rmap_write_protect(vcpu->kvm, sp->gfn)) | 1184 | if (rmap_write_protect(vcpu->kvm, sp->gfn)) |
| 1113 | kvm_flush_remote_tlbs(vcpu->kvm); | 1185 | kvm_flush_remote_tlbs(vcpu->kvm); |
| 1114 | kvm_unlink_unsync_page(vcpu->kvm, sp); | 1186 | kvm_unlink_unsync_page(vcpu->kvm, sp); |
| @@ -1231,8 +1303,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
| 1231 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | 1303 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; |
| 1232 | role.quadrant = quadrant; | 1304 | role.quadrant = quadrant; |
| 1233 | } | 1305 | } |
| 1234 | pgprintk("%s: looking gfn %lx role %x\n", __func__, | ||
| 1235 | gfn, role.word); | ||
| 1236 | index = kvm_page_table_hashfn(gfn); | 1306 | index = kvm_page_table_hashfn(gfn); |
| 1237 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1307 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
| 1238 | hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) | 1308 | hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) |
| @@ -1249,14 +1319,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
| 1249 | set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); | 1319 | set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); |
| 1250 | kvm_mmu_mark_parents_unsync(vcpu, sp); | 1320 | kvm_mmu_mark_parents_unsync(vcpu, sp); |
| 1251 | } | 1321 | } |
| 1252 | pgprintk("%s: found\n", __func__); | 1322 | trace_kvm_mmu_get_page(sp, false); |
| 1253 | return sp; | 1323 | return sp; |
| 1254 | } | 1324 | } |
| 1255 | ++vcpu->kvm->stat.mmu_cache_miss; | 1325 | ++vcpu->kvm->stat.mmu_cache_miss; |
| 1256 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); | 1326 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); |
| 1257 | if (!sp) | 1327 | if (!sp) |
| 1258 | return sp; | 1328 | return sp; |
| 1259 | pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); | ||
| 1260 | sp->gfn = gfn; | 1329 | sp->gfn = gfn; |
| 1261 | sp->role = role; | 1330 | sp->role = role; |
| 1262 | hlist_add_head(&sp->hash_link, bucket); | 1331 | hlist_add_head(&sp->hash_link, bucket); |
| @@ -1269,6 +1338,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
| 1269 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | 1338 | vcpu->arch.mmu.prefetch_page(vcpu, sp); |
| 1270 | else | 1339 | else |
| 1271 | nonpaging_prefetch_page(vcpu, sp); | 1340 | nonpaging_prefetch_page(vcpu, sp); |
| 1341 | trace_kvm_mmu_get_page(sp, true); | ||
| 1272 | return sp; | 1342 | return sp; |
| 1273 | } | 1343 | } |
| 1274 | 1344 | ||
| @@ -1292,6 +1362,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) | |||
| 1292 | { | 1362 | { |
| 1293 | if (iterator->level < PT_PAGE_TABLE_LEVEL) | 1363 | if (iterator->level < PT_PAGE_TABLE_LEVEL) |
| 1294 | return false; | 1364 | return false; |
| 1365 | |||
| 1366 | if (iterator->level == PT_PAGE_TABLE_LEVEL) | ||
| 1367 | if (is_large_pte(*iterator->sptep)) | ||
| 1368 | return false; | ||
| 1369 | |||
| 1295 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); | 1370 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); |
| 1296 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; | 1371 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; |
| 1297 | return true; | 1372 | return true; |
| @@ -1312,25 +1387,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
| 1312 | 1387 | ||
| 1313 | pt = sp->spt; | 1388 | pt = sp->spt; |
| 1314 | 1389 | ||
| 1315 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) { | ||
| 1316 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 1317 | if (is_shadow_present_pte(pt[i])) | ||
| 1318 | rmap_remove(kvm, &pt[i]); | ||
| 1319 | pt[i] = shadow_trap_nonpresent_pte; | ||
| 1320 | } | ||
| 1321 | return; | ||
| 1322 | } | ||
| 1323 | |||
| 1324 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 1390 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
| 1325 | ent = pt[i]; | 1391 | ent = pt[i]; |
| 1326 | 1392 | ||
| 1327 | if (is_shadow_present_pte(ent)) { | 1393 | if (is_shadow_present_pte(ent)) { |
| 1328 | if (!is_large_pte(ent)) { | 1394 | if (!is_last_spte(ent, sp->role.level)) { |
| 1329 | ent &= PT64_BASE_ADDR_MASK; | 1395 | ent &= PT64_BASE_ADDR_MASK; |
| 1330 | mmu_page_remove_parent_pte(page_header(ent), | 1396 | mmu_page_remove_parent_pte(page_header(ent), |
| 1331 | &pt[i]); | 1397 | &pt[i]); |
| 1332 | } else { | 1398 | } else { |
| 1333 | --kvm->stat.lpages; | 1399 | if (is_large_pte(ent)) |
| 1400 | --kvm->stat.lpages; | ||
| 1334 | rmap_remove(kvm, &pt[i]); | 1401 | rmap_remove(kvm, &pt[i]); |
| 1335 | } | 1402 | } |
| 1336 | } | 1403 | } |
| @@ -1346,10 +1413,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | |||
| 1346 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | 1413 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) |
| 1347 | { | 1414 | { |
| 1348 | int i; | 1415 | int i; |
| 1416 | struct kvm_vcpu *vcpu; | ||
| 1349 | 1417 | ||
| 1350 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | 1418 | kvm_for_each_vcpu(i, vcpu, kvm) |
| 1351 | if (kvm->vcpus[i]) | 1419 | vcpu->arch.last_pte_updated = NULL; |
| 1352 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | ||
| 1353 | } | 1420 | } |
| 1354 | 1421 | ||
| 1355 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | 1422 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
| @@ -1368,7 +1435,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
| 1368 | } | 1435 | } |
| 1369 | BUG_ON(!parent_pte); | 1436 | BUG_ON(!parent_pte); |
| 1370 | kvm_mmu_put_page(sp, parent_pte); | 1437 | kvm_mmu_put_page(sp, parent_pte); |
| 1371 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | 1438 | __set_spte(parent_pte, shadow_trap_nonpresent_pte); |
| 1372 | } | 1439 | } |
| 1373 | } | 1440 | } |
| 1374 | 1441 | ||
| @@ -1400,6 +1467,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, | |||
| 1400 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1467 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
| 1401 | { | 1468 | { |
| 1402 | int ret; | 1469 | int ret; |
| 1470 | |||
| 1471 | trace_kvm_mmu_zap_page(sp); | ||
| 1403 | ++kvm->stat.mmu_shadow_zapped; | 1472 | ++kvm->stat.mmu_shadow_zapped; |
| 1404 | ret = mmu_zap_unsync_children(kvm, sp); | 1473 | ret = mmu_zap_unsync_children(kvm, sp); |
| 1405 | kvm_mmu_page_unlink_children(kvm, sp); | 1474 | kvm_mmu_page_unlink_children(kvm, sp); |
| @@ -1516,7 +1585,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp) | |||
| 1516 | 1585 | ||
| 1517 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 1586 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
| 1518 | if (pt[i] == shadow_notrap_nonpresent_pte) | 1587 | if (pt[i] == shadow_notrap_nonpresent_pte) |
| 1519 | set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); | 1588 | __set_spte(&pt[i], shadow_trap_nonpresent_pte); |
| 1520 | } | 1589 | } |
| 1521 | } | 1590 | } |
| 1522 | 1591 | ||
| @@ -1646,6 +1715,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
| 1646 | struct kvm_mmu_page *s; | 1715 | struct kvm_mmu_page *s; |
| 1647 | struct hlist_node *node, *n; | 1716 | struct hlist_node *node, *n; |
| 1648 | 1717 | ||
| 1718 | trace_kvm_mmu_unsync_page(sp); | ||
| 1649 | index = kvm_page_table_hashfn(sp->gfn); | 1719 | index = kvm_page_table_hashfn(sp->gfn); |
| 1650 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1720 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
| 1651 | /* don't unsync if pagetable is shadowed with multiple roles */ | 1721 | /* don't unsync if pagetable is shadowed with multiple roles */ |
| @@ -1682,9 +1752,9 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
| 1682 | return 0; | 1752 | return 0; |
| 1683 | } | 1753 | } |
| 1684 | 1754 | ||
| 1685 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1755 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
| 1686 | unsigned pte_access, int user_fault, | 1756 | unsigned pte_access, int user_fault, |
| 1687 | int write_fault, int dirty, int largepage, | 1757 | int write_fault, int dirty, int level, |
| 1688 | gfn_t gfn, pfn_t pfn, bool speculative, | 1758 | gfn_t gfn, pfn_t pfn, bool speculative, |
| 1689 | bool can_unsync) | 1759 | bool can_unsync) |
| 1690 | { | 1760 | { |
| @@ -1707,7 +1777,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
| 1707 | spte |= shadow_nx_mask; | 1777 | spte |= shadow_nx_mask; |
| 1708 | if (pte_access & ACC_USER_MASK) | 1778 | if (pte_access & ACC_USER_MASK) |
| 1709 | spte |= shadow_user_mask; | 1779 | spte |= shadow_user_mask; |
| 1710 | if (largepage) | 1780 | if (level > PT_PAGE_TABLE_LEVEL) |
| 1711 | spte |= PT_PAGE_SIZE_MASK; | 1781 | spte |= PT_PAGE_SIZE_MASK; |
| 1712 | if (tdp_enabled) | 1782 | if (tdp_enabled) |
| 1713 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, | 1783 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, |
| @@ -1718,7 +1788,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
| 1718 | if ((pte_access & ACC_WRITE_MASK) | 1788 | if ((pte_access & ACC_WRITE_MASK) |
| 1719 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | 1789 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { |
| 1720 | 1790 | ||
| 1721 | if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { | 1791 | if (level > PT_PAGE_TABLE_LEVEL && |
| 1792 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | ||
| 1722 | ret = 1; | 1793 | ret = 1; |
| 1723 | spte = shadow_trap_nonpresent_pte; | 1794 | spte = shadow_trap_nonpresent_pte; |
| 1724 | goto set_pte; | 1795 | goto set_pte; |
| @@ -1732,7 +1803,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
| 1732 | * is responsibility of mmu_get_page / kvm_sync_page. | 1803 | * is responsibility of mmu_get_page / kvm_sync_page. |
| 1733 | * Same reasoning can be applied to dirty page accounting. | 1804 | * Same reasoning can be applied to dirty page accounting. |
| 1734 | */ | 1805 | */ |
| 1735 | if (!can_unsync && is_writeble_pte(*shadow_pte)) | 1806 | if (!can_unsync && is_writeble_pte(*sptep)) |
| 1736 | goto set_pte; | 1807 | goto set_pte; |
| 1737 | 1808 | ||
| 1738 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { | 1809 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
| @@ -1749,65 +1820,67 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
| 1749 | mark_page_dirty(vcpu->kvm, gfn); | 1820 | mark_page_dirty(vcpu->kvm, gfn); |
| 1750 | 1821 | ||
| 1751 | set_pte: | 1822 | set_pte: |
| 1752 | set_shadow_pte(shadow_pte, spte); | 1823 | __set_spte(sptep, spte); |
| 1753 | return ret; | 1824 | return ret; |
| 1754 | } | 1825 | } |
| 1755 | 1826 | ||
| 1756 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1827 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
| 1757 | unsigned pt_access, unsigned pte_access, | 1828 | unsigned pt_access, unsigned pte_access, |
| 1758 | int user_fault, int write_fault, int dirty, | 1829 | int user_fault, int write_fault, int dirty, |
| 1759 | int *ptwrite, int largepage, gfn_t gfn, | 1830 | int *ptwrite, int level, gfn_t gfn, |
| 1760 | pfn_t pfn, bool speculative) | 1831 | pfn_t pfn, bool speculative) |
| 1761 | { | 1832 | { |
| 1762 | int was_rmapped = 0; | 1833 | int was_rmapped = 0; |
| 1763 | int was_writeble = is_writeble_pte(*shadow_pte); | 1834 | int was_writeble = is_writeble_pte(*sptep); |
| 1764 | int rmap_count; | 1835 | int rmap_count; |
| 1765 | 1836 | ||
| 1766 | pgprintk("%s: spte %llx access %x write_fault %d" | 1837 | pgprintk("%s: spte %llx access %x write_fault %d" |
| 1767 | " user_fault %d gfn %lx\n", | 1838 | " user_fault %d gfn %lx\n", |
| 1768 | __func__, *shadow_pte, pt_access, | 1839 | __func__, *sptep, pt_access, |
| 1769 | write_fault, user_fault, gfn); | 1840 | write_fault, user_fault, gfn); |
| 1770 | 1841 | ||
| 1771 | if (is_rmap_pte(*shadow_pte)) { | 1842 | if (is_rmap_spte(*sptep)) { |
| 1772 | /* | 1843 | /* |
| 1773 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | 1844 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink |
| 1774 | * the parent of the now unreachable PTE. | 1845 | * the parent of the now unreachable PTE. |
| 1775 | */ | 1846 | */ |
| 1776 | if (largepage && !is_large_pte(*shadow_pte)) { | 1847 | if (level > PT_PAGE_TABLE_LEVEL && |
| 1848 | !is_large_pte(*sptep)) { | ||
| 1777 | struct kvm_mmu_page *child; | 1849 | struct kvm_mmu_page *child; |
| 1778 | u64 pte = *shadow_pte; | 1850 | u64 pte = *sptep; |
| 1779 | 1851 | ||
| 1780 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1852 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
| 1781 | mmu_page_remove_parent_pte(child, shadow_pte); | 1853 | mmu_page_remove_parent_pte(child, sptep); |
| 1782 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | 1854 | } else if (pfn != spte_to_pfn(*sptep)) { |
| 1783 | pgprintk("hfn old %lx new %lx\n", | 1855 | pgprintk("hfn old %lx new %lx\n", |
| 1784 | spte_to_pfn(*shadow_pte), pfn); | 1856 | spte_to_pfn(*sptep), pfn); |
| 1785 | rmap_remove(vcpu->kvm, shadow_pte); | 1857 | rmap_remove(vcpu->kvm, sptep); |
| 1786 | } else | 1858 | } else |
| 1787 | was_rmapped = 1; | 1859 | was_rmapped = 1; |
| 1788 | } | 1860 | } |
| 1789 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, | 1861 | |
| 1790 | dirty, largepage, gfn, pfn, speculative, true)) { | 1862 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, |
| 1863 | dirty, level, gfn, pfn, speculative, true)) { | ||
| 1791 | if (write_fault) | 1864 | if (write_fault) |
| 1792 | *ptwrite = 1; | 1865 | *ptwrite = 1; |
| 1793 | kvm_x86_ops->tlb_flush(vcpu); | 1866 | kvm_x86_ops->tlb_flush(vcpu); |
| 1794 | } | 1867 | } |
| 1795 | 1868 | ||
| 1796 | pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); | 1869 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
| 1797 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | 1870 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", |
| 1798 | is_large_pte(*shadow_pte)? "2MB" : "4kB", | 1871 | is_large_pte(*sptep)? "2MB" : "4kB", |
| 1799 | is_present_pte(*shadow_pte)?"RW":"R", gfn, | 1872 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, |
| 1800 | *shadow_pte, shadow_pte); | 1873 | *sptep, sptep); |
| 1801 | if (!was_rmapped && is_large_pte(*shadow_pte)) | 1874 | if (!was_rmapped && is_large_pte(*sptep)) |
| 1802 | ++vcpu->kvm->stat.lpages; | 1875 | ++vcpu->kvm->stat.lpages; |
| 1803 | 1876 | ||
| 1804 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1877 | page_header_update_slot(vcpu->kvm, sptep, gfn); |
| 1805 | if (!was_rmapped) { | 1878 | if (!was_rmapped) { |
| 1806 | rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); | 1879 | rmap_count = rmap_add(vcpu, sptep, gfn); |
| 1807 | if (!is_rmap_pte(*shadow_pte)) | 1880 | if (!is_rmap_spte(*sptep)) |
| 1808 | kvm_release_pfn_clean(pfn); | 1881 | kvm_release_pfn_clean(pfn); |
| 1809 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | 1882 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) |
| 1810 | rmap_recycle(vcpu, gfn, largepage); | 1883 | rmap_recycle(vcpu, sptep, gfn); |
| 1811 | } else { | 1884 | } else { |
| 1812 | if (was_writeble) | 1885 | if (was_writeble) |
| 1813 | kvm_release_pfn_dirty(pfn); | 1886 | kvm_release_pfn_dirty(pfn); |
| @@ -1815,7 +1888,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
| 1815 | kvm_release_pfn_clean(pfn); | 1888 | kvm_release_pfn_clean(pfn); |
| 1816 | } | 1889 | } |
| 1817 | if (speculative) { | 1890 | if (speculative) { |
| 1818 | vcpu->arch.last_pte_updated = shadow_pte; | 1891 | vcpu->arch.last_pte_updated = sptep; |
| 1819 | vcpu->arch.last_pte_gfn = gfn; | 1892 | vcpu->arch.last_pte_gfn = gfn; |
| 1820 | } | 1893 | } |
| 1821 | } | 1894 | } |
| @@ -1825,7 +1898,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
| 1825 | } | 1898 | } |
| 1826 | 1899 | ||
| 1827 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 1900 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
| 1828 | int largepage, gfn_t gfn, pfn_t pfn) | 1901 | int level, gfn_t gfn, pfn_t pfn) |
| 1829 | { | 1902 | { |
| 1830 | struct kvm_shadow_walk_iterator iterator; | 1903 | struct kvm_shadow_walk_iterator iterator; |
| 1831 | struct kvm_mmu_page *sp; | 1904 | struct kvm_mmu_page *sp; |
| @@ -1833,11 +1906,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
| 1833 | gfn_t pseudo_gfn; | 1906 | gfn_t pseudo_gfn; |
| 1834 | 1907 | ||
| 1835 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 1908 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
| 1836 | if (iterator.level == PT_PAGE_TABLE_LEVEL | 1909 | if (iterator.level == level) { |
| 1837 | || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { | ||
| 1838 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, | 1910 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, |
| 1839 | 0, write, 1, &pt_write, | 1911 | 0, write, 1, &pt_write, |
| 1840 | largepage, gfn, pfn, false); | 1912 | level, gfn, pfn, false); |
| 1841 | ++vcpu->stat.pf_fixed; | 1913 | ++vcpu->stat.pf_fixed; |
| 1842 | break; | 1914 | break; |
| 1843 | } | 1915 | } |
| @@ -1853,10 +1925,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
| 1853 | return -ENOMEM; | 1925 | return -ENOMEM; |
| 1854 | } | 1926 | } |
| 1855 | 1927 | ||
| 1856 | set_shadow_pte(iterator.sptep, | 1928 | __set_spte(iterator.sptep, |
| 1857 | __pa(sp->spt) | 1929 | __pa(sp->spt) |
| 1858 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | 1930 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
| 1859 | | shadow_user_mask | shadow_x_mask); | 1931 | | shadow_user_mask | shadow_x_mask); |
| 1860 | } | 1932 | } |
| 1861 | } | 1933 | } |
| 1862 | return pt_write; | 1934 | return pt_write; |
| @@ -1865,14 +1937,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
| 1865 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 1937 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) |
| 1866 | { | 1938 | { |
| 1867 | int r; | 1939 | int r; |
| 1868 | int largepage = 0; | 1940 | int level; |
| 1869 | pfn_t pfn; | 1941 | pfn_t pfn; |
| 1870 | unsigned long mmu_seq; | 1942 | unsigned long mmu_seq; |
| 1871 | 1943 | ||
| 1872 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1944 | level = mapping_level(vcpu, gfn); |
| 1873 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1945 | |
| 1874 | largepage = 1; | 1946 | /* |
| 1875 | } | 1947 | * This path builds a PAE pagetable - so we can map 2mb pages at |
| 1948 | * maximum. Therefore check if the level is larger than that. | ||
| 1949 | */ | ||
| 1950 | if (level > PT_DIRECTORY_LEVEL) | ||
| 1951 | level = PT_DIRECTORY_LEVEL; | ||
| 1952 | |||
| 1953 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||
| 1876 | 1954 | ||
| 1877 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 1955 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
| 1878 | smp_rmb(); | 1956 | smp_rmb(); |
| @@ -1888,7 +1966,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
| 1888 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 1966 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
| 1889 | goto out_unlock; | 1967 | goto out_unlock; |
| 1890 | kvm_mmu_free_some_pages(vcpu); | 1968 | kvm_mmu_free_some_pages(vcpu); |
| 1891 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn); | 1969 | r = __direct_map(vcpu, v, write, level, gfn, pfn); |
| 1892 | spin_unlock(&vcpu->kvm->mmu_lock); | 1970 | spin_unlock(&vcpu->kvm->mmu_lock); |
| 1893 | 1971 | ||
| 1894 | 1972 | ||
| @@ -1954,6 +2032,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
| 1954 | gfn_t root_gfn; | 2032 | gfn_t root_gfn; |
| 1955 | struct kvm_mmu_page *sp; | 2033 | struct kvm_mmu_page *sp; |
| 1956 | int direct = 0; | 2034 | int direct = 0; |
| 2035 | u64 pdptr; | ||
| 1957 | 2036 | ||
| 1958 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | 2037 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; |
| 1959 | 2038 | ||
| @@ -1981,11 +2060,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
| 1981 | 2060 | ||
| 1982 | ASSERT(!VALID_PAGE(root)); | 2061 | ASSERT(!VALID_PAGE(root)); |
| 1983 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | 2062 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { |
| 1984 | if (!is_present_pte(vcpu->arch.pdptrs[i])) { | 2063 | pdptr = kvm_pdptr_read(vcpu, i); |
| 2064 | if (!is_present_gpte(pdptr)) { | ||
| 1985 | vcpu->arch.mmu.pae_root[i] = 0; | 2065 | vcpu->arch.mmu.pae_root[i] = 0; |
| 1986 | continue; | 2066 | continue; |
| 1987 | } | 2067 | } |
| 1988 | root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; | 2068 | root_gfn = pdptr >> PAGE_SHIFT; |
| 1989 | } else if (vcpu->arch.mmu.root_level == 0) | 2069 | } else if (vcpu->arch.mmu.root_level == 0) |
| 1990 | root_gfn = 0; | 2070 | root_gfn = 0; |
| 1991 | if (mmu_check_root(vcpu, root_gfn)) | 2071 | if (mmu_check_root(vcpu, root_gfn)) |
| @@ -2062,7 +2142,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
| 2062 | { | 2142 | { |
| 2063 | pfn_t pfn; | 2143 | pfn_t pfn; |
| 2064 | int r; | 2144 | int r; |
| 2065 | int largepage = 0; | 2145 | int level; |
| 2066 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2146 | gfn_t gfn = gpa >> PAGE_SHIFT; |
| 2067 | unsigned long mmu_seq; | 2147 | unsigned long mmu_seq; |
| 2068 | 2148 | ||
| @@ -2073,10 +2153,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
| 2073 | if (r) | 2153 | if (r) |
| 2074 | return r; | 2154 | return r; |
| 2075 | 2155 | ||
| 2076 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 2156 | level = mapping_level(vcpu, gfn); |
| 2077 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 2157 | |
| 2078 | largepage = 1; | 2158 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); |
| 2079 | } | 2159 | |
| 2080 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2160 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
| 2081 | smp_rmb(); | 2161 | smp_rmb(); |
| 2082 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2162 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
| @@ -2089,7 +2169,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
| 2089 | goto out_unlock; | 2169 | goto out_unlock; |
| 2090 | kvm_mmu_free_some_pages(vcpu); | 2170 | kvm_mmu_free_some_pages(vcpu); |
| 2091 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 2171 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
| 2092 | largepage, gfn, pfn); | 2172 | level, gfn, pfn); |
| 2093 | spin_unlock(&vcpu->kvm->mmu_lock); | 2173 | spin_unlock(&vcpu->kvm->mmu_lock); |
| 2094 | 2174 | ||
| 2095 | return r; | 2175 | return r; |
| @@ -2206,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | |||
| 2206 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | | 2286 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | |
| 2207 | rsvd_bits(maxphyaddr, 51); | 2287 | rsvd_bits(maxphyaddr, 51); |
| 2208 | context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; | 2288 | context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; |
| 2209 | context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; | 2289 | context->rsvd_bits_mask[1][2] = exb_bit_rsvd | |
| 2290 | rsvd_bits(maxphyaddr, 51) | | ||
| 2291 | rsvd_bits(13, 29); | ||
| 2210 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | 2292 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | |
| 2211 | rsvd_bits(maxphyaddr, 51) | | 2293 | rsvd_bits(maxphyaddr, 51) | |
| 2212 | rsvd_bits(13, 20); /* large page */ | 2294 | rsvd_bits(13, 20); /* large page */ |
| @@ -2357,8 +2439,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
| 2357 | spin_unlock(&vcpu->kvm->mmu_lock); | 2439 | spin_unlock(&vcpu->kvm->mmu_lock); |
| 2358 | if (r) | 2440 | if (r) |
| 2359 | goto out; | 2441 | goto out; |
| 2442 | /* set_cr3() should ensure TLB has been flushed */ | ||
| 2360 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 2443 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
| 2361 | kvm_mmu_flush_tlb(vcpu); | ||
| 2362 | out: | 2444 | out: |
| 2363 | return r; | 2445 | return r; |
| 2364 | } | 2446 | } |
| @@ -2378,15 +2460,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
| 2378 | 2460 | ||
| 2379 | pte = *spte; | 2461 | pte = *spte; |
| 2380 | if (is_shadow_present_pte(pte)) { | 2462 | if (is_shadow_present_pte(pte)) { |
| 2381 | if (sp->role.level == PT_PAGE_TABLE_LEVEL || | 2463 | if (is_last_spte(pte, sp->role.level)) |
| 2382 | is_large_pte(pte)) | ||
| 2383 | rmap_remove(vcpu->kvm, spte); | 2464 | rmap_remove(vcpu->kvm, spte); |
| 2384 | else { | 2465 | else { |
| 2385 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 2466 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
| 2386 | mmu_page_remove_parent_pte(child, spte); | 2467 | mmu_page_remove_parent_pte(child, spte); |
| 2387 | } | 2468 | } |
| 2388 | } | 2469 | } |
| 2389 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 2470 | __set_spte(spte, shadow_trap_nonpresent_pte); |
| 2390 | if (is_large_pte(pte)) | 2471 | if (is_large_pte(pte)) |
| 2391 | --vcpu->kvm->stat.lpages; | 2472 | --vcpu->kvm->stat.lpages; |
| 2392 | } | 2473 | } |
| @@ -2397,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
| 2397 | const void *new) | 2478 | const void *new) |
| 2398 | { | 2479 | { |
| 2399 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | 2480 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { |
| 2400 | if (!vcpu->arch.update_pte.largepage || | 2481 | ++vcpu->kvm->stat.mmu_pde_zapped; |
| 2401 | sp->role.glevels == PT32_ROOT_LEVEL) { | 2482 | return; |
| 2402 | ++vcpu->kvm->stat.mmu_pde_zapped; | ||
| 2403 | return; | ||
| 2404 | } | ||
| 2405 | } | 2483 | } |
| 2406 | 2484 | ||
| 2407 | ++vcpu->kvm->stat.mmu_pte_updated; | 2485 | ++vcpu->kvm->stat.mmu_pte_updated; |
| @@ -2447,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
| 2447 | u64 gpte = 0; | 2525 | u64 gpte = 0; |
| 2448 | pfn_t pfn; | 2526 | pfn_t pfn; |
| 2449 | 2527 | ||
| 2450 | vcpu->arch.update_pte.largepage = 0; | ||
| 2451 | |||
| 2452 | if (bytes != 4 && bytes != 8) | 2528 | if (bytes != 4 && bytes != 8) |
| 2453 | return; | 2529 | return; |
| 2454 | 2530 | ||
| @@ -2472,14 +2548,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
| 2472 | if ((bytes == 4) && (gpa % 4 == 0)) | 2548 | if ((bytes == 4) && (gpa % 4 == 0)) |
| 2473 | memcpy((void *)&gpte, new, 4); | 2549 | memcpy((void *)&gpte, new, 4); |
| 2474 | } | 2550 | } |
| 2475 | if (!is_present_pte(gpte)) | 2551 | if (!is_present_gpte(gpte)) |
| 2476 | return; | 2552 | return; |
| 2477 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 2553 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
| 2478 | 2554 | ||
| 2479 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { | ||
| 2480 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | ||
| 2481 | vcpu->arch.update_pte.largepage = 1; | ||
| 2482 | } | ||
| 2483 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2555 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; |
| 2484 | smp_rmb(); | 2556 | smp_rmb(); |
| 2485 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2557 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
| @@ -2622,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |||
| 2622 | gpa_t gpa; | 2694 | gpa_t gpa; |
| 2623 | int r; | 2695 | int r; |
| 2624 | 2696 | ||
| 2697 | if (tdp_enabled) | ||
| 2698 | return 0; | ||
| 2699 | |||
| 2625 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | 2700 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); |
| 2626 | 2701 | ||
| 2627 | spin_lock(&vcpu->kvm->mmu_lock); | 2702 | spin_lock(&vcpu->kvm->mmu_lock); |
| @@ -2633,7 +2708,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | |||
| 2633 | 2708 | ||
| 2634 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 2709 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
| 2635 | { | 2710 | { |
| 2636 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { | 2711 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && |
| 2712 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | ||
| 2637 | struct kvm_mmu_page *sp; | 2713 | struct kvm_mmu_page *sp; |
| 2638 | 2714 | ||
| 2639 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | 2715 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, |
| @@ -2670,8 +2746,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | |||
| 2670 | ++vcpu->stat.mmio_exits; | 2746 | ++vcpu->stat.mmio_exits; |
| 2671 | return 0; | 2747 | return 0; |
| 2672 | case EMULATE_FAIL: | 2748 | case EMULATE_FAIL: |
| 2673 | kvm_report_emulation_failure(vcpu, "pagetable"); | 2749 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
| 2674 | return 1; | 2750 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |
| 2751 | return 0; | ||
| 2675 | default: | 2752 | default: |
| 2676 | BUG(); | 2753 | BUG(); |
| 2677 | } | 2754 | } |
| @@ -2712,12 +2789,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | |||
| 2712 | 2789 | ||
| 2713 | ASSERT(vcpu); | 2790 | ASSERT(vcpu); |
| 2714 | 2791 | ||
| 2715 | if (vcpu->kvm->arch.n_requested_mmu_pages) | ||
| 2716 | vcpu->kvm->arch.n_free_mmu_pages = | ||
| 2717 | vcpu->kvm->arch.n_requested_mmu_pages; | ||
| 2718 | else | ||
| 2719 | vcpu->kvm->arch.n_free_mmu_pages = | ||
| 2720 | vcpu->kvm->arch.n_alloc_mmu_pages; | ||
| 2721 | /* | 2792 | /* |
| 2722 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | 2793 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. |
| 2723 | * Therefore we need to allocate shadow page tables in the first | 2794 | * Therefore we need to allocate shadow page tables in the first |
| @@ -3029,6 +3100,24 @@ out: | |||
| 3029 | return r; | 3100 | return r; |
| 3030 | } | 3101 | } |
| 3031 | 3102 | ||
| 3103 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | ||
| 3104 | { | ||
| 3105 | struct kvm_shadow_walk_iterator iterator; | ||
| 3106 | int nr_sptes = 0; | ||
| 3107 | |||
| 3108 | spin_lock(&vcpu->kvm->mmu_lock); | ||
| 3109 | for_each_shadow_entry(vcpu, addr, iterator) { | ||
| 3110 | sptes[iterator.level-1] = *iterator.sptep; | ||
| 3111 | nr_sptes++; | ||
| 3112 | if (!is_shadow_present_pte(*iterator.sptep)) | ||
| 3113 | break; | ||
| 3114 | } | ||
| 3115 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
| 3116 | |||
| 3117 | return nr_sptes; | ||
| 3118 | } | ||
| 3119 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | ||
| 3120 | |||
| 3032 | #ifdef AUDIT | 3121 | #ifdef AUDIT |
| 3033 | 3122 | ||
| 3034 | static const char *audit_msg; | 3123 | static const char *audit_msg; |
| @@ -3041,6 +3130,54 @@ static gva_t canonicalize(gva_t gva) | |||
| 3041 | return gva; | 3130 | return gva; |
| 3042 | } | 3131 | } |
| 3043 | 3132 | ||
| 3133 | |||
| 3134 | typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, | ||
| 3135 | u64 *sptep); | ||
| 3136 | |||
| 3137 | static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, | ||
| 3138 | inspect_spte_fn fn) | ||
| 3139 | { | ||
| 3140 | int i; | ||
| 3141 | |||
| 3142 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 3143 | u64 ent = sp->spt[i]; | ||
| 3144 | |||
| 3145 | if (is_shadow_present_pte(ent)) { | ||
| 3146 | if (!is_last_spte(ent, sp->role.level)) { | ||
| 3147 | struct kvm_mmu_page *child; | ||
| 3148 | child = page_header(ent & PT64_BASE_ADDR_MASK); | ||
| 3149 | __mmu_spte_walk(kvm, child, fn); | ||
| 3150 | } else | ||
| 3151 | fn(kvm, sp, &sp->spt[i]); | ||
| 3152 | } | ||
| 3153 | } | ||
| 3154 | } | ||
| 3155 | |||
| 3156 | static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) | ||
| 3157 | { | ||
| 3158 | int i; | ||
| 3159 | struct kvm_mmu_page *sp; | ||
| 3160 | |||
| 3161 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
| 3162 | return; | ||
| 3163 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
| 3164 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
| 3165 | sp = page_header(root); | ||
| 3166 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
| 3167 | return; | ||
| 3168 | } | ||
| 3169 | for (i = 0; i < 4; ++i) { | ||
| 3170 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
| 3171 | |||
| 3172 | if (root && VALID_PAGE(root)) { | ||
| 3173 | root &= PT64_BASE_ADDR_MASK; | ||
| 3174 | sp = page_header(root); | ||
| 3175 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
| 3176 | } | ||
| 3177 | } | ||
| 3178 | return; | ||
| 3179 | } | ||
| 3180 | |||
| 3044 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | 3181 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, |
| 3045 | gva_t va, int level) | 3182 | gva_t va, int level) |
| 3046 | { | 3183 | { |
| @@ -3055,20 +3192,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | |||
| 3055 | continue; | 3192 | continue; |
| 3056 | 3193 | ||
| 3057 | va = canonicalize(va); | 3194 | va = canonicalize(va); |
| 3058 | if (level > 1) { | 3195 | if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) |
| 3059 | if (ent == shadow_notrap_nonpresent_pte) | 3196 | audit_mappings_page(vcpu, ent, va, level - 1); |
| 3060 | printk(KERN_ERR "audit: (%s) nontrapping pte" | 3197 | else { |
| 3061 | " in nonleaf level: levels %d gva %lx" | ||
| 3062 | " level %d pte %llx\n", audit_msg, | ||
| 3063 | vcpu->arch.mmu.root_level, va, level, ent); | ||
| 3064 | else | ||
| 3065 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
| 3066 | } else { | ||
| 3067 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); | 3198 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); |
| 3068 | gfn_t gfn = gpa >> PAGE_SHIFT; | 3199 | gfn_t gfn = gpa >> PAGE_SHIFT; |
| 3069 | pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); | 3200 | pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); |
| 3070 | hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; | 3201 | hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; |
| 3071 | 3202 | ||
| 3203 | if (is_error_pfn(pfn)) { | ||
| 3204 | kvm_release_pfn_clean(pfn); | ||
| 3205 | continue; | ||
| 3206 | } | ||
| 3207 | |||
| 3072 | if (is_shadow_present_pte(ent) | 3208 | if (is_shadow_present_pte(ent) |
| 3073 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | 3209 | && (ent & PT64_BASE_ADDR_MASK) != hpa) |
| 3074 | printk(KERN_ERR "xx audit error: (%s) levels %d" | 3210 | printk(KERN_ERR "xx audit error: (%s) levels %d" |
| @@ -3122,7 +3258,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) | |||
| 3122 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 3258 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
| 3123 | while (d) { | 3259 | while (d) { |
| 3124 | for (k = 0; k < RMAP_EXT; ++k) | 3260 | for (k = 0; k < RMAP_EXT; ++k) |
| 3125 | if (d->shadow_ptes[k]) | 3261 | if (d->sptes[k]) |
| 3126 | ++nmaps; | 3262 | ++nmaps; |
| 3127 | else | 3263 | else |
| 3128 | break; | 3264 | break; |
| @@ -3133,9 +3269,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu) | |||
| 3133 | return nmaps; | 3269 | return nmaps; |
| 3134 | } | 3270 | } |
| 3135 | 3271 | ||
| 3136 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | 3272 | void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) |
| 3273 | { | ||
| 3274 | unsigned long *rmapp; | ||
| 3275 | struct kvm_mmu_page *rev_sp; | ||
| 3276 | gfn_t gfn; | ||
| 3277 | |||
| 3278 | if (*sptep & PT_WRITABLE_MASK) { | ||
| 3279 | rev_sp = page_header(__pa(sptep)); | ||
| 3280 | gfn = rev_sp->gfns[sptep - rev_sp->spt]; | ||
| 3281 | |||
| 3282 | if (!gfn_to_memslot(kvm, gfn)) { | ||
| 3283 | if (!printk_ratelimit()) | ||
| 3284 | return; | ||
| 3285 | printk(KERN_ERR "%s: no memslot for gfn %ld\n", | ||
| 3286 | audit_msg, gfn); | ||
| 3287 | printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", | ||
| 3288 | audit_msg, sptep - rev_sp->spt, | ||
| 3289 | rev_sp->gfn); | ||
| 3290 | dump_stack(); | ||
| 3291 | return; | ||
| 3292 | } | ||
| 3293 | |||
| 3294 | rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], | ||
| 3295 | is_large_pte(*sptep)); | ||
| 3296 | if (!*rmapp) { | ||
| 3297 | if (!printk_ratelimit()) | ||
| 3298 | return; | ||
| 3299 | printk(KERN_ERR "%s: no rmap for writable spte %llx\n", | ||
| 3300 | audit_msg, *sptep); | ||
| 3301 | dump_stack(); | ||
| 3302 | } | ||
| 3303 | } | ||
| 3304 | |||
| 3305 | } | ||
| 3306 | |||
| 3307 | void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) | ||
| 3308 | { | ||
| 3309 | mmu_spte_walk(vcpu, inspect_spte_has_rmap); | ||
| 3310 | } | ||
| 3311 | |||
| 3312 | static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) | ||
| 3137 | { | 3313 | { |
| 3138 | int nmaps = 0; | ||
| 3139 | struct kvm_mmu_page *sp; | 3314 | struct kvm_mmu_page *sp; |
| 3140 | int i; | 3315 | int i; |
| 3141 | 3316 | ||
| @@ -3152,20 +3327,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) | |||
| 3152 | continue; | 3327 | continue; |
| 3153 | if (!(ent & PT_WRITABLE_MASK)) | 3328 | if (!(ent & PT_WRITABLE_MASK)) |
| 3154 | continue; | 3329 | continue; |
| 3155 | ++nmaps; | 3330 | inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); |
| 3156 | } | 3331 | } |
| 3157 | } | 3332 | } |
| 3158 | return nmaps; | 3333 | return; |
| 3159 | } | 3334 | } |
| 3160 | 3335 | ||
| 3161 | static void audit_rmap(struct kvm_vcpu *vcpu) | 3336 | static void audit_rmap(struct kvm_vcpu *vcpu) |
| 3162 | { | 3337 | { |
| 3163 | int n_rmap = count_rmaps(vcpu); | 3338 | check_writable_mappings_rmap(vcpu); |
| 3164 | int n_actual = count_writable_mappings(vcpu); | 3339 | count_rmaps(vcpu); |
| 3165 | |||
| 3166 | if (n_rmap != n_actual) | ||
| 3167 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
| 3168 | __func__, audit_msg, n_rmap, n_actual); | ||
| 3169 | } | 3340 | } |
| 3170 | 3341 | ||
| 3171 | static void audit_write_protection(struct kvm_vcpu *vcpu) | 3342 | static void audit_write_protection(struct kvm_vcpu *vcpu) |
| @@ -3173,20 +3344,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) | |||
| 3173 | struct kvm_mmu_page *sp; | 3344 | struct kvm_mmu_page *sp; |
| 3174 | struct kvm_memory_slot *slot; | 3345 | struct kvm_memory_slot *slot; |
| 3175 | unsigned long *rmapp; | 3346 | unsigned long *rmapp; |
| 3347 | u64 *spte; | ||
| 3176 | gfn_t gfn; | 3348 | gfn_t gfn; |
| 3177 | 3349 | ||
| 3178 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | 3350 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { |
| 3179 | if (sp->role.direct) | 3351 | if (sp->role.direct) |
| 3180 | continue; | 3352 | continue; |
| 3353 | if (sp->unsync) | ||
| 3354 | continue; | ||
| 3181 | 3355 | ||
| 3182 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); | 3356 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); |
| 3183 | slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); | 3357 | slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); |
| 3184 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | 3358 | rmapp = &slot->rmap[gfn - slot->base_gfn]; |
| 3185 | if (*rmapp) | 3359 | |
| 3186 | printk(KERN_ERR "%s: (%s) shadow page has writable" | 3360 | spte = rmap_next(vcpu->kvm, rmapp, NULL); |
| 3187 | " mappings: gfn %lx role %x\n", | 3361 | while (spte) { |
| 3362 | if (*spte & PT_WRITABLE_MASK) | ||
| 3363 | printk(KERN_ERR "%s: (%s) shadow page has " | ||
| 3364 | "writable mappings: gfn %lx role %x\n", | ||
| 3188 | __func__, audit_msg, sp->gfn, | 3365 | __func__, audit_msg, sp->gfn, |
| 3189 | sp->role.word); | 3366 | sp->role.word); |
| 3367 | spte = rmap_next(vcpu->kvm, rmapp, spte); | ||
| 3368 | } | ||
| 3190 | } | 3369 | } |
| 3191 | } | 3370 | } |
| 3192 | 3371 | ||
| @@ -3198,7 +3377,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | |||
| 3198 | audit_msg = msg; | 3377 | audit_msg = msg; |
| 3199 | audit_rmap(vcpu); | 3378 | audit_rmap(vcpu); |
| 3200 | audit_write_protection(vcpu); | 3379 | audit_write_protection(vcpu); |
| 3201 | audit_mappings(vcpu); | 3380 | if (strcmp("pre pte write", audit_msg) != 0) |
| 3381 | audit_mappings(vcpu); | ||
| 3382 | audit_writable_sptes_have_rmaps(vcpu); | ||
| 3202 | dbg = olddbg; | 3383 | dbg = olddbg; |
| 3203 | } | 3384 | } |
| 3204 | 3385 | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3494a2fb136..61a1b3884b4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
| @@ -37,6 +37,8 @@ | |||
| 37 | #define PT32_ROOT_LEVEL 2 | 37 | #define PT32_ROOT_LEVEL 2 |
| 38 | #define PT32E_ROOT_LEVEL 3 | 38 | #define PT32E_ROOT_LEVEL 3 |
| 39 | 39 | ||
| 40 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); | ||
| 41 | |||
| 40 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 42 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
| 41 | { | 43 | { |
| 42 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | 44 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) |
| @@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu) | |||
| 75 | return vcpu->arch.cr0 & X86_CR0_PG; | 77 | return vcpu->arch.cr0 & X86_CR0_PG; |
| 76 | } | 78 | } |
| 77 | 79 | ||
| 78 | static inline int is_present_pte(unsigned long pte) | 80 | static inline int is_present_gpte(unsigned long pte) |
| 79 | { | 81 | { |
| 80 | return pte & PT_PRESENT_MASK; | 82 | return pte & PT_PRESENT_MASK; |
| 81 | } | 83 | } |
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h new file mode 100644 index 00000000000..3e4a5c6ca2a --- /dev/null +++ b/arch/x86/kvm/mmutrace.h | |||
| @@ -0,0 +1,220 @@ | |||
| 1 | #if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 2 | #define _TRACE_KVMMMU_H | ||
| 3 | |||
| 4 | #include <linux/tracepoint.h> | ||
| 5 | #include <linux/ftrace_event.h> | ||
| 6 | |||
| 7 | #undef TRACE_SYSTEM | ||
| 8 | #define TRACE_SYSTEM kvmmmu | ||
| 9 | #define TRACE_INCLUDE_PATH . | ||
| 10 | #define TRACE_INCLUDE_FILE mmutrace | ||
| 11 | |||
| 12 | #define KVM_MMU_PAGE_FIELDS \ | ||
| 13 | __field(__u64, gfn) \ | ||
| 14 | __field(__u32, role) \ | ||
| 15 | __field(__u32, root_count) \ | ||
| 16 | __field(__u32, unsync) | ||
| 17 | |||
| 18 | #define KVM_MMU_PAGE_ASSIGN(sp) \ | ||
| 19 | __entry->gfn = sp->gfn; \ | ||
| 20 | __entry->role = sp->role.word; \ | ||
| 21 | __entry->root_count = sp->root_count; \ | ||
| 22 | __entry->unsync = sp->unsync; | ||
| 23 | |||
| 24 | #define KVM_MMU_PAGE_PRINTK() ({ \ | ||
| 25 | const char *ret = p->buffer + p->len; \ | ||
| 26 | static const char *access_str[] = { \ | ||
| 27 | "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ | ||
| 28 | }; \ | ||
| 29 | union kvm_mmu_page_role role; \ | ||
| 30 | \ | ||
| 31 | role.word = __entry->role; \ | ||
| 32 | \ | ||
| 33 | trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ | ||
| 34 | " %snxe root %u %s%c", \ | ||
| 35 | __entry->gfn, role.level, role.glevels, \ | ||
| 36 | role.quadrant, \ | ||
| 37 | role.direct ? " direct" : "", \ | ||
| 38 | access_str[role.access], \ | ||
| 39 | role.invalid ? " invalid" : "", \ | ||
| 40 | role.cr4_pge ? "" : "!", \ | ||
| 41 | role.nxe ? "" : "!", \ | ||
| 42 | __entry->root_count, \ | ||
| 43 | __entry->unsync ? "unsync" : "sync", 0); \ | ||
| 44 | ret; \ | ||
| 45 | }) | ||
| 46 | |||
| 47 | #define kvm_mmu_trace_pferr_flags \ | ||
| 48 | { PFERR_PRESENT_MASK, "P" }, \ | ||
| 49 | { PFERR_WRITE_MASK, "W" }, \ | ||
| 50 | { PFERR_USER_MASK, "U" }, \ | ||
| 51 | { PFERR_RSVD_MASK, "RSVD" }, \ | ||
| 52 | { PFERR_FETCH_MASK, "F" } | ||
| 53 | |||
| 54 | /* | ||
| 55 | * A pagetable walk has started | ||
| 56 | */ | ||
| 57 | TRACE_EVENT( | ||
| 58 | kvm_mmu_pagetable_walk, | ||
| 59 | TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), | ||
| 60 | TP_ARGS(addr, write_fault, user_fault, fetch_fault), | ||
| 61 | |||
| 62 | TP_STRUCT__entry( | ||
| 63 | __field(__u64, addr) | ||
| 64 | __field(__u32, pferr) | ||
| 65 | ), | ||
| 66 | |||
| 67 | TP_fast_assign( | ||
| 68 | __entry->addr = addr; | ||
| 69 | __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) | ||
| 70 | | (!!fetch_fault << 4); | ||
| 71 | ), | ||
| 72 | |||
| 73 | TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, | ||
| 74 | __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) | ||
| 75 | ); | ||
| 76 | |||
| 77 | |||
| 78 | /* We just walked a paging element */ | ||
| 79 | TRACE_EVENT( | ||
| 80 | kvm_mmu_paging_element, | ||
| 81 | TP_PROTO(u64 pte, int level), | ||
| 82 | TP_ARGS(pte, level), | ||
| 83 | |||
| 84 | TP_STRUCT__entry( | ||
| 85 | __field(__u64, pte) | ||
| 86 | __field(__u32, level) | ||
| 87 | ), | ||
| 88 | |||
| 89 | TP_fast_assign( | ||
| 90 | __entry->pte = pte; | ||
| 91 | __entry->level = level; | ||
| 92 | ), | ||
| 93 | |||
| 94 | TP_printk("pte %llx level %u", __entry->pte, __entry->level) | ||
| 95 | ); | ||
| 96 | |||
| 97 | /* We set a pte accessed bit */ | ||
| 98 | TRACE_EVENT( | ||
| 99 | kvm_mmu_set_accessed_bit, | ||
| 100 | TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), | ||
| 101 | TP_ARGS(table_gfn, index, size), | ||
| 102 | |||
| 103 | TP_STRUCT__entry( | ||
| 104 | __field(__u64, gpa) | ||
| 105 | ), | ||
| 106 | |||
| 107 | TP_fast_assign( | ||
| 108 | __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) | ||
| 109 | + index * size; | ||
| 110 | ), | ||
| 111 | |||
| 112 | TP_printk("gpa %llx", __entry->gpa) | ||
| 113 | ); | ||
| 114 | |||
| 115 | /* We set a pte dirty bit */ | ||
| 116 | TRACE_EVENT( | ||
| 117 | kvm_mmu_set_dirty_bit, | ||
| 118 | TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), | ||
| 119 | TP_ARGS(table_gfn, index, size), | ||
| 120 | |||
| 121 | TP_STRUCT__entry( | ||
| 122 | __field(__u64, gpa) | ||
| 123 | ), | ||
| 124 | |||
| 125 | TP_fast_assign( | ||
| 126 | __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) | ||
| 127 | + index * size; | ||
| 128 | ), | ||
| 129 | |||
| 130 | TP_printk("gpa %llx", __entry->gpa) | ||
| 131 | ); | ||
| 132 | |||
| 133 | TRACE_EVENT( | ||
| 134 | kvm_mmu_walker_error, | ||
| 135 | TP_PROTO(u32 pferr), | ||
| 136 | TP_ARGS(pferr), | ||
| 137 | |||
| 138 | TP_STRUCT__entry( | ||
| 139 | __field(__u32, pferr) | ||
| 140 | ), | ||
| 141 | |||
| 142 | TP_fast_assign( | ||
| 143 | __entry->pferr = pferr; | ||
| 144 | ), | ||
| 145 | |||
| 146 | TP_printk("pferr %x %s", __entry->pferr, | ||
| 147 | __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) | ||
| 148 | ); | ||
| 149 | |||
| 150 | TRACE_EVENT( | ||
| 151 | kvm_mmu_get_page, | ||
| 152 | TP_PROTO(struct kvm_mmu_page *sp, bool created), | ||
| 153 | TP_ARGS(sp, created), | ||
| 154 | |||
| 155 | TP_STRUCT__entry( | ||
| 156 | KVM_MMU_PAGE_FIELDS | ||
| 157 | __field(bool, created) | ||
| 158 | ), | ||
| 159 | |||
| 160 | TP_fast_assign( | ||
| 161 | KVM_MMU_PAGE_ASSIGN(sp) | ||
| 162 | __entry->created = created; | ||
| 163 | ), | ||
| 164 | |||
| 165 | TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), | ||
| 166 | __entry->created ? "new" : "existing") | ||
| 167 | ); | ||
| 168 | |||
| 169 | TRACE_EVENT( | ||
| 170 | kvm_mmu_sync_page, | ||
| 171 | TP_PROTO(struct kvm_mmu_page *sp), | ||
| 172 | TP_ARGS(sp), | ||
| 173 | |||
| 174 | TP_STRUCT__entry( | ||
| 175 | KVM_MMU_PAGE_FIELDS | ||
| 176 | ), | ||
| 177 | |||
| 178 | TP_fast_assign( | ||
| 179 | KVM_MMU_PAGE_ASSIGN(sp) | ||
| 180 | ), | ||
| 181 | |||
| 182 | TP_printk("%s", KVM_MMU_PAGE_PRINTK()) | ||
| 183 | ); | ||
| 184 | |||
| 185 | TRACE_EVENT( | ||
| 186 | kvm_mmu_unsync_page, | ||
| 187 | TP_PROTO(struct kvm_mmu_page *sp), | ||
| 188 | TP_ARGS(sp), | ||
| 189 | |||
| 190 | TP_STRUCT__entry( | ||
| 191 | KVM_MMU_PAGE_FIELDS | ||
| 192 | ), | ||
| 193 | |||
| 194 | TP_fast_assign( | ||
| 195 | KVM_MMU_PAGE_ASSIGN(sp) | ||
| 196 | ), | ||
| 197 | |||
| 198 | TP_printk("%s", KVM_MMU_PAGE_PRINTK()) | ||
| 199 | ); | ||
| 200 | |||
| 201 | TRACE_EVENT( | ||
| 202 | kvm_mmu_zap_page, | ||
| 203 | TP_PROTO(struct kvm_mmu_page *sp), | ||
| 204 | TP_ARGS(sp), | ||
| 205 | |||
| 206 | TP_STRUCT__entry( | ||
| 207 | KVM_MMU_PAGE_FIELDS | ||
| 208 | ), | ||
| 209 | |||
| 210 | TP_fast_assign( | ||
| 211 | KVM_MMU_PAGE_ASSIGN(sp) | ||
| 212 | ), | ||
| 213 | |||
| 214 | TP_printk("%s", KVM_MMU_PAGE_PRINTK()) | ||
| 215 | ); | ||
| 216 | |||
| 217 | #endif /* _TRACE_KVMMMU_H */ | ||
| 218 | |||
| 219 | /* This part must be outside protection */ | ||
| 220 | #include <trace/define_trace.h> | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67785f63539..d2fec9c12d2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
| @@ -27,7 +27,8 @@ | |||
| 27 | #define guest_walker guest_walker64 | 27 | #define guest_walker guest_walker64 |
| 28 | #define FNAME(name) paging##64_##name | 28 | #define FNAME(name) paging##64_##name |
| 29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | 29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK |
| 30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | 30 | #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) |
| 31 | #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) | ||
| 31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) |
| 32 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
| 33 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS |
| @@ -43,7 +44,8 @@ | |||
| 43 | #define guest_walker guest_walker32 | 44 | #define guest_walker guest_walker32 |
| 44 | #define FNAME(name) paging##32_##name | 45 | #define FNAME(name) paging##32_##name |
| 45 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | 46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK |
| 46 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | 47 | #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) |
| 48 | #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) | ||
| 47 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | 49 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) |
| 48 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
| 49 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | 51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS |
| @@ -53,8 +55,8 @@ | |||
| 53 | #error Invalid PTTYPE value | 55 | #error Invalid PTTYPE value |
| 54 | #endif | 56 | #endif |
| 55 | 57 | ||
| 56 | #define gpte_to_gfn FNAME(gpte_to_gfn) | 58 | #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) |
| 57 | #define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) | 59 | #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) |
| 58 | 60 | ||
| 59 | /* | 61 | /* |
| 60 | * The guest_walker structure emulates the behavior of the hardware page | 62 | * The guest_walker structure emulates the behavior of the hardware page |
| @@ -71,14 +73,9 @@ struct guest_walker { | |||
| 71 | u32 error_code; | 73 | u32 error_code; |
| 72 | }; | 74 | }; |
| 73 | 75 | ||
| 74 | static gfn_t gpte_to_gfn(pt_element_t gpte) | 76 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) |
| 75 | { | 77 | { |
| 76 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | 78 | return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; |
| 77 | } | ||
| 78 | |||
| 79 | static gfn_t gpte_to_gfn_pde(pt_element_t gpte) | ||
| 80 | { | ||
| 81 | return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
| 82 | } | 79 | } |
| 83 | 80 | ||
| 84 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | 81 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, |
| @@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker, | |||
| 125 | gpa_t pte_gpa; | 122 | gpa_t pte_gpa; |
| 126 | int rsvd_fault = 0; | 123 | int rsvd_fault = 0; |
| 127 | 124 | ||
| 128 | pgprintk("%s: addr %lx\n", __func__, addr); | 125 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, |
| 126 | fetch_fault); | ||
| 129 | walk: | 127 | walk: |
| 130 | walker->level = vcpu->arch.mmu.root_level; | 128 | walker->level = vcpu->arch.mmu.root_level; |
| 131 | pte = vcpu->arch.cr3; | 129 | pte = vcpu->arch.cr3; |
| 132 | #if PTTYPE == 64 | 130 | #if PTTYPE == 64 |
| 133 | if (!is_long_mode(vcpu)) { | 131 | if (!is_long_mode(vcpu)) { |
| 134 | pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; | 132 | pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); |
| 135 | if (!is_present_pte(pte)) | 133 | trace_kvm_mmu_paging_element(pte, walker->level); |
| 134 | if (!is_present_gpte(pte)) | ||
| 136 | goto not_present; | 135 | goto not_present; |
| 137 | --walker->level; | 136 | --walker->level; |
| 138 | } | 137 | } |
| @@ -150,12 +149,11 @@ walk: | |||
| 150 | pte_gpa += index * sizeof(pt_element_t); | 149 | pte_gpa += index * sizeof(pt_element_t); |
| 151 | walker->table_gfn[walker->level - 1] = table_gfn; | 150 | walker->table_gfn[walker->level - 1] = table_gfn; |
| 152 | walker->pte_gpa[walker->level - 1] = pte_gpa; | 151 | walker->pte_gpa[walker->level - 1] = pte_gpa; |
| 153 | pgprintk("%s: table_gfn[%d] %lx\n", __func__, | ||
| 154 | walker->level - 1, table_gfn); | ||
| 155 | 152 | ||
| 156 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); | 153 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); |
| 154 | trace_kvm_mmu_paging_element(pte, walker->level); | ||
| 157 | 155 | ||
| 158 | if (!is_present_pte(pte)) | 156 | if (!is_present_gpte(pte)) |
| 159 | goto not_present; | 157 | goto not_present; |
| 160 | 158 | ||
| 161 | rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); | 159 | rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); |
| @@ -175,6 +173,8 @@ walk: | |||
| 175 | #endif | 173 | #endif |
| 176 | 174 | ||
| 177 | if (!(pte & PT_ACCESSED_MASK)) { | 175 | if (!(pte & PT_ACCESSED_MASK)) { |
| 176 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | ||
| 177 | sizeof(pte)); | ||
| 178 | mark_page_dirty(vcpu->kvm, table_gfn); | 178 | mark_page_dirty(vcpu->kvm, table_gfn); |
| 179 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | 179 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, |
| 180 | index, pte, pte|PT_ACCESSED_MASK)) | 180 | index, pte, pte|PT_ACCESSED_MASK)) |
| @@ -186,18 +186,24 @@ walk: | |||
| 186 | 186 | ||
| 187 | walker->ptes[walker->level - 1] = pte; | 187 | walker->ptes[walker->level - 1] = pte; |
| 188 | 188 | ||
| 189 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | 189 | if ((walker->level == PT_PAGE_TABLE_LEVEL) || |
| 190 | walker->gfn = gpte_to_gfn(pte); | 190 | ((walker->level == PT_DIRECTORY_LEVEL) && |
| 191 | break; | 191 | (pte & PT_PAGE_SIZE_MASK) && |
| 192 | } | 192 | (PTTYPE == 64 || is_pse(vcpu))) || |
| 193 | 193 | ((walker->level == PT_PDPE_LEVEL) && | |
| 194 | if (walker->level == PT_DIRECTORY_LEVEL | 194 | (pte & PT_PAGE_SIZE_MASK) && |
| 195 | && (pte & PT_PAGE_SIZE_MASK) | 195 | is_long_mode(vcpu))) { |
| 196 | && (PTTYPE == 64 || is_pse(vcpu))) { | 196 | int lvl = walker->level; |
| 197 | walker->gfn = gpte_to_gfn_pde(pte); | 197 | |
| 198 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | 198 | walker->gfn = gpte_to_gfn_lvl(pte, lvl); |
| 199 | if (PTTYPE == 32 && is_cpuid_PSE36()) | 199 | walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) |
| 200 | >> PAGE_SHIFT; | ||
| 201 | |||
| 202 | if (PTTYPE == 32 && | ||
| 203 | walker->level == PT_DIRECTORY_LEVEL && | ||
| 204 | is_cpuid_PSE36()) | ||
| 200 | walker->gfn += pse36_gfn_delta(pte); | 205 | walker->gfn += pse36_gfn_delta(pte); |
| 206 | |||
| 201 | break; | 207 | break; |
| 202 | } | 208 | } |
| 203 | 209 | ||
| @@ -205,9 +211,10 @@ walk: | |||
| 205 | --walker->level; | 211 | --walker->level; |
| 206 | } | 212 | } |
| 207 | 213 | ||
| 208 | if (write_fault && !is_dirty_pte(pte)) { | 214 | if (write_fault && !is_dirty_gpte(pte)) { |
| 209 | bool ret; | 215 | bool ret; |
| 210 | 216 | ||
| 217 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | ||
| 211 | mark_page_dirty(vcpu->kvm, table_gfn); | 218 | mark_page_dirty(vcpu->kvm, table_gfn); |
| 212 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | 219 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, |
| 213 | pte|PT_DIRTY_MASK); | 220 | pte|PT_DIRTY_MASK); |
| @@ -239,6 +246,7 @@ err: | |||
| 239 | walker->error_code |= PFERR_FETCH_MASK; | 246 | walker->error_code |= PFERR_FETCH_MASK; |
| 240 | if (rsvd_fault) | 247 | if (rsvd_fault) |
| 241 | walker->error_code |= PFERR_RSVD_MASK; | 248 | walker->error_code |= PFERR_RSVD_MASK; |
| 249 | trace_kvm_mmu_walker_error(walker->error_code); | ||
| 242 | return 0; | 250 | return 0; |
| 243 | } | 251 | } |
| 244 | 252 | ||
| @@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
| 248 | pt_element_t gpte; | 256 | pt_element_t gpte; |
| 249 | unsigned pte_access; | 257 | unsigned pte_access; |
| 250 | pfn_t pfn; | 258 | pfn_t pfn; |
| 251 | int largepage = vcpu->arch.update_pte.largepage; | ||
| 252 | 259 | ||
| 253 | gpte = *(const pt_element_t *)pte; | 260 | gpte = *(const pt_element_t *)pte; |
| 254 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | 261 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { |
| 255 | if (!is_present_pte(gpte)) | 262 | if (!is_present_gpte(gpte)) |
| 256 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | 263 | __set_spte(spte, shadow_notrap_nonpresent_pte); |
| 257 | return; | 264 | return; |
| 258 | } | 265 | } |
| 259 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 266 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
| @@ -267,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
| 267 | return; | 274 | return; |
| 268 | kvm_get_pfn(pfn); | 275 | kvm_get_pfn(pfn); |
| 269 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 276 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
| 270 | gpte & PT_DIRTY_MASK, NULL, largepage, | 277 | gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, |
| 271 | gpte_to_gfn(gpte), pfn, true); | 278 | gpte_to_gfn(gpte), pfn, true); |
| 272 | } | 279 | } |
| 273 | 280 | ||
| @@ -276,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
| 276 | */ | 283 | */ |
| 277 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 284 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
| 278 | struct guest_walker *gw, | 285 | struct guest_walker *gw, |
| 279 | int user_fault, int write_fault, int largepage, | 286 | int user_fault, int write_fault, int hlevel, |
| 280 | int *ptwrite, pfn_t pfn) | 287 | int *ptwrite, pfn_t pfn) |
| 281 | { | 288 | { |
| 282 | unsigned access = gw->pt_access; | 289 | unsigned access = gw->pt_access; |
| @@ -289,19 +296,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 289 | pt_element_t curr_pte; | 296 | pt_element_t curr_pte; |
| 290 | struct kvm_shadow_walk_iterator iterator; | 297 | struct kvm_shadow_walk_iterator iterator; |
| 291 | 298 | ||
| 292 | if (!is_present_pte(gw->ptes[gw->level - 1])) | 299 | if (!is_present_gpte(gw->ptes[gw->level - 1])) |
| 293 | return NULL; | 300 | return NULL; |
| 294 | 301 | ||
| 295 | for_each_shadow_entry(vcpu, addr, iterator) { | 302 | for_each_shadow_entry(vcpu, addr, iterator) { |
| 296 | level = iterator.level; | 303 | level = iterator.level; |
| 297 | sptep = iterator.sptep; | 304 | sptep = iterator.sptep; |
| 298 | if (level == PT_PAGE_TABLE_LEVEL | 305 | if (iterator.level == hlevel) { |
| 299 | || (largepage && level == PT_DIRECTORY_LEVEL)) { | ||
| 300 | mmu_set_spte(vcpu, sptep, access, | 306 | mmu_set_spte(vcpu, sptep, access, |
| 301 | gw->pte_access & access, | 307 | gw->pte_access & access, |
| 302 | user_fault, write_fault, | 308 | user_fault, write_fault, |
| 303 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | 309 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, |
| 304 | ptwrite, largepage, | 310 | ptwrite, level, |
| 305 | gw->gfn, pfn, false); | 311 | gw->gfn, pfn, false); |
| 306 | break; | 312 | break; |
| 307 | } | 313 | } |
| @@ -311,16 +317,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 311 | 317 | ||
| 312 | if (is_large_pte(*sptep)) { | 318 | if (is_large_pte(*sptep)) { |
| 313 | rmap_remove(vcpu->kvm, sptep); | 319 | rmap_remove(vcpu->kvm, sptep); |
| 314 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | 320 | __set_spte(sptep, shadow_trap_nonpresent_pte); |
| 315 | kvm_flush_remote_tlbs(vcpu->kvm); | 321 | kvm_flush_remote_tlbs(vcpu->kvm); |
| 316 | } | 322 | } |
| 317 | 323 | ||
| 318 | if (level == PT_DIRECTORY_LEVEL | 324 | if (level <= gw->level) { |
| 319 | && gw->level == PT_DIRECTORY_LEVEL) { | 325 | int delta = level - gw->level + 1; |
| 320 | direct = 1; | 326 | direct = 1; |
| 321 | if (!is_dirty_pte(gw->ptes[level - 1])) | 327 | if (!is_dirty_gpte(gw->ptes[level - delta])) |
| 322 | access &= ~ACC_WRITE_MASK; | 328 | access &= ~ACC_WRITE_MASK; |
| 323 | table_gfn = gpte_to_gfn(gw->ptes[level - 1]); | 329 | table_gfn = gpte_to_gfn(gw->ptes[level - delta]); |
| 330 | /* advance table_gfn when emulating 1gb pages with 4k */ | ||
| 331 | if (delta == 0) | ||
| 332 | table_gfn += PT_INDEX(addr, level); | ||
| 324 | } else { | 333 | } else { |
| 325 | direct = 0; | 334 | direct = 0; |
| 326 | table_gfn = gw->table_gfn[level - 2]; | 335 | table_gfn = gw->table_gfn[level - 2]; |
| @@ -369,11 +378,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 369 | int user_fault = error_code & PFERR_USER_MASK; | 378 | int user_fault = error_code & PFERR_USER_MASK; |
| 370 | int fetch_fault = error_code & PFERR_FETCH_MASK; | 379 | int fetch_fault = error_code & PFERR_FETCH_MASK; |
| 371 | struct guest_walker walker; | 380 | struct guest_walker walker; |
| 372 | u64 *shadow_pte; | 381 | u64 *sptep; |
| 373 | int write_pt = 0; | 382 | int write_pt = 0; |
| 374 | int r; | 383 | int r; |
| 375 | pfn_t pfn; | 384 | pfn_t pfn; |
| 376 | int largepage = 0; | 385 | int level = PT_PAGE_TABLE_LEVEL; |
| 377 | unsigned long mmu_seq; | 386 | unsigned long mmu_seq; |
| 378 | 387 | ||
| 379 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 388 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
| @@ -399,14 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 399 | return 0; | 408 | return 0; |
| 400 | } | 409 | } |
| 401 | 410 | ||
| 402 | if (walker.level == PT_DIRECTORY_LEVEL) { | 411 | if (walker.level >= PT_DIRECTORY_LEVEL) { |
| 403 | gfn_t large_gfn; | 412 | level = min(walker.level, mapping_level(vcpu, walker.gfn)); |
| 404 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); | 413 | walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); |
| 405 | if (is_largepage_backed(vcpu, large_gfn)) { | ||
| 406 | walker.gfn = large_gfn; | ||
| 407 | largepage = 1; | ||
| 408 | } | ||
| 409 | } | 414 | } |
| 415 | |||
| 410 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 416 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
| 411 | smp_rmb(); | 417 | smp_rmb(); |
| 412 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 418 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
| @@ -422,11 +428,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 422 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 428 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
| 423 | goto out_unlock; | 429 | goto out_unlock; |
| 424 | kvm_mmu_free_some_pages(vcpu); | 430 | kvm_mmu_free_some_pages(vcpu); |
| 425 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 431 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
| 426 | largepage, &write_pt, pfn); | 432 | level, &write_pt, pfn); |
| 427 | |||
| 428 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | 433 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, |
| 429 | shadow_pte, *shadow_pte, write_pt); | 434 | sptep, *sptep, write_pt); |
| 430 | 435 | ||
| 431 | if (!write_pt) | 436 | if (!write_pt) |
| 432 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 437 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
| @@ -459,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
| 459 | sptep = iterator.sptep; | 464 | sptep = iterator.sptep; |
| 460 | 465 | ||
| 461 | /* FIXME: properly handle invlpg on large guest pages */ | 466 | /* FIXME: properly handle invlpg on large guest pages */ |
| 462 | if (level == PT_PAGE_TABLE_LEVEL || | 467 | if (level == PT_PAGE_TABLE_LEVEL || |
| 463 | ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { | 468 | ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || |
| 469 | ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { | ||
| 464 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | 470 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); |
| 465 | 471 | ||
| 466 | pte_gpa = (sp->gfn << PAGE_SHIFT); | 472 | pte_gpa = (sp->gfn << PAGE_SHIFT); |
| @@ -472,7 +478,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
| 472 | --vcpu->kvm->stat.lpages; | 478 | --vcpu->kvm->stat.lpages; |
| 473 | need_flush = 1; | 479 | need_flush = 1; |
| 474 | } | 480 | } |
| 475 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | 481 | __set_spte(sptep, shadow_trap_nonpresent_pte); |
| 476 | break; | 482 | break; |
| 477 | } | 483 | } |
| 478 | 484 | ||
| @@ -489,7 +495,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
| 489 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, | 495 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, |
| 490 | sizeof(pt_element_t))) | 496 | sizeof(pt_element_t))) |
| 491 | return; | 497 | return; |
| 492 | if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { | 498 | if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { |
| 493 | if (mmu_topup_memory_caches(vcpu)) | 499 | if (mmu_topup_memory_caches(vcpu)) |
| 494 | return; | 500 | return; |
| 495 | kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, | 501 | kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, |
| @@ -536,7 +542,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
| 536 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); | 542 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); |
| 537 | pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); | 543 | pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); |
| 538 | for (j = 0; j < ARRAY_SIZE(pt); ++j) | 544 | for (j = 0; j < ARRAY_SIZE(pt); ++j) |
| 539 | if (r || is_present_pte(pt[j])) | 545 | if (r || is_present_gpte(pt[j])) |
| 540 | sp->spt[i+j] = shadow_trap_nonpresent_pte; | 546 | sp->spt[i+j] = shadow_trap_nonpresent_pte; |
| 541 | else | 547 | else |
| 542 | sp->spt[i+j] = shadow_notrap_nonpresent_pte; | 548 | sp->spt[i+j] = shadow_notrap_nonpresent_pte; |
| @@ -574,23 +580,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
| 574 | sizeof(pt_element_t))) | 580 | sizeof(pt_element_t))) |
| 575 | return -EINVAL; | 581 | return -EINVAL; |
| 576 | 582 | ||
| 577 | if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || | 583 | if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || |
| 578 | !(gpte & PT_ACCESSED_MASK)) { | 584 | !(gpte & PT_ACCESSED_MASK)) { |
| 579 | u64 nonpresent; | 585 | u64 nonpresent; |
| 580 | 586 | ||
| 581 | rmap_remove(vcpu->kvm, &sp->spt[i]); | 587 | rmap_remove(vcpu->kvm, &sp->spt[i]); |
| 582 | if (is_present_pte(gpte)) | 588 | if (is_present_gpte(gpte)) |
| 583 | nonpresent = shadow_trap_nonpresent_pte; | 589 | nonpresent = shadow_trap_nonpresent_pte; |
| 584 | else | 590 | else |
| 585 | nonpresent = shadow_notrap_nonpresent_pte; | 591 | nonpresent = shadow_notrap_nonpresent_pte; |
| 586 | set_shadow_pte(&sp->spt[i], nonpresent); | 592 | __set_spte(&sp->spt[i], nonpresent); |
| 587 | continue; | 593 | continue; |
| 588 | } | 594 | } |
| 589 | 595 | ||
| 590 | nr_present++; | 596 | nr_present++; |
| 591 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 597 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
| 592 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 598 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
| 593 | is_dirty_pte(gpte), 0, gfn, | 599 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, |
| 594 | spte_to_pfn(sp->spt[i]), true, false); | 600 | spte_to_pfn(sp->spt[i]), true, false); |
| 595 | } | 601 | } |
| 596 | 602 | ||
| @@ -603,9 +609,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
| 603 | #undef PT_BASE_ADDR_MASK | 609 | #undef PT_BASE_ADDR_MASK |
| 604 | #undef PT_INDEX | 610 | #undef PT_INDEX |
| 605 | #undef PT_LEVEL_MASK | 611 | #undef PT_LEVEL_MASK |
| 606 | #undef PT_DIR_BASE_ADDR_MASK | 612 | #undef PT_LVL_ADDR_MASK |
| 613 | #undef PT_LVL_OFFSET_MASK | ||
| 607 | #undef PT_LEVEL_BITS | 614 | #undef PT_LEVEL_BITS |
| 608 | #undef PT_MAX_FULL_LEVELS | 615 | #undef PT_MAX_FULL_LEVELS |
| 609 | #undef gpte_to_gfn | 616 | #undef gpte_to_gfn |
| 610 | #undef gpte_to_gfn_pde | 617 | #undef gpte_to_gfn_lvl |
| 611 | #undef CMPXCHG | 618 | #undef CMPXCHG |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b1f658ad2f0..944cc9c04b3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
| @@ -15,7 +15,6 @@ | |||
| 15 | */ | 15 | */ |
| 16 | #include <linux/kvm_host.h> | 16 | #include <linux/kvm_host.h> |
| 17 | 17 | ||
| 18 | #include "kvm_svm.h" | ||
| 19 | #include "irq.h" | 18 | #include "irq.h" |
| 20 | #include "mmu.h" | 19 | #include "mmu.h" |
| 21 | #include "kvm_cache_regs.h" | 20 | #include "kvm_cache_regs.h" |
| @@ -26,10 +25,12 @@ | |||
| 26 | #include <linux/vmalloc.h> | 25 | #include <linux/vmalloc.h> |
| 27 | #include <linux/highmem.h> | 26 | #include <linux/highmem.h> |
| 28 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
| 28 | #include <linux/ftrace_event.h> | ||
| 29 | 29 | ||
| 30 | #include <asm/desc.h> | 30 | #include <asm/desc.h> |
| 31 | 31 | ||
| 32 | #include <asm/virtext.h> | 32 | #include <asm/virtext.h> |
| 33 | #include "trace.h" | ||
| 33 | 34 | ||
| 34 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 35 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
| 35 | 36 | ||
| @@ -46,6 +47,10 @@ MODULE_LICENSE("GPL"); | |||
| 46 | #define SVM_FEATURE_LBRV (1 << 1) | 47 | #define SVM_FEATURE_LBRV (1 << 1) |
| 47 | #define SVM_FEATURE_SVML (1 << 2) | 48 | #define SVM_FEATURE_SVML (1 << 2) |
| 48 | 49 | ||
| 50 | #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ | ||
| 51 | #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ | ||
| 52 | #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ | ||
| 53 | |||
| 49 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) | 54 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) |
| 50 | 55 | ||
| 51 | /* Turn on to get debugging output*/ | 56 | /* Turn on to get debugging output*/ |
| @@ -57,6 +62,58 @@ MODULE_LICENSE("GPL"); | |||
| 57 | #define nsvm_printk(fmt, args...) do {} while(0) | 62 | #define nsvm_printk(fmt, args...) do {} while(0) |
| 58 | #endif | 63 | #endif |
| 59 | 64 | ||
| 65 | static const u32 host_save_user_msrs[] = { | ||
| 66 | #ifdef CONFIG_X86_64 | ||
| 67 | MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, | ||
| 68 | MSR_FS_BASE, | ||
| 69 | #endif | ||
| 70 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
| 71 | }; | ||
| 72 | |||
| 73 | #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) | ||
| 74 | |||
| 75 | struct kvm_vcpu; | ||
| 76 | |||
| 77 | struct nested_state { | ||
| 78 | struct vmcb *hsave; | ||
| 79 | u64 hsave_msr; | ||
| 80 | u64 vmcb; | ||
| 81 | |||
| 82 | /* These are the merged vectors */ | ||
| 83 | u32 *msrpm; | ||
| 84 | |||
| 85 | /* gpa pointers to the real vectors */ | ||
| 86 | u64 vmcb_msrpm; | ||
| 87 | |||
| 88 | /* cache for intercepts of the guest */ | ||
| 89 | u16 intercept_cr_read; | ||
| 90 | u16 intercept_cr_write; | ||
| 91 | u16 intercept_dr_read; | ||
| 92 | u16 intercept_dr_write; | ||
| 93 | u32 intercept_exceptions; | ||
| 94 | u64 intercept; | ||
| 95 | |||
| 96 | }; | ||
| 97 | |||
| 98 | struct vcpu_svm { | ||
| 99 | struct kvm_vcpu vcpu; | ||
| 100 | struct vmcb *vmcb; | ||
| 101 | unsigned long vmcb_pa; | ||
| 102 | struct svm_cpu_data *svm_data; | ||
| 103 | uint64_t asid_generation; | ||
| 104 | uint64_t sysenter_esp; | ||
| 105 | uint64_t sysenter_eip; | ||
| 106 | |||
| 107 | u64 next_rip; | ||
| 108 | |||
| 109 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | ||
| 110 | u64 host_gs_base; | ||
| 111 | |||
| 112 | u32 *msrpm; | ||
| 113 | |||
| 114 | struct nested_state nested; | ||
| 115 | }; | ||
| 116 | |||
| 60 | /* enable NPT for AMD64 and X86 with PAE */ | 117 | /* enable NPT for AMD64 and X86 with PAE */ |
| 61 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | 118 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
| 62 | static bool npt_enabled = true; | 119 | static bool npt_enabled = true; |
| @@ -67,15 +124,14 @@ static int npt = 1; | |||
| 67 | 124 | ||
| 68 | module_param(npt, int, S_IRUGO); | 125 | module_param(npt, int, S_IRUGO); |
| 69 | 126 | ||
| 70 | static int nested = 0; | 127 | static int nested = 1; |
| 71 | module_param(nested, int, S_IRUGO); | 128 | module_param(nested, int, S_IRUGO); |
| 72 | 129 | ||
| 73 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); | 130 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); |
| 131 | static void svm_complete_interrupts(struct vcpu_svm *svm); | ||
| 74 | 132 | ||
| 75 | static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); | 133 | static int nested_svm_exit_handled(struct vcpu_svm *svm); |
| 76 | static int nested_svm_vmexit(struct vcpu_svm *svm); | 134 | static int nested_svm_vmexit(struct vcpu_svm *svm); |
| 77 | static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, | ||
| 78 | void *arg2, void *opaque); | ||
| 79 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | 135 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, |
| 80 | bool has_error_code, u32 error_code); | 136 | bool has_error_code, u32 error_code); |
| 81 | 137 | ||
| @@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | |||
| 86 | 142 | ||
| 87 | static inline bool is_nested(struct vcpu_svm *svm) | 143 | static inline bool is_nested(struct vcpu_svm *svm) |
| 88 | { | 144 | { |
| 89 | return svm->nested_vmcb; | 145 | return svm->nested.vmcb; |
| 146 | } | ||
| 147 | |||
| 148 | static inline void enable_gif(struct vcpu_svm *svm) | ||
| 149 | { | ||
| 150 | svm->vcpu.arch.hflags |= HF_GIF_MASK; | ||
| 151 | } | ||
| 152 | |||
| 153 | static inline void disable_gif(struct vcpu_svm *svm) | ||
| 154 | { | ||
| 155 | svm->vcpu.arch.hflags &= ~HF_GIF_MASK; | ||
| 156 | } | ||
| 157 | |||
| 158 | static inline bool gif_set(struct vcpu_svm *svm) | ||
| 159 | { | ||
| 160 | return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); | ||
| 90 | } | 161 | } |
| 91 | 162 | ||
| 92 | static unsigned long iopm_base; | 163 | static unsigned long iopm_base; |
| @@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid) | |||
| 147 | asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); | 218 | asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); |
| 148 | } | 219 | } |
| 149 | 220 | ||
| 150 | static inline unsigned long kvm_read_cr2(void) | ||
| 151 | { | ||
| 152 | unsigned long cr2; | ||
| 153 | |||
| 154 | asm volatile ("mov %%cr2, %0" : "=r" (cr2)); | ||
| 155 | return cr2; | ||
| 156 | } | ||
| 157 | |||
| 158 | static inline void kvm_write_cr2(unsigned long val) | ||
| 159 | { | ||
| 160 | asm volatile ("mov %0, %%cr2" :: "r" (val)); | ||
| 161 | } | ||
| 162 | |||
| 163 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | 221 | static inline void force_new_asid(struct kvm_vcpu *vcpu) |
| 164 | { | 222 | { |
| 165 | to_svm(vcpu)->asid_generation--; | 223 | to_svm(vcpu)->asid_generation--; |
| @@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage) | |||
| 263 | 321 | ||
| 264 | struct svm_cpu_data *svm_data; | 322 | struct svm_cpu_data *svm_data; |
| 265 | uint64_t efer; | 323 | uint64_t efer; |
| 266 | struct desc_ptr gdt_descr; | 324 | struct descriptor_table gdt_descr; |
| 267 | struct desc_struct *gdt; | 325 | struct desc_struct *gdt; |
| 268 | int me = raw_smp_processor_id(); | 326 | int me = raw_smp_processor_id(); |
| 269 | 327 | ||
| @@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage) | |||
| 283 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; | 341 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; |
| 284 | svm_data->next_asid = svm_data->max_asid + 1; | 342 | svm_data->next_asid = svm_data->max_asid + 1; |
| 285 | 343 | ||
| 286 | asm volatile ("sgdt %0" : "=m"(gdt_descr)); | 344 | kvm_get_gdt(&gdt_descr); |
| 287 | gdt = (struct desc_struct *)gdt_descr.address; | 345 | gdt = (struct desc_struct *)gdt_descr.base; |
| 288 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | 346 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); |
| 289 | 347 | ||
| 290 | rdmsrl(MSR_EFER, efer); | 348 | rdmsrl(MSR_EFER, efer); |
| @@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm) | |||
| 367 | #endif | 425 | #endif |
| 368 | set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); | 426 | set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); |
| 369 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); | 427 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); |
| 370 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); | ||
| 371 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); | ||
| 372 | } | 428 | } |
| 373 | 429 | ||
| 374 | static void svm_enable_lbrv(struct vcpu_svm *svm) | 430 | static void svm_enable_lbrv(struct vcpu_svm *svm) |
| @@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
| 595 | } | 651 | } |
| 596 | force_new_asid(&svm->vcpu); | 652 | force_new_asid(&svm->vcpu); |
| 597 | 653 | ||
| 598 | svm->nested_vmcb = 0; | 654 | svm->nested.vmcb = 0; |
| 599 | svm->vcpu.arch.hflags = HF_GIF_MASK; | 655 | svm->vcpu.arch.hflags = 0; |
| 656 | |||
| 657 | enable_gif(svm); | ||
| 600 | } | 658 | } |
| 601 | 659 | ||
| 602 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | 660 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) |
| @@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
| 605 | 663 | ||
| 606 | init_vmcb(svm); | 664 | init_vmcb(svm); |
| 607 | 665 | ||
| 608 | if (vcpu->vcpu_id != 0) { | 666 | if (!kvm_vcpu_is_bsp(vcpu)) { |
| 609 | kvm_rip_write(vcpu, 0); | 667 | kvm_rip_write(vcpu, 0); |
| 610 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; | 668 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; |
| 611 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; | 669 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; |
| @@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
| 656 | hsave_page = alloc_page(GFP_KERNEL); | 714 | hsave_page = alloc_page(GFP_KERNEL); |
| 657 | if (!hsave_page) | 715 | if (!hsave_page) |
| 658 | goto uninit; | 716 | goto uninit; |
| 659 | svm->hsave = page_address(hsave_page); | 717 | svm->nested.hsave = page_address(hsave_page); |
| 660 | 718 | ||
| 661 | svm->nested_msrpm = page_address(nested_msrpm_pages); | 719 | svm->nested.msrpm = page_address(nested_msrpm_pages); |
| 662 | 720 | ||
| 663 | svm->vmcb = page_address(page); | 721 | svm->vmcb = page_address(page); |
| 664 | clear_page(svm->vmcb); | 722 | clear_page(svm->vmcb); |
| @@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
| 669 | fx_init(&svm->vcpu); | 727 | fx_init(&svm->vcpu); |
| 670 | svm->vcpu.fpu_active = 1; | 728 | svm->vcpu.fpu_active = 1; |
| 671 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 729 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
| 672 | if (svm->vcpu.vcpu_id == 0) | 730 | if (kvm_vcpu_is_bsp(&svm->vcpu)) |
| 673 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; | 731 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; |
| 674 | 732 | ||
| 675 | return &svm->vcpu; | 733 | return &svm->vcpu; |
| @@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) | |||
| 688 | 746 | ||
| 689 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); | 747 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); |
| 690 | __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); | 748 | __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); |
| 691 | __free_page(virt_to_page(svm->hsave)); | 749 | __free_page(virt_to_page(svm->nested.hsave)); |
| 692 | __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); | 750 | __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); |
| 693 | kvm_vcpu_uninit(vcpu); | 751 | kvm_vcpu_uninit(vcpu); |
| 694 | kmem_cache_free(kvm_vcpu_cache, svm); | 752 | kmem_cache_free(kvm_vcpu_cache, svm); |
| 695 | } | 753 | } |
| @@ -740,6 +798,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | |||
| 740 | to_svm(vcpu)->vmcb->save.rflags = rflags; | 798 | to_svm(vcpu)->vmcb->save.rflags = rflags; |
| 741 | } | 799 | } |
| 742 | 800 | ||
| 801 | static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | ||
| 802 | { | ||
| 803 | switch (reg) { | ||
| 804 | case VCPU_EXREG_PDPTR: | ||
| 805 | BUG_ON(!npt_enabled); | ||
| 806 | load_pdptrs(vcpu, vcpu->arch.cr3); | ||
| 807 | break; | ||
| 808 | default: | ||
| 809 | BUG(); | ||
| 810 | } | ||
| 811 | } | ||
| 812 | |||
| 743 | static void svm_set_vintr(struct vcpu_svm *svm) | 813 | static void svm_set_vintr(struct vcpu_svm *svm) |
| 744 | { | 814 | { |
| 745 | svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; | 815 | svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; |
| @@ -1061,7 +1131,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) | |||
| 1061 | val = 0; | 1131 | val = 0; |
| 1062 | } | 1132 | } |
| 1063 | 1133 | ||
| 1064 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); | ||
| 1065 | return val; | 1134 | return val; |
| 1066 | } | 1135 | } |
| 1067 | 1136 | ||
| @@ -1070,8 +1139,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | |||
| 1070 | { | 1139 | { |
| 1071 | struct vcpu_svm *svm = to_svm(vcpu); | 1140 | struct vcpu_svm *svm = to_svm(vcpu); |
| 1072 | 1141 | ||
| 1073 | KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler); | ||
| 1074 | |||
| 1075 | *exception = 0; | 1142 | *exception = 0; |
| 1076 | 1143 | ||
| 1077 | switch (dr) { | 1144 | switch (dr) { |
| @@ -1119,25 +1186,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1119 | fault_address = svm->vmcb->control.exit_info_2; | 1186 | fault_address = svm->vmcb->control.exit_info_2; |
| 1120 | error_code = svm->vmcb->control.exit_info_1; | 1187 | error_code = svm->vmcb->control.exit_info_1; |
| 1121 | 1188 | ||
| 1122 | if (!npt_enabled) | 1189 | trace_kvm_page_fault(fault_address, error_code); |
| 1123 | KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, | 1190 | if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) |
| 1124 | (u32)fault_address, (u32)(fault_address >> 32), | 1191 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
| 1125 | handler); | ||
| 1126 | else | ||
| 1127 | KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, | ||
| 1128 | (u32)fault_address, (u32)(fault_address >> 32), | ||
| 1129 | handler); | ||
| 1130 | /* | ||
| 1131 | * FIXME: Tis shouldn't be necessary here, but there is a flush | ||
| 1132 | * missing in the MMU code. Until we find this bug, flush the | ||
| 1133 | * complete TLB here on an NPF | ||
| 1134 | */ | ||
| 1135 | if (npt_enabled) | ||
| 1136 | svm_flush_tlb(&svm->vcpu); | ||
| 1137 | else { | ||
| 1138 | if (kvm_event_needs_reinjection(&svm->vcpu)) | ||
| 1139 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | ||
| 1140 | } | ||
| 1141 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1192 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); |
| 1142 | } | 1193 | } |
| 1143 | 1194 | ||
| @@ -1253,14 +1304,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1253 | 1304 | ||
| 1254 | static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1305 | static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 1255 | { | 1306 | { |
| 1256 | KVMTRACE_0D(NMI, &svm->vcpu, handler); | ||
| 1257 | return 1; | 1307 | return 1; |
| 1258 | } | 1308 | } |
| 1259 | 1309 | ||
| 1260 | static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1310 | static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 1261 | { | 1311 | { |
| 1262 | ++svm->vcpu.stat.irq_exits; | 1312 | ++svm->vcpu.stat.irq_exits; |
| 1263 | KVMTRACE_0D(INTR, &svm->vcpu, handler); | ||
| 1264 | return 1; | 1313 | return 1; |
| 1265 | } | 1314 | } |
| 1266 | 1315 | ||
| @@ -1303,44 +1352,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) | |||
| 1303 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | 1352 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, |
| 1304 | bool has_error_code, u32 error_code) | 1353 | bool has_error_code, u32 error_code) |
| 1305 | { | 1354 | { |
| 1306 | if (is_nested(svm)) { | 1355 | if (!is_nested(svm)) |
| 1307 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; | 1356 | return 0; |
| 1308 | svm->vmcb->control.exit_code_hi = 0; | ||
| 1309 | svm->vmcb->control.exit_info_1 = error_code; | ||
| 1310 | svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; | ||
| 1311 | if (nested_svm_exit_handled(svm, false)) { | ||
| 1312 | nsvm_printk("VMexit -> EXCP 0x%x\n", nr); | ||
| 1313 | |||
| 1314 | nested_svm_vmexit(svm); | ||
| 1315 | return 1; | ||
| 1316 | } | ||
| 1317 | } | ||
| 1318 | 1357 | ||
| 1319 | return 0; | 1358 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; |
| 1359 | svm->vmcb->control.exit_code_hi = 0; | ||
| 1360 | svm->vmcb->control.exit_info_1 = error_code; | ||
| 1361 | svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; | ||
| 1362 | |||
| 1363 | return nested_svm_exit_handled(svm); | ||
| 1320 | } | 1364 | } |
| 1321 | 1365 | ||
| 1322 | static inline int nested_svm_intr(struct vcpu_svm *svm) | 1366 | static inline int nested_svm_intr(struct vcpu_svm *svm) |
| 1323 | { | 1367 | { |
| 1324 | if (is_nested(svm)) { | 1368 | if (!is_nested(svm)) |
| 1325 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) | 1369 | return 0; |
| 1326 | return 0; | ||
| 1327 | 1370 | ||
| 1328 | if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) | 1371 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) |
| 1329 | return 0; | 1372 | return 0; |
| 1330 | 1373 | ||
| 1331 | svm->vmcb->control.exit_code = SVM_EXIT_INTR; | 1374 | if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) |
| 1375 | return 0; | ||
| 1332 | 1376 | ||
| 1333 | if (nested_svm_exit_handled(svm, false)) { | 1377 | svm->vmcb->control.exit_code = SVM_EXIT_INTR; |
| 1334 | nsvm_printk("VMexit -> INTR\n"); | 1378 | |
| 1335 | nested_svm_vmexit(svm); | 1379 | if (nested_svm_exit_handled(svm)) { |
| 1336 | return 1; | 1380 | nsvm_printk("VMexit -> INTR\n"); |
| 1337 | } | 1381 | return 1; |
| 1338 | } | 1382 | } |
| 1339 | 1383 | ||
| 1340 | return 0; | 1384 | return 0; |
| 1341 | } | 1385 | } |
| 1342 | 1386 | ||
| 1343 | static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) | 1387 | static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) |
| 1344 | { | 1388 | { |
| 1345 | struct page *page; | 1389 | struct page *page; |
| 1346 | 1390 | ||
| @@ -1348,236 +1392,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) | |||
| 1348 | page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); | 1392 | page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); |
| 1349 | up_read(¤t->mm->mmap_sem); | 1393 | up_read(¤t->mm->mmap_sem); |
| 1350 | 1394 | ||
| 1351 | if (is_error_page(page)) { | 1395 | if (is_error_page(page)) |
| 1352 | printk(KERN_INFO "%s: could not find page at 0x%llx\n", | 1396 | goto error; |
| 1353 | __func__, gpa); | 1397 | |
| 1354 | kvm_release_page_clean(page); | 1398 | return kmap_atomic(page, idx); |
| 1355 | kvm_inject_gp(&svm->vcpu, 0); | 1399 | |
| 1356 | return NULL; | 1400 | error: |
| 1357 | } | 1401 | kvm_release_page_clean(page); |
| 1358 | return page; | 1402 | kvm_inject_gp(&svm->vcpu, 0); |
| 1403 | |||
| 1404 | return NULL; | ||
| 1359 | } | 1405 | } |
| 1360 | 1406 | ||
| 1361 | static int nested_svm_do(struct vcpu_svm *svm, | 1407 | static void nested_svm_unmap(void *addr, enum km_type idx) |
| 1362 | u64 arg1_gpa, u64 arg2_gpa, void *opaque, | ||
| 1363 | int (*handler)(struct vcpu_svm *svm, | ||
| 1364 | void *arg1, | ||
| 1365 | void *arg2, | ||
| 1366 | void *opaque)) | ||
| 1367 | { | 1408 | { |
| 1368 | struct page *arg1_page; | 1409 | struct page *page; |
| 1369 | struct page *arg2_page = NULL; | ||
| 1370 | void *arg1; | ||
| 1371 | void *arg2 = NULL; | ||
| 1372 | int retval; | ||
| 1373 | 1410 | ||
| 1374 | arg1_page = nested_svm_get_page(svm, arg1_gpa); | 1411 | if (!addr) |
| 1375 | if(arg1_page == NULL) | 1412 | return; |
| 1376 | return 1; | ||
| 1377 | 1413 | ||
| 1378 | if (arg2_gpa) { | 1414 | page = kmap_atomic_to_page(addr); |
| 1379 | arg2_page = nested_svm_get_page(svm, arg2_gpa); | 1415 | |
| 1380 | if(arg2_page == NULL) { | 1416 | kunmap_atomic(addr, idx); |
| 1381 | kvm_release_page_clean(arg1_page); | 1417 | kvm_release_page_dirty(page); |
| 1382 | return 1; | 1418 | } |
| 1383 | } | 1419 | |
| 1384 | } | 1420 | static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) |
| 1421 | { | ||
| 1422 | u32 param = svm->vmcb->control.exit_info_1 & 1; | ||
| 1423 | u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | ||
| 1424 | bool ret = false; | ||
| 1425 | u32 t0, t1; | ||
| 1426 | u8 *msrpm; | ||
| 1385 | 1427 | ||
| 1386 | arg1 = kmap_atomic(arg1_page, KM_USER0); | 1428 | if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) |
| 1387 | if (arg2_gpa) | 1429 | return false; |
| 1388 | arg2 = kmap_atomic(arg2_page, KM_USER1); | ||
| 1389 | 1430 | ||
| 1390 | retval = handler(svm, arg1, arg2, opaque); | 1431 | msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); |
| 1432 | |||
| 1433 | if (!msrpm) | ||
| 1434 | goto out; | ||
| 1435 | |||
| 1436 | switch (msr) { | ||
| 1437 | case 0 ... 0x1fff: | ||
| 1438 | t0 = (msr * 2) % 8; | ||
| 1439 | t1 = msr / 8; | ||
| 1440 | break; | ||
| 1441 | case 0xc0000000 ... 0xc0001fff: | ||
| 1442 | t0 = (8192 + msr - 0xc0000000) * 2; | ||
| 1443 | t1 = (t0 / 8); | ||
| 1444 | t0 %= 8; | ||
| 1445 | break; | ||
| 1446 | case 0xc0010000 ... 0xc0011fff: | ||
| 1447 | t0 = (16384 + msr - 0xc0010000) * 2; | ||
| 1448 | t1 = (t0 / 8); | ||
| 1449 | t0 %= 8; | ||
| 1450 | break; | ||
| 1451 | default: | ||
| 1452 | ret = true; | ||
| 1453 | goto out; | ||
| 1454 | } | ||
| 1391 | 1455 | ||
| 1392 | kunmap_atomic(arg1, KM_USER0); | 1456 | ret = msrpm[t1] & ((1 << param) << t0); |
| 1393 | if (arg2_gpa) | ||
| 1394 | kunmap_atomic(arg2, KM_USER1); | ||
| 1395 | 1457 | ||
| 1396 | kvm_release_page_dirty(arg1_page); | 1458 | out: |
| 1397 | if (arg2_gpa) | 1459 | nested_svm_unmap(msrpm, KM_USER0); |
| 1398 | kvm_release_page_dirty(arg2_page); | ||
| 1399 | 1460 | ||
| 1400 | return retval; | 1461 | return ret; |
| 1401 | } | 1462 | } |
| 1402 | 1463 | ||
| 1403 | static int nested_svm_exit_handled_real(struct vcpu_svm *svm, | 1464 | static int nested_svm_exit_special(struct vcpu_svm *svm) |
| 1404 | void *arg1, | ||
| 1405 | void *arg2, | ||
| 1406 | void *opaque) | ||
| 1407 | { | 1465 | { |
| 1408 | struct vmcb *nested_vmcb = (struct vmcb *)arg1; | ||
| 1409 | bool kvm_overrides = *(bool *)opaque; | ||
| 1410 | u32 exit_code = svm->vmcb->control.exit_code; | 1466 | u32 exit_code = svm->vmcb->control.exit_code; |
| 1411 | 1467 | ||
| 1412 | if (kvm_overrides) { | 1468 | switch (exit_code) { |
| 1413 | switch (exit_code) { | 1469 | case SVM_EXIT_INTR: |
| 1414 | case SVM_EXIT_INTR: | 1470 | case SVM_EXIT_NMI: |
| 1415 | case SVM_EXIT_NMI: | 1471 | return NESTED_EXIT_HOST; |
| 1416 | return 0; | ||
| 1417 | /* For now we are always handling NPFs when using them */ | 1472 | /* For now we are always handling NPFs when using them */ |
| 1418 | case SVM_EXIT_NPF: | 1473 | case SVM_EXIT_NPF: |
| 1419 | if (npt_enabled) | 1474 | if (npt_enabled) |
| 1420 | return 0; | 1475 | return NESTED_EXIT_HOST; |
| 1421 | break; | 1476 | break; |
| 1422 | /* When we're shadowing, trap PFs */ | 1477 | /* When we're shadowing, trap PFs */ |
| 1423 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: | 1478 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: |
| 1424 | if (!npt_enabled) | 1479 | if (!npt_enabled) |
| 1425 | return 0; | 1480 | return NESTED_EXIT_HOST; |
| 1426 | break; | 1481 | break; |
| 1427 | default: | 1482 | default: |
| 1428 | break; | 1483 | break; |
| 1429 | } | ||
| 1430 | } | 1484 | } |
| 1431 | 1485 | ||
| 1486 | return NESTED_EXIT_CONTINUE; | ||
| 1487 | } | ||
| 1488 | |||
| 1489 | /* | ||
| 1490 | * If this function returns true, this #vmexit was already handled | ||
| 1491 | */ | ||
| 1492 | static int nested_svm_exit_handled(struct vcpu_svm *svm) | ||
| 1493 | { | ||
| 1494 | u32 exit_code = svm->vmcb->control.exit_code; | ||
| 1495 | int vmexit = NESTED_EXIT_HOST; | ||
| 1496 | |||
| 1432 | switch (exit_code) { | 1497 | switch (exit_code) { |
| 1498 | case SVM_EXIT_MSR: | ||
| 1499 | vmexit = nested_svm_exit_handled_msr(svm); | ||
| 1500 | break; | ||
| 1433 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { | 1501 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { |
| 1434 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); | 1502 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); |
| 1435 | if (nested_vmcb->control.intercept_cr_read & cr_bits) | 1503 | if (svm->nested.intercept_cr_read & cr_bits) |
| 1436 | return 1; | 1504 | vmexit = NESTED_EXIT_DONE; |
| 1437 | break; | 1505 | break; |
| 1438 | } | 1506 | } |
| 1439 | case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { | 1507 | case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { |
| 1440 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); | 1508 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); |
| 1441 | if (nested_vmcb->control.intercept_cr_write & cr_bits) | 1509 | if (svm->nested.intercept_cr_write & cr_bits) |
| 1442 | return 1; | 1510 | vmexit = NESTED_EXIT_DONE; |
| 1443 | break; | 1511 | break; |
| 1444 | } | 1512 | } |
| 1445 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { | 1513 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { |
| 1446 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); | 1514 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); |
| 1447 | if (nested_vmcb->control.intercept_dr_read & dr_bits) | 1515 | if (svm->nested.intercept_dr_read & dr_bits) |
| 1448 | return 1; | 1516 | vmexit = NESTED_EXIT_DONE; |
| 1449 | break; | 1517 | break; |
| 1450 | } | 1518 | } |
| 1451 | case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { | 1519 | case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { |
| 1452 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); | 1520 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); |
| 1453 | if (nested_vmcb->control.intercept_dr_write & dr_bits) | 1521 | if (svm->nested.intercept_dr_write & dr_bits) |
| 1454 | return 1; | 1522 | vmexit = NESTED_EXIT_DONE; |
| 1455 | break; | 1523 | break; |
| 1456 | } | 1524 | } |
| 1457 | case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { | 1525 | case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { |
| 1458 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); | 1526 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); |
| 1459 | if (nested_vmcb->control.intercept_exceptions & excp_bits) | 1527 | if (svm->nested.intercept_exceptions & excp_bits) |
| 1460 | return 1; | 1528 | vmexit = NESTED_EXIT_DONE; |
| 1461 | break; | 1529 | break; |
| 1462 | } | 1530 | } |
| 1463 | default: { | 1531 | default: { |
| 1464 | u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); | 1532 | u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); |
| 1465 | nsvm_printk("exit code: 0x%x\n", exit_code); | 1533 | nsvm_printk("exit code: 0x%x\n", exit_code); |
| 1466 | if (nested_vmcb->control.intercept & exit_bits) | 1534 | if (svm->nested.intercept & exit_bits) |
| 1467 | return 1; | 1535 | vmexit = NESTED_EXIT_DONE; |
| 1468 | } | 1536 | } |
| 1469 | } | 1537 | } |
| 1470 | 1538 | ||
| 1471 | return 0; | 1539 | if (vmexit == NESTED_EXIT_DONE) { |
| 1472 | } | 1540 | nsvm_printk("#VMEXIT reason=%04x\n", exit_code); |
| 1473 | 1541 | nested_svm_vmexit(svm); | |
| 1474 | static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, | ||
| 1475 | void *arg1, void *arg2, | ||
| 1476 | void *opaque) | ||
| 1477 | { | ||
| 1478 | struct vmcb *nested_vmcb = (struct vmcb *)arg1; | ||
| 1479 | u8 *msrpm = (u8 *)arg2; | ||
| 1480 | u32 t0, t1; | ||
| 1481 | u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | ||
| 1482 | u32 param = svm->vmcb->control.exit_info_1 & 1; | ||
| 1483 | |||
| 1484 | if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) | ||
| 1485 | return 0; | ||
| 1486 | |||
| 1487 | switch(msr) { | ||
| 1488 | case 0 ... 0x1fff: | ||
| 1489 | t0 = (msr * 2) % 8; | ||
| 1490 | t1 = msr / 8; | ||
| 1491 | break; | ||
| 1492 | case 0xc0000000 ... 0xc0001fff: | ||
| 1493 | t0 = (8192 + msr - 0xc0000000) * 2; | ||
| 1494 | t1 = (t0 / 8); | ||
| 1495 | t0 %= 8; | ||
| 1496 | break; | ||
| 1497 | case 0xc0010000 ... 0xc0011fff: | ||
| 1498 | t0 = (16384 + msr - 0xc0010000) * 2; | ||
| 1499 | t1 = (t0 / 8); | ||
| 1500 | t0 %= 8; | ||
| 1501 | break; | ||
| 1502 | default: | ||
| 1503 | return 1; | ||
| 1504 | break; | ||
| 1505 | } | 1542 | } |
| 1506 | if (msrpm[t1] & ((1 << param) << t0)) | ||
| 1507 | return 1; | ||
| 1508 | 1543 | ||
| 1509 | return 0; | 1544 | return vmexit; |
| 1545 | } | ||
| 1546 | |||
| 1547 | static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) | ||
| 1548 | { | ||
| 1549 | struct vmcb_control_area *dst = &dst_vmcb->control; | ||
| 1550 | struct vmcb_control_area *from = &from_vmcb->control; | ||
| 1551 | |||
| 1552 | dst->intercept_cr_read = from->intercept_cr_read; | ||
| 1553 | dst->intercept_cr_write = from->intercept_cr_write; | ||
| 1554 | dst->intercept_dr_read = from->intercept_dr_read; | ||
| 1555 | dst->intercept_dr_write = from->intercept_dr_write; | ||
| 1556 | dst->intercept_exceptions = from->intercept_exceptions; | ||
| 1557 | dst->intercept = from->intercept; | ||
| 1558 | dst->iopm_base_pa = from->iopm_base_pa; | ||
| 1559 | dst->msrpm_base_pa = from->msrpm_base_pa; | ||
| 1560 | dst->tsc_offset = from->tsc_offset; | ||
| 1561 | dst->asid = from->asid; | ||
| 1562 | dst->tlb_ctl = from->tlb_ctl; | ||
| 1563 | dst->int_ctl = from->int_ctl; | ||
| 1564 | dst->int_vector = from->int_vector; | ||
| 1565 | dst->int_state = from->int_state; | ||
| 1566 | dst->exit_code = from->exit_code; | ||
| 1567 | dst->exit_code_hi = from->exit_code_hi; | ||
| 1568 | dst->exit_info_1 = from->exit_info_1; | ||
| 1569 | dst->exit_info_2 = from->exit_info_2; | ||
| 1570 | dst->exit_int_info = from->exit_int_info; | ||
| 1571 | dst->exit_int_info_err = from->exit_int_info_err; | ||
| 1572 | dst->nested_ctl = from->nested_ctl; | ||
| 1573 | dst->event_inj = from->event_inj; | ||
| 1574 | dst->event_inj_err = from->event_inj_err; | ||
| 1575 | dst->nested_cr3 = from->nested_cr3; | ||
| 1576 | dst->lbr_ctl = from->lbr_ctl; | ||
| 1510 | } | 1577 | } |
| 1511 | 1578 | ||
| 1512 | static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) | 1579 | static int nested_svm_vmexit(struct vcpu_svm *svm) |
| 1513 | { | 1580 | { |
| 1514 | bool k = kvm_override; | 1581 | struct vmcb *nested_vmcb; |
| 1515 | 1582 | struct vmcb *hsave = svm->nested.hsave; | |
| 1516 | switch (svm->vmcb->control.exit_code) { | 1583 | struct vmcb *vmcb = svm->vmcb; |
| 1517 | case SVM_EXIT_MSR: | ||
| 1518 | return nested_svm_do(svm, svm->nested_vmcb, | ||
| 1519 | svm->nested_vmcb_msrpm, NULL, | ||
| 1520 | nested_svm_exit_handled_msr); | ||
| 1521 | default: break; | ||
| 1522 | } | ||
| 1523 | 1584 | ||
| 1524 | return nested_svm_do(svm, svm->nested_vmcb, 0, &k, | 1585 | nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); |
| 1525 | nested_svm_exit_handled_real); | 1586 | if (!nested_vmcb) |
| 1526 | } | 1587 | return 1; |
| 1527 | |||
| 1528 | static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, | ||
| 1529 | void *arg2, void *opaque) | ||
| 1530 | { | ||
| 1531 | struct vmcb *nested_vmcb = (struct vmcb *)arg1; | ||
| 1532 | struct vmcb *hsave = svm->hsave; | ||
| 1533 | u64 nested_save[] = { nested_vmcb->save.cr0, | ||
| 1534 | nested_vmcb->save.cr3, | ||
| 1535 | nested_vmcb->save.cr4, | ||
| 1536 | nested_vmcb->save.efer, | ||
| 1537 | nested_vmcb->control.intercept_cr_read, | ||
| 1538 | nested_vmcb->control.intercept_cr_write, | ||
| 1539 | nested_vmcb->control.intercept_dr_read, | ||
| 1540 | nested_vmcb->control.intercept_dr_write, | ||
| 1541 | nested_vmcb->control.intercept_exceptions, | ||
| 1542 | nested_vmcb->control.intercept, | ||
| 1543 | nested_vmcb->control.msrpm_base_pa, | ||
| 1544 | nested_vmcb->control.iopm_base_pa, | ||
| 1545 | nested_vmcb->control.tsc_offset }; | ||
| 1546 | 1588 | ||
| 1547 | /* Give the current vmcb to the guest */ | 1589 | /* Give the current vmcb to the guest */ |
| 1548 | memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); | 1590 | disable_gif(svm); |
| 1549 | nested_vmcb->save.cr0 = nested_save[0]; | 1591 | |
| 1550 | if (!npt_enabled) | 1592 | nested_vmcb->save.es = vmcb->save.es; |
| 1551 | nested_vmcb->save.cr3 = nested_save[1]; | 1593 | nested_vmcb->save.cs = vmcb->save.cs; |
| 1552 | nested_vmcb->save.cr4 = nested_save[2]; | 1594 | nested_vmcb->save.ss = vmcb->save.ss; |
| 1553 | nested_vmcb->save.efer = nested_save[3]; | 1595 | nested_vmcb->save.ds = vmcb->save.ds; |
| 1554 | nested_vmcb->control.intercept_cr_read = nested_save[4]; | 1596 | nested_vmcb->save.gdtr = vmcb->save.gdtr; |
| 1555 | nested_vmcb->control.intercept_cr_write = nested_save[5]; | 1597 | nested_vmcb->save.idtr = vmcb->save.idtr; |
| 1556 | nested_vmcb->control.intercept_dr_read = nested_save[6]; | 1598 | if (npt_enabled) |
| 1557 | nested_vmcb->control.intercept_dr_write = nested_save[7]; | 1599 | nested_vmcb->save.cr3 = vmcb->save.cr3; |
| 1558 | nested_vmcb->control.intercept_exceptions = nested_save[8]; | 1600 | nested_vmcb->save.cr2 = vmcb->save.cr2; |
| 1559 | nested_vmcb->control.intercept = nested_save[9]; | 1601 | nested_vmcb->save.rflags = vmcb->save.rflags; |
| 1560 | nested_vmcb->control.msrpm_base_pa = nested_save[10]; | 1602 | nested_vmcb->save.rip = vmcb->save.rip; |
| 1561 | nested_vmcb->control.iopm_base_pa = nested_save[11]; | 1603 | nested_vmcb->save.rsp = vmcb->save.rsp; |
| 1562 | nested_vmcb->control.tsc_offset = nested_save[12]; | 1604 | nested_vmcb->save.rax = vmcb->save.rax; |
| 1605 | nested_vmcb->save.dr7 = vmcb->save.dr7; | ||
| 1606 | nested_vmcb->save.dr6 = vmcb->save.dr6; | ||
| 1607 | nested_vmcb->save.cpl = vmcb->save.cpl; | ||
| 1608 | |||
| 1609 | nested_vmcb->control.int_ctl = vmcb->control.int_ctl; | ||
| 1610 | nested_vmcb->control.int_vector = vmcb->control.int_vector; | ||
| 1611 | nested_vmcb->control.int_state = vmcb->control.int_state; | ||
| 1612 | nested_vmcb->control.exit_code = vmcb->control.exit_code; | ||
| 1613 | nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; | ||
| 1614 | nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; | ||
| 1615 | nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; | ||
| 1616 | nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; | ||
| 1617 | nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; | ||
| 1618 | nested_vmcb->control.tlb_ctl = 0; | ||
| 1619 | nested_vmcb->control.event_inj = 0; | ||
| 1620 | nested_vmcb->control.event_inj_err = 0; | ||
| 1563 | 1621 | ||
| 1564 | /* We always set V_INTR_MASKING and remember the old value in hflags */ | 1622 | /* We always set V_INTR_MASKING and remember the old value in hflags */ |
| 1565 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) | 1623 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) |
| 1566 | nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; | 1624 | nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; |
| 1567 | 1625 | ||
| 1568 | if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) && | ||
| 1569 | (nested_vmcb->control.int_vector)) { | ||
| 1570 | nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n", | ||
| 1571 | nested_vmcb->control.int_vector); | ||
| 1572 | } | ||
| 1573 | |||
| 1574 | /* Restore the original control entries */ | 1626 | /* Restore the original control entries */ |
| 1575 | svm->vmcb->control = hsave->control; | 1627 | copy_vmcb_control_area(vmcb, hsave); |
| 1576 | 1628 | ||
| 1577 | /* Kill any pending exceptions */ | 1629 | /* Kill any pending exceptions */ |
| 1578 | if (svm->vcpu.arch.exception.pending == true) | 1630 | if (svm->vcpu.arch.exception.pending == true) |
| 1579 | nsvm_printk("WARNING: Pending Exception\n"); | 1631 | nsvm_printk("WARNING: Pending Exception\n"); |
| 1580 | svm->vcpu.arch.exception.pending = false; | 1632 | |
| 1633 | kvm_clear_exception_queue(&svm->vcpu); | ||
| 1634 | kvm_clear_interrupt_queue(&svm->vcpu); | ||
| 1581 | 1635 | ||
| 1582 | /* Restore selected save entries */ | 1636 | /* Restore selected save entries */ |
| 1583 | svm->vmcb->save.es = hsave->save.es; | 1637 | svm->vmcb->save.es = hsave->save.es; |
| @@ -1603,19 +1657,10 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, | |||
| 1603 | svm->vmcb->save.cpl = 0; | 1657 | svm->vmcb->save.cpl = 0; |
| 1604 | svm->vmcb->control.exit_int_info = 0; | 1658 | svm->vmcb->control.exit_int_info = 0; |
| 1605 | 1659 | ||
| 1606 | svm->vcpu.arch.hflags &= ~HF_GIF_MASK; | ||
| 1607 | /* Exit nested SVM mode */ | 1660 | /* Exit nested SVM mode */ |
| 1608 | svm->nested_vmcb = 0; | 1661 | svm->nested.vmcb = 0; |
| 1609 | 1662 | ||
| 1610 | return 0; | 1663 | nested_svm_unmap(nested_vmcb, KM_USER0); |
| 1611 | } | ||
| 1612 | |||
| 1613 | static int nested_svm_vmexit(struct vcpu_svm *svm) | ||
| 1614 | { | ||
| 1615 | nsvm_printk("VMexit\n"); | ||
| 1616 | if (nested_svm_do(svm, svm->nested_vmcb, 0, | ||
| 1617 | NULL, nested_svm_vmexit_real)) | ||
| 1618 | return 1; | ||
| 1619 | 1664 | ||
| 1620 | kvm_mmu_reset_context(&svm->vcpu); | 1665 | kvm_mmu_reset_context(&svm->vcpu); |
| 1621 | kvm_mmu_load(&svm->vcpu); | 1666 | kvm_mmu_load(&svm->vcpu); |
| @@ -1623,38 +1668,63 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
| 1623 | return 0; | 1668 | return 0; |
| 1624 | } | 1669 | } |
| 1625 | 1670 | ||
| 1626 | static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, | 1671 | static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) |
| 1627 | void *arg2, void *opaque) | ||
| 1628 | { | 1672 | { |
| 1673 | u32 *nested_msrpm; | ||
| 1629 | int i; | 1674 | int i; |
| 1630 | u32 *nested_msrpm = (u32*)arg1; | 1675 | |
| 1676 | nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); | ||
| 1677 | if (!nested_msrpm) | ||
| 1678 | return false; | ||
| 1679 | |||
| 1631 | for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) | 1680 | for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) |
| 1632 | svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; | 1681 | svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; |
| 1633 | svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm); | ||
| 1634 | 1682 | ||
| 1635 | return 0; | 1683 | svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); |
| 1684 | |||
| 1685 | nested_svm_unmap(nested_msrpm, KM_USER0); | ||
| 1686 | |||
| 1687 | return true; | ||
| 1636 | } | 1688 | } |
| 1637 | 1689 | ||
| 1638 | static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, | 1690 | static bool nested_svm_vmrun(struct vcpu_svm *svm) |
| 1639 | void *arg2, void *opaque) | ||
| 1640 | { | 1691 | { |
| 1641 | struct vmcb *nested_vmcb = (struct vmcb *)arg1; | 1692 | struct vmcb *nested_vmcb; |
| 1642 | struct vmcb *hsave = svm->hsave; | 1693 | struct vmcb *hsave = svm->nested.hsave; |
| 1694 | struct vmcb *vmcb = svm->vmcb; | ||
| 1695 | |||
| 1696 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); | ||
| 1697 | if (!nested_vmcb) | ||
| 1698 | return false; | ||
| 1643 | 1699 | ||
| 1644 | /* nested_vmcb is our indicator if nested SVM is activated */ | 1700 | /* nested_vmcb is our indicator if nested SVM is activated */ |
| 1645 | svm->nested_vmcb = svm->vmcb->save.rax; | 1701 | svm->nested.vmcb = svm->vmcb->save.rax; |
| 1646 | 1702 | ||
| 1647 | /* Clear internal status */ | 1703 | /* Clear internal status */ |
| 1648 | svm->vcpu.arch.exception.pending = false; | 1704 | kvm_clear_exception_queue(&svm->vcpu); |
| 1705 | kvm_clear_interrupt_queue(&svm->vcpu); | ||
| 1649 | 1706 | ||
| 1650 | /* Save the old vmcb, so we don't need to pick what we save, but | 1707 | /* Save the old vmcb, so we don't need to pick what we save, but |
| 1651 | can restore everything when a VMEXIT occurs */ | 1708 | can restore everything when a VMEXIT occurs */ |
| 1652 | memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); | 1709 | hsave->save.es = vmcb->save.es; |
| 1653 | /* We need to remember the original CR3 in the SPT case */ | 1710 | hsave->save.cs = vmcb->save.cs; |
| 1654 | if (!npt_enabled) | 1711 | hsave->save.ss = vmcb->save.ss; |
| 1655 | hsave->save.cr3 = svm->vcpu.arch.cr3; | 1712 | hsave->save.ds = vmcb->save.ds; |
| 1656 | hsave->save.cr4 = svm->vcpu.arch.cr4; | 1713 | hsave->save.gdtr = vmcb->save.gdtr; |
| 1657 | hsave->save.rip = svm->next_rip; | 1714 | hsave->save.idtr = vmcb->save.idtr; |
| 1715 | hsave->save.efer = svm->vcpu.arch.shadow_efer; | ||
| 1716 | hsave->save.cr0 = svm->vcpu.arch.cr0; | ||
| 1717 | hsave->save.cr4 = svm->vcpu.arch.cr4; | ||
| 1718 | hsave->save.rflags = vmcb->save.rflags; | ||
| 1719 | hsave->save.rip = svm->next_rip; | ||
| 1720 | hsave->save.rsp = vmcb->save.rsp; | ||
| 1721 | hsave->save.rax = vmcb->save.rax; | ||
| 1722 | if (npt_enabled) | ||
| 1723 | hsave->save.cr3 = vmcb->save.cr3; | ||
| 1724 | else | ||
| 1725 | hsave->save.cr3 = svm->vcpu.arch.cr3; | ||
| 1726 | |||
| 1727 | copy_vmcb_control_area(hsave, vmcb); | ||
| 1658 | 1728 | ||
| 1659 | if (svm->vmcb->save.rflags & X86_EFLAGS_IF) | 1729 | if (svm->vmcb->save.rflags & X86_EFLAGS_IF) |
| 1660 | svm->vcpu.arch.hflags |= HF_HIF_MASK; | 1730 | svm->vcpu.arch.hflags |= HF_HIF_MASK; |
| @@ -1679,7 +1749,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, | |||
| 1679 | kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); | 1749 | kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); |
| 1680 | kvm_mmu_reset_context(&svm->vcpu); | 1750 | kvm_mmu_reset_context(&svm->vcpu); |
| 1681 | } | 1751 | } |
| 1682 | svm->vmcb->save.cr2 = nested_vmcb->save.cr2; | 1752 | svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; |
| 1683 | kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); | 1753 | kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); |
| 1684 | kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); | 1754 | kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); |
| 1685 | kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); | 1755 | kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); |
| @@ -1706,7 +1776,15 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, | |||
| 1706 | 1776 | ||
| 1707 | svm->vmcb->control.intercept |= nested_vmcb->control.intercept; | 1777 | svm->vmcb->control.intercept |= nested_vmcb->control.intercept; |
| 1708 | 1778 | ||
| 1709 | svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; | 1779 | svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; |
| 1780 | |||
| 1781 | /* cache intercepts */ | ||
| 1782 | svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; | ||
| 1783 | svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; | ||
| 1784 | svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; | ||
| 1785 | svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; | ||
| 1786 | svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; | ||
| 1787 | svm->nested.intercept = nested_vmcb->control.intercept; | ||
| 1710 | 1788 | ||
| 1711 | force_new_asid(&svm->vcpu); | 1789 | force_new_asid(&svm->vcpu); |
| 1712 | svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; | 1790 | svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; |
| @@ -1734,12 +1812,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, | |||
| 1734 | svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; | 1812 | svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; |
| 1735 | svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; | 1813 | svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; |
| 1736 | 1814 | ||
| 1737 | svm->vcpu.arch.hflags |= HF_GIF_MASK; | 1815 | nested_svm_unmap(nested_vmcb, KM_USER0); |
| 1738 | 1816 | ||
| 1739 | return 0; | 1817 | enable_gif(svm); |
| 1818 | |||
| 1819 | return true; | ||
| 1740 | } | 1820 | } |
| 1741 | 1821 | ||
| 1742 | static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) | 1822 | static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) |
| 1743 | { | 1823 | { |
| 1744 | to_vmcb->save.fs = from_vmcb->save.fs; | 1824 | to_vmcb->save.fs = from_vmcb->save.fs; |
| 1745 | to_vmcb->save.gs = from_vmcb->save.gs; | 1825 | to_vmcb->save.gs = from_vmcb->save.gs; |
| @@ -1753,44 +1833,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) | |||
| 1753 | to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; | 1833 | to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; |
| 1754 | to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; | 1834 | to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; |
| 1755 | to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; | 1835 | to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; |
| 1756 | |||
| 1757 | return 1; | ||
| 1758 | } | ||
| 1759 | |||
| 1760 | static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, | ||
| 1761 | void *arg2, void *opaque) | ||
| 1762 | { | ||
| 1763 | return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb); | ||
| 1764 | } | ||
| 1765 | |||
| 1766 | static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, | ||
| 1767 | void *arg2, void *opaque) | ||
| 1768 | { | ||
| 1769 | return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb); | ||
| 1770 | } | 1836 | } |
| 1771 | 1837 | ||
| 1772 | static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1838 | static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 1773 | { | 1839 | { |
| 1840 | struct vmcb *nested_vmcb; | ||
| 1841 | |||
| 1774 | if (nested_svm_check_permissions(svm)) | 1842 | if (nested_svm_check_permissions(svm)) |
| 1775 | return 1; | 1843 | return 1; |
| 1776 | 1844 | ||
| 1777 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1845 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
| 1778 | skip_emulated_instruction(&svm->vcpu); | 1846 | skip_emulated_instruction(&svm->vcpu); |
| 1779 | 1847 | ||
| 1780 | nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); | 1848 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); |
| 1849 | if (!nested_vmcb) | ||
| 1850 | return 1; | ||
| 1851 | |||
| 1852 | nested_svm_vmloadsave(nested_vmcb, svm->vmcb); | ||
| 1853 | nested_svm_unmap(nested_vmcb, KM_USER0); | ||
| 1781 | 1854 | ||
| 1782 | return 1; | 1855 | return 1; |
| 1783 | } | 1856 | } |
| 1784 | 1857 | ||
| 1785 | static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1858 | static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 1786 | { | 1859 | { |
| 1860 | struct vmcb *nested_vmcb; | ||
| 1861 | |||
| 1787 | if (nested_svm_check_permissions(svm)) | 1862 | if (nested_svm_check_permissions(svm)) |
| 1788 | return 1; | 1863 | return 1; |
| 1789 | 1864 | ||
| 1790 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1865 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
| 1791 | skip_emulated_instruction(&svm->vcpu); | 1866 | skip_emulated_instruction(&svm->vcpu); |
| 1792 | 1867 | ||
| 1793 | nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); | 1868 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); |
| 1869 | if (!nested_vmcb) | ||
| 1870 | return 1; | ||
| 1871 | |||
| 1872 | nested_svm_vmloadsave(svm->vmcb, nested_vmcb); | ||
| 1873 | nested_svm_unmap(nested_vmcb, KM_USER0); | ||
| 1794 | 1874 | ||
| 1795 | return 1; | 1875 | return 1; |
| 1796 | } | 1876 | } |
| @@ -1798,19 +1878,29 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1798 | static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1878 | static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
| 1799 | { | 1879 | { |
| 1800 | nsvm_printk("VMrun\n"); | 1880 | nsvm_printk("VMrun\n"); |
| 1881 | |||
| 1801 | if (nested_svm_check_permissions(svm)) | 1882 | if (nested_svm_check_permissions(svm)) |
| 1802 | return 1; | 1883 | return 1; |
| 1803 | 1884 | ||
| 1804 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1885 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
| 1805 | skip_emulated_instruction(&svm->vcpu); | 1886 | skip_emulated_instruction(&svm->vcpu); |
| 1806 | 1887 | ||
| 1807 | if (nested_svm_do(svm, svm->vmcb->save.rax, 0, | 1888 | if (!nested_svm_vmrun(svm)) |
| 1808 | NULL, nested_svm_vmrun)) | ||
| 1809 | return 1; | 1889 | return 1; |
| 1810 | 1890 | ||
| 1811 | if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, | 1891 | if (!nested_svm_vmrun_msrpm(svm)) |
| 1812 | NULL, nested_svm_vmrun_msrpm)) | 1892 | goto failed; |
| 1813 | return 1; | 1893 | |
| 1894 | return 1; | ||
| 1895 | |||
| 1896 | failed: | ||
| 1897 | |||
| 1898 | svm->vmcb->control.exit_code = SVM_EXIT_ERR; | ||
| 1899 | svm->vmcb->control.exit_code_hi = 0; | ||
| 1900 | svm->vmcb->control.exit_info_1 = 0; | ||
| 1901 | svm->vmcb->control.exit_info_2 = 0; | ||
| 1902 | |||
| 1903 | nested_svm_vmexit(svm); | ||
| 1814 | 1904 | ||
| 1815 | return 1; | 1905 | return 1; |
| 1816 | } | 1906 | } |
| @@ -1823,7 +1913,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1823 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1913 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
| 1824 | skip_emulated_instruction(&svm->vcpu); | 1914 | skip_emulated_instruction(&svm->vcpu); |
| 1825 | 1915 | ||
| 1826 | svm->vcpu.arch.hflags |= HF_GIF_MASK; | 1916 | enable_gif(svm); |
| 1827 | 1917 | ||
| 1828 | return 1; | 1918 | return 1; |
| 1829 | } | 1919 | } |
| @@ -1836,7 +1926,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1836 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1926 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
| 1837 | skip_emulated_instruction(&svm->vcpu); | 1927 | skip_emulated_instruction(&svm->vcpu); |
| 1838 | 1928 | ||
| 1839 | svm->vcpu.arch.hflags &= ~HF_GIF_MASK; | 1929 | disable_gif(svm); |
| 1840 | 1930 | ||
| 1841 | /* After a CLGI no interrupts should come */ | 1931 | /* After a CLGI no interrupts should come */ |
| 1842 | svm_clear_vintr(svm); | 1932 | svm_clear_vintr(svm); |
| @@ -1845,6 +1935,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 1845 | return 1; | 1935 | return 1; |
| 1846 | } | 1936 | } |
| 1847 | 1937 | ||
| 1938 | static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
| 1939 | { | ||
| 1940 | struct kvm_vcpu *vcpu = &svm->vcpu; | ||
| 1941 | nsvm_printk("INVLPGA\n"); | ||
| 1942 | |||
| 1943 | /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ | ||
| 1944 | kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); | ||
| 1945 | |||
| 1946 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
| 1947 | skip_emulated_instruction(&svm->vcpu); | ||
| 1948 | return 1; | ||
| 1949 | } | ||
| 1950 | |||
| 1848 | static int invalid_op_interception(struct vcpu_svm *svm, | 1951 | static int invalid_op_interception(struct vcpu_svm *svm, |
| 1849 | struct kvm_run *kvm_run) | 1952 | struct kvm_run *kvm_run) |
| 1850 | { | 1953 | { |
| @@ -1953,7 +2056,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
| 1953 | struct vcpu_svm *svm = to_svm(vcpu); | 2056 | struct vcpu_svm *svm = to_svm(vcpu); |
| 1954 | 2057 | ||
| 1955 | switch (ecx) { | 2058 | switch (ecx) { |
| 1956 | case MSR_IA32_TIME_STAMP_COUNTER: { | 2059 | case MSR_IA32_TSC: { |
| 1957 | u64 tsc; | 2060 | u64 tsc; |
| 1958 | 2061 | ||
| 1959 | rdtscll(tsc); | 2062 | rdtscll(tsc); |
| @@ -1981,10 +2084,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
| 1981 | *data = svm->vmcb->save.sysenter_cs; | 2084 | *data = svm->vmcb->save.sysenter_cs; |
| 1982 | break; | 2085 | break; |
| 1983 | case MSR_IA32_SYSENTER_EIP: | 2086 | case MSR_IA32_SYSENTER_EIP: |
| 1984 | *data = svm->vmcb->save.sysenter_eip; | 2087 | *data = svm->sysenter_eip; |
| 1985 | break; | 2088 | break; |
| 1986 | case MSR_IA32_SYSENTER_ESP: | 2089 | case MSR_IA32_SYSENTER_ESP: |
| 1987 | *data = svm->vmcb->save.sysenter_esp; | 2090 | *data = svm->sysenter_esp; |
| 1988 | break; | 2091 | break; |
| 1989 | /* Nobody will change the following 5 values in the VMCB so | 2092 | /* Nobody will change the following 5 values in the VMCB so |
| 1990 | we can safely return them on rdmsr. They will always be 0 | 2093 | we can safely return them on rdmsr. They will always be 0 |
| @@ -2005,7 +2108,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
| 2005 | *data = svm->vmcb->save.last_excp_to; | 2108 | *data = svm->vmcb->save.last_excp_to; |
| 2006 | break; | 2109 | break; |
| 2007 | case MSR_VM_HSAVE_PA: | 2110 | case MSR_VM_HSAVE_PA: |
| 2008 | *data = svm->hsave_msr; | 2111 | *data = svm->nested.hsave_msr; |
| 2009 | break; | 2112 | break; |
| 2010 | case MSR_VM_CR: | 2113 | case MSR_VM_CR: |
| 2011 | *data = 0; | 2114 | *data = 0; |
| @@ -2027,8 +2130,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 2027 | if (svm_get_msr(&svm->vcpu, ecx, &data)) | 2130 | if (svm_get_msr(&svm->vcpu, ecx, &data)) |
| 2028 | kvm_inject_gp(&svm->vcpu, 0); | 2131 | kvm_inject_gp(&svm->vcpu, 0); |
| 2029 | else { | 2132 | else { |
| 2030 | KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, | 2133 | trace_kvm_msr_read(ecx, data); |
| 2031 | (u32)(data >> 32), handler); | ||
| 2032 | 2134 | ||
| 2033 | svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; | 2135 | svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; |
| 2034 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; | 2136 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; |
| @@ -2043,7 +2145,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
| 2043 | struct vcpu_svm *svm = to_svm(vcpu); | 2145 | struct vcpu_svm *svm = to_svm(vcpu); |
| 2044 | 2146 | ||
| 2045 | switch (ecx) { | 2147 | switch (ecx) { |
| 2046 | case MSR_IA32_TIME_STAMP_COUNTER: { | 2148 | case MSR_IA32_TSC: { |
| 2047 | u64 tsc; | 2149 | u64 tsc; |
| 2048 | 2150 | ||
| 2049 | rdtscll(tsc); | 2151 | rdtscll(tsc); |
| @@ -2071,9 +2173,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
| 2071 | svm->vmcb->save.sysenter_cs = data; | 2173 | svm->vmcb->save.sysenter_cs = data; |
| 2072 | break; | 2174 | break; |
| 2073 | case MSR_IA32_SYSENTER_EIP: | 2175 | case MSR_IA32_SYSENTER_EIP: |
| 2176 | svm->sysenter_eip = data; | ||
| 2074 | svm->vmcb->save.sysenter_eip = data; | 2177 | svm->vmcb->save.sysenter_eip = data; |
| 2075 | break; | 2178 | break; |
| 2076 | case MSR_IA32_SYSENTER_ESP: | 2179 | case MSR_IA32_SYSENTER_ESP: |
| 2180 | svm->sysenter_esp = data; | ||
| 2077 | svm->vmcb->save.sysenter_esp = data; | 2181 | svm->vmcb->save.sysenter_esp = data; |
| 2078 | break; | 2182 | break; |
| 2079 | case MSR_IA32_DEBUGCTLMSR: | 2183 | case MSR_IA32_DEBUGCTLMSR: |
| @@ -2091,24 +2195,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
| 2091 | else | 2195 | else |
| 2092 | svm_disable_lbrv(svm); | 2196 | svm_disable_lbrv(svm); |
| 2093 | break; | 2197 | break; |
| 2094 | case MSR_K7_EVNTSEL0: | ||
| 2095 | case MSR_K7_EVNTSEL1: | ||
| 2096 | case MSR_K7_EVNTSEL2: | ||
| 2097 | case MSR_K7_EVNTSEL3: | ||
| 2098 | case MSR_K7_PERFCTR0: | ||
| 2099 | case MSR_K7_PERFCTR1: | ||
| 2100 | case MSR_K7_PERFCTR2: | ||
| 2101 | case MSR_K7_PERFCTR3: | ||
| 2102 | /* | ||
| 2103 | * Just discard all writes to the performance counters; this | ||
| 2104 | * should keep both older linux and windows 64-bit guests | ||
| 2105 | * happy | ||
| 2106 | */ | ||
| 2107 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); | ||
| 2108 | |||
| 2109 | break; | ||
| 2110 | case MSR_VM_HSAVE_PA: | 2198 | case MSR_VM_HSAVE_PA: |
| 2111 | svm->hsave_msr = data; | 2199 | svm->nested.hsave_msr = data; |
| 2200 | break; | ||
| 2201 | case MSR_VM_CR: | ||
| 2202 | case MSR_VM_IGNNE: | ||
| 2203 | pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); | ||
| 2112 | break; | 2204 | break; |
| 2113 | default: | 2205 | default: |
| 2114 | return kvm_set_msr_common(vcpu, ecx, data); | 2206 | return kvm_set_msr_common(vcpu, ecx, data); |
| @@ -2122,8 +2214,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 2122 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | 2214 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) |
| 2123 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 2215 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
| 2124 | 2216 | ||
| 2125 | KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), | 2217 | trace_kvm_msr_write(ecx, data); |
| 2126 | handler); | ||
| 2127 | 2218 | ||
| 2128 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; | 2219 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
| 2129 | if (svm_set_msr(&svm->vcpu, ecx, data)) | 2220 | if (svm_set_msr(&svm->vcpu, ecx, data)) |
| @@ -2144,8 +2235,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
| 2144 | static int interrupt_window_interception(struct vcpu_svm *svm, | 2235 | static int interrupt_window_interception(struct vcpu_svm *svm, |
| 2145 | struct kvm_run *kvm_run) | 2236 | struct kvm_run *kvm_run) |
| 2146 | { | 2237 | { |
| 2147 | KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); | ||
| 2148 | |||
| 2149 | svm_clear_vintr(svm); | 2238 | svm_clear_vintr(svm); |
| 2150 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 2239 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
| 2151 | /* | 2240 | /* |
| @@ -2201,7 +2290,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
| 2201 | [SVM_EXIT_INVD] = emulate_on_interception, | 2290 | [SVM_EXIT_INVD] = emulate_on_interception, |
| 2202 | [SVM_EXIT_HLT] = halt_interception, | 2291 | [SVM_EXIT_HLT] = halt_interception, |
| 2203 | [SVM_EXIT_INVLPG] = invlpg_interception, | 2292 | [SVM_EXIT_INVLPG] = invlpg_interception, |
| 2204 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | 2293 | [SVM_EXIT_INVLPGA] = invlpga_interception, |
| 2205 | [SVM_EXIT_IOIO] = io_interception, | 2294 | [SVM_EXIT_IOIO] = io_interception, |
| 2206 | [SVM_EXIT_MSR] = msr_interception, | 2295 | [SVM_EXIT_MSR] = msr_interception, |
| 2207 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, | 2296 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, |
| @@ -2224,20 +2313,26 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
| 2224 | struct vcpu_svm *svm = to_svm(vcpu); | 2313 | struct vcpu_svm *svm = to_svm(vcpu); |
| 2225 | u32 exit_code = svm->vmcb->control.exit_code; | 2314 | u32 exit_code = svm->vmcb->control.exit_code; |
| 2226 | 2315 | ||
| 2227 | KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, | 2316 | trace_kvm_exit(exit_code, svm->vmcb->save.rip); |
| 2228 | (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); | ||
| 2229 | 2317 | ||
| 2230 | if (is_nested(svm)) { | 2318 | if (is_nested(svm)) { |
| 2319 | int vmexit; | ||
| 2320 | |||
| 2231 | nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", | 2321 | nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", |
| 2232 | exit_code, svm->vmcb->control.exit_info_1, | 2322 | exit_code, svm->vmcb->control.exit_info_1, |
| 2233 | svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); | 2323 | svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); |
| 2234 | if (nested_svm_exit_handled(svm, true)) { | 2324 | |
| 2235 | nested_svm_vmexit(svm); | 2325 | vmexit = nested_svm_exit_special(svm); |
| 2236 | nsvm_printk("-> #VMEXIT\n"); | 2326 | |
| 2327 | if (vmexit == NESTED_EXIT_CONTINUE) | ||
| 2328 | vmexit = nested_svm_exit_handled(svm); | ||
| 2329 | |||
| 2330 | if (vmexit == NESTED_EXIT_DONE) | ||
| 2237 | return 1; | 2331 | return 1; |
| 2238 | } | ||
| 2239 | } | 2332 | } |
| 2240 | 2333 | ||
| 2334 | svm_complete_interrupts(svm); | ||
| 2335 | |||
| 2241 | if (npt_enabled) { | 2336 | if (npt_enabled) { |
| 2242 | int mmu_reload = 0; | 2337 | int mmu_reload = 0; |
| 2243 | if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { | 2338 | if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { |
| @@ -2246,12 +2341,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
| 2246 | } | 2341 | } |
| 2247 | vcpu->arch.cr0 = svm->vmcb->save.cr0; | 2342 | vcpu->arch.cr0 = svm->vmcb->save.cr0; |
| 2248 | vcpu->arch.cr3 = svm->vmcb->save.cr3; | 2343 | vcpu->arch.cr3 = svm->vmcb->save.cr3; |
| 2249 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
| 2250 | if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
| 2251 | kvm_inject_gp(vcpu, 0); | ||
| 2252 | return 1; | ||
| 2253 | } | ||
| 2254 | } | ||
| 2255 | if (mmu_reload) { | 2344 | if (mmu_reload) { |
| 2256 | kvm_mmu_reset_context(vcpu); | 2345 | kvm_mmu_reset_context(vcpu); |
| 2257 | kvm_mmu_load(vcpu); | 2346 | kvm_mmu_load(vcpu); |
| @@ -2319,7 +2408,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
| 2319 | { | 2408 | { |
| 2320 | struct vmcb_control_area *control; | 2409 | struct vmcb_control_area *control; |
| 2321 | 2410 | ||
| 2322 | KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); | 2411 | trace_kvm_inj_virq(irq); |
| 2323 | 2412 | ||
| 2324 | ++svm->vcpu.stat.irq_injections; | 2413 | ++svm->vcpu.stat.irq_injections; |
| 2325 | control = &svm->vmcb->control; | 2414 | control = &svm->vmcb->control; |
| @@ -2329,21 +2418,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
| 2329 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | 2418 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); |
| 2330 | } | 2419 | } |
| 2331 | 2420 | ||
| 2332 | static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) | ||
| 2333 | { | ||
| 2334 | struct vcpu_svm *svm = to_svm(vcpu); | ||
| 2335 | |||
| 2336 | svm->vmcb->control.event_inj = nr | | ||
| 2337 | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; | ||
| 2338 | } | ||
| 2339 | |||
| 2340 | static void svm_set_irq(struct kvm_vcpu *vcpu) | 2421 | static void svm_set_irq(struct kvm_vcpu *vcpu) |
| 2341 | { | 2422 | { |
| 2342 | struct vcpu_svm *svm = to_svm(vcpu); | 2423 | struct vcpu_svm *svm = to_svm(vcpu); |
| 2343 | 2424 | ||
| 2344 | nested_svm_intr(svm); | 2425 | BUG_ON(!(gif_set(svm))); |
| 2345 | 2426 | ||
| 2346 | svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); | 2427 | svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | |
| 2428 | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; | ||
| 2347 | } | 2429 | } |
| 2348 | 2430 | ||
| 2349 | static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | 2431 | static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) |
| @@ -2371,13 +2453,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) | |||
| 2371 | struct vmcb *vmcb = svm->vmcb; | 2453 | struct vmcb *vmcb = svm->vmcb; |
| 2372 | return (vmcb->save.rflags & X86_EFLAGS_IF) && | 2454 | return (vmcb->save.rflags & X86_EFLAGS_IF) && |
| 2373 | !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && | 2455 | !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && |
| 2374 | (svm->vcpu.arch.hflags & HF_GIF_MASK); | 2456 | gif_set(svm) && |
| 2457 | !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); | ||
| 2375 | } | 2458 | } |
| 2376 | 2459 | ||
| 2377 | static void enable_irq_window(struct kvm_vcpu *vcpu) | 2460 | static void enable_irq_window(struct kvm_vcpu *vcpu) |
| 2378 | { | 2461 | { |
| 2379 | svm_set_vintr(to_svm(vcpu)); | 2462 | struct vcpu_svm *svm = to_svm(vcpu); |
| 2380 | svm_inject_irq(to_svm(vcpu), 0x0); | 2463 | nsvm_printk("Trying to open IRQ window\n"); |
| 2464 | |||
| 2465 | nested_svm_intr(svm); | ||
| 2466 | |||
| 2467 | /* In case GIF=0 we can't rely on the CPU to tell us when | ||
| 2468 | * GIF becomes 1, because that's a separate STGI/VMRUN intercept. | ||
| 2469 | * The next time we get that intercept, this function will be | ||
| 2470 | * called again though and we'll get the vintr intercept. */ | ||
| 2471 | if (gif_set(svm)) { | ||
| 2472 | svm_set_vintr(svm); | ||
| 2473 | svm_inject_irq(svm, 0x0); | ||
| 2474 | } | ||
| 2381 | } | 2475 | } |
| 2382 | 2476 | ||
| 2383 | static void enable_nmi_window(struct kvm_vcpu *vcpu) | 2477 | static void enable_nmi_window(struct kvm_vcpu *vcpu) |
| @@ -2456,6 +2550,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
| 2456 | case SVM_EXITINTINFO_TYPE_EXEPT: | 2550 | case SVM_EXITINTINFO_TYPE_EXEPT: |
| 2457 | /* In case of software exception do not reinject an exception | 2551 | /* In case of software exception do not reinject an exception |
| 2458 | vector, but re-execute and instruction instead */ | 2552 | vector, but re-execute and instruction instead */ |
| 2553 | if (is_nested(svm)) | ||
| 2554 | break; | ||
| 2459 | if (kvm_exception_is_soft(vector)) | 2555 | if (kvm_exception_is_soft(vector)) |
| 2460 | break; | 2556 | break; |
| 2461 | if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { | 2557 | if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { |
| @@ -2498,9 +2594,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2498 | fs_selector = kvm_read_fs(); | 2594 | fs_selector = kvm_read_fs(); |
| 2499 | gs_selector = kvm_read_gs(); | 2595 | gs_selector = kvm_read_gs(); |
| 2500 | ldt_selector = kvm_read_ldt(); | 2596 | ldt_selector = kvm_read_ldt(); |
| 2501 | svm->host_cr2 = kvm_read_cr2(); | 2597 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
| 2502 | if (!is_nested(svm)) | ||
| 2503 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | ||
| 2504 | /* required for live migration with NPT */ | 2598 | /* required for live migration with NPT */ |
| 2505 | if (npt_enabled) | 2599 | if (npt_enabled) |
| 2506 | svm->vmcb->save.cr3 = vcpu->arch.cr3; | 2600 | svm->vmcb->save.cr3 = vcpu->arch.cr3; |
| @@ -2585,8 +2679,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2585 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | 2679 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; |
| 2586 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | 2680 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; |
| 2587 | 2681 | ||
| 2588 | kvm_write_cr2(svm->host_cr2); | ||
| 2589 | |||
| 2590 | kvm_load_fs(fs_selector); | 2682 | kvm_load_fs(fs_selector); |
| 2591 | kvm_load_gs(gs_selector); | 2683 | kvm_load_gs(gs_selector); |
| 2592 | kvm_load_ldt(ldt_selector); | 2684 | kvm_load_ldt(ldt_selector); |
| @@ -2602,7 +2694,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2602 | 2694 | ||
| 2603 | svm->next_rip = 0; | 2695 | svm->next_rip = 0; |
| 2604 | 2696 | ||
| 2605 | svm_complete_interrupts(svm); | 2697 | if (npt_enabled) { |
| 2698 | vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); | ||
| 2699 | vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); | ||
| 2700 | } | ||
| 2606 | } | 2701 | } |
| 2607 | 2702 | ||
| 2608 | #undef R | 2703 | #undef R |
| @@ -2673,6 +2768,64 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | |||
| 2673 | return 0; | 2768 | return 0; |
| 2674 | } | 2769 | } |
| 2675 | 2770 | ||
| 2771 | static const struct trace_print_flags svm_exit_reasons_str[] = { | ||
| 2772 | { SVM_EXIT_READ_CR0, "read_cr0" }, | ||
| 2773 | { SVM_EXIT_READ_CR3, "read_cr3" }, | ||
| 2774 | { SVM_EXIT_READ_CR4, "read_cr4" }, | ||
| 2775 | { SVM_EXIT_READ_CR8, "read_cr8" }, | ||
| 2776 | { SVM_EXIT_WRITE_CR0, "write_cr0" }, | ||
| 2777 | { SVM_EXIT_WRITE_CR3, "write_cr3" }, | ||
| 2778 | { SVM_EXIT_WRITE_CR4, "write_cr4" }, | ||
| 2779 | { SVM_EXIT_WRITE_CR8, "write_cr8" }, | ||
| 2780 | { SVM_EXIT_READ_DR0, "read_dr0" }, | ||
| 2781 | { SVM_EXIT_READ_DR1, "read_dr1" }, | ||
| 2782 | { SVM_EXIT_READ_DR2, "read_dr2" }, | ||
| 2783 | { SVM_EXIT_READ_DR3, "read_dr3" }, | ||
| 2784 | { SVM_EXIT_WRITE_DR0, "write_dr0" }, | ||
| 2785 | { SVM_EXIT_WRITE_DR1, "write_dr1" }, | ||
| 2786 | { SVM_EXIT_WRITE_DR2, "write_dr2" }, | ||
| 2787 | { SVM_EXIT_WRITE_DR3, "write_dr3" }, | ||
| 2788 | { SVM_EXIT_WRITE_DR5, "write_dr5" }, | ||
| 2789 | { SVM_EXIT_WRITE_DR7, "write_dr7" }, | ||
| 2790 | { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, | ||
| 2791 | { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, | ||
| 2792 | { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, | ||
| 2793 | { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, | ||
| 2794 | { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, | ||
| 2795 | { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, | ||
| 2796 | { SVM_EXIT_INTR, "interrupt" }, | ||
| 2797 | { SVM_EXIT_NMI, "nmi" }, | ||
| 2798 | { SVM_EXIT_SMI, "smi" }, | ||
| 2799 | { SVM_EXIT_INIT, "init" }, | ||
| 2800 | { SVM_EXIT_VINTR, "vintr" }, | ||
| 2801 | { SVM_EXIT_CPUID, "cpuid" }, | ||
| 2802 | { SVM_EXIT_INVD, "invd" }, | ||
| 2803 | { SVM_EXIT_HLT, "hlt" }, | ||
| 2804 | { SVM_EXIT_INVLPG, "invlpg" }, | ||
| 2805 | { SVM_EXIT_INVLPGA, "invlpga" }, | ||
| 2806 | { SVM_EXIT_IOIO, "io" }, | ||
| 2807 | { SVM_EXIT_MSR, "msr" }, | ||
| 2808 | { SVM_EXIT_TASK_SWITCH, "task_switch" }, | ||
| 2809 | { SVM_EXIT_SHUTDOWN, "shutdown" }, | ||
| 2810 | { SVM_EXIT_VMRUN, "vmrun" }, | ||
| 2811 | { SVM_EXIT_VMMCALL, "hypercall" }, | ||
| 2812 | { SVM_EXIT_VMLOAD, "vmload" }, | ||
| 2813 | { SVM_EXIT_VMSAVE, "vmsave" }, | ||
| 2814 | { SVM_EXIT_STGI, "stgi" }, | ||
| 2815 | { SVM_EXIT_CLGI, "clgi" }, | ||
| 2816 | { SVM_EXIT_SKINIT, "skinit" }, | ||
| 2817 | { SVM_EXIT_WBINVD, "wbinvd" }, | ||
| 2818 | { SVM_EXIT_MONITOR, "monitor" }, | ||
| 2819 | { SVM_EXIT_MWAIT, "mwait" }, | ||
| 2820 | { SVM_EXIT_NPF, "npf" }, | ||
| 2821 | { -1, NULL } | ||
| 2822 | }; | ||
| 2823 | |||
| 2824 | static bool svm_gb_page_enable(void) | ||
| 2825 | { | ||
| 2826 | return true; | ||
| 2827 | } | ||
| 2828 | |||
| 2676 | static struct kvm_x86_ops svm_x86_ops = { | 2829 | static struct kvm_x86_ops svm_x86_ops = { |
| 2677 | .cpu_has_kvm_support = has_svm, | 2830 | .cpu_has_kvm_support = has_svm, |
| 2678 | .disabled_by_bios = is_disabled, | 2831 | .disabled_by_bios = is_disabled, |
| @@ -2710,6 +2863,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
| 2710 | .set_gdt = svm_set_gdt, | 2863 | .set_gdt = svm_set_gdt, |
| 2711 | .get_dr = svm_get_dr, | 2864 | .get_dr = svm_get_dr, |
| 2712 | .set_dr = svm_set_dr, | 2865 | .set_dr = svm_set_dr, |
| 2866 | .cache_reg = svm_cache_reg, | ||
| 2713 | .get_rflags = svm_get_rflags, | 2867 | .get_rflags = svm_get_rflags, |
| 2714 | .set_rflags = svm_set_rflags, | 2868 | .set_rflags = svm_set_rflags, |
| 2715 | 2869 | ||
| @@ -2733,6 +2887,9 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
| 2733 | .set_tss_addr = svm_set_tss_addr, | 2887 | .set_tss_addr = svm_set_tss_addr, |
| 2734 | .get_tdp_level = get_npt_level, | 2888 | .get_tdp_level = get_npt_level, |
| 2735 | .get_mt_mask = svm_get_mt_mask, | 2889 | .get_mt_mask = svm_get_mt_mask, |
| 2890 | |||
| 2891 | .exit_reasons_str = svm_exit_reasons_str, | ||
| 2892 | .gb_page_enable = svm_gb_page_enable, | ||
| 2736 | }; | 2893 | }; |
| 2737 | 2894 | ||
| 2738 | static int __init svm_init(void) | 2895 | static int __init svm_init(void) |
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 86dbac072d0..eea40439066 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c | |||
| @@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) | |||
| 9 | int restart_timer = 0; | 9 | int restart_timer = 0; |
| 10 | wait_queue_head_t *q = &vcpu->wq; | 10 | wait_queue_head_t *q = &vcpu->wq; |
| 11 | 11 | ||
| 12 | /* FIXME: this code should not know anything about vcpus */ | 12 | /* |
| 13 | if (!atomic_inc_and_test(&ktimer->pending)) | 13 | * There is a race window between reading and incrementing, but we do |
| 14 | * not care about potentially loosing timer events in the !reinject | ||
| 15 | * case anyway. | ||
| 16 | */ | ||
| 17 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { | ||
| 18 | atomic_inc(&ktimer->pending); | ||
| 19 | /* FIXME: this code should not know anything about vcpus */ | ||
| 14 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); | 20 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); |
| 15 | 21 | } | |
| 16 | if (!ktimer->reinject) | ||
| 17 | atomic_set(&ktimer->pending, 1); | ||
| 18 | 22 | ||
| 19 | if (waitqueue_active(q)) | 23 | if (waitqueue_active(q)) |
| 20 | wake_up_interruptible(q); | 24 | wake_up_interruptible(q); |
| @@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) | |||
| 33 | struct kvm_vcpu *vcpu; | 37 | struct kvm_vcpu *vcpu; |
| 34 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); | 38 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); |
| 35 | 39 | ||
| 36 | vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; | 40 | vcpu = ktimer->vcpu; |
| 37 | if (!vcpu) | 41 | if (!vcpu) |
| 38 | return HRTIMER_NORESTART; | 42 | return HRTIMER_NORESTART; |
| 39 | 43 | ||
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h new file mode 100644 index 00000000000..0d480e77eac --- /dev/null +++ b/arch/x86/kvm/trace.h | |||
| @@ -0,0 +1,355 @@ | |||
| 1 | #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 2 | #define _TRACE_KVM_H | ||
| 3 | |||
| 4 | #include <linux/tracepoint.h> | ||
| 5 | |||
| 6 | #undef TRACE_SYSTEM | ||
| 7 | #define TRACE_SYSTEM kvm | ||
| 8 | #define TRACE_INCLUDE_PATH arch/x86/kvm | ||
| 9 | #define TRACE_INCLUDE_FILE trace | ||
| 10 | |||
| 11 | /* | ||
| 12 | * Tracepoint for guest mode entry. | ||
| 13 | */ | ||
| 14 | TRACE_EVENT(kvm_entry, | ||
| 15 | TP_PROTO(unsigned int vcpu_id), | ||
| 16 | TP_ARGS(vcpu_id), | ||
| 17 | |||
| 18 | TP_STRUCT__entry( | ||
| 19 | __field( unsigned int, vcpu_id ) | ||
| 20 | ), | ||
| 21 | |||
| 22 | TP_fast_assign( | ||
| 23 | __entry->vcpu_id = vcpu_id; | ||
| 24 | ), | ||
| 25 | |||
| 26 | TP_printk("vcpu %u", __entry->vcpu_id) | ||
| 27 | ); | ||
| 28 | |||
| 29 | /* | ||
| 30 | * Tracepoint for hypercall. | ||
| 31 | */ | ||
| 32 | TRACE_EVENT(kvm_hypercall, | ||
| 33 | TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, | ||
| 34 | unsigned long a2, unsigned long a3), | ||
| 35 | TP_ARGS(nr, a0, a1, a2, a3), | ||
| 36 | |||
| 37 | TP_STRUCT__entry( | ||
| 38 | __field( unsigned long, nr ) | ||
| 39 | __field( unsigned long, a0 ) | ||
| 40 | __field( unsigned long, a1 ) | ||
| 41 | __field( unsigned long, a2 ) | ||
| 42 | __field( unsigned long, a3 ) | ||
| 43 | ), | ||
| 44 | |||
| 45 | TP_fast_assign( | ||
| 46 | __entry->nr = nr; | ||
| 47 | __entry->a0 = a0; | ||
| 48 | __entry->a1 = a1; | ||
| 49 | __entry->a2 = a2; | ||
| 50 | __entry->a3 = a3; | ||
| 51 | ), | ||
| 52 | |||
| 53 | TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", | ||
| 54 | __entry->nr, __entry->a0, __entry->a1, __entry->a2, | ||
| 55 | __entry->a3) | ||
| 56 | ); | ||
| 57 | |||
| 58 | /* | ||
| 59 | * Tracepoint for PIO. | ||
| 60 | */ | ||
| 61 | TRACE_EVENT(kvm_pio, | ||
| 62 | TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, | ||
| 63 | unsigned int count), | ||
| 64 | TP_ARGS(rw, port, size, count), | ||
| 65 | |||
| 66 | TP_STRUCT__entry( | ||
| 67 | __field( unsigned int, rw ) | ||
| 68 | __field( unsigned int, port ) | ||
| 69 | __field( unsigned int, size ) | ||
| 70 | __field( unsigned int, count ) | ||
| 71 | ), | ||
| 72 | |||
| 73 | TP_fast_assign( | ||
| 74 | __entry->rw = rw; | ||
| 75 | __entry->port = port; | ||
| 76 | __entry->size = size; | ||
| 77 | __entry->count = count; | ||
| 78 | ), | ||
| 79 | |||
| 80 | TP_printk("pio_%s at 0x%x size %d count %d", | ||
| 81 | __entry->rw ? "write" : "read", | ||
| 82 | __entry->port, __entry->size, __entry->count) | ||
| 83 | ); | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Tracepoint for cpuid. | ||
| 87 | */ | ||
| 88 | TRACE_EVENT(kvm_cpuid, | ||
| 89 | TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, | ||
| 90 | unsigned long rcx, unsigned long rdx), | ||
| 91 | TP_ARGS(function, rax, rbx, rcx, rdx), | ||
| 92 | |||
| 93 | TP_STRUCT__entry( | ||
| 94 | __field( unsigned int, function ) | ||
| 95 | __field( unsigned long, rax ) | ||
| 96 | __field( unsigned long, rbx ) | ||
| 97 | __field( unsigned long, rcx ) | ||
| 98 | __field( unsigned long, rdx ) | ||
| 99 | ), | ||
| 100 | |||
| 101 | TP_fast_assign( | ||
| 102 | __entry->function = function; | ||
| 103 | __entry->rax = rax; | ||
| 104 | __entry->rbx = rbx; | ||
| 105 | __entry->rcx = rcx; | ||
| 106 | __entry->rdx = rdx; | ||
| 107 | ), | ||
| 108 | |||
| 109 | TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", | ||
| 110 | __entry->function, __entry->rax, | ||
| 111 | __entry->rbx, __entry->rcx, __entry->rdx) | ||
| 112 | ); | ||
| 113 | |||
| 114 | #define AREG(x) { APIC_##x, "APIC_" #x } | ||
| 115 | |||
| 116 | #define kvm_trace_symbol_apic \ | ||
| 117 | AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ | ||
| 118 | AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ | ||
| 119 | AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ | ||
| 120 | AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ | ||
| 121 | AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ | ||
| 122 | AREG(ECTRL) | ||
| 123 | /* | ||
| 124 | * Tracepoint for apic access. | ||
| 125 | */ | ||
| 126 | TRACE_EVENT(kvm_apic, | ||
| 127 | TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), | ||
| 128 | TP_ARGS(rw, reg, val), | ||
| 129 | |||
| 130 | TP_STRUCT__entry( | ||
| 131 | __field( unsigned int, rw ) | ||
| 132 | __field( unsigned int, reg ) | ||
| 133 | __field( unsigned int, val ) | ||
| 134 | ), | ||
| 135 | |||
| 136 | TP_fast_assign( | ||
| 137 | __entry->rw = rw; | ||
| 138 | __entry->reg = reg; | ||
| 139 | __entry->val = val; | ||
| 140 | ), | ||
| 141 | |||
| 142 | TP_printk("apic_%s %s = 0x%x", | ||
| 143 | __entry->rw ? "write" : "read", | ||
| 144 | __print_symbolic(__entry->reg, kvm_trace_symbol_apic), | ||
| 145 | __entry->val) | ||
| 146 | ); | ||
| 147 | |||
| 148 | #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) | ||
| 149 | #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) | ||
| 150 | |||
| 151 | /* | ||
| 152 | * Tracepoint for kvm guest exit: | ||
| 153 | */ | ||
| 154 | TRACE_EVENT(kvm_exit, | ||
| 155 | TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), | ||
| 156 | TP_ARGS(exit_reason, guest_rip), | ||
| 157 | |||
| 158 | TP_STRUCT__entry( | ||
| 159 | __field( unsigned int, exit_reason ) | ||
| 160 | __field( unsigned long, guest_rip ) | ||
| 161 | ), | ||
| 162 | |||
| 163 | TP_fast_assign( | ||
| 164 | __entry->exit_reason = exit_reason; | ||
| 165 | __entry->guest_rip = guest_rip; | ||
| 166 | ), | ||
| 167 | |||
| 168 | TP_printk("reason %s rip 0x%lx", | ||
| 169 | ftrace_print_symbols_seq(p, __entry->exit_reason, | ||
| 170 | kvm_x86_ops->exit_reasons_str), | ||
| 171 | __entry->guest_rip) | ||
| 172 | ); | ||
| 173 | |||
| 174 | /* | ||
| 175 | * Tracepoint for kvm interrupt injection: | ||
| 176 | */ | ||
| 177 | TRACE_EVENT(kvm_inj_virq, | ||
| 178 | TP_PROTO(unsigned int irq), | ||
| 179 | TP_ARGS(irq), | ||
| 180 | |||
| 181 | TP_STRUCT__entry( | ||
| 182 | __field( unsigned int, irq ) | ||
| 183 | ), | ||
| 184 | |||
| 185 | TP_fast_assign( | ||
| 186 | __entry->irq = irq; | ||
| 187 | ), | ||
| 188 | |||
| 189 | TP_printk("irq %u", __entry->irq) | ||
| 190 | ); | ||
| 191 | |||
| 192 | /* | ||
| 193 | * Tracepoint for page fault. | ||
| 194 | */ | ||
| 195 | TRACE_EVENT(kvm_page_fault, | ||
| 196 | TP_PROTO(unsigned long fault_address, unsigned int error_code), | ||
| 197 | TP_ARGS(fault_address, error_code), | ||
| 198 | |||
| 199 | TP_STRUCT__entry( | ||
| 200 | __field( unsigned long, fault_address ) | ||
| 201 | __field( unsigned int, error_code ) | ||
| 202 | ), | ||
| 203 | |||
| 204 | TP_fast_assign( | ||
| 205 | __entry->fault_address = fault_address; | ||
| 206 | __entry->error_code = error_code; | ||
| 207 | ), | ||
| 208 | |||
| 209 | TP_printk("address %lx error_code %x", | ||
| 210 | __entry->fault_address, __entry->error_code) | ||
| 211 | ); | ||
| 212 | |||
| 213 | /* | ||
| 214 | * Tracepoint for guest MSR access. | ||
| 215 | */ | ||
| 216 | TRACE_EVENT(kvm_msr, | ||
| 217 | TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), | ||
| 218 | TP_ARGS(rw, ecx, data), | ||
| 219 | |||
| 220 | TP_STRUCT__entry( | ||
| 221 | __field( unsigned int, rw ) | ||
| 222 | __field( unsigned int, ecx ) | ||
| 223 | __field( unsigned long, data ) | ||
| 224 | ), | ||
| 225 | |||
| 226 | TP_fast_assign( | ||
| 227 | __entry->rw = rw; | ||
| 228 | __entry->ecx = ecx; | ||
| 229 | __entry->data = data; | ||
| 230 | ), | ||
| 231 | |||
| 232 | TP_printk("msr_%s %x = 0x%lx", | ||
| 233 | __entry->rw ? "write" : "read", | ||
| 234 | __entry->ecx, __entry->data) | ||
| 235 | ); | ||
| 236 | |||
| 237 | #define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) | ||
| 238 | #define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) | ||
| 239 | |||
| 240 | /* | ||
| 241 | * Tracepoint for guest CR access. | ||
| 242 | */ | ||
| 243 | TRACE_EVENT(kvm_cr, | ||
| 244 | TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), | ||
| 245 | TP_ARGS(rw, cr, val), | ||
| 246 | |||
| 247 | TP_STRUCT__entry( | ||
| 248 | __field( unsigned int, rw ) | ||
| 249 | __field( unsigned int, cr ) | ||
| 250 | __field( unsigned long, val ) | ||
| 251 | ), | ||
| 252 | |||
| 253 | TP_fast_assign( | ||
| 254 | __entry->rw = rw; | ||
| 255 | __entry->cr = cr; | ||
| 256 | __entry->val = val; | ||
| 257 | ), | ||
| 258 | |||
| 259 | TP_printk("cr_%s %x = 0x%lx", | ||
| 260 | __entry->rw ? "write" : "read", | ||
| 261 | __entry->cr, __entry->val) | ||
| 262 | ); | ||
| 263 | |||
| 264 | #define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) | ||
| 265 | #define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) | ||
| 266 | |||
| 267 | TRACE_EVENT(kvm_pic_set_irq, | ||
| 268 | TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), | ||
| 269 | TP_ARGS(chip, pin, elcr, imr, coalesced), | ||
| 270 | |||
| 271 | TP_STRUCT__entry( | ||
| 272 | __field( __u8, chip ) | ||
| 273 | __field( __u8, pin ) | ||
| 274 | __field( __u8, elcr ) | ||
| 275 | __field( __u8, imr ) | ||
| 276 | __field( bool, coalesced ) | ||
| 277 | ), | ||
| 278 | |||
| 279 | TP_fast_assign( | ||
| 280 | __entry->chip = chip; | ||
| 281 | __entry->pin = pin; | ||
| 282 | __entry->elcr = elcr; | ||
| 283 | __entry->imr = imr; | ||
| 284 | __entry->coalesced = coalesced; | ||
| 285 | ), | ||
| 286 | |||
| 287 | TP_printk("chip %u pin %u (%s%s)%s", | ||
| 288 | __entry->chip, __entry->pin, | ||
| 289 | (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", | ||
| 290 | (__entry->imr & (1 << __entry->pin)) ? "|masked":"", | ||
| 291 | __entry->coalesced ? " (coalesced)" : "") | ||
| 292 | ); | ||
| 293 | |||
| 294 | #define kvm_apic_dst_shorthand \ | ||
| 295 | {0x0, "dst"}, \ | ||
| 296 | {0x1, "self"}, \ | ||
| 297 | {0x2, "all"}, \ | ||
| 298 | {0x3, "all-but-self"} | ||
| 299 | |||
| 300 | TRACE_EVENT(kvm_apic_ipi, | ||
| 301 | TP_PROTO(__u32 icr_low, __u32 dest_id), | ||
| 302 | TP_ARGS(icr_low, dest_id), | ||
| 303 | |||
| 304 | TP_STRUCT__entry( | ||
| 305 | __field( __u32, icr_low ) | ||
| 306 | __field( __u32, dest_id ) | ||
| 307 | ), | ||
| 308 | |||
| 309 | TP_fast_assign( | ||
| 310 | __entry->icr_low = icr_low; | ||
| 311 | __entry->dest_id = dest_id; | ||
| 312 | ), | ||
| 313 | |||
| 314 | TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", | ||
| 315 | __entry->dest_id, (u8)__entry->icr_low, | ||
| 316 | __print_symbolic((__entry->icr_low >> 8 & 0x7), | ||
| 317 | kvm_deliver_mode), | ||
| 318 | (__entry->icr_low & (1<<11)) ? "logical" : "physical", | ||
| 319 | (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", | ||
| 320 | (__entry->icr_low & (1<<15)) ? "level" : "edge", | ||
| 321 | __print_symbolic((__entry->icr_low >> 18 & 0x3), | ||
| 322 | kvm_apic_dst_shorthand)) | ||
| 323 | ); | ||
| 324 | |||
| 325 | TRACE_EVENT(kvm_apic_accept_irq, | ||
| 326 | TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), | ||
| 327 | TP_ARGS(apicid, dm, tm, vec, coalesced), | ||
| 328 | |||
| 329 | TP_STRUCT__entry( | ||
| 330 | __field( __u32, apicid ) | ||
| 331 | __field( __u16, dm ) | ||
| 332 | __field( __u8, tm ) | ||
| 333 | __field( __u8, vec ) | ||
| 334 | __field( bool, coalesced ) | ||
| 335 | ), | ||
| 336 | |||
| 337 | TP_fast_assign( | ||
| 338 | __entry->apicid = apicid; | ||
| 339 | __entry->dm = dm; | ||
| 340 | __entry->tm = tm; | ||
| 341 | __entry->vec = vec; | ||
| 342 | __entry->coalesced = coalesced; | ||
| 343 | ), | ||
| 344 | |||
| 345 | TP_printk("apicid %x vec %u (%s|%s)%s", | ||
| 346 | __entry->apicid, __entry->vec, | ||
| 347 | __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), | ||
| 348 | __entry->tm ? "level" : "edge", | ||
| 349 | __entry->coalesced ? " (coalesced)" : "") | ||
| 350 | ); | ||
| 351 | |||
| 352 | #endif /* _TRACE_KVM_H */ | ||
| 353 | |||
| 354 | /* This part must be outside protection */ | ||
| 355 | #include <trace/define_trace.h> | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f912927a5..f3812014bd0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/highmem.h> | 25 | #include <linux/highmem.h> |
| 26 | #include <linux/sched.h> | 26 | #include <linux/sched.h> |
| 27 | #include <linux/moduleparam.h> | 27 | #include <linux/moduleparam.h> |
| 28 | #include <linux/ftrace_event.h> | ||
| 28 | #include "kvm_cache_regs.h" | 29 | #include "kvm_cache_regs.h" |
| 29 | #include "x86.h" | 30 | #include "x86.h" |
| 30 | 31 | ||
| @@ -34,6 +35,8 @@ | |||
| 34 | #include <asm/virtext.h> | 35 | #include <asm/virtext.h> |
| 35 | #include <asm/mce.h> | 36 | #include <asm/mce.h> |
| 36 | 37 | ||
| 38 | #include "trace.h" | ||
| 39 | |||
| 37 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 40 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
| 38 | 41 | ||
| 39 | MODULE_AUTHOR("Qumranet"); | 42 | MODULE_AUTHOR("Qumranet"); |
| @@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); | |||
| 51 | static int __read_mostly enable_ept = 1; | 54 | static int __read_mostly enable_ept = 1; |
| 52 | module_param_named(ept, enable_ept, bool, S_IRUGO); | 55 | module_param_named(ept, enable_ept, bool, S_IRUGO); |
| 53 | 56 | ||
| 57 | static int __read_mostly enable_unrestricted_guest = 1; | ||
| 58 | module_param_named(unrestricted_guest, | ||
| 59 | enable_unrestricted_guest, bool, S_IRUGO); | ||
| 60 | |||
| 54 | static int __read_mostly emulate_invalid_guest_state = 0; | 61 | static int __read_mostly emulate_invalid_guest_state = 0; |
| 55 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); | 62 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); |
| 56 | 63 | ||
| @@ -84,6 +91,14 @@ struct vcpu_vmx { | |||
| 84 | int guest_efer_loaded; | 91 | int guest_efer_loaded; |
| 85 | } host_state; | 92 | } host_state; |
| 86 | struct { | 93 | struct { |
| 94 | int vm86_active; | ||
| 95 | u8 save_iopl; | ||
| 96 | struct kvm_save_segment { | ||
| 97 | u16 selector; | ||
| 98 | unsigned long base; | ||
| 99 | u32 limit; | ||
| 100 | u32 ar; | ||
| 101 | } tr, es, ds, fs, gs; | ||
| 87 | struct { | 102 | struct { |
| 88 | bool pending; | 103 | bool pending; |
| 89 | u8 vector; | 104 | u8 vector; |
| @@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field { | |||
| 161 | VMX_SEGMENT_FIELD(LDTR), | 176 | VMX_SEGMENT_FIELD(LDTR), |
| 162 | }; | 177 | }; |
| 163 | 178 | ||
| 179 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu); | ||
| 180 | |||
| 164 | /* | 181 | /* |
| 165 | * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it | 182 | * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it |
| 166 | * away by decrementing the array size. | 183 | * away by decrementing the array size. |
| @@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void) | |||
| 256 | cpu_has_vmx_virtualize_apic_accesses(); | 273 | cpu_has_vmx_virtualize_apic_accesses(); |
| 257 | } | 274 | } |
| 258 | 275 | ||
| 276 | static inline bool cpu_has_vmx_ept_execute_only(void) | ||
| 277 | { | ||
| 278 | return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); | ||
| 279 | } | ||
| 280 | |||
| 281 | static inline bool cpu_has_vmx_eptp_uncacheable(void) | ||
| 282 | { | ||
| 283 | return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); | ||
| 284 | } | ||
| 285 | |||
| 286 | static inline bool cpu_has_vmx_eptp_writeback(void) | ||
| 287 | { | ||
| 288 | return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); | ||
| 289 | } | ||
| 290 | |||
| 291 | static inline bool cpu_has_vmx_ept_2m_page(void) | ||
| 292 | { | ||
| 293 | return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); | ||
| 294 | } | ||
| 295 | |||
| 259 | static inline int cpu_has_vmx_invept_individual_addr(void) | 296 | static inline int cpu_has_vmx_invept_individual_addr(void) |
| 260 | { | 297 | { |
| 261 | return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); | 298 | return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); |
| @@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void) | |||
| 277 | SECONDARY_EXEC_ENABLE_EPT; | 314 | SECONDARY_EXEC_ENABLE_EPT; |
| 278 | } | 315 | } |
| 279 | 316 | ||
| 317 | static inline int cpu_has_vmx_unrestricted_guest(void) | ||
| 318 | { | ||
| 319 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 320 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 321 | } | ||
| 322 | |||
| 280 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | 323 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) |
| 281 | { | 324 | { |
| 282 | return flexpriority_enabled && | 325 | return flexpriority_enabled && |
| @@ -497,14 +540,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
| 497 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); | 540 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); |
| 498 | if (!vcpu->fpu_active) | 541 | if (!vcpu->fpu_active) |
| 499 | eb |= 1u << NM_VECTOR; | 542 | eb |= 1u << NM_VECTOR; |
| 543 | /* | ||
| 544 | * Unconditionally intercept #DB so we can maintain dr6 without | ||
| 545 | * reading it every exit. | ||
| 546 | */ | ||
| 547 | eb |= 1u << DB_VECTOR; | ||
| 500 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { | 548 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { |
| 501 | if (vcpu->guest_debug & | ||
| 502 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | ||
| 503 | eb |= 1u << DB_VECTOR; | ||
| 504 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | 549 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
| 505 | eb |= 1u << BP_VECTOR; | 550 | eb |= 1u << BP_VECTOR; |
| 506 | } | 551 | } |
| 507 | if (vcpu->arch.rmode.vm86_active) | 552 | if (to_vmx(vcpu)->rmode.vm86_active) |
| 508 | eb = ~0; | 553 | eb = ~0; |
| 509 | if (enable_ept) | 554 | if (enable_ept) |
| 510 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ | 555 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ |
| @@ -528,12 +573,15 @@ static void reload_tss(void) | |||
| 528 | static void load_transition_efer(struct vcpu_vmx *vmx) | 573 | static void load_transition_efer(struct vcpu_vmx *vmx) |
| 529 | { | 574 | { |
| 530 | int efer_offset = vmx->msr_offset_efer; | 575 | int efer_offset = vmx->msr_offset_efer; |
| 531 | u64 host_efer = vmx->host_msrs[efer_offset].data; | 576 | u64 host_efer; |
| 532 | u64 guest_efer = vmx->guest_msrs[efer_offset].data; | 577 | u64 guest_efer; |
| 533 | u64 ignore_bits; | 578 | u64 ignore_bits; |
| 534 | 579 | ||
| 535 | if (efer_offset < 0) | 580 | if (efer_offset < 0) |
| 536 | return; | 581 | return; |
| 582 | host_efer = vmx->host_msrs[efer_offset].data; | ||
| 583 | guest_efer = vmx->guest_msrs[efer_offset].data; | ||
| 584 | |||
| 537 | /* | 585 | /* |
| 538 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless | 586 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless |
| 539 | * outside long mode | 587 | * outside long mode |
| @@ -735,12 +783,17 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) | |||
| 735 | 783 | ||
| 736 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | 784 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) |
| 737 | { | 785 | { |
| 738 | return vmcs_readl(GUEST_RFLAGS); | 786 | unsigned long rflags; |
| 787 | |||
| 788 | rflags = vmcs_readl(GUEST_RFLAGS); | ||
| 789 | if (to_vmx(vcpu)->rmode.vm86_active) | ||
| 790 | rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | ||
| 791 | return rflags; | ||
| 739 | } | 792 | } |
| 740 | 793 | ||
| 741 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 794 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
| 742 | { | 795 | { |
| 743 | if (vcpu->arch.rmode.vm86_active) | 796 | if (to_vmx(vcpu)->rmode.vm86_active) |
| 744 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 797 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
| 745 | vmcs_writel(GUEST_RFLAGS, rflags); | 798 | vmcs_writel(GUEST_RFLAGS, rflags); |
| 746 | } | 799 | } |
| @@ -797,12 +850,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
| 797 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | 850 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; |
| 798 | } | 851 | } |
| 799 | 852 | ||
| 800 | if (vcpu->arch.rmode.vm86_active) { | 853 | if (vmx->rmode.vm86_active) { |
| 801 | vmx->rmode.irq.pending = true; | 854 | vmx->rmode.irq.pending = true; |
| 802 | vmx->rmode.irq.vector = nr; | 855 | vmx->rmode.irq.vector = nr; |
| 803 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 856 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
| 804 | if (nr == BP_VECTOR || nr == OF_VECTOR) | 857 | if (kvm_exception_is_soft(nr)) |
| 805 | vmx->rmode.irq.rip++; | 858 | vmx->rmode.irq.rip += |
| 859 | vmx->vcpu.arch.event_exit_inst_len; | ||
| 806 | intr_info |= INTR_TYPE_SOFT_INTR; | 860 | intr_info |= INTR_TYPE_SOFT_INTR; |
| 807 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | 861 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); |
| 808 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | 862 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); |
| @@ -940,7 +994,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
| 940 | case MSR_EFER: | 994 | case MSR_EFER: |
| 941 | return kvm_get_msr_common(vcpu, msr_index, pdata); | 995 | return kvm_get_msr_common(vcpu, msr_index, pdata); |
| 942 | #endif | 996 | #endif |
| 943 | case MSR_IA32_TIME_STAMP_COUNTER: | 997 | case MSR_IA32_TSC: |
| 944 | data = guest_read_tsc(); | 998 | data = guest_read_tsc(); |
| 945 | break; | 999 | break; |
| 946 | case MSR_IA32_SYSENTER_CS: | 1000 | case MSR_IA32_SYSENTER_CS: |
| @@ -953,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
| 953 | data = vmcs_readl(GUEST_SYSENTER_ESP); | 1007 | data = vmcs_readl(GUEST_SYSENTER_ESP); |
| 954 | break; | 1008 | break; |
| 955 | default: | 1009 | default: |
| 956 | vmx_load_host_state(to_vmx(vcpu)); | ||
| 957 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | 1010 | msr = find_msr_entry(to_vmx(vcpu), msr_index); |
| 958 | if (msr) { | 1011 | if (msr) { |
| 1012 | vmx_load_host_state(to_vmx(vcpu)); | ||
| 959 | data = msr->data; | 1013 | data = msr->data; |
| 960 | break; | 1014 | break; |
| 961 | } | 1015 | } |
| @@ -1000,22 +1054,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
| 1000 | case MSR_IA32_SYSENTER_ESP: | 1054 | case MSR_IA32_SYSENTER_ESP: |
| 1001 | vmcs_writel(GUEST_SYSENTER_ESP, data); | 1055 | vmcs_writel(GUEST_SYSENTER_ESP, data); |
| 1002 | break; | 1056 | break; |
| 1003 | case MSR_IA32_TIME_STAMP_COUNTER: | 1057 | case MSR_IA32_TSC: |
| 1004 | rdtscll(host_tsc); | 1058 | rdtscll(host_tsc); |
| 1005 | guest_write_tsc(data, host_tsc); | 1059 | guest_write_tsc(data, host_tsc); |
| 1006 | break; | 1060 | break; |
| 1007 | case MSR_P6_PERFCTR0: | ||
| 1008 | case MSR_P6_PERFCTR1: | ||
| 1009 | case MSR_P6_EVNTSEL0: | ||
| 1010 | case MSR_P6_EVNTSEL1: | ||
| 1011 | /* | ||
| 1012 | * Just discard all writes to the performance counters; this | ||
| 1013 | * should keep both older linux and windows 64-bit guests | ||
| 1014 | * happy | ||
| 1015 | */ | ||
| 1016 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); | ||
| 1017 | |||
| 1018 | break; | ||
| 1019 | case MSR_IA32_CR_PAT: | 1061 | case MSR_IA32_CR_PAT: |
| 1020 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | 1062 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
| 1021 | vmcs_write64(GUEST_IA32_PAT, data); | 1063 | vmcs_write64(GUEST_IA32_PAT, data); |
| @@ -1024,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
| 1024 | } | 1066 | } |
| 1025 | /* Otherwise falls through to kvm_set_msr_common */ | 1067 | /* Otherwise falls through to kvm_set_msr_common */ |
| 1026 | default: | 1068 | default: |
| 1027 | vmx_load_host_state(vmx); | ||
| 1028 | msr = find_msr_entry(vmx, msr_index); | 1069 | msr = find_msr_entry(vmx, msr_index); |
| 1029 | if (msr) { | 1070 | if (msr) { |
| 1071 | vmx_load_host_state(vmx); | ||
| 1030 | msr->data = data; | 1072 | msr->data = data; |
| 1031 | break; | 1073 | break; |
| 1032 | } | 1074 | } |
| @@ -1046,6 +1088,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
| 1046 | case VCPU_REGS_RIP: | 1088 | case VCPU_REGS_RIP: |
| 1047 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); | 1089 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); |
| 1048 | break; | 1090 | break; |
| 1091 | case VCPU_EXREG_PDPTR: | ||
| 1092 | if (enable_ept) | ||
| 1093 | ept_save_pdptrs(vcpu); | ||
| 1094 | break; | ||
| 1049 | default: | 1095 | default: |
| 1050 | break; | 1096 | break; |
| 1051 | } | 1097 | } |
| @@ -1203,7 +1249,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
| 1203 | opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | 1249 | opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | |
| 1204 | SECONDARY_EXEC_WBINVD_EXITING | | 1250 | SECONDARY_EXEC_WBINVD_EXITING | |
| 1205 | SECONDARY_EXEC_ENABLE_VPID | | 1251 | SECONDARY_EXEC_ENABLE_VPID | |
| 1206 | SECONDARY_EXEC_ENABLE_EPT; | 1252 | SECONDARY_EXEC_ENABLE_EPT | |
| 1253 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 1207 | if (adjust_vmx_controls(min2, opt2, | 1254 | if (adjust_vmx_controls(min2, opt2, |
| 1208 | MSR_IA32_VMX_PROCBASED_CTLS2, | 1255 | MSR_IA32_VMX_PROCBASED_CTLS2, |
| 1209 | &_cpu_based_2nd_exec_control) < 0) | 1256 | &_cpu_based_2nd_exec_control) < 0) |
| @@ -1217,12 +1264,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
| 1217 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { | 1264 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { |
| 1218 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT | 1265 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT |
| 1219 | enabled */ | 1266 | enabled */ |
| 1220 | min &= ~(CPU_BASED_CR3_LOAD_EXITING | | 1267 | _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | |
| 1221 | CPU_BASED_CR3_STORE_EXITING | | 1268 | CPU_BASED_CR3_STORE_EXITING | |
| 1222 | CPU_BASED_INVLPG_EXITING); | 1269 | CPU_BASED_INVLPG_EXITING); |
| 1223 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | ||
| 1224 | &_cpu_based_exec_control) < 0) | ||
| 1225 | return -EIO; | ||
| 1226 | rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, | 1270 | rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, |
| 1227 | vmx_capability.ept, vmx_capability.vpid); | 1271 | vmx_capability.ept, vmx_capability.vpid); |
| 1228 | } | 1272 | } |
| @@ -1333,8 +1377,13 @@ static __init int hardware_setup(void) | |||
| 1333 | if (!cpu_has_vmx_vpid()) | 1377 | if (!cpu_has_vmx_vpid()) |
| 1334 | enable_vpid = 0; | 1378 | enable_vpid = 0; |
| 1335 | 1379 | ||
| 1336 | if (!cpu_has_vmx_ept()) | 1380 | if (!cpu_has_vmx_ept()) { |
| 1337 | enable_ept = 0; | 1381 | enable_ept = 0; |
| 1382 | enable_unrestricted_guest = 0; | ||
| 1383 | } | ||
| 1384 | |||
| 1385 | if (!cpu_has_vmx_unrestricted_guest()) | ||
| 1386 | enable_unrestricted_guest = 0; | ||
| 1338 | 1387 | ||
| 1339 | if (!cpu_has_vmx_flexpriority()) | 1388 | if (!cpu_has_vmx_flexpriority()) |
| 1340 | flexpriority_enabled = 0; | 1389 | flexpriority_enabled = 0; |
| @@ -1342,6 +1391,9 @@ static __init int hardware_setup(void) | |||
| 1342 | if (!cpu_has_vmx_tpr_shadow()) | 1391 | if (!cpu_has_vmx_tpr_shadow()) |
| 1343 | kvm_x86_ops->update_cr8_intercept = NULL; | 1392 | kvm_x86_ops->update_cr8_intercept = NULL; |
| 1344 | 1393 | ||
| 1394 | if (enable_ept && !cpu_has_vmx_ept_2m_page()) | ||
| 1395 | kvm_disable_largepages(); | ||
| 1396 | |||
| 1345 | return alloc_kvm_area(); | 1397 | return alloc_kvm_area(); |
| 1346 | } | 1398 | } |
| 1347 | 1399 | ||
| @@ -1372,15 +1424,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
| 1372 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1424 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 1373 | 1425 | ||
| 1374 | vmx->emulation_required = 1; | 1426 | vmx->emulation_required = 1; |
| 1375 | vcpu->arch.rmode.vm86_active = 0; | 1427 | vmx->rmode.vm86_active = 0; |
| 1376 | 1428 | ||
| 1377 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); | 1429 | vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); |
| 1378 | vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); | 1430 | vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); |
| 1379 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); | 1431 | vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); |
| 1380 | 1432 | ||
| 1381 | flags = vmcs_readl(GUEST_RFLAGS); | 1433 | flags = vmcs_readl(GUEST_RFLAGS); |
| 1382 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | 1434 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); |
| 1383 | flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); | 1435 | flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); |
| 1384 | vmcs_writel(GUEST_RFLAGS, flags); | 1436 | vmcs_writel(GUEST_RFLAGS, flags); |
| 1385 | 1437 | ||
| 1386 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | 1438 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | |
| @@ -1391,10 +1443,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
| 1391 | if (emulate_invalid_guest_state) | 1443 | if (emulate_invalid_guest_state) |
| 1392 | return; | 1444 | return; |
| 1393 | 1445 | ||
| 1394 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | 1446 | fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); |
| 1395 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | 1447 | fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); |
| 1396 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1448 | fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); |
| 1397 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | 1449 | fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); |
| 1398 | 1450 | ||
| 1399 | vmcs_write16(GUEST_SS_SELECTOR, 0); | 1451 | vmcs_write16(GUEST_SS_SELECTOR, 0); |
| 1400 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | 1452 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); |
| @@ -1433,20 +1485,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
| 1433 | unsigned long flags; | 1485 | unsigned long flags; |
| 1434 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1486 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 1435 | 1487 | ||
| 1488 | if (enable_unrestricted_guest) | ||
| 1489 | return; | ||
| 1490 | |||
| 1436 | vmx->emulation_required = 1; | 1491 | vmx->emulation_required = 1; |
| 1437 | vcpu->arch.rmode.vm86_active = 1; | 1492 | vmx->rmode.vm86_active = 1; |
| 1438 | 1493 | ||
| 1439 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1494 | vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
| 1440 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | 1495 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); |
| 1441 | 1496 | ||
| 1442 | vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | 1497 | vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); |
| 1443 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | 1498 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); |
| 1444 | 1499 | ||
| 1445 | vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | 1500 | vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); |
| 1446 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | 1501 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); |
| 1447 | 1502 | ||
| 1448 | flags = vmcs_readl(GUEST_RFLAGS); | 1503 | flags = vmcs_readl(GUEST_RFLAGS); |
| 1449 | vcpu->arch.rmode.save_iopl | 1504 | vmx->rmode.save_iopl |
| 1450 | = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 1505 | = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
| 1451 | 1506 | ||
| 1452 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 1507 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
| @@ -1468,10 +1523,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
| 1468 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | 1523 | vmcs_writel(GUEST_CS_BASE, 0xf0000); |
| 1469 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | 1524 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); |
| 1470 | 1525 | ||
| 1471 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | 1526 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); |
| 1472 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | 1527 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); |
| 1473 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1528 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); |
| 1474 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | 1529 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); |
| 1475 | 1530 | ||
| 1476 | continue_rmode: | 1531 | continue_rmode: |
| 1477 | kvm_mmu_reset_context(vcpu); | 1532 | kvm_mmu_reset_context(vcpu); |
| @@ -1545,11 +1600,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | |||
| 1545 | 1600 | ||
| 1546 | static void ept_load_pdptrs(struct kvm_vcpu *vcpu) | 1601 | static void ept_load_pdptrs(struct kvm_vcpu *vcpu) |
| 1547 | { | 1602 | { |
| 1603 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
| 1604 | (unsigned long *)&vcpu->arch.regs_dirty)) | ||
| 1605 | return; | ||
| 1606 | |||
| 1548 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | 1607 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { |
| 1549 | if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
| 1550 | printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); | ||
| 1551 | return; | ||
| 1552 | } | ||
| 1553 | vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); | 1608 | vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); |
| 1554 | vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); | 1609 | vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); |
| 1555 | vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); | 1610 | vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); |
| @@ -1557,6 +1612,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) | |||
| 1557 | } | 1612 | } |
| 1558 | } | 1613 | } |
| 1559 | 1614 | ||
| 1615 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu) | ||
| 1616 | { | ||
| 1617 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
| 1618 | vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); | ||
| 1619 | vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); | ||
| 1620 | vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); | ||
| 1621 | vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); | ||
| 1622 | } | ||
| 1623 | |||
| 1624 | __set_bit(VCPU_EXREG_PDPTR, | ||
| 1625 | (unsigned long *)&vcpu->arch.regs_avail); | ||
| 1626 | __set_bit(VCPU_EXREG_PDPTR, | ||
| 1627 | (unsigned long *)&vcpu->arch.regs_dirty); | ||
| 1628 | } | ||
| 1629 | |||
| 1560 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | 1630 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); |
| 1561 | 1631 | ||
| 1562 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | 1632 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, |
| @@ -1571,8 +1641,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | |||
| 1571 | CPU_BASED_CR3_STORE_EXITING)); | 1641 | CPU_BASED_CR3_STORE_EXITING)); |
| 1572 | vcpu->arch.cr0 = cr0; | 1642 | vcpu->arch.cr0 = cr0; |
| 1573 | vmx_set_cr4(vcpu, vcpu->arch.cr4); | 1643 | vmx_set_cr4(vcpu, vcpu->arch.cr4); |
| 1574 | *hw_cr0 |= X86_CR0_PE | X86_CR0_PG; | ||
| 1575 | *hw_cr0 &= ~X86_CR0_WP; | ||
| 1576 | } else if (!is_paging(vcpu)) { | 1644 | } else if (!is_paging(vcpu)) { |
| 1577 | /* From nonpaging to paging */ | 1645 | /* From nonpaging to paging */ |
| 1578 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | 1646 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, |
| @@ -1581,9 +1649,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | |||
| 1581 | CPU_BASED_CR3_STORE_EXITING)); | 1649 | CPU_BASED_CR3_STORE_EXITING)); |
| 1582 | vcpu->arch.cr0 = cr0; | 1650 | vcpu->arch.cr0 = cr0; |
| 1583 | vmx_set_cr4(vcpu, vcpu->arch.cr4); | 1651 | vmx_set_cr4(vcpu, vcpu->arch.cr4); |
| 1584 | if (!(vcpu->arch.cr0 & X86_CR0_WP)) | ||
| 1585 | *hw_cr0 &= ~X86_CR0_WP; | ||
| 1586 | } | 1652 | } |
| 1653 | |||
| 1654 | if (!(cr0 & X86_CR0_WP)) | ||
| 1655 | *hw_cr0 &= ~X86_CR0_WP; | ||
| 1587 | } | 1656 | } |
| 1588 | 1657 | ||
| 1589 | static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, | 1658 | static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, |
| @@ -1598,15 +1667,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, | |||
| 1598 | 1667 | ||
| 1599 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 1668 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| 1600 | { | 1669 | { |
| 1601 | unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | | 1670 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 1602 | KVM_VM_CR0_ALWAYS_ON; | 1671 | unsigned long hw_cr0; |
| 1672 | |||
| 1673 | if (enable_unrestricted_guest) | ||
| 1674 | hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) | ||
| 1675 | | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; | ||
| 1676 | else | ||
| 1677 | hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; | ||
| 1603 | 1678 | ||
| 1604 | vmx_fpu_deactivate(vcpu); | 1679 | vmx_fpu_deactivate(vcpu); |
| 1605 | 1680 | ||
| 1606 | if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) | 1681 | if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) |
| 1607 | enter_pmode(vcpu); | 1682 | enter_pmode(vcpu); |
| 1608 | 1683 | ||
| 1609 | if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) | 1684 | if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) |
| 1610 | enter_rmode(vcpu); | 1685 | enter_rmode(vcpu); |
| 1611 | 1686 | ||
| 1612 | #ifdef CONFIG_X86_64 | 1687 | #ifdef CONFIG_X86_64 |
| @@ -1650,10 +1725,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 1650 | if (enable_ept) { | 1725 | if (enable_ept) { |
| 1651 | eptp = construct_eptp(cr3); | 1726 | eptp = construct_eptp(cr3); |
| 1652 | vmcs_write64(EPT_POINTER, eptp); | 1727 | vmcs_write64(EPT_POINTER, eptp); |
| 1653 | ept_sync_context(eptp); | ||
| 1654 | ept_load_pdptrs(vcpu); | ||
| 1655 | guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : | 1728 | guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : |
| 1656 | VMX_EPT_IDENTITY_PAGETABLE_ADDR; | 1729 | vcpu->kvm->arch.ept_identity_map_addr; |
| 1657 | } | 1730 | } |
| 1658 | 1731 | ||
| 1659 | vmx_flush_tlb(vcpu); | 1732 | vmx_flush_tlb(vcpu); |
| @@ -1664,7 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 1664 | 1737 | ||
| 1665 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 1738 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
| 1666 | { | 1739 | { |
| 1667 | unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? | 1740 | unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? |
| 1668 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); | 1741 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); |
| 1669 | 1742 | ||
| 1670 | vcpu->arch.cr4 = cr4; | 1743 | vcpu->arch.cr4 = cr4; |
| @@ -1707,16 +1780,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, | |||
| 1707 | 1780 | ||
| 1708 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | 1781 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) |
| 1709 | { | 1782 | { |
| 1710 | struct kvm_segment kvm_seg; | ||
| 1711 | |||
| 1712 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ | 1783 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ |
| 1713 | return 0; | 1784 | return 0; |
| 1714 | 1785 | ||
| 1715 | if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ | 1786 | if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ |
| 1716 | return 3; | 1787 | return 3; |
| 1717 | 1788 | ||
| 1718 | vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); | 1789 | return vmcs_read16(GUEST_CS_SELECTOR) & 3; |
| 1719 | return kvm_seg.selector & 3; | ||
| 1720 | } | 1790 | } |
| 1721 | 1791 | ||
| 1722 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | 1792 | static u32 vmx_segment_access_rights(struct kvm_segment *var) |
| @@ -1744,20 +1814,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) | |||
| 1744 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | 1814 | static void vmx_set_segment(struct kvm_vcpu *vcpu, |
| 1745 | struct kvm_segment *var, int seg) | 1815 | struct kvm_segment *var, int seg) |
| 1746 | { | 1816 | { |
| 1817 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1747 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 1818 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
| 1748 | u32 ar; | 1819 | u32 ar; |
| 1749 | 1820 | ||
| 1750 | if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { | 1821 | if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { |
| 1751 | vcpu->arch.rmode.tr.selector = var->selector; | 1822 | vmx->rmode.tr.selector = var->selector; |
| 1752 | vcpu->arch.rmode.tr.base = var->base; | 1823 | vmx->rmode.tr.base = var->base; |
| 1753 | vcpu->arch.rmode.tr.limit = var->limit; | 1824 | vmx->rmode.tr.limit = var->limit; |
| 1754 | vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); | 1825 | vmx->rmode.tr.ar = vmx_segment_access_rights(var); |
| 1755 | return; | 1826 | return; |
| 1756 | } | 1827 | } |
| 1757 | vmcs_writel(sf->base, var->base); | 1828 | vmcs_writel(sf->base, var->base); |
| 1758 | vmcs_write32(sf->limit, var->limit); | 1829 | vmcs_write32(sf->limit, var->limit); |
| 1759 | vmcs_write16(sf->selector, var->selector); | 1830 | vmcs_write16(sf->selector, var->selector); |
| 1760 | if (vcpu->arch.rmode.vm86_active && var->s) { | 1831 | if (vmx->rmode.vm86_active && var->s) { |
| 1761 | /* | 1832 | /* |
| 1762 | * Hack real-mode segments into vm86 compatibility. | 1833 | * Hack real-mode segments into vm86 compatibility. |
| 1763 | */ | 1834 | */ |
| @@ -1766,6 +1837,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
| 1766 | ar = 0xf3; | 1837 | ar = 0xf3; |
| 1767 | } else | 1838 | } else |
| 1768 | ar = vmx_segment_access_rights(var); | 1839 | ar = vmx_segment_access_rights(var); |
| 1840 | |||
| 1841 | /* | ||
| 1842 | * Fix the "Accessed" bit in AR field of segment registers for older | ||
| 1843 | * qemu binaries. | ||
| 1844 | * IA32 arch specifies that at the time of processor reset the | ||
| 1845 | * "Accessed" bit in the AR field of segment registers is 1. And qemu | ||
| 1846 | * is setting it to 0 in the usedland code. This causes invalid guest | ||
| 1847 | * state vmexit when "unrestricted guest" mode is turned on. | ||
| 1848 | * Fix for this setup issue in cpu_reset is being pushed in the qemu | ||
| 1849 | * tree. Newer qemu binaries with that qemu fix would not need this | ||
| 1850 | * kvm hack. | ||
| 1851 | */ | ||
| 1852 | if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) | ||
| 1853 | ar |= 0x1; /* Accessed */ | ||
| 1854 | |||
| 1769 | vmcs_write32(sf->ar_bytes, ar); | 1855 | vmcs_write32(sf->ar_bytes, ar); |
| 1770 | } | 1856 | } |
| 1771 | 1857 | ||
| @@ -2040,7 +2126,7 @@ static int init_rmode_identity_map(struct kvm *kvm) | |||
| 2040 | if (likely(kvm->arch.ept_identity_pagetable_done)) | 2126 | if (likely(kvm->arch.ept_identity_pagetable_done)) |
| 2041 | return 1; | 2127 | return 1; |
| 2042 | ret = 0; | 2128 | ret = 0; |
| 2043 | identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; | 2129 | identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; |
| 2044 | r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); | 2130 | r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); |
| 2045 | if (r < 0) | 2131 | if (r < 0) |
| 2046 | goto out; | 2132 | goto out; |
| @@ -2062,11 +2148,19 @@ out: | |||
| 2062 | static void seg_setup(int seg) | 2148 | static void seg_setup(int seg) |
| 2063 | { | 2149 | { |
| 2064 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 2150 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
| 2151 | unsigned int ar; | ||
| 2065 | 2152 | ||
| 2066 | vmcs_write16(sf->selector, 0); | 2153 | vmcs_write16(sf->selector, 0); |
| 2067 | vmcs_writel(sf->base, 0); | 2154 | vmcs_writel(sf->base, 0); |
| 2068 | vmcs_write32(sf->limit, 0xffff); | 2155 | vmcs_write32(sf->limit, 0xffff); |
| 2069 | vmcs_write32(sf->ar_bytes, 0xf3); | 2156 | if (enable_unrestricted_guest) { |
| 2157 | ar = 0x93; | ||
| 2158 | if (seg == VCPU_SREG_CS) | ||
| 2159 | ar |= 0x08; /* code segment */ | ||
| 2160 | } else | ||
| 2161 | ar = 0xf3; | ||
| 2162 | |||
| 2163 | vmcs_write32(sf->ar_bytes, ar); | ||
| 2070 | } | 2164 | } |
| 2071 | 2165 | ||
| 2072 | static int alloc_apic_access_page(struct kvm *kvm) | 2166 | static int alloc_apic_access_page(struct kvm *kvm) |
| @@ -2101,14 +2195,15 @@ static int alloc_identity_pagetable(struct kvm *kvm) | |||
| 2101 | goto out; | 2195 | goto out; |
| 2102 | kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; | 2196 | kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; |
| 2103 | kvm_userspace_mem.flags = 0; | 2197 | kvm_userspace_mem.flags = 0; |
| 2104 | kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; | 2198 | kvm_userspace_mem.guest_phys_addr = |
| 2199 | kvm->arch.ept_identity_map_addr; | ||
| 2105 | kvm_userspace_mem.memory_size = PAGE_SIZE; | 2200 | kvm_userspace_mem.memory_size = PAGE_SIZE; |
| 2106 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); | 2201 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); |
| 2107 | if (r) | 2202 | if (r) |
| 2108 | goto out; | 2203 | goto out; |
| 2109 | 2204 | ||
| 2110 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, | 2205 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, |
| 2111 | VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); | 2206 | kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); |
| 2112 | out: | 2207 | out: |
| 2113 | up_write(&kvm->slots_lock); | 2208 | up_write(&kvm->slots_lock); |
| 2114 | return r; | 2209 | return r; |
| @@ -2209,6 +2304,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
| 2209 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | 2304 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; |
| 2210 | if (!enable_ept) | 2305 | if (!enable_ept) |
| 2211 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | 2306 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; |
| 2307 | if (!enable_unrestricted_guest) | ||
| 2308 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 2212 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | 2309 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); |
| 2213 | } | 2310 | } |
| 2214 | 2311 | ||
| @@ -2326,14 +2423,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
| 2326 | goto out; | 2423 | goto out; |
| 2327 | } | 2424 | } |
| 2328 | 2425 | ||
| 2329 | vmx->vcpu.arch.rmode.vm86_active = 0; | 2426 | vmx->rmode.vm86_active = 0; |
| 2330 | 2427 | ||
| 2331 | vmx->soft_vnmi_blocked = 0; | 2428 | vmx->soft_vnmi_blocked = 0; |
| 2332 | 2429 | ||
| 2333 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | 2430 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); |
| 2334 | kvm_set_cr8(&vmx->vcpu, 0); | 2431 | kvm_set_cr8(&vmx->vcpu, 0); |
| 2335 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 2432 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
| 2336 | if (vmx->vcpu.vcpu_id == 0) | 2433 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) |
| 2337 | msr |= MSR_IA32_APICBASE_BSP; | 2434 | msr |= MSR_IA32_APICBASE_BSP; |
| 2338 | kvm_set_apic_base(&vmx->vcpu, msr); | 2435 | kvm_set_apic_base(&vmx->vcpu, msr); |
| 2339 | 2436 | ||
| @@ -2344,7 +2441,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
| 2344 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | 2441 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode |
| 2345 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | 2442 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. |
| 2346 | */ | 2443 | */ |
| 2347 | if (vmx->vcpu.vcpu_id == 0) { | 2444 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) { |
| 2348 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | 2445 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); |
| 2349 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | 2446 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); |
| 2350 | } else { | 2447 | } else { |
| @@ -2373,7 +2470,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
| 2373 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | 2470 | vmcs_writel(GUEST_SYSENTER_EIP, 0); |
| 2374 | 2471 | ||
| 2375 | vmcs_writel(GUEST_RFLAGS, 0x02); | 2472 | vmcs_writel(GUEST_RFLAGS, 0x02); |
| 2376 | if (vmx->vcpu.vcpu_id == 0) | 2473 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) |
| 2377 | kvm_rip_write(vcpu, 0xfff0); | 2474 | kvm_rip_write(vcpu, 0xfff0); |
| 2378 | else | 2475 | else |
| 2379 | kvm_rip_write(vcpu, 0); | 2476 | kvm_rip_write(vcpu, 0); |
| @@ -2461,13 +2558,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) | |||
| 2461 | uint32_t intr; | 2558 | uint32_t intr; |
| 2462 | int irq = vcpu->arch.interrupt.nr; | 2559 | int irq = vcpu->arch.interrupt.nr; |
| 2463 | 2560 | ||
| 2464 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); | 2561 | trace_kvm_inj_virq(irq); |
| 2465 | 2562 | ||
| 2466 | ++vcpu->stat.irq_injections; | 2563 | ++vcpu->stat.irq_injections; |
| 2467 | if (vcpu->arch.rmode.vm86_active) { | 2564 | if (vmx->rmode.vm86_active) { |
| 2468 | vmx->rmode.irq.pending = true; | 2565 | vmx->rmode.irq.pending = true; |
| 2469 | vmx->rmode.irq.vector = irq; | 2566 | vmx->rmode.irq.vector = irq; |
| 2470 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 2567 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
| 2568 | if (vcpu->arch.interrupt.soft) | ||
| 2569 | vmx->rmode.irq.rip += | ||
| 2570 | vmx->vcpu.arch.event_exit_inst_len; | ||
| 2471 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2571 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
| 2472 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | 2572 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); |
| 2473 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | 2573 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); |
| @@ -2502,7 +2602,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
| 2502 | } | 2602 | } |
| 2503 | 2603 | ||
| 2504 | ++vcpu->stat.nmi_injections; | 2604 | ++vcpu->stat.nmi_injections; |
| 2505 | if (vcpu->arch.rmode.vm86_active) { | 2605 | if (vmx->rmode.vm86_active) { |
| 2506 | vmx->rmode.irq.pending = true; | 2606 | vmx->rmode.irq.pending = true; |
| 2507 | vmx->rmode.irq.vector = NMI_VECTOR; | 2607 | vmx->rmode.irq.vector = NMI_VECTOR; |
| 2508 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 2608 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
| @@ -2659,14 +2759,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2659 | if (enable_ept) | 2759 | if (enable_ept) |
| 2660 | BUG(); | 2760 | BUG(); |
| 2661 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 2761 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
| 2662 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, | 2762 | trace_kvm_page_fault(cr2, error_code); |
| 2663 | (u32)((u64)cr2 >> 32), handler); | 2763 | |
| 2664 | if (kvm_event_needs_reinjection(vcpu)) | 2764 | if (kvm_event_needs_reinjection(vcpu)) |
| 2665 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 2765 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
| 2666 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 2766 | return kvm_mmu_page_fault(vcpu, cr2, error_code); |
| 2667 | } | 2767 | } |
| 2668 | 2768 | ||
| 2669 | if (vcpu->arch.rmode.vm86_active && | 2769 | if (vmx->rmode.vm86_active && |
| 2670 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | 2770 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, |
| 2671 | error_code)) { | 2771 | error_code)) { |
| 2672 | if (vcpu->arch.halt_request) { | 2772 | if (vcpu->arch.halt_request) { |
| @@ -2707,7 +2807,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, | |||
| 2707 | struct kvm_run *kvm_run) | 2807 | struct kvm_run *kvm_run) |
| 2708 | { | 2808 | { |
| 2709 | ++vcpu->stat.irq_exits; | 2809 | ++vcpu->stat.irq_exits; |
| 2710 | KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); | ||
| 2711 | return 1; | 2810 | return 1; |
| 2712 | } | 2811 | } |
| 2713 | 2812 | ||
| @@ -2755,7 +2854,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
| 2755 | 2854 | ||
| 2756 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2855 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 2757 | { | 2856 | { |
| 2758 | unsigned long exit_qualification; | 2857 | unsigned long exit_qualification, val; |
| 2759 | int cr; | 2858 | int cr; |
| 2760 | int reg; | 2859 | int reg; |
| 2761 | 2860 | ||
| @@ -2764,21 +2863,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2764 | reg = (exit_qualification >> 8) & 15; | 2863 | reg = (exit_qualification >> 8) & 15; |
| 2765 | switch ((exit_qualification >> 4) & 3) { | 2864 | switch ((exit_qualification >> 4) & 3) { |
| 2766 | case 0: /* mov to cr */ | 2865 | case 0: /* mov to cr */ |
| 2767 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, | 2866 | val = kvm_register_read(vcpu, reg); |
| 2768 | (u32)kvm_register_read(vcpu, reg), | 2867 | trace_kvm_cr_write(cr, val); |
| 2769 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), | ||
| 2770 | handler); | ||
| 2771 | switch (cr) { | 2868 | switch (cr) { |
| 2772 | case 0: | 2869 | case 0: |
| 2773 | kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); | 2870 | kvm_set_cr0(vcpu, val); |
| 2774 | skip_emulated_instruction(vcpu); | 2871 | skip_emulated_instruction(vcpu); |
| 2775 | return 1; | 2872 | return 1; |
| 2776 | case 3: | 2873 | case 3: |
| 2777 | kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); | 2874 | kvm_set_cr3(vcpu, val); |
| 2778 | skip_emulated_instruction(vcpu); | 2875 | skip_emulated_instruction(vcpu); |
| 2779 | return 1; | 2876 | return 1; |
| 2780 | case 4: | 2877 | case 4: |
| 2781 | kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); | 2878 | kvm_set_cr4(vcpu, val); |
| 2782 | skip_emulated_instruction(vcpu); | 2879 | skip_emulated_instruction(vcpu); |
| 2783 | return 1; | 2880 | return 1; |
| 2784 | case 8: { | 2881 | case 8: { |
| @@ -2800,23 +2897,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2800 | vcpu->arch.cr0 &= ~X86_CR0_TS; | 2897 | vcpu->arch.cr0 &= ~X86_CR0_TS; |
| 2801 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 2898 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); |
| 2802 | vmx_fpu_activate(vcpu); | 2899 | vmx_fpu_activate(vcpu); |
| 2803 | KVMTRACE_0D(CLTS, vcpu, handler); | ||
| 2804 | skip_emulated_instruction(vcpu); | 2900 | skip_emulated_instruction(vcpu); |
| 2805 | return 1; | 2901 | return 1; |
| 2806 | case 1: /*mov from cr*/ | 2902 | case 1: /*mov from cr*/ |
| 2807 | switch (cr) { | 2903 | switch (cr) { |
| 2808 | case 3: | 2904 | case 3: |
| 2809 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); | 2905 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); |
| 2810 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, | 2906 | trace_kvm_cr_read(cr, vcpu->arch.cr3); |
| 2811 | (u32)kvm_register_read(vcpu, reg), | ||
| 2812 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), | ||
| 2813 | handler); | ||
| 2814 | skip_emulated_instruction(vcpu); | 2907 | skip_emulated_instruction(vcpu); |
| 2815 | return 1; | 2908 | return 1; |
| 2816 | case 8: | 2909 | case 8: |
| 2817 | kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); | 2910 | val = kvm_get_cr8(vcpu); |
| 2818 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, | 2911 | kvm_register_write(vcpu, reg, val); |
| 2819 | (u32)kvm_register_read(vcpu, reg), handler); | 2912 | trace_kvm_cr_read(cr, val); |
| 2820 | skip_emulated_instruction(vcpu); | 2913 | skip_emulated_instruction(vcpu); |
| 2821 | return 1; | 2914 | return 1; |
| 2822 | } | 2915 | } |
| @@ -2841,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2841 | unsigned long val; | 2934 | unsigned long val; |
| 2842 | int dr, reg; | 2935 | int dr, reg; |
| 2843 | 2936 | ||
| 2937 | if (!kvm_require_cpl(vcpu, 0)) | ||
| 2938 | return 1; | ||
| 2844 | dr = vmcs_readl(GUEST_DR7); | 2939 | dr = vmcs_readl(GUEST_DR7); |
| 2845 | if (dr & DR7_GD) { | 2940 | if (dr & DR7_GD) { |
| 2846 | /* | 2941 | /* |
| @@ -2884,7 +2979,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2884 | val = 0; | 2979 | val = 0; |
| 2885 | } | 2980 | } |
| 2886 | kvm_register_write(vcpu, reg, val); | 2981 | kvm_register_write(vcpu, reg, val); |
| 2887 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); | ||
| 2888 | } else { | 2982 | } else { |
| 2889 | val = vcpu->arch.regs[reg]; | 2983 | val = vcpu->arch.regs[reg]; |
| 2890 | switch (dr) { | 2984 | switch (dr) { |
| @@ -2917,7 +3011,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2917 | } | 3011 | } |
| 2918 | break; | 3012 | break; |
| 2919 | } | 3013 | } |
| 2920 | KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); | ||
| 2921 | } | 3014 | } |
| 2922 | skip_emulated_instruction(vcpu); | 3015 | skip_emulated_instruction(vcpu); |
| 2923 | return 1; | 3016 | return 1; |
| @@ -2939,8 +3032,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2939 | return 1; | 3032 | return 1; |
| 2940 | } | 3033 | } |
| 2941 | 3034 | ||
| 2942 | KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), | 3035 | trace_kvm_msr_read(ecx, data); |
| 2943 | handler); | ||
| 2944 | 3036 | ||
| 2945 | /* FIXME: handling of bits 32:63 of rax, rdx */ | 3037 | /* FIXME: handling of bits 32:63 of rax, rdx */ |
| 2946 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; | 3038 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; |
| @@ -2955,8 +3047,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 2955 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | 3047 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) |
| 2956 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 3048 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
| 2957 | 3049 | ||
| 2958 | KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), | 3050 | trace_kvm_msr_write(ecx, data); |
| 2959 | handler); | ||
| 2960 | 3051 | ||
| 2961 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | 3052 | if (vmx_set_msr(vcpu, ecx, data) != 0) { |
| 2962 | kvm_inject_gp(vcpu, 0); | 3053 | kvm_inject_gp(vcpu, 0); |
| @@ -2983,7 +3074,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, | |||
| 2983 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | 3074 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; |
| 2984 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 3075 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
| 2985 | 3076 | ||
| 2986 | KVMTRACE_0D(PEND_INTR, vcpu, handler); | ||
| 2987 | ++vcpu->stat.irq_window_exits; | 3077 | ++vcpu->stat.irq_window_exits; |
| 2988 | 3078 | ||
| 2989 | /* | 3079 | /* |
| @@ -3049,7 +3139,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3049 | printk(KERN_ERR | 3139 | printk(KERN_ERR |
| 3050 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", | 3140 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", |
| 3051 | offset); | 3141 | offset); |
| 3052 | return -ENOTSUPP; | 3142 | return -ENOEXEC; |
| 3053 | } | 3143 | } |
| 3054 | return 1; | 3144 | return 1; |
| 3055 | } | 3145 | } |
| @@ -3118,7 +3208,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3118 | 3208 | ||
| 3119 | if (exit_qualification & (1 << 6)) { | 3209 | if (exit_qualification & (1 << 6)) { |
| 3120 | printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); | 3210 | printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); |
| 3121 | return -ENOTSUPP; | 3211 | return -EINVAL; |
| 3122 | } | 3212 | } |
| 3123 | 3213 | ||
| 3124 | gla_validity = (exit_qualification >> 7) & 0x3; | 3214 | gla_validity = (exit_qualification >> 7) & 0x3; |
| @@ -3130,14 +3220,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3130 | printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", | 3220 | printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", |
| 3131 | (long unsigned int)exit_qualification); | 3221 | (long unsigned int)exit_qualification); |
| 3132 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 3222 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; |
| 3133 | kvm_run->hw.hardware_exit_reason = 0; | 3223 | kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; |
| 3134 | return -ENOTSUPP; | 3224 | return 0; |
| 3135 | } | 3225 | } |
| 3136 | 3226 | ||
| 3137 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 3227 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
| 3228 | trace_kvm_page_fault(gpa, exit_qualification); | ||
| 3138 | return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); | 3229 | return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); |
| 3139 | } | 3230 | } |
| 3140 | 3231 | ||
| 3232 | static u64 ept_rsvd_mask(u64 spte, int level) | ||
| 3233 | { | ||
| 3234 | int i; | ||
| 3235 | u64 mask = 0; | ||
| 3236 | |||
| 3237 | for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) | ||
| 3238 | mask |= (1ULL << i); | ||
| 3239 | |||
| 3240 | if (level > 2) | ||
| 3241 | /* bits 7:3 reserved */ | ||
| 3242 | mask |= 0xf8; | ||
| 3243 | else if (level == 2) { | ||
| 3244 | if (spte & (1ULL << 7)) | ||
| 3245 | /* 2MB ref, bits 20:12 reserved */ | ||
| 3246 | mask |= 0x1ff000; | ||
| 3247 | else | ||
| 3248 | /* bits 6:3 reserved */ | ||
| 3249 | mask |= 0x78; | ||
| 3250 | } | ||
| 3251 | |||
| 3252 | return mask; | ||
| 3253 | } | ||
| 3254 | |||
| 3255 | static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, | ||
| 3256 | int level) | ||
| 3257 | { | ||
| 3258 | printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); | ||
| 3259 | |||
| 3260 | /* 010b (write-only) */ | ||
| 3261 | WARN_ON((spte & 0x7) == 0x2); | ||
| 3262 | |||
| 3263 | /* 110b (write/execute) */ | ||
| 3264 | WARN_ON((spte & 0x7) == 0x6); | ||
| 3265 | |||
| 3266 | /* 100b (execute-only) and value not supported by logical processor */ | ||
| 3267 | if (!cpu_has_vmx_ept_execute_only()) | ||
| 3268 | WARN_ON((spte & 0x7) == 0x4); | ||
| 3269 | |||
| 3270 | /* not 000b */ | ||
| 3271 | if ((spte & 0x7)) { | ||
| 3272 | u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); | ||
| 3273 | |||
| 3274 | if (rsvd_bits != 0) { | ||
| 3275 | printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", | ||
| 3276 | __func__, rsvd_bits); | ||
| 3277 | WARN_ON(1); | ||
| 3278 | } | ||
| 3279 | |||
| 3280 | if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { | ||
| 3281 | u64 ept_mem_type = (spte & 0x38) >> 3; | ||
| 3282 | |||
| 3283 | if (ept_mem_type == 2 || ept_mem_type == 3 || | ||
| 3284 | ept_mem_type == 7) { | ||
| 3285 | printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", | ||
| 3286 | __func__, ept_mem_type); | ||
| 3287 | WARN_ON(1); | ||
| 3288 | } | ||
| 3289 | } | ||
| 3290 | } | ||
| 3291 | } | ||
| 3292 | |||
| 3293 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
| 3294 | { | ||
| 3295 | u64 sptes[4]; | ||
| 3296 | int nr_sptes, i; | ||
| 3297 | gpa_t gpa; | ||
| 3298 | |||
| 3299 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | ||
| 3300 | |||
| 3301 | printk(KERN_ERR "EPT: Misconfiguration.\n"); | ||
| 3302 | printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); | ||
| 3303 | |||
| 3304 | nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); | ||
| 3305 | |||
| 3306 | for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) | ||
| 3307 | ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); | ||
| 3308 | |||
| 3309 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
| 3310 | kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; | ||
| 3311 | |||
| 3312 | return 0; | ||
| 3313 | } | ||
| 3314 | |||
| 3141 | static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3315 | static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 3142 | { | 3316 | { |
| 3143 | u32 cpu_based_vm_exec_control; | 3317 | u32 cpu_based_vm_exec_control; |
| @@ -3217,8 +3391,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
| 3217 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 3391 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
| 3218 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 3392 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
| 3219 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, | 3393 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, |
| 3220 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | ||
| 3221 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, | 3394 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, |
| 3395 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | ||
| 3396 | [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, | ||
| 3222 | }; | 3397 | }; |
| 3223 | 3398 | ||
| 3224 | static const int kvm_vmx_max_exit_handlers = | 3399 | static const int kvm_vmx_max_exit_handlers = |
| @@ -3234,8 +3409,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
| 3234 | u32 exit_reason = vmx->exit_reason; | 3409 | u32 exit_reason = vmx->exit_reason; |
| 3235 | u32 vectoring_info = vmx->idt_vectoring_info; | 3410 | u32 vectoring_info = vmx->idt_vectoring_info; |
| 3236 | 3411 | ||
| 3237 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), | 3412 | trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); |
| 3238 | (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); | ||
| 3239 | 3413 | ||
| 3240 | /* If we need to emulate an MMIO from handle_invalid_guest_state | 3414 | /* If we need to emulate an MMIO from handle_invalid_guest_state |
| 3241 | * we just return 0 */ | 3415 | * we just return 0 */ |
| @@ -3247,10 +3421,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
| 3247 | 3421 | ||
| 3248 | /* Access CR3 don't cause VMExit in paging mode, so we need | 3422 | /* Access CR3 don't cause VMExit in paging mode, so we need |
| 3249 | * to sync with guest real CR3. */ | 3423 | * to sync with guest real CR3. */ |
| 3250 | if (enable_ept && is_paging(vcpu)) { | 3424 | if (enable_ept && is_paging(vcpu)) |
| 3251 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | 3425 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); |
| 3252 | ept_load_pdptrs(vcpu); | ||
| 3253 | } | ||
| 3254 | 3426 | ||
| 3255 | if (unlikely(vmx->fail)) { | 3427 | if (unlikely(vmx->fail)) { |
| 3256 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 3428 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
| @@ -3326,10 +3498,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
| 3326 | 3498 | ||
| 3327 | /* We need to handle NMIs before interrupts are enabled */ | 3499 | /* We need to handle NMIs before interrupts are enabled */ |
| 3328 | if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && | 3500 | if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && |
| 3329 | (exit_intr_info & INTR_INFO_VALID_MASK)) { | 3501 | (exit_intr_info & INTR_INFO_VALID_MASK)) |
| 3330 | KVMTRACE_0D(NMI, &vmx->vcpu, handler); | ||
| 3331 | asm("int $2"); | 3502 | asm("int $2"); |
| 3332 | } | ||
| 3333 | 3503 | ||
| 3334 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | 3504 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; |
| 3335 | 3505 | ||
| @@ -3434,6 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3434 | { | 3604 | { |
| 3435 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3605 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 3436 | 3606 | ||
| 3607 | if (enable_ept && is_paging(vcpu)) { | ||
| 3608 | vmcs_writel(GUEST_CR3, vcpu->arch.cr3); | ||
| 3609 | ept_load_pdptrs(vcpu); | ||
| 3610 | } | ||
| 3437 | /* Record the guest's net vcpu time for enforced NMI injections. */ | 3611 | /* Record the guest's net vcpu time for enforced NMI injections. */ |
| 3438 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) | 3612 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) |
| 3439 | vmx->entry_time = ktime_get(); | 3613 | vmx->entry_time = ktime_get(); |
| @@ -3449,12 +3623,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3449 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) | 3623 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) |
| 3450 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); | 3624 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); |
| 3451 | 3625 | ||
| 3626 | /* When single-stepping over STI and MOV SS, we must clear the | ||
| 3627 | * corresponding interruptibility bits in the guest state. Otherwise | ||
| 3628 | * vmentry fails as it then expects bit 14 (BS) in pending debug | ||
| 3629 | * exceptions being set, but that's not correct for the guest debugging | ||
| 3630 | * case. */ | ||
| 3631 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | ||
| 3632 | vmx_set_interrupt_shadow(vcpu, 0); | ||
| 3633 | |||
| 3452 | /* | 3634 | /* |
| 3453 | * Loading guest fpu may have cleared host cr0.ts | 3635 | * Loading guest fpu may have cleared host cr0.ts |
| 3454 | */ | 3636 | */ |
| 3455 | vmcs_writel(HOST_CR0, read_cr0()); | 3637 | vmcs_writel(HOST_CR0, read_cr0()); |
| 3456 | 3638 | ||
| 3457 | set_debugreg(vcpu->arch.dr6, 6); | 3639 | if (vcpu->arch.switch_db_regs) |
| 3640 | set_debugreg(vcpu->arch.dr6, 6); | ||
| 3458 | 3641 | ||
| 3459 | asm( | 3642 | asm( |
| 3460 | /* Store host registers */ | 3643 | /* Store host registers */ |
| @@ -3465,11 +3648,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3465 | "mov %%"R"sp, %c[host_rsp](%0) \n\t" | 3648 | "mov %%"R"sp, %c[host_rsp](%0) \n\t" |
| 3466 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" | 3649 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" |
| 3467 | "1: \n\t" | 3650 | "1: \n\t" |
| 3651 | /* Reload cr2 if changed */ | ||
| 3652 | "mov %c[cr2](%0), %%"R"ax \n\t" | ||
| 3653 | "mov %%cr2, %%"R"dx \n\t" | ||
| 3654 | "cmp %%"R"ax, %%"R"dx \n\t" | ||
| 3655 | "je 2f \n\t" | ||
| 3656 | "mov %%"R"ax, %%cr2 \n\t" | ||
| 3657 | "2: \n\t" | ||
| 3468 | /* Check if vmlaunch of vmresume is needed */ | 3658 | /* Check if vmlaunch of vmresume is needed */ |
| 3469 | "cmpl $0, %c[launched](%0) \n\t" | 3659 | "cmpl $0, %c[launched](%0) \n\t" |
| 3470 | /* Load guest registers. Don't clobber flags. */ | 3660 | /* Load guest registers. Don't clobber flags. */ |
| 3471 | "mov %c[cr2](%0), %%"R"ax \n\t" | ||
| 3472 | "mov %%"R"ax, %%cr2 \n\t" | ||
| 3473 | "mov %c[rax](%0), %%"R"ax \n\t" | 3661 | "mov %c[rax](%0), %%"R"ax \n\t" |
| 3474 | "mov %c[rbx](%0), %%"R"bx \n\t" | 3662 | "mov %c[rbx](%0), %%"R"bx \n\t" |
| 3475 | "mov %c[rdx](%0), %%"R"dx \n\t" | 3663 | "mov %c[rdx](%0), %%"R"dx \n\t" |
| @@ -3547,10 +3735,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3547 | #endif | 3735 | #endif |
| 3548 | ); | 3736 | ); |
| 3549 | 3737 | ||
| 3550 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | 3738 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) |
| 3739 | | (1 << VCPU_EXREG_PDPTR)); | ||
| 3551 | vcpu->arch.regs_dirty = 0; | 3740 | vcpu->arch.regs_dirty = 0; |
| 3552 | 3741 | ||
| 3553 | get_debugreg(vcpu->arch.dr6, 6); | 3742 | if (vcpu->arch.switch_db_regs) |
| 3743 | get_debugreg(vcpu->arch.dr6, 6); | ||
| 3554 | 3744 | ||
| 3555 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 3745 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
| 3556 | if (vmx->rmode.irq.pending) | 3746 | if (vmx->rmode.irq.pending) |
| @@ -3633,9 +3823,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
| 3633 | if (alloc_apic_access_page(kvm) != 0) | 3823 | if (alloc_apic_access_page(kvm) != 0) |
| 3634 | goto free_vmcs; | 3824 | goto free_vmcs; |
| 3635 | 3825 | ||
| 3636 | if (enable_ept) | 3826 | if (enable_ept) { |
| 3827 | if (!kvm->arch.ept_identity_map_addr) | ||
| 3828 | kvm->arch.ept_identity_map_addr = | ||
| 3829 | VMX_EPT_IDENTITY_PAGETABLE_ADDR; | ||
| 3637 | if (alloc_identity_pagetable(kvm) != 0) | 3830 | if (alloc_identity_pagetable(kvm) != 0) |
| 3638 | goto free_vmcs; | 3831 | goto free_vmcs; |
| 3832 | } | ||
| 3639 | 3833 | ||
| 3640 | return &vmx->vcpu; | 3834 | return &vmx->vcpu; |
| 3641 | 3835 | ||
| @@ -3699,6 +3893,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | |||
| 3699 | return ret; | 3893 | return ret; |
| 3700 | } | 3894 | } |
| 3701 | 3895 | ||
| 3896 | static const struct trace_print_flags vmx_exit_reasons_str[] = { | ||
| 3897 | { EXIT_REASON_EXCEPTION_NMI, "exception" }, | ||
| 3898 | { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, | ||
| 3899 | { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, | ||
| 3900 | { EXIT_REASON_NMI_WINDOW, "nmi_window" }, | ||
| 3901 | { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, | ||
| 3902 | { EXIT_REASON_CR_ACCESS, "cr_access" }, | ||
| 3903 | { EXIT_REASON_DR_ACCESS, "dr_access" }, | ||
| 3904 | { EXIT_REASON_CPUID, "cpuid" }, | ||
| 3905 | { EXIT_REASON_MSR_READ, "rdmsr" }, | ||
| 3906 | { EXIT_REASON_MSR_WRITE, "wrmsr" }, | ||
| 3907 | { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, | ||
| 3908 | { EXIT_REASON_HLT, "halt" }, | ||
| 3909 | { EXIT_REASON_INVLPG, "invlpg" }, | ||
| 3910 | { EXIT_REASON_VMCALL, "hypercall" }, | ||
| 3911 | { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, | ||
| 3912 | { EXIT_REASON_APIC_ACCESS, "apic_access" }, | ||
| 3913 | { EXIT_REASON_WBINVD, "wbinvd" }, | ||
| 3914 | { EXIT_REASON_TASK_SWITCH, "task_switch" }, | ||
| 3915 | { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, | ||
| 3916 | { -1, NULL } | ||
| 3917 | }; | ||
| 3918 | |||
| 3919 | static bool vmx_gb_page_enable(void) | ||
| 3920 | { | ||
| 3921 | return false; | ||
| 3922 | } | ||
| 3923 | |||
| 3702 | static struct kvm_x86_ops vmx_x86_ops = { | 3924 | static struct kvm_x86_ops vmx_x86_ops = { |
| 3703 | .cpu_has_kvm_support = cpu_has_kvm_support, | 3925 | .cpu_has_kvm_support = cpu_has_kvm_support, |
| 3704 | .disabled_by_bios = vmx_disabled_by_bios, | 3926 | .disabled_by_bios = vmx_disabled_by_bios, |
| @@ -3758,6 +3980,9 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
| 3758 | .set_tss_addr = vmx_set_tss_addr, | 3980 | .set_tss_addr = vmx_set_tss_addr, |
| 3759 | .get_tdp_level = get_ept_level, | 3981 | .get_tdp_level = get_ept_level, |
| 3760 | .get_mt_mask = vmx_get_mt_mask, | 3982 | .get_mt_mask = vmx_get_mt_mask, |
| 3983 | |||
| 3984 | .exit_reasons_str = vmx_exit_reasons_str, | ||
| 3985 | .gb_page_enable = vmx_gb_page_enable, | ||
| 3761 | }; | 3986 | }; |
| 3762 | 3987 | ||
| 3763 | static int __init vmx_init(void) | 3988 | static int __init vmx_init(void) |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 633ccc7400a..be451ee4424 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
| @@ -37,11 +37,16 @@ | |||
| 37 | #include <linux/iommu.h> | 37 | #include <linux/iommu.h> |
| 38 | #include <linux/intel-iommu.h> | 38 | #include <linux/intel-iommu.h> |
| 39 | #include <linux/cpufreq.h> | 39 | #include <linux/cpufreq.h> |
| 40 | #include <trace/events/kvm.h> | ||
| 41 | #undef TRACE_INCLUDE_FILE | ||
| 42 | #define CREATE_TRACE_POINTS | ||
| 43 | #include "trace.h" | ||
| 40 | 44 | ||
| 41 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
| 42 | #include <asm/msr.h> | 46 | #include <asm/msr.h> |
| 43 | #include <asm/desc.h> | 47 | #include <asm/desc.h> |
| 44 | #include <asm/mtrr.h> | 48 | #include <asm/mtrr.h> |
| 49 | #include <asm/mce.h> | ||
| 45 | 50 | ||
| 46 | #define MAX_IO_MSRS 256 | 51 | #define MAX_IO_MSRS 256 |
| 47 | #define CR0_RESERVED_BITS \ | 52 | #define CR0_RESERVED_BITS \ |
| @@ -55,6 +60,10 @@ | |||
| 55 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | 60 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) |
| 56 | 61 | ||
| 57 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | 62 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) |
| 63 | |||
| 64 | #define KVM_MAX_MCE_BANKS 32 | ||
| 65 | #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P | ||
| 66 | |||
| 58 | /* EFER defaults: | 67 | /* EFER defaults: |
| 59 | * - enable syscall per default because its emulated by KVM | 68 | * - enable syscall per default because its emulated by KVM |
| 60 | * - enable LME and LMA per default on 64 bit KVM | 69 | * - enable LME and LMA per default on 64 bit KVM |
| @@ -68,14 +77,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; | |||
| 68 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM | 77 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM |
| 69 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU | 78 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU |
| 70 | 79 | ||
| 80 | static void update_cr8_intercept(struct kvm_vcpu *vcpu); | ||
| 71 | static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | 81 | static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, |
| 72 | struct kvm_cpuid_entry2 __user *entries); | 82 | struct kvm_cpuid_entry2 __user *entries); |
| 73 | struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | ||
| 74 | u32 function, u32 index); | ||
| 75 | 83 | ||
| 76 | struct kvm_x86_ops *kvm_x86_ops; | 84 | struct kvm_x86_ops *kvm_x86_ops; |
| 77 | EXPORT_SYMBOL_GPL(kvm_x86_ops); | 85 | EXPORT_SYMBOL_GPL(kvm_x86_ops); |
| 78 | 86 | ||
| 87 | int ignore_msrs = 0; | ||
| 88 | module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); | ||
| 89 | |||
| 79 | struct kvm_stats_debugfs_item debugfs_entries[] = { | 90 | struct kvm_stats_debugfs_item debugfs_entries[] = { |
| 80 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | 91 | { "pf_fixed", VCPU_STAT(pf_fixed) }, |
| 81 | { "pf_guest", VCPU_STAT(pf_guest) }, | 92 | { "pf_guest", VCPU_STAT(pf_guest) }, |
| @@ -122,18 +133,16 @@ unsigned long segment_base(u16 selector) | |||
| 122 | if (selector == 0) | 133 | if (selector == 0) |
| 123 | return 0; | 134 | return 0; |
| 124 | 135 | ||
| 125 | asm("sgdt %0" : "=m"(gdt)); | 136 | kvm_get_gdt(&gdt); |
| 126 | table_base = gdt.base; | 137 | table_base = gdt.base; |
| 127 | 138 | ||
| 128 | if (selector & 4) { /* from ldt */ | 139 | if (selector & 4) { /* from ldt */ |
| 129 | u16 ldt_selector; | 140 | u16 ldt_selector = kvm_read_ldt(); |
| 130 | 141 | ||
| 131 | asm("sldt %0" : "=g"(ldt_selector)); | ||
| 132 | table_base = segment_base(ldt_selector); | 142 | table_base = segment_base(ldt_selector); |
| 133 | } | 143 | } |
| 134 | d = (struct desc_struct *)(table_base + (selector & ~7)); | 144 | d = (struct desc_struct *)(table_base + (selector & ~7)); |
| 135 | v = d->base0 | ((unsigned long)d->base1 << 16) | | 145 | v = get_desc_base(d); |
| 136 | ((unsigned long)d->base2 << 24); | ||
| 137 | #ifdef CONFIG_X86_64 | 146 | #ifdef CONFIG_X86_64 |
| 138 | if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) | 147 | if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) |
| 139 | v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; | 148 | v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; |
| @@ -176,16 +185,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, | |||
| 176 | ++vcpu->stat.pf_guest; | 185 | ++vcpu->stat.pf_guest; |
| 177 | 186 | ||
| 178 | if (vcpu->arch.exception.pending) { | 187 | if (vcpu->arch.exception.pending) { |
| 179 | if (vcpu->arch.exception.nr == PF_VECTOR) { | 188 | switch(vcpu->arch.exception.nr) { |
| 180 | printk(KERN_DEBUG "kvm: inject_page_fault:" | 189 | case DF_VECTOR: |
| 181 | " double fault 0x%lx\n", addr); | ||
| 182 | vcpu->arch.exception.nr = DF_VECTOR; | ||
| 183 | vcpu->arch.exception.error_code = 0; | ||
| 184 | } else if (vcpu->arch.exception.nr == DF_VECTOR) { | ||
| 185 | /* triple fault -> shutdown */ | 190 | /* triple fault -> shutdown */ |
| 186 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | 191 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); |
| 192 | return; | ||
| 193 | case PF_VECTOR: | ||
| 194 | vcpu->arch.exception.nr = DF_VECTOR; | ||
| 195 | vcpu->arch.exception.error_code = 0; | ||
| 196 | return; | ||
| 197 | default: | ||
| 198 | /* replace previous exception with a new one in a hope | ||
| 199 | that instruction re-execution will regenerate lost | ||
| 200 | exception */ | ||
| 201 | vcpu->arch.exception.pending = false; | ||
| 202 | break; | ||
| 187 | } | 203 | } |
| 188 | return; | ||
| 189 | } | 204 | } |
| 190 | vcpu->arch.cr2 = addr; | 205 | vcpu->arch.cr2 = addr; |
| 191 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | 206 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); |
| @@ -207,12 +222,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) | |||
| 207 | } | 222 | } |
| 208 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); | 223 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); |
| 209 | 224 | ||
| 210 | static void __queue_exception(struct kvm_vcpu *vcpu) | 225 | /* |
| 226 | * Checks if cpl <= required_cpl; if true, return true. Otherwise queue | ||
| 227 | * a #GP and return false. | ||
| 228 | */ | ||
| 229 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) | ||
| 211 | { | 230 | { |
| 212 | kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, | 231 | if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) |
| 213 | vcpu->arch.exception.has_error_code, | 232 | return true; |
| 214 | vcpu->arch.exception.error_code); | 233 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); |
| 234 | return false; | ||
| 215 | } | 235 | } |
| 236 | EXPORT_SYMBOL_GPL(kvm_require_cpl); | ||
| 216 | 237 | ||
| 217 | /* | 238 | /* |
| 218 | * Load the pae pdptrs. Return true is they are all valid. | 239 | * Load the pae pdptrs. Return true is they are all valid. |
| @@ -232,7 +253,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 232 | goto out; | 253 | goto out; |
| 233 | } | 254 | } |
| 234 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { | 255 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { |
| 235 | if (is_present_pte(pdpte[i]) && | 256 | if (is_present_gpte(pdpte[i]) && |
| 236 | (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { | 257 | (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { |
| 237 | ret = 0; | 258 | ret = 0; |
| 238 | goto out; | 259 | goto out; |
| @@ -241,6 +262,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 241 | ret = 1; | 262 | ret = 1; |
| 242 | 263 | ||
| 243 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); | 264 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); |
| 265 | __set_bit(VCPU_EXREG_PDPTR, | ||
| 266 | (unsigned long *)&vcpu->arch.regs_avail); | ||
| 267 | __set_bit(VCPU_EXREG_PDPTR, | ||
| 268 | (unsigned long *)&vcpu->arch.regs_dirty); | ||
| 244 | out: | 269 | out: |
| 245 | 270 | ||
| 246 | return ret; | 271 | return ret; |
| @@ -256,6 +281,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) | |||
| 256 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | 281 | if (is_long_mode(vcpu) || !is_pae(vcpu)) |
| 257 | return false; | 282 | return false; |
| 258 | 283 | ||
| 284 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
| 285 | (unsigned long *)&vcpu->arch.regs_avail)) | ||
| 286 | return true; | ||
| 287 | |||
| 259 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | 288 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); |
| 260 | if (r < 0) | 289 | if (r < 0) |
| 261 | goto out; | 290 | goto out; |
| @@ -328,9 +357,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); | |||
| 328 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | 357 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
| 329 | { | 358 | { |
| 330 | kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); | 359 | kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); |
| 331 | KVMTRACE_1D(LMSW, vcpu, | ||
| 332 | (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), | ||
| 333 | handler); | ||
| 334 | } | 360 | } |
| 335 | EXPORT_SYMBOL_GPL(kvm_lmsw); | 361 | EXPORT_SYMBOL_GPL(kvm_lmsw); |
| 336 | 362 | ||
| @@ -466,7 +492,7 @@ static u32 msrs_to_save[] = { | |||
| 466 | #ifdef CONFIG_X86_64 | 492 | #ifdef CONFIG_X86_64 |
| 467 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | 493 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
| 468 | #endif | 494 | #endif |
| 469 | MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, | 495 | MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
| 470 | MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA | 496 | MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA |
| 471 | }; | 497 | }; |
| 472 | 498 | ||
| @@ -644,8 +670,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) | |||
| 644 | 670 | ||
| 645 | /* Keep irq disabled to prevent changes to the clock */ | 671 | /* Keep irq disabled to prevent changes to the clock */ |
| 646 | local_irq_save(flags); | 672 | local_irq_save(flags); |
| 647 | kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, | 673 | kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); |
| 648 | &vcpu->hv_clock.tsc_timestamp); | ||
| 649 | ktime_get_ts(&ts); | 674 | ktime_get_ts(&ts); |
| 650 | local_irq_restore(flags); | 675 | local_irq_restore(flags); |
| 651 | 676 | ||
| @@ -778,23 +803,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 778 | return 0; | 803 | return 0; |
| 779 | } | 804 | } |
| 780 | 805 | ||
| 806 | static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
| 807 | { | ||
| 808 | u64 mcg_cap = vcpu->arch.mcg_cap; | ||
| 809 | unsigned bank_num = mcg_cap & 0xff; | ||
| 810 | |||
| 811 | switch (msr) { | ||
| 812 | case MSR_IA32_MCG_STATUS: | ||
| 813 | vcpu->arch.mcg_status = data; | ||
| 814 | break; | ||
| 815 | case MSR_IA32_MCG_CTL: | ||
| 816 | if (!(mcg_cap & MCG_CTL_P)) | ||
| 817 | return 1; | ||
| 818 | if (data != 0 && data != ~(u64)0) | ||
| 819 | return -1; | ||
| 820 | vcpu->arch.mcg_ctl = data; | ||
| 821 | break; | ||
| 822 | default: | ||
| 823 | if (msr >= MSR_IA32_MC0_CTL && | ||
| 824 | msr < MSR_IA32_MC0_CTL + 4 * bank_num) { | ||
| 825 | u32 offset = msr - MSR_IA32_MC0_CTL; | ||
| 826 | /* only 0 or all 1s can be written to IA32_MCi_CTL */ | ||
| 827 | if ((offset & 0x3) == 0 && | ||
| 828 | data != 0 && data != ~(u64)0) | ||
| 829 | return -1; | ||
| 830 | vcpu->arch.mce_banks[offset] = data; | ||
| 831 | break; | ||
| 832 | } | ||
| 833 | return 1; | ||
| 834 | } | ||
| 835 | return 0; | ||
| 836 | } | ||
| 837 | |||
| 781 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 838 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
| 782 | { | 839 | { |
| 783 | switch (msr) { | 840 | switch (msr) { |
| 784 | case MSR_EFER: | 841 | case MSR_EFER: |
| 785 | set_efer(vcpu, data); | 842 | set_efer(vcpu, data); |
| 786 | break; | 843 | break; |
| 787 | case MSR_IA32_MC0_STATUS: | 844 | case MSR_K7_HWCR: |
| 788 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | 845 | data &= ~(u64)0x40; /* ignore flush filter disable */ |
| 789 | __func__, data); | 846 | if (data != 0) { |
| 847 | pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", | ||
| 848 | data); | ||
| 849 | return 1; | ||
| 850 | } | ||
| 790 | break; | 851 | break; |
| 791 | case MSR_IA32_MCG_STATUS: | 852 | case MSR_FAM10H_MMIO_CONF_BASE: |
| 792 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | 853 | if (data != 0) { |
| 793 | __func__, data); | 854 | pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " |
| 855 | "0x%llx\n", data); | ||
| 856 | return 1; | ||
| 857 | } | ||
| 794 | break; | 858 | break; |
| 795 | case MSR_IA32_MCG_CTL: | 859 | case MSR_AMD64_NB_CFG: |
| 796 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", | ||
| 797 | __func__, data); | ||
| 798 | break; | 860 | break; |
| 799 | case MSR_IA32_DEBUGCTLMSR: | 861 | case MSR_IA32_DEBUGCTLMSR: |
| 800 | if (!data) { | 862 | if (!data) { |
| @@ -811,12 +873,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 811 | case MSR_IA32_UCODE_REV: | 873 | case MSR_IA32_UCODE_REV: |
| 812 | case MSR_IA32_UCODE_WRITE: | 874 | case MSR_IA32_UCODE_WRITE: |
| 813 | case MSR_VM_HSAVE_PA: | 875 | case MSR_VM_HSAVE_PA: |
| 876 | case MSR_AMD64_PATCH_LOADER: | ||
| 814 | break; | 877 | break; |
| 815 | case 0x200 ... 0x2ff: | 878 | case 0x200 ... 0x2ff: |
| 816 | return set_msr_mtrr(vcpu, msr, data); | 879 | return set_msr_mtrr(vcpu, msr, data); |
| 817 | case MSR_IA32_APICBASE: | 880 | case MSR_IA32_APICBASE: |
| 818 | kvm_set_apic_base(vcpu, data); | 881 | kvm_set_apic_base(vcpu, data); |
| 819 | break; | 882 | break; |
| 883 | case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: | ||
| 884 | return kvm_x2apic_msr_write(vcpu, msr, data); | ||
| 820 | case MSR_IA32_MISC_ENABLE: | 885 | case MSR_IA32_MISC_ENABLE: |
| 821 | vcpu->arch.ia32_misc_enable_msr = data; | 886 | vcpu->arch.ia32_misc_enable_msr = data; |
| 822 | break; | 887 | break; |
| @@ -850,9 +915,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 850 | kvm_request_guest_time_update(vcpu); | 915 | kvm_request_guest_time_update(vcpu); |
| 851 | break; | 916 | break; |
| 852 | } | 917 | } |
| 918 | case MSR_IA32_MCG_CTL: | ||
| 919 | case MSR_IA32_MCG_STATUS: | ||
| 920 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | ||
| 921 | return set_msr_mce(vcpu, msr, data); | ||
| 922 | |||
| 923 | /* Performance counters are not protected by a CPUID bit, | ||
| 924 | * so we should check all of them in the generic path for the sake of | ||
| 925 | * cross vendor migration. | ||
| 926 | * Writing a zero into the event select MSRs disables them, | ||
| 927 | * which we perfectly emulate ;-). Any other value should be at least | ||
| 928 | * reported, some guests depend on them. | ||
| 929 | */ | ||
| 930 | case MSR_P6_EVNTSEL0: | ||
| 931 | case MSR_P6_EVNTSEL1: | ||
| 932 | case MSR_K7_EVNTSEL0: | ||
| 933 | case MSR_K7_EVNTSEL1: | ||
| 934 | case MSR_K7_EVNTSEL2: | ||
| 935 | case MSR_K7_EVNTSEL3: | ||
| 936 | if (data != 0) | ||
| 937 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | ||
| 938 | "0x%x data 0x%llx\n", msr, data); | ||
| 939 | break; | ||
| 940 | /* at least RHEL 4 unconditionally writes to the perfctr registers, | ||
| 941 | * so we ignore writes to make it happy. | ||
| 942 | */ | ||
| 943 | case MSR_P6_PERFCTR0: | ||
| 944 | case MSR_P6_PERFCTR1: | ||
| 945 | case MSR_K7_PERFCTR0: | ||
| 946 | case MSR_K7_PERFCTR1: | ||
| 947 | case MSR_K7_PERFCTR2: | ||
| 948 | case MSR_K7_PERFCTR3: | ||
| 949 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | ||
| 950 | "0x%x data 0x%llx\n", msr, data); | ||
| 951 | break; | ||
| 853 | default: | 952 | default: |
| 854 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); | 953 | if (!ignore_msrs) { |
| 855 | return 1; | 954 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", |
| 955 | msr, data); | ||
| 956 | return 1; | ||
| 957 | } else { | ||
| 958 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", | ||
| 959 | msr, data); | ||
| 960 | break; | ||
| 961 | } | ||
| 856 | } | 962 | } |
| 857 | return 0; | 963 | return 0; |
| 858 | } | 964 | } |
| @@ -905,26 +1011,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
| 905 | return 0; | 1011 | return 0; |
| 906 | } | 1012 | } |
| 907 | 1013 | ||
| 908 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | 1014 | static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) |
| 909 | { | 1015 | { |
| 910 | u64 data; | 1016 | u64 data; |
| 1017 | u64 mcg_cap = vcpu->arch.mcg_cap; | ||
| 1018 | unsigned bank_num = mcg_cap & 0xff; | ||
| 911 | 1019 | ||
| 912 | switch (msr) { | 1020 | switch (msr) { |
| 913 | case 0xc0010010: /* SYSCFG */ | ||
| 914 | case 0xc0010015: /* HWCR */ | ||
| 915 | case MSR_IA32_PLATFORM_ID: | ||
| 916 | case MSR_IA32_P5_MC_ADDR: | 1021 | case MSR_IA32_P5_MC_ADDR: |
| 917 | case MSR_IA32_P5_MC_TYPE: | 1022 | case MSR_IA32_P5_MC_TYPE: |
| 918 | case MSR_IA32_MC0_CTL: | 1023 | data = 0; |
| 919 | case MSR_IA32_MCG_STATUS: | 1024 | break; |
| 920 | case MSR_IA32_MCG_CAP: | 1025 | case MSR_IA32_MCG_CAP: |
| 1026 | data = vcpu->arch.mcg_cap; | ||
| 1027 | break; | ||
| 921 | case MSR_IA32_MCG_CTL: | 1028 | case MSR_IA32_MCG_CTL: |
| 922 | case MSR_IA32_MC0_MISC: | 1029 | if (!(mcg_cap & MCG_CTL_P)) |
| 923 | case MSR_IA32_MC0_MISC+4: | 1030 | return 1; |
| 924 | case MSR_IA32_MC0_MISC+8: | 1031 | data = vcpu->arch.mcg_ctl; |
| 925 | case MSR_IA32_MC0_MISC+12: | 1032 | break; |
| 926 | case MSR_IA32_MC0_MISC+16: | 1033 | case MSR_IA32_MCG_STATUS: |
| 927 | case MSR_IA32_MC0_MISC+20: | 1034 | data = vcpu->arch.mcg_status; |
| 1035 | break; | ||
| 1036 | default: | ||
| 1037 | if (msr >= MSR_IA32_MC0_CTL && | ||
| 1038 | msr < MSR_IA32_MC0_CTL + 4 * bank_num) { | ||
| 1039 | u32 offset = msr - MSR_IA32_MC0_CTL; | ||
| 1040 | data = vcpu->arch.mce_banks[offset]; | ||
| 1041 | break; | ||
| 1042 | } | ||
| 1043 | return 1; | ||
| 1044 | } | ||
| 1045 | *pdata = data; | ||
| 1046 | return 0; | ||
| 1047 | } | ||
| 1048 | |||
| 1049 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
| 1050 | { | ||
| 1051 | u64 data; | ||
| 1052 | |||
| 1053 | switch (msr) { | ||
| 1054 | case MSR_IA32_PLATFORM_ID: | ||
| 928 | case MSR_IA32_UCODE_REV: | 1055 | case MSR_IA32_UCODE_REV: |
| 929 | case MSR_IA32_EBL_CR_POWERON: | 1056 | case MSR_IA32_EBL_CR_POWERON: |
| 930 | case MSR_IA32_DEBUGCTLMSR: | 1057 | case MSR_IA32_DEBUGCTLMSR: |
| @@ -932,10 +1059,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
| 932 | case MSR_IA32_LASTBRANCHTOIP: | 1059 | case MSR_IA32_LASTBRANCHTOIP: |
| 933 | case MSR_IA32_LASTINTFROMIP: | 1060 | case MSR_IA32_LASTINTFROMIP: |
| 934 | case MSR_IA32_LASTINTTOIP: | 1061 | case MSR_IA32_LASTINTTOIP: |
| 1062 | case MSR_K8_SYSCFG: | ||
| 1063 | case MSR_K7_HWCR: | ||
| 935 | case MSR_VM_HSAVE_PA: | 1064 | case MSR_VM_HSAVE_PA: |
| 1065 | case MSR_P6_PERFCTR0: | ||
| 1066 | case MSR_P6_PERFCTR1: | ||
| 936 | case MSR_P6_EVNTSEL0: | 1067 | case MSR_P6_EVNTSEL0: |
| 937 | case MSR_P6_EVNTSEL1: | 1068 | case MSR_P6_EVNTSEL1: |
| 938 | case MSR_K7_EVNTSEL0: | 1069 | case MSR_K7_EVNTSEL0: |
| 1070 | case MSR_K7_PERFCTR0: | ||
| 1071 | case MSR_K8_INT_PENDING_MSG: | ||
| 1072 | case MSR_AMD64_NB_CFG: | ||
| 1073 | case MSR_FAM10H_MMIO_CONF_BASE: | ||
| 939 | data = 0; | 1074 | data = 0; |
| 940 | break; | 1075 | break; |
| 941 | case MSR_MTRRcap: | 1076 | case MSR_MTRRcap: |
| @@ -949,6 +1084,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
| 949 | case MSR_IA32_APICBASE: | 1084 | case MSR_IA32_APICBASE: |
| 950 | data = kvm_get_apic_base(vcpu); | 1085 | data = kvm_get_apic_base(vcpu); |
| 951 | break; | 1086 | break; |
| 1087 | case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: | ||
| 1088 | return kvm_x2apic_msr_read(vcpu, msr, pdata); | ||
| 1089 | break; | ||
| 952 | case MSR_IA32_MISC_ENABLE: | 1090 | case MSR_IA32_MISC_ENABLE: |
| 953 | data = vcpu->arch.ia32_misc_enable_msr; | 1091 | data = vcpu->arch.ia32_misc_enable_msr; |
| 954 | break; | 1092 | break; |
| @@ -967,9 +1105,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
| 967 | case MSR_KVM_SYSTEM_TIME: | 1105 | case MSR_KVM_SYSTEM_TIME: |
| 968 | data = vcpu->arch.time; | 1106 | data = vcpu->arch.time; |
| 969 | break; | 1107 | break; |
| 1108 | case MSR_IA32_P5_MC_ADDR: | ||
| 1109 | case MSR_IA32_P5_MC_TYPE: | ||
| 1110 | case MSR_IA32_MCG_CAP: | ||
| 1111 | case MSR_IA32_MCG_CTL: | ||
| 1112 | case MSR_IA32_MCG_STATUS: | ||
| 1113 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | ||
| 1114 | return get_msr_mce(vcpu, msr, pdata); | ||
| 970 | default: | 1115 | default: |
| 971 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | 1116 | if (!ignore_msrs) { |
| 972 | return 1; | 1117 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); |
| 1118 | return 1; | ||
| 1119 | } else { | ||
| 1120 | pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); | ||
| 1121 | data = 0; | ||
| 1122 | } | ||
| 1123 | break; | ||
| 973 | } | 1124 | } |
| 974 | *pdata = data; | 1125 | *pdata = data; |
| 975 | return 0; | 1126 | return 0; |
| @@ -1068,6 +1219,11 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
| 1068 | case KVM_CAP_REINJECT_CONTROL: | 1219 | case KVM_CAP_REINJECT_CONTROL: |
| 1069 | case KVM_CAP_IRQ_INJECT_STATUS: | 1220 | case KVM_CAP_IRQ_INJECT_STATUS: |
| 1070 | case KVM_CAP_ASSIGN_DEV_IRQ: | 1221 | case KVM_CAP_ASSIGN_DEV_IRQ: |
| 1222 | case KVM_CAP_IRQFD: | ||
| 1223 | case KVM_CAP_IOEVENTFD: | ||
| 1224 | case KVM_CAP_PIT2: | ||
| 1225 | case KVM_CAP_PIT_STATE2: | ||
| 1226 | case KVM_CAP_SET_IDENTITY_MAP_ADDR: | ||
| 1071 | r = 1; | 1227 | r = 1; |
| 1072 | break; | 1228 | break; |
| 1073 | case KVM_CAP_COALESCED_MMIO: | 1229 | case KVM_CAP_COALESCED_MMIO: |
| @@ -1088,6 +1244,9 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
| 1088 | case KVM_CAP_IOMMU: | 1244 | case KVM_CAP_IOMMU: |
| 1089 | r = iommu_found(); | 1245 | r = iommu_found(); |
| 1090 | break; | 1246 | break; |
| 1247 | case KVM_CAP_MCE: | ||
| 1248 | r = KVM_MAX_MCE_BANKS; | ||
| 1249 | break; | ||
| 1091 | default: | 1250 | default: |
| 1092 | r = 0; | 1251 | r = 0; |
| 1093 | break; | 1252 | break; |
| @@ -1147,6 +1306,16 @@ long kvm_arch_dev_ioctl(struct file *filp, | |||
| 1147 | r = 0; | 1306 | r = 0; |
| 1148 | break; | 1307 | break; |
| 1149 | } | 1308 | } |
| 1309 | case KVM_X86_GET_MCE_CAP_SUPPORTED: { | ||
| 1310 | u64 mce_cap; | ||
| 1311 | |||
| 1312 | mce_cap = KVM_MCE_CAP_SUPPORTED; | ||
| 1313 | r = -EFAULT; | ||
| 1314 | if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) | ||
| 1315 | goto out; | ||
| 1316 | r = 0; | ||
| 1317 | break; | ||
| 1318 | } | ||
| 1150 | default: | 1319 | default: |
| 1151 | r = -EINVAL; | 1320 | r = -EINVAL; |
| 1152 | } | 1321 | } |
| @@ -1227,6 +1396,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | |||
| 1227 | vcpu->arch.cpuid_nent = cpuid->nent; | 1396 | vcpu->arch.cpuid_nent = cpuid->nent; |
| 1228 | cpuid_fix_nx_cap(vcpu); | 1397 | cpuid_fix_nx_cap(vcpu); |
| 1229 | r = 0; | 1398 | r = 0; |
| 1399 | kvm_apic_set_version(vcpu); | ||
| 1230 | 1400 | ||
| 1231 | out_free: | 1401 | out_free: |
| 1232 | vfree(cpuid_entries); | 1402 | vfree(cpuid_entries); |
| @@ -1248,6 +1418,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | |||
| 1248 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | 1418 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) |
| 1249 | goto out; | 1419 | goto out; |
| 1250 | vcpu->arch.cpuid_nent = cpuid->nent; | 1420 | vcpu->arch.cpuid_nent = cpuid->nent; |
| 1421 | kvm_apic_set_version(vcpu); | ||
| 1251 | return 0; | 1422 | return 0; |
| 1252 | 1423 | ||
| 1253 | out: | 1424 | out: |
| @@ -1290,6 +1461,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 1290 | u32 index, int *nent, int maxnent) | 1461 | u32 index, int *nent, int maxnent) |
| 1291 | { | 1462 | { |
| 1292 | unsigned f_nx = is_efer_nx() ? F(NX) : 0; | 1463 | unsigned f_nx = is_efer_nx() ? F(NX) : 0; |
| 1464 | unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; | ||
| 1293 | #ifdef CONFIG_X86_64 | 1465 | #ifdef CONFIG_X86_64 |
| 1294 | unsigned f_lm = F(LM); | 1466 | unsigned f_lm = F(LM); |
| 1295 | #else | 1467 | #else |
| @@ -1314,7 +1486,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 1314 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | | 1486 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | |
| 1315 | F(PAT) | F(PSE36) | 0 /* Reserved */ | | 1487 | F(PAT) | F(PSE36) | 0 /* Reserved */ | |
| 1316 | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | | 1488 | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | |
| 1317 | F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | | 1489 | F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | |
| 1318 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); | 1490 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); |
| 1319 | /* cpuid 1.ecx */ | 1491 | /* cpuid 1.ecx */ |
| 1320 | const u32 kvm_supported_word4_x86_features = | 1492 | const u32 kvm_supported_word4_x86_features = |
| @@ -1323,7 +1495,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 1323 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | 1495 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | |
| 1324 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | | 1496 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | |
| 1325 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 1497 | 0 /* Reserved, DCA */ | F(XMM4_1) | |
| 1326 | F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | | 1498 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
| 1327 | 0 /* Reserved, XSAVE, OSXSAVE */; | 1499 | 0 /* Reserved, XSAVE, OSXSAVE */; |
| 1328 | /* cpuid 0x80000001.ecx */ | 1500 | /* cpuid 0x80000001.ecx */ |
| 1329 | const u32 kvm_supported_word6_x86_features = | 1501 | const u32 kvm_supported_word6_x86_features = |
| @@ -1344,6 +1516,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 1344 | case 1: | 1516 | case 1: |
| 1345 | entry->edx &= kvm_supported_word0_x86_features; | 1517 | entry->edx &= kvm_supported_word0_x86_features; |
| 1346 | entry->ecx &= kvm_supported_word4_x86_features; | 1518 | entry->ecx &= kvm_supported_word4_x86_features; |
| 1519 | /* we support x2apic emulation even if host does not support | ||
| 1520 | * it since we emulate x2apic in software */ | ||
| 1521 | entry->ecx |= F(X2APIC); | ||
| 1347 | break; | 1522 | break; |
| 1348 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | 1523 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands |
| 1349 | * may return different values. This forces us to get_cpu() before | 1524 | * may return different values. This forces us to get_cpu() before |
| @@ -1435,6 +1610,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | |||
| 1435 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) | 1610 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) |
| 1436 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | 1611 | do_cpuid_ent(&cpuid_entries[nent], func, 0, |
| 1437 | &nent, cpuid->nent); | 1612 | &nent, cpuid->nent); |
| 1613 | r = -E2BIG; | ||
| 1614 | if (nent >= cpuid->nent) | ||
| 1615 | goto out_free; | ||
| 1616 | |||
| 1438 | r = -EFAULT; | 1617 | r = -EFAULT; |
| 1439 | if (copy_to_user(entries, cpuid_entries, | 1618 | if (copy_to_user(entries, cpuid_entries, |
| 1440 | nent * sizeof(struct kvm_cpuid_entry2))) | 1619 | nent * sizeof(struct kvm_cpuid_entry2))) |
| @@ -1464,6 +1643,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | |||
| 1464 | vcpu_load(vcpu); | 1643 | vcpu_load(vcpu); |
| 1465 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | 1644 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); |
| 1466 | kvm_apic_post_state_restore(vcpu); | 1645 | kvm_apic_post_state_restore(vcpu); |
| 1646 | update_cr8_intercept(vcpu); | ||
| 1467 | vcpu_put(vcpu); | 1647 | vcpu_put(vcpu); |
| 1468 | 1648 | ||
| 1469 | return 0; | 1649 | return 0; |
| @@ -1503,6 +1683,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, | |||
| 1503 | return 0; | 1683 | return 0; |
| 1504 | } | 1684 | } |
| 1505 | 1685 | ||
| 1686 | static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, | ||
| 1687 | u64 mcg_cap) | ||
| 1688 | { | ||
| 1689 | int r; | ||
| 1690 | unsigned bank_num = mcg_cap & 0xff, bank; | ||
| 1691 | |||
| 1692 | r = -EINVAL; | ||
| 1693 | if (!bank_num) | ||
| 1694 | goto out; | ||
| 1695 | if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) | ||
| 1696 | goto out; | ||
| 1697 | r = 0; | ||
| 1698 | vcpu->arch.mcg_cap = mcg_cap; | ||
| 1699 | /* Init IA32_MCG_CTL to all 1s */ | ||
| 1700 | if (mcg_cap & MCG_CTL_P) | ||
| 1701 | vcpu->arch.mcg_ctl = ~(u64)0; | ||
| 1702 | /* Init IA32_MCi_CTL to all 1s */ | ||
| 1703 | for (bank = 0; bank < bank_num; bank++) | ||
| 1704 | vcpu->arch.mce_banks[bank*4] = ~(u64)0; | ||
| 1705 | out: | ||
| 1706 | return r; | ||
| 1707 | } | ||
| 1708 | |||
| 1709 | static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, | ||
| 1710 | struct kvm_x86_mce *mce) | ||
| 1711 | { | ||
| 1712 | u64 mcg_cap = vcpu->arch.mcg_cap; | ||
| 1713 | unsigned bank_num = mcg_cap & 0xff; | ||
| 1714 | u64 *banks = vcpu->arch.mce_banks; | ||
| 1715 | |||
| 1716 | if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) | ||
| 1717 | return -EINVAL; | ||
| 1718 | /* | ||
| 1719 | * if IA32_MCG_CTL is not all 1s, the uncorrected error | ||
| 1720 | * reporting is disabled | ||
| 1721 | */ | ||
| 1722 | if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && | ||
| 1723 | vcpu->arch.mcg_ctl != ~(u64)0) | ||
| 1724 | return 0; | ||
| 1725 | banks += 4 * mce->bank; | ||
| 1726 | /* | ||
| 1727 | * if IA32_MCi_CTL is not all 1s, the uncorrected error | ||
| 1728 | * reporting is disabled for the bank | ||
| 1729 | */ | ||
| 1730 | if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) | ||
| 1731 | return 0; | ||
| 1732 | if (mce->status & MCI_STATUS_UC) { | ||
| 1733 | if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || | ||
| 1734 | !(vcpu->arch.cr4 & X86_CR4_MCE)) { | ||
| 1735 | printk(KERN_DEBUG "kvm: set_mce: " | ||
| 1736 | "injects mce exception while " | ||
| 1737 | "previous one is in progress!\n"); | ||
| 1738 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | ||
| 1739 | return 0; | ||
| 1740 | } | ||
| 1741 | if (banks[1] & MCI_STATUS_VAL) | ||
| 1742 | mce->status |= MCI_STATUS_OVER; | ||
| 1743 | banks[2] = mce->addr; | ||
| 1744 | banks[3] = mce->misc; | ||
| 1745 | vcpu->arch.mcg_status = mce->mcg_status; | ||
| 1746 | banks[1] = mce->status; | ||
| 1747 | kvm_queue_exception(vcpu, MC_VECTOR); | ||
| 1748 | } else if (!(banks[1] & MCI_STATUS_VAL) | ||
| 1749 | || !(banks[1] & MCI_STATUS_UC)) { | ||
| 1750 | if (banks[1] & MCI_STATUS_VAL) | ||
| 1751 | mce->status |= MCI_STATUS_OVER; | ||
| 1752 | banks[2] = mce->addr; | ||
| 1753 | banks[3] = mce->misc; | ||
| 1754 | banks[1] = mce->status; | ||
| 1755 | } else | ||
| 1756 | banks[1] |= MCI_STATUS_OVER; | ||
| 1757 | return 0; | ||
| 1758 | } | ||
| 1759 | |||
| 1506 | long kvm_arch_vcpu_ioctl(struct file *filp, | 1760 | long kvm_arch_vcpu_ioctl(struct file *filp, |
| 1507 | unsigned int ioctl, unsigned long arg) | 1761 | unsigned int ioctl, unsigned long arg) |
| 1508 | { | 1762 | { |
| @@ -1636,6 +1890,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
| 1636 | kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); | 1890 | kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); |
| 1637 | break; | 1891 | break; |
| 1638 | } | 1892 | } |
| 1893 | case KVM_X86_SETUP_MCE: { | ||
| 1894 | u64 mcg_cap; | ||
| 1895 | |||
| 1896 | r = -EFAULT; | ||
| 1897 | if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) | ||
| 1898 | goto out; | ||
| 1899 | r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); | ||
| 1900 | break; | ||
| 1901 | } | ||
| 1902 | case KVM_X86_SET_MCE: { | ||
| 1903 | struct kvm_x86_mce mce; | ||
| 1904 | |||
| 1905 | r = -EFAULT; | ||
| 1906 | if (copy_from_user(&mce, argp, sizeof mce)) | ||
| 1907 | goto out; | ||
| 1908 | r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); | ||
| 1909 | break; | ||
| 1910 | } | ||
| 1639 | default: | 1911 | default: |
| 1640 | r = -EINVAL; | 1912 | r = -EINVAL; |
| 1641 | } | 1913 | } |
| @@ -1654,6 +1926,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | |||
| 1654 | return ret; | 1926 | return ret; |
| 1655 | } | 1927 | } |
| 1656 | 1928 | ||
| 1929 | static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, | ||
| 1930 | u64 ident_addr) | ||
| 1931 | { | ||
| 1932 | kvm->arch.ept_identity_map_addr = ident_addr; | ||
| 1933 | return 0; | ||
| 1934 | } | ||
| 1935 | |||
| 1657 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | 1936 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, |
| 1658 | u32 kvm_nr_mmu_pages) | 1937 | u32 kvm_nr_mmu_pages) |
| 1659 | { | 1938 | { |
| @@ -1775,19 +2054,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
| 1775 | r = 0; | 2054 | r = 0; |
| 1776 | switch (chip->chip_id) { | 2055 | switch (chip->chip_id) { |
| 1777 | case KVM_IRQCHIP_PIC_MASTER: | 2056 | case KVM_IRQCHIP_PIC_MASTER: |
| 2057 | spin_lock(&pic_irqchip(kvm)->lock); | ||
| 1778 | memcpy(&pic_irqchip(kvm)->pics[0], | 2058 | memcpy(&pic_irqchip(kvm)->pics[0], |
| 1779 | &chip->chip.pic, | 2059 | &chip->chip.pic, |
| 1780 | sizeof(struct kvm_pic_state)); | 2060 | sizeof(struct kvm_pic_state)); |
| 2061 | spin_unlock(&pic_irqchip(kvm)->lock); | ||
| 1781 | break; | 2062 | break; |
| 1782 | case KVM_IRQCHIP_PIC_SLAVE: | 2063 | case KVM_IRQCHIP_PIC_SLAVE: |
| 2064 | spin_lock(&pic_irqchip(kvm)->lock); | ||
| 1783 | memcpy(&pic_irqchip(kvm)->pics[1], | 2065 | memcpy(&pic_irqchip(kvm)->pics[1], |
| 1784 | &chip->chip.pic, | 2066 | &chip->chip.pic, |
| 1785 | sizeof(struct kvm_pic_state)); | 2067 | sizeof(struct kvm_pic_state)); |
| 2068 | spin_unlock(&pic_irqchip(kvm)->lock); | ||
| 1786 | break; | 2069 | break; |
| 1787 | case KVM_IRQCHIP_IOAPIC: | 2070 | case KVM_IRQCHIP_IOAPIC: |
| 2071 | mutex_lock(&kvm->irq_lock); | ||
| 1788 | memcpy(ioapic_irqchip(kvm), | 2072 | memcpy(ioapic_irqchip(kvm), |
| 1789 | &chip->chip.ioapic, | 2073 | &chip->chip.ioapic, |
| 1790 | sizeof(struct kvm_ioapic_state)); | 2074 | sizeof(struct kvm_ioapic_state)); |
| 2075 | mutex_unlock(&kvm->irq_lock); | ||
| 1791 | break; | 2076 | break; |
| 1792 | default: | 2077 | default: |
| 1793 | r = -EINVAL; | 2078 | r = -EINVAL; |
| @@ -1801,7 +2086,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) | |||
| 1801 | { | 2086 | { |
| 1802 | int r = 0; | 2087 | int r = 0; |
| 1803 | 2088 | ||
| 2089 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
| 1804 | memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); | 2090 | memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); |
| 2091 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
| 1805 | return r; | 2092 | return r; |
| 1806 | } | 2093 | } |
| 1807 | 2094 | ||
| @@ -1809,8 +2096,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) | |||
| 1809 | { | 2096 | { |
| 1810 | int r = 0; | 2097 | int r = 0; |
| 1811 | 2098 | ||
| 2099 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
| 1812 | memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); | 2100 | memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); |
| 1813 | kvm_pit_load_count(kvm, 0, ps->channels[0].count); | 2101 | kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); |
| 2102 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
| 2103 | return r; | ||
| 2104 | } | ||
| 2105 | |||
| 2106 | static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) | ||
| 2107 | { | ||
| 2108 | int r = 0; | ||
| 2109 | |||
| 2110 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
| 2111 | memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, | ||
| 2112 | sizeof(ps->channels)); | ||
| 2113 | ps->flags = kvm->arch.vpit->pit_state.flags; | ||
| 2114 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
| 2115 | return r; | ||
| 2116 | } | ||
| 2117 | |||
| 2118 | static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) | ||
| 2119 | { | ||
| 2120 | int r = 0, start = 0; | ||
| 2121 | u32 prev_legacy, cur_legacy; | ||
| 2122 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
| 2123 | prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; | ||
| 2124 | cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; | ||
| 2125 | if (!prev_legacy && cur_legacy) | ||
| 2126 | start = 1; | ||
| 2127 | memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, | ||
| 2128 | sizeof(kvm->arch.vpit->pit_state.channels)); | ||
| 2129 | kvm->arch.vpit->pit_state.flags = ps->flags; | ||
| 2130 | kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); | ||
| 2131 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
| 1814 | return r; | 2132 | return r; |
| 1815 | } | 2133 | } |
| 1816 | 2134 | ||
| @@ -1819,7 +2137,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, | |||
| 1819 | { | 2137 | { |
| 1820 | if (!kvm->arch.vpit) | 2138 | if (!kvm->arch.vpit) |
| 1821 | return -ENXIO; | 2139 | return -ENXIO; |
| 2140 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
| 1822 | kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; | 2141 | kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; |
| 2142 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
| 1823 | return 0; | 2143 | return 0; |
| 1824 | } | 2144 | } |
| 1825 | 2145 | ||
| @@ -1845,7 +2165,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
| 1845 | spin_lock(&kvm->mmu_lock); | 2165 | spin_lock(&kvm->mmu_lock); |
| 1846 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | 2166 | kvm_mmu_slot_remove_write_access(kvm, log->slot); |
| 1847 | spin_unlock(&kvm->mmu_lock); | 2167 | spin_unlock(&kvm->mmu_lock); |
| 1848 | kvm_flush_remote_tlbs(kvm); | ||
| 1849 | memslot = &kvm->memslots[log->slot]; | 2168 | memslot = &kvm->memslots[log->slot]; |
| 1850 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | 2169 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; |
| 1851 | memset(memslot->dirty_bitmap, 0, n); | 2170 | memset(memslot->dirty_bitmap, 0, n); |
| @@ -1869,7 +2188,9 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 1869 | */ | 2188 | */ |
| 1870 | union { | 2189 | union { |
| 1871 | struct kvm_pit_state ps; | 2190 | struct kvm_pit_state ps; |
| 2191 | struct kvm_pit_state2 ps2; | ||
| 1872 | struct kvm_memory_alias alias; | 2192 | struct kvm_memory_alias alias; |
| 2193 | struct kvm_pit_config pit_config; | ||
| 1873 | } u; | 2194 | } u; |
| 1874 | 2195 | ||
| 1875 | switch (ioctl) { | 2196 | switch (ioctl) { |
| @@ -1878,6 +2199,17 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 1878 | if (r < 0) | 2199 | if (r < 0) |
| 1879 | goto out; | 2200 | goto out; |
| 1880 | break; | 2201 | break; |
| 2202 | case KVM_SET_IDENTITY_MAP_ADDR: { | ||
| 2203 | u64 ident_addr; | ||
| 2204 | |||
| 2205 | r = -EFAULT; | ||
| 2206 | if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) | ||
| 2207 | goto out; | ||
| 2208 | r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); | ||
| 2209 | if (r < 0) | ||
| 2210 | goto out; | ||
| 2211 | break; | ||
| 2212 | } | ||
| 1881 | case KVM_SET_MEMORY_REGION: { | 2213 | case KVM_SET_MEMORY_REGION: { |
| 1882 | struct kvm_memory_region kvm_mem; | 2214 | struct kvm_memory_region kvm_mem; |
| 1883 | struct kvm_userspace_memory_region kvm_userspace_mem; | 2215 | struct kvm_userspace_memory_region kvm_userspace_mem; |
| @@ -1930,16 +2262,24 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 1930 | } | 2262 | } |
| 1931 | break; | 2263 | break; |
| 1932 | case KVM_CREATE_PIT: | 2264 | case KVM_CREATE_PIT: |
| 1933 | mutex_lock(&kvm->lock); | 2265 | u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; |
| 2266 | goto create_pit; | ||
| 2267 | case KVM_CREATE_PIT2: | ||
| 2268 | r = -EFAULT; | ||
| 2269 | if (copy_from_user(&u.pit_config, argp, | ||
| 2270 | sizeof(struct kvm_pit_config))) | ||
| 2271 | goto out; | ||
| 2272 | create_pit: | ||
| 2273 | down_write(&kvm->slots_lock); | ||
| 1934 | r = -EEXIST; | 2274 | r = -EEXIST; |
| 1935 | if (kvm->arch.vpit) | 2275 | if (kvm->arch.vpit) |
| 1936 | goto create_pit_unlock; | 2276 | goto create_pit_unlock; |
| 1937 | r = -ENOMEM; | 2277 | r = -ENOMEM; |
| 1938 | kvm->arch.vpit = kvm_create_pit(kvm); | 2278 | kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); |
| 1939 | if (kvm->arch.vpit) | 2279 | if (kvm->arch.vpit) |
| 1940 | r = 0; | 2280 | r = 0; |
| 1941 | create_pit_unlock: | 2281 | create_pit_unlock: |
| 1942 | mutex_unlock(&kvm->lock); | 2282 | up_write(&kvm->slots_lock); |
| 1943 | break; | 2283 | break; |
| 1944 | case KVM_IRQ_LINE_STATUS: | 2284 | case KVM_IRQ_LINE_STATUS: |
| 1945 | case KVM_IRQ_LINE: { | 2285 | case KVM_IRQ_LINE: { |
| @@ -1950,10 +2290,10 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 1950 | goto out; | 2290 | goto out; |
| 1951 | if (irqchip_in_kernel(kvm)) { | 2291 | if (irqchip_in_kernel(kvm)) { |
| 1952 | __s32 status; | 2292 | __s32 status; |
| 1953 | mutex_lock(&kvm->lock); | 2293 | mutex_lock(&kvm->irq_lock); |
| 1954 | status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, | 2294 | status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, |
| 1955 | irq_event.irq, irq_event.level); | 2295 | irq_event.irq, irq_event.level); |
| 1956 | mutex_unlock(&kvm->lock); | 2296 | mutex_unlock(&kvm->irq_lock); |
| 1957 | if (ioctl == KVM_IRQ_LINE_STATUS) { | 2297 | if (ioctl == KVM_IRQ_LINE_STATUS) { |
| 1958 | irq_event.status = status; | 2298 | irq_event.status = status; |
| 1959 | if (copy_to_user(argp, &irq_event, | 2299 | if (copy_to_user(argp, &irq_event, |
| @@ -2042,6 +2382,32 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 2042 | r = 0; | 2382 | r = 0; |
| 2043 | break; | 2383 | break; |
| 2044 | } | 2384 | } |
| 2385 | case KVM_GET_PIT2: { | ||
| 2386 | r = -ENXIO; | ||
| 2387 | if (!kvm->arch.vpit) | ||
| 2388 | goto out; | ||
| 2389 | r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); | ||
| 2390 | if (r) | ||
| 2391 | goto out; | ||
| 2392 | r = -EFAULT; | ||
| 2393 | if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) | ||
| 2394 | goto out; | ||
| 2395 | r = 0; | ||
| 2396 | break; | ||
| 2397 | } | ||
| 2398 | case KVM_SET_PIT2: { | ||
| 2399 | r = -EFAULT; | ||
| 2400 | if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) | ||
| 2401 | goto out; | ||
| 2402 | r = -ENXIO; | ||
| 2403 | if (!kvm->arch.vpit) | ||
| 2404 | goto out; | ||
| 2405 | r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); | ||
| 2406 | if (r) | ||
| 2407 | goto out; | ||
| 2408 | r = 0; | ||
| 2409 | break; | ||
| 2410 | } | ||
| 2045 | case KVM_REINJECT_CONTROL: { | 2411 | case KVM_REINJECT_CONTROL: { |
| 2046 | struct kvm_reinject_control control; | 2412 | struct kvm_reinject_control control; |
| 2047 | r = -EFAULT; | 2413 | r = -EFAULT; |
| @@ -2075,35 +2441,23 @@ static void kvm_init_msr_list(void) | |||
| 2075 | num_msrs_to_save = j; | 2441 | num_msrs_to_save = j; |
| 2076 | } | 2442 | } |
| 2077 | 2443 | ||
| 2078 | /* | 2444 | static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, |
| 2079 | * Only apic need an MMIO device hook, so shortcut now.. | 2445 | const void *v) |
| 2080 | */ | ||
| 2081 | static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, | ||
| 2082 | gpa_t addr, int len, | ||
| 2083 | int is_write) | ||
| 2084 | { | 2446 | { |
| 2085 | struct kvm_io_device *dev; | 2447 | if (vcpu->arch.apic && |
| 2448 | !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) | ||
| 2449 | return 0; | ||
| 2086 | 2450 | ||
| 2087 | if (vcpu->arch.apic) { | 2451 | return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); |
| 2088 | dev = &vcpu->arch.apic->dev; | ||
| 2089 | if (dev->in_range(dev, addr, len, is_write)) | ||
| 2090 | return dev; | ||
| 2091 | } | ||
| 2092 | return NULL; | ||
| 2093 | } | 2452 | } |
| 2094 | 2453 | ||
| 2095 | 2454 | static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) | |
| 2096 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | ||
| 2097 | gpa_t addr, int len, | ||
| 2098 | int is_write) | ||
| 2099 | { | 2455 | { |
| 2100 | struct kvm_io_device *dev; | 2456 | if (vcpu->arch.apic && |
| 2457 | !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) | ||
| 2458 | return 0; | ||
| 2101 | 2459 | ||
| 2102 | dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); | 2460 | return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); |
| 2103 | if (dev == NULL) | ||
| 2104 | dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, | ||
| 2105 | is_write); | ||
| 2106 | return dev; | ||
| 2107 | } | 2461 | } |
| 2108 | 2462 | ||
| 2109 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, | 2463 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, |
| @@ -2172,11 +2526,12 @@ static int emulator_read_emulated(unsigned long addr, | |||
| 2172 | unsigned int bytes, | 2526 | unsigned int bytes, |
| 2173 | struct kvm_vcpu *vcpu) | 2527 | struct kvm_vcpu *vcpu) |
| 2174 | { | 2528 | { |
| 2175 | struct kvm_io_device *mmio_dev; | ||
| 2176 | gpa_t gpa; | 2529 | gpa_t gpa; |
| 2177 | 2530 | ||
| 2178 | if (vcpu->mmio_read_completed) { | 2531 | if (vcpu->mmio_read_completed) { |
| 2179 | memcpy(val, vcpu->mmio_data, bytes); | 2532 | memcpy(val, vcpu->mmio_data, bytes); |
| 2533 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, | ||
| 2534 | vcpu->mmio_phys_addr, *(u64 *)val); | ||
| 2180 | vcpu->mmio_read_completed = 0; | 2535 | vcpu->mmio_read_completed = 0; |
| 2181 | return X86EMUL_CONTINUE; | 2536 | return X86EMUL_CONTINUE; |
| 2182 | } | 2537 | } |
| @@ -2197,14 +2552,12 @@ mmio: | |||
| 2197 | /* | 2552 | /* |
| 2198 | * Is this MMIO handled locally? | 2553 | * Is this MMIO handled locally? |
| 2199 | */ | 2554 | */ |
| 2200 | mutex_lock(&vcpu->kvm->lock); | 2555 | if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { |
| 2201 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); | 2556 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); |
| 2202 | if (mmio_dev) { | ||
| 2203 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); | ||
| 2204 | mutex_unlock(&vcpu->kvm->lock); | ||
| 2205 | return X86EMUL_CONTINUE; | 2557 | return X86EMUL_CONTINUE; |
| 2206 | } | 2558 | } |
| 2207 | mutex_unlock(&vcpu->kvm->lock); | 2559 | |
| 2560 | trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); | ||
| 2208 | 2561 | ||
| 2209 | vcpu->mmio_needed = 1; | 2562 | vcpu->mmio_needed = 1; |
| 2210 | vcpu->mmio_phys_addr = gpa; | 2563 | vcpu->mmio_phys_addr = gpa; |
| @@ -2231,7 +2584,6 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
| 2231 | unsigned int bytes, | 2584 | unsigned int bytes, |
| 2232 | struct kvm_vcpu *vcpu) | 2585 | struct kvm_vcpu *vcpu) |
| 2233 | { | 2586 | { |
| 2234 | struct kvm_io_device *mmio_dev; | ||
| 2235 | gpa_t gpa; | 2587 | gpa_t gpa; |
| 2236 | 2588 | ||
| 2237 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | 2589 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); |
| @@ -2249,17 +2601,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
| 2249 | return X86EMUL_CONTINUE; | 2601 | return X86EMUL_CONTINUE; |
| 2250 | 2602 | ||
| 2251 | mmio: | 2603 | mmio: |
| 2604 | trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); | ||
| 2252 | /* | 2605 | /* |
| 2253 | * Is this MMIO handled locally? | 2606 | * Is this MMIO handled locally? |
| 2254 | */ | 2607 | */ |
| 2255 | mutex_lock(&vcpu->kvm->lock); | 2608 | if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) |
| 2256 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); | ||
| 2257 | if (mmio_dev) { | ||
| 2258 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); | ||
| 2259 | mutex_unlock(&vcpu->kvm->lock); | ||
| 2260 | return X86EMUL_CONTINUE; | 2609 | return X86EMUL_CONTINUE; |
| 2261 | } | ||
| 2262 | mutex_unlock(&vcpu->kvm->lock); | ||
| 2263 | 2610 | ||
| 2264 | vcpu->mmio_needed = 1; | 2611 | vcpu->mmio_needed = 1; |
| 2265 | vcpu->mmio_phys_addr = gpa; | 2612 | vcpu->mmio_phys_addr = gpa; |
| @@ -2343,7 +2690,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | |||
| 2343 | 2690 | ||
| 2344 | int emulate_clts(struct kvm_vcpu *vcpu) | 2691 | int emulate_clts(struct kvm_vcpu *vcpu) |
| 2345 | { | 2692 | { |
| 2346 | KVMTRACE_0D(CLTS, vcpu, handler); | ||
| 2347 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); | 2693 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); |
| 2348 | return X86EMUL_CONTINUE; | 2694 | return X86EMUL_CONTINUE; |
| 2349 | } | 2695 | } |
| @@ -2420,7 +2766,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
| 2420 | kvm_clear_exception_queue(vcpu); | 2766 | kvm_clear_exception_queue(vcpu); |
| 2421 | vcpu->arch.mmio_fault_cr2 = cr2; | 2767 | vcpu->arch.mmio_fault_cr2 = cr2; |
| 2422 | /* | 2768 | /* |
| 2423 | * TODO: fix x86_emulate.c to use guest_read/write_register | 2769 | * TODO: fix emulate.c to use guest_read/write_register |
| 2424 | * instead of direct ->regs accesses, can save hundred cycles | 2770 | * instead of direct ->regs accesses, can save hundred cycles |
| 2425 | * on Intel for instructions that don't read/change RSP, for | 2771 | * on Intel for instructions that don't read/change RSP, for |
| 2426 | * for example. | 2772 | * for example. |
| @@ -2444,14 +2790,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
| 2444 | 2790 | ||
| 2445 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | 2791 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); |
| 2446 | 2792 | ||
| 2447 | /* Reject the instructions other than VMCALL/VMMCALL when | 2793 | /* Only allow emulation of specific instructions on #UD |
| 2448 | * try to emulate invalid opcode */ | 2794 | * (namely VMMCALL, sysenter, sysexit, syscall)*/ |
| 2449 | c = &vcpu->arch.emulate_ctxt.decode; | 2795 | c = &vcpu->arch.emulate_ctxt.decode; |
| 2450 | if ((emulation_type & EMULTYPE_TRAP_UD) && | 2796 | if (emulation_type & EMULTYPE_TRAP_UD) { |
| 2451 | (!(c->twobyte && c->b == 0x01 && | 2797 | if (!c->twobyte) |
| 2452 | (c->modrm_reg == 0 || c->modrm_reg == 3) && | 2798 | return EMULATE_FAIL; |
| 2453 | c->modrm_mod == 3 && c->modrm_rm == 1))) | 2799 | switch (c->b) { |
| 2454 | return EMULATE_FAIL; | 2800 | case 0x01: /* VMMCALL */ |
| 2801 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
| 2802 | return EMULATE_FAIL; | ||
| 2803 | break; | ||
| 2804 | case 0x34: /* sysenter */ | ||
| 2805 | case 0x35: /* sysexit */ | ||
| 2806 | if (c->modrm_mod != 0 || c->modrm_rm != 0) | ||
| 2807 | return EMULATE_FAIL; | ||
| 2808 | break; | ||
| 2809 | case 0x05: /* syscall */ | ||
| 2810 | if (c->modrm_mod != 0 || c->modrm_rm != 0) | ||
| 2811 | return EMULATE_FAIL; | ||
| 2812 | break; | ||
| 2813 | default: | ||
| 2814 | return EMULATE_FAIL; | ||
| 2815 | } | ||
| 2816 | |||
| 2817 | if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) | ||
| 2818 | return EMULATE_FAIL; | ||
| 2819 | } | ||
| 2455 | 2820 | ||
| 2456 | ++vcpu->stat.insn_emulation; | 2821 | ++vcpu->stat.insn_emulation; |
| 2457 | if (r) { | 2822 | if (r) { |
| @@ -2571,52 +2936,40 @@ int complete_pio(struct kvm_vcpu *vcpu) | |||
| 2571 | return 0; | 2936 | return 0; |
| 2572 | } | 2937 | } |
| 2573 | 2938 | ||
| 2574 | static void kernel_pio(struct kvm_io_device *pio_dev, | 2939 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) |
| 2575 | struct kvm_vcpu *vcpu, | ||
| 2576 | void *pd) | ||
| 2577 | { | 2940 | { |
| 2578 | /* TODO: String I/O for in kernel device */ | 2941 | /* TODO: String I/O for in kernel device */ |
| 2942 | int r; | ||
| 2579 | 2943 | ||
| 2580 | mutex_lock(&vcpu->kvm->lock); | ||
| 2581 | if (vcpu->arch.pio.in) | 2944 | if (vcpu->arch.pio.in) |
| 2582 | kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, | 2945 | r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, |
| 2583 | vcpu->arch.pio.size, | 2946 | vcpu->arch.pio.size, pd); |
| 2584 | pd); | ||
| 2585 | else | 2947 | else |
| 2586 | kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, | 2948 | r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, |
| 2587 | vcpu->arch.pio.size, | 2949 | vcpu->arch.pio.size, pd); |
| 2588 | pd); | 2950 | return r; |
| 2589 | mutex_unlock(&vcpu->kvm->lock); | ||
| 2590 | } | 2951 | } |
| 2591 | 2952 | ||
| 2592 | static void pio_string_write(struct kvm_io_device *pio_dev, | 2953 | static int pio_string_write(struct kvm_vcpu *vcpu) |
| 2593 | struct kvm_vcpu *vcpu) | ||
| 2594 | { | 2954 | { |
| 2595 | struct kvm_pio_request *io = &vcpu->arch.pio; | 2955 | struct kvm_pio_request *io = &vcpu->arch.pio; |
| 2596 | void *pd = vcpu->arch.pio_data; | 2956 | void *pd = vcpu->arch.pio_data; |
| 2597 | int i; | 2957 | int i, r = 0; |
| 2598 | 2958 | ||
| 2599 | mutex_lock(&vcpu->kvm->lock); | ||
| 2600 | for (i = 0; i < io->cur_count; i++) { | 2959 | for (i = 0; i < io->cur_count; i++) { |
| 2601 | kvm_iodevice_write(pio_dev, io->port, | 2960 | if (kvm_io_bus_write(&vcpu->kvm->pio_bus, |
| 2602 | io->size, | 2961 | io->port, io->size, pd)) { |
| 2603 | pd); | 2962 | r = -EOPNOTSUPP; |
| 2963 | break; | ||
| 2964 | } | ||
| 2604 | pd += io->size; | 2965 | pd += io->size; |
| 2605 | } | 2966 | } |
| 2606 | mutex_unlock(&vcpu->kvm->lock); | 2967 | return r; |
| 2607 | } | ||
| 2608 | |||
| 2609 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, | ||
| 2610 | gpa_t addr, int len, | ||
| 2611 | int is_write) | ||
| 2612 | { | ||
| 2613 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); | ||
| 2614 | } | 2968 | } |
| 2615 | 2969 | ||
| 2616 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | 2970 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, |
| 2617 | int size, unsigned port) | 2971 | int size, unsigned port) |
| 2618 | { | 2972 | { |
| 2619 | struct kvm_io_device *pio_dev; | ||
| 2620 | unsigned long val; | 2973 | unsigned long val; |
| 2621 | 2974 | ||
| 2622 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2975 | vcpu->run->exit_reason = KVM_EXIT_IO; |
| @@ -2630,19 +2983,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
| 2630 | vcpu->arch.pio.down = 0; | 2983 | vcpu->arch.pio.down = 0; |
| 2631 | vcpu->arch.pio.rep = 0; | 2984 | vcpu->arch.pio.rep = 0; |
| 2632 | 2985 | ||
| 2633 | if (vcpu->run->io.direction == KVM_EXIT_IO_IN) | 2986 | trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, |
| 2634 | KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, | 2987 | size, 1); |
| 2635 | handler); | ||
| 2636 | else | ||
| 2637 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | ||
| 2638 | handler); | ||
| 2639 | 2988 | ||
| 2640 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); | 2989 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
| 2641 | memcpy(vcpu->arch.pio_data, &val, 4); | 2990 | memcpy(vcpu->arch.pio_data, &val, 4); |
| 2642 | 2991 | ||
| 2643 | pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); | 2992 | if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { |
| 2644 | if (pio_dev) { | ||
| 2645 | kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); | ||
| 2646 | complete_pio(vcpu); | 2993 | complete_pio(vcpu); |
| 2647 | return 1; | 2994 | return 1; |
| 2648 | } | 2995 | } |
| @@ -2656,7 +3003,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
| 2656 | { | 3003 | { |
| 2657 | unsigned now, in_page; | 3004 | unsigned now, in_page; |
| 2658 | int ret = 0; | 3005 | int ret = 0; |
| 2659 | struct kvm_io_device *pio_dev; | ||
| 2660 | 3006 | ||
| 2661 | vcpu->run->exit_reason = KVM_EXIT_IO; | 3007 | vcpu->run->exit_reason = KVM_EXIT_IO; |
| 2662 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 3008 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
| @@ -2669,12 +3015,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
| 2669 | vcpu->arch.pio.down = down; | 3015 | vcpu->arch.pio.down = down; |
| 2670 | vcpu->arch.pio.rep = rep; | 3016 | vcpu->arch.pio.rep = rep; |
| 2671 | 3017 | ||
| 2672 | if (vcpu->run->io.direction == KVM_EXIT_IO_IN) | 3018 | trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, |
| 2673 | KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, | 3019 | size, count); |
| 2674 | handler); | ||
| 2675 | else | ||
| 2676 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | ||
| 2677 | handler); | ||
| 2678 | 3020 | ||
| 2679 | if (!count) { | 3021 | if (!count) { |
| 2680 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 3022 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
| @@ -2704,9 +3046,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
| 2704 | 3046 | ||
| 2705 | vcpu->arch.pio.guest_gva = address; | 3047 | vcpu->arch.pio.guest_gva = address; |
| 2706 | 3048 | ||
| 2707 | pio_dev = vcpu_find_pio_dev(vcpu, port, | ||
| 2708 | vcpu->arch.pio.cur_count, | ||
| 2709 | !vcpu->arch.pio.in); | ||
| 2710 | if (!vcpu->arch.pio.in) { | 3049 | if (!vcpu->arch.pio.in) { |
| 2711 | /* string PIO write */ | 3050 | /* string PIO write */ |
| 2712 | ret = pio_copy_data(vcpu); | 3051 | ret = pio_copy_data(vcpu); |
| @@ -2714,16 +3053,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
| 2714 | kvm_inject_gp(vcpu, 0); | 3053 | kvm_inject_gp(vcpu, 0); |
| 2715 | return 1; | 3054 | return 1; |
| 2716 | } | 3055 | } |
| 2717 | if (ret == 0 && pio_dev) { | 3056 | if (ret == 0 && !pio_string_write(vcpu)) { |
| 2718 | pio_string_write(pio_dev, vcpu); | ||
| 2719 | complete_pio(vcpu); | 3057 | complete_pio(vcpu); |
| 2720 | if (vcpu->arch.pio.count == 0) | 3058 | if (vcpu->arch.pio.count == 0) |
| 2721 | ret = 1; | 3059 | ret = 1; |
| 2722 | } | 3060 | } |
| 2723 | } else if (pio_dev) | 3061 | } |
| 2724 | pr_unimpl(vcpu, "no string pio read support yet, " | 3062 | /* no string PIO read support yet */ |
| 2725 | "port %x size %d count %ld\n", | ||
| 2726 | port, size, count); | ||
| 2727 | 3063 | ||
| 2728 | return ret; | 3064 | return ret; |
| 2729 | } | 3065 | } |
| @@ -2756,10 +3092,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va | |||
| 2756 | 3092 | ||
| 2757 | spin_lock(&kvm_lock); | 3093 | spin_lock(&kvm_lock); |
| 2758 | list_for_each_entry(kvm, &vm_list, vm_list) { | 3094 | list_for_each_entry(kvm, &vm_list, vm_list) { |
| 2759 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 3095 | kvm_for_each_vcpu(i, vcpu, kvm) { |
| 2760 | vcpu = kvm->vcpus[i]; | ||
| 2761 | if (!vcpu) | ||
| 2762 | continue; | ||
| 2763 | if (vcpu->cpu != freq->cpu) | 3096 | if (vcpu->cpu != freq->cpu) |
| 2764 | continue; | 3097 | continue; |
| 2765 | if (!kvm_request_guest_time_update(vcpu)) | 3098 | if (!kvm_request_guest_time_update(vcpu)) |
| @@ -2852,7 +3185,6 @@ void kvm_arch_exit(void) | |||
| 2852 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | 3185 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) |
| 2853 | { | 3186 | { |
| 2854 | ++vcpu->stat.halt_exits; | 3187 | ++vcpu->stat.halt_exits; |
| 2855 | KVMTRACE_0D(HLT, vcpu, handler); | ||
| 2856 | if (irqchip_in_kernel(vcpu->kvm)) { | 3188 | if (irqchip_in_kernel(vcpu->kvm)) { |
| 2857 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; | 3189 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; |
| 2858 | return 1; | 3190 | return 1; |
| @@ -2883,7 +3215,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
| 2883 | a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); | 3215 | a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); |
| 2884 | a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); | 3216 | a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); |
| 2885 | 3217 | ||
| 2886 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); | 3218 | trace_kvm_hypercall(nr, a0, a1, a2, a3); |
| 2887 | 3219 | ||
| 2888 | if (!is_long_mode(vcpu)) { | 3220 | if (!is_long_mode(vcpu)) { |
| 2889 | nr &= 0xFFFFFFFF; | 3221 | nr &= 0xFFFFFFFF; |
| @@ -2893,6 +3225,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
| 2893 | a3 &= 0xFFFFFFFF; | 3225 | a3 &= 0xFFFFFFFF; |
| 2894 | } | 3226 | } |
| 2895 | 3227 | ||
| 3228 | if (kvm_x86_ops->get_cpl(vcpu) != 0) { | ||
| 3229 | ret = -KVM_EPERM; | ||
| 3230 | goto out; | ||
| 3231 | } | ||
| 3232 | |||
| 2896 | switch (nr) { | 3233 | switch (nr) { |
| 2897 | case KVM_HC_VAPIC_POLL_IRQ: | 3234 | case KVM_HC_VAPIC_POLL_IRQ: |
| 2898 | ret = 0; | 3235 | ret = 0; |
| @@ -2904,6 +3241,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
| 2904 | ret = -KVM_ENOSYS; | 3241 | ret = -KVM_ENOSYS; |
| 2905 | break; | 3242 | break; |
| 2906 | } | 3243 | } |
| 3244 | out: | ||
| 2907 | kvm_register_write(vcpu, VCPU_REGS_RAX, ret); | 3245 | kvm_register_write(vcpu, VCPU_REGS_RAX, ret); |
| 2908 | ++vcpu->stat.hypercalls; | 3246 | ++vcpu->stat.hypercalls; |
| 2909 | return r; | 3247 | return r; |
| @@ -2983,8 +3321,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | |||
| 2983 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 3321 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); |
| 2984 | return 0; | 3322 | return 0; |
| 2985 | } | 3323 | } |
| 2986 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, | ||
| 2987 | (u32)((u64)value >> 32), handler); | ||
| 2988 | 3324 | ||
| 2989 | return value; | 3325 | return value; |
| 2990 | } | 3326 | } |
| @@ -2992,9 +3328,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | |||
| 2992 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | 3328 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, |
| 2993 | unsigned long *rflags) | 3329 | unsigned long *rflags) |
| 2994 | { | 3330 | { |
| 2995 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, | ||
| 2996 | (u32)((u64)val >> 32), handler); | ||
| 2997 | |||
| 2998 | switch (cr) { | 3331 | switch (cr) { |
| 2999 | case 0: | 3332 | case 0: |
| 3000 | kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); | 3333 | kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); |
| @@ -3104,11 +3437,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
| 3104 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); | 3437 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); |
| 3105 | } | 3438 | } |
| 3106 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 3439 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
| 3107 | KVMTRACE_5D(CPUID, vcpu, function, | 3440 | trace_kvm_cpuid(function, |
| 3108 | (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), | 3441 | kvm_register_read(vcpu, VCPU_REGS_RAX), |
| 3109 | (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), | 3442 | kvm_register_read(vcpu, VCPU_REGS_RBX), |
| 3110 | (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), | 3443 | kvm_register_read(vcpu, VCPU_REGS_RCX), |
| 3111 | (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); | 3444 | kvm_register_read(vcpu, VCPU_REGS_RDX)); |
| 3112 | } | 3445 | } |
| 3113 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | 3446 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); |
| 3114 | 3447 | ||
| @@ -3174,6 +3507,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) | |||
| 3174 | if (!kvm_x86_ops->update_cr8_intercept) | 3507 | if (!kvm_x86_ops->update_cr8_intercept) |
| 3175 | return; | 3508 | return; |
| 3176 | 3509 | ||
| 3510 | if (!vcpu->arch.apic) | ||
| 3511 | return; | ||
| 3512 | |||
| 3177 | if (!vcpu->arch.apic->vapic_addr) | 3513 | if (!vcpu->arch.apic->vapic_addr) |
| 3178 | max_irr = kvm_lapic_find_highest_irr(vcpu); | 3514 | max_irr = kvm_lapic_find_highest_irr(vcpu); |
| 3179 | else | 3515 | else |
| @@ -3187,12 +3523,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) | |||
| 3187 | kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); | 3523 | kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); |
| 3188 | } | 3524 | } |
| 3189 | 3525 | ||
| 3190 | static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3526 | static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 3191 | { | 3527 | { |
| 3192 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | ||
| 3193 | kvm_x86_ops->set_interrupt_shadow(vcpu, 0); | ||
| 3194 | |||
| 3195 | /* try to reinject previous events if any */ | 3528 | /* try to reinject previous events if any */ |
| 3529 | if (vcpu->arch.exception.pending) { | ||
| 3530 | kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, | ||
| 3531 | vcpu->arch.exception.has_error_code, | ||
| 3532 | vcpu->arch.exception.error_code); | ||
| 3533 | return; | ||
| 3534 | } | ||
| 3535 | |||
| 3196 | if (vcpu->arch.nmi_injected) { | 3536 | if (vcpu->arch.nmi_injected) { |
| 3197 | kvm_x86_ops->set_nmi(vcpu); | 3537 | kvm_x86_ops->set_nmi(vcpu); |
| 3198 | return; | 3538 | return; |
| @@ -3266,16 +3606,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3266 | smp_mb__after_clear_bit(); | 3606 | smp_mb__after_clear_bit(); |
| 3267 | 3607 | ||
| 3268 | if (vcpu->requests || need_resched() || signal_pending(current)) { | 3608 | if (vcpu->requests || need_resched() || signal_pending(current)) { |
| 3609 | set_bit(KVM_REQ_KICK, &vcpu->requests); | ||
| 3269 | local_irq_enable(); | 3610 | local_irq_enable(); |
| 3270 | preempt_enable(); | 3611 | preempt_enable(); |
| 3271 | r = 1; | 3612 | r = 1; |
| 3272 | goto out; | 3613 | goto out; |
| 3273 | } | 3614 | } |
| 3274 | 3615 | ||
| 3275 | if (vcpu->arch.exception.pending) | 3616 | inject_pending_event(vcpu, kvm_run); |
| 3276 | __queue_exception(vcpu); | ||
| 3277 | else | ||
| 3278 | inject_pending_irq(vcpu, kvm_run); | ||
| 3279 | 3617 | ||
| 3280 | /* enable NMI/IRQ window open exits if needed */ | 3618 | /* enable NMI/IRQ window open exits if needed */ |
| 3281 | if (vcpu->arch.nmi_pending) | 3619 | if (vcpu->arch.nmi_pending) |
| @@ -3292,14 +3630,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3292 | 3630 | ||
| 3293 | kvm_guest_enter(); | 3631 | kvm_guest_enter(); |
| 3294 | 3632 | ||
| 3295 | get_debugreg(vcpu->arch.host_dr6, 6); | ||
| 3296 | get_debugreg(vcpu->arch.host_dr7, 7); | ||
| 3297 | if (unlikely(vcpu->arch.switch_db_regs)) { | 3633 | if (unlikely(vcpu->arch.switch_db_regs)) { |
| 3298 | get_debugreg(vcpu->arch.host_db[0], 0); | ||
| 3299 | get_debugreg(vcpu->arch.host_db[1], 1); | ||
| 3300 | get_debugreg(vcpu->arch.host_db[2], 2); | ||
| 3301 | get_debugreg(vcpu->arch.host_db[3], 3); | ||
| 3302 | |||
| 3303 | set_debugreg(0, 7); | 3634 | set_debugreg(0, 7); |
| 3304 | set_debugreg(vcpu->arch.eff_db[0], 0); | 3635 | set_debugreg(vcpu->arch.eff_db[0], 0); |
| 3305 | set_debugreg(vcpu->arch.eff_db[1], 1); | 3636 | set_debugreg(vcpu->arch.eff_db[1], 1); |
| @@ -3307,18 +3638,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 3307 | set_debugreg(vcpu->arch.eff_db[3], 3); | 3638 | set_debugreg(vcpu->arch.eff_db[3], 3); |
| 3308 | } | 3639 | } |
| 3309 | 3640 | ||
| 3310 | KVMTRACE_0D(VMENTRY, vcpu, entryexit); | 3641 | trace_kvm_entry(vcpu->vcpu_id); |
| 3311 | kvm_x86_ops->run(vcpu, kvm_run); | 3642 | kvm_x86_ops->run(vcpu, kvm_run); |
| 3312 | 3643 | ||
| 3313 | if (unlikely(vcpu->arch.switch_db_regs)) { | 3644 | if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { |
| 3314 | set_debugreg(0, 7); | 3645 | set_debugreg(current->thread.debugreg0, 0); |
| 3315 | set_debugreg(vcpu->arch.host_db[0], 0); | 3646 | set_debugreg(current->thread.debugreg1, 1); |
| 3316 | set_debugreg(vcpu->arch.host_db[1], 1); | 3647 | set_debugreg(current->thread.debugreg2, 2); |
| 3317 | set_debugreg(vcpu->arch.host_db[2], 2); | 3648 | set_debugreg(current->thread.debugreg3, 3); |
| 3318 | set_debugreg(vcpu->arch.host_db[3], 3); | 3649 | set_debugreg(current->thread.debugreg6, 6); |
| 3650 | set_debugreg(current->thread.debugreg7, 7); | ||
| 3319 | } | 3651 | } |
| 3320 | set_debugreg(vcpu->arch.host_dr6, 6); | ||
| 3321 | set_debugreg(vcpu->arch.host_dr7, 7); | ||
| 3322 | 3652 | ||
| 3323 | set_bit(KVM_REQ_KICK, &vcpu->requests); | 3653 | set_bit(KVM_REQ_KICK, &vcpu->requests); |
| 3324 | local_irq_enable(); | 3654 | local_irq_enable(); |
| @@ -3648,11 +3978,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu, | |||
| 3648 | static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, | 3978 | static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, |
| 3649 | struct kvm_segment *kvm_desct) | 3979 | struct kvm_segment *kvm_desct) |
| 3650 | { | 3980 | { |
| 3651 | kvm_desct->base = seg_desc->base0; | 3981 | kvm_desct->base = get_desc_base(seg_desc); |
| 3652 | kvm_desct->base |= seg_desc->base1 << 16; | 3982 | kvm_desct->limit = get_desc_limit(seg_desc); |
| 3653 | kvm_desct->base |= seg_desc->base2 << 24; | ||
| 3654 | kvm_desct->limit = seg_desc->limit0; | ||
| 3655 | kvm_desct->limit |= seg_desc->limit << 16; | ||
| 3656 | if (seg_desc->g) { | 3983 | if (seg_desc->g) { |
| 3657 | kvm_desct->limit <<= 12; | 3984 | kvm_desct->limit <<= 12; |
| 3658 | kvm_desct->limit |= 0xfff; | 3985 | kvm_desct->limit |= 0xfff; |
| @@ -3696,7 +4023,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, | |||
| 3696 | static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 4023 | static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
| 3697 | struct desc_struct *seg_desc) | 4024 | struct desc_struct *seg_desc) |
| 3698 | { | 4025 | { |
| 3699 | gpa_t gpa; | ||
| 3700 | struct descriptor_table dtable; | 4026 | struct descriptor_table dtable; |
| 3701 | u16 index = selector >> 3; | 4027 | u16 index = selector >> 3; |
| 3702 | 4028 | ||
| @@ -3706,16 +4032,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | |||
| 3706 | kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); | 4032 | kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); |
| 3707 | return 1; | 4033 | return 1; |
| 3708 | } | 4034 | } |
| 3709 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); | 4035 | return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); |
| 3710 | gpa += index * 8; | ||
| 3711 | return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); | ||
| 3712 | } | 4036 | } |
| 3713 | 4037 | ||
| 3714 | /* allowed just for 8 bytes segments */ | 4038 | /* allowed just for 8 bytes segments */ |
| 3715 | static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 4039 | static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
| 3716 | struct desc_struct *seg_desc) | 4040 | struct desc_struct *seg_desc) |
| 3717 | { | 4041 | { |
| 3718 | gpa_t gpa; | ||
| 3719 | struct descriptor_table dtable; | 4042 | struct descriptor_table dtable; |
| 3720 | u16 index = selector >> 3; | 4043 | u16 index = selector >> 3; |
| 3721 | 4044 | ||
| @@ -3723,19 +4046,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | |||
| 3723 | 4046 | ||
| 3724 | if (dtable.limit < index * 8 + 7) | 4047 | if (dtable.limit < index * 8 + 7) |
| 3725 | return 1; | 4048 | return 1; |
| 3726 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); | 4049 | return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); |
| 3727 | gpa += index * 8; | ||
| 3728 | return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); | ||
| 3729 | } | 4050 | } |
| 3730 | 4051 | ||
| 3731 | static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, | 4052 | static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, |
| 3732 | struct desc_struct *seg_desc) | 4053 | struct desc_struct *seg_desc) |
| 3733 | { | 4054 | { |
| 3734 | u32 base_addr; | 4055 | u32 base_addr = get_desc_base(seg_desc); |
| 3735 | |||
| 3736 | base_addr = seg_desc->base0; | ||
| 3737 | base_addr |= (seg_desc->base1 << 16); | ||
| 3738 | base_addr |= (seg_desc->base2 << 24); | ||
| 3739 | 4056 | ||
| 3740 | return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); | 4057 | return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); |
| 3741 | } | 4058 | } |
| @@ -3780,12 +4097,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se | |||
| 3780 | return 0; | 4097 | return 0; |
| 3781 | } | 4098 | } |
| 3782 | 4099 | ||
| 4100 | static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) | ||
| 4101 | { | ||
| 4102 | return (seg != VCPU_SREG_LDTR) && | ||
| 4103 | (seg != VCPU_SREG_TR) && | ||
| 4104 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); | ||
| 4105 | } | ||
| 4106 | |||
| 3783 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 4107 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
| 3784 | int type_bits, int seg) | 4108 | int type_bits, int seg) |
| 3785 | { | 4109 | { |
| 3786 | struct kvm_segment kvm_seg; | 4110 | struct kvm_segment kvm_seg; |
| 3787 | 4111 | ||
| 3788 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) | 4112 | if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) |
| 3789 | return kvm_load_realmode_segment(vcpu, selector, seg); | 4113 | return kvm_load_realmode_segment(vcpu, selector, seg); |
| 3790 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) | 4114 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) |
| 3791 | return 1; | 4115 | return 1; |
| @@ -4024,7 +4348,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
| 4024 | } | 4348 | } |
| 4025 | } | 4349 | } |
| 4026 | 4350 | ||
| 4027 | if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { | 4351 | if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { |
| 4028 | kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); | 4352 | kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); |
| 4029 | return 1; | 4353 | return 1; |
| 4030 | } | 4354 | } |
| @@ -4094,13 +4418,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
| 4094 | 4418 | ||
| 4095 | vcpu->arch.cr2 = sregs->cr2; | 4419 | vcpu->arch.cr2 = sregs->cr2; |
| 4096 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | 4420 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; |
| 4097 | 4421 | vcpu->arch.cr3 = sregs->cr3; | |
| 4098 | down_read(&vcpu->kvm->slots_lock); | ||
| 4099 | if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) | ||
| 4100 | vcpu->arch.cr3 = sregs->cr3; | ||
| 4101 | else | ||
| 4102 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | ||
| 4103 | up_read(&vcpu->kvm->slots_lock); | ||
| 4104 | 4422 | ||
| 4105 | kvm_set_cr8(vcpu, sregs->cr8); | 4423 | kvm_set_cr8(vcpu, sregs->cr8); |
| 4106 | 4424 | ||
| @@ -4142,8 +4460,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
| 4142 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | 4460 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); |
| 4143 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | 4461 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); |
| 4144 | 4462 | ||
| 4463 | update_cr8_intercept(vcpu); | ||
| 4464 | |||
| 4145 | /* Older userspace won't unhalt the vcpu on reset. */ | 4465 | /* Older userspace won't unhalt the vcpu on reset. */ |
| 4146 | if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && | 4466 | if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && |
| 4147 | sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && | 4467 | sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && |
| 4148 | !(vcpu->arch.cr0 & X86_CR0_PE)) | 4468 | !(vcpu->arch.cr0 & X86_CR0_PE)) |
| 4149 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 4469 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
| @@ -4414,7 +4734,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
| 4414 | kvm = vcpu->kvm; | 4734 | kvm = vcpu->kvm; |
| 4415 | 4735 | ||
| 4416 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 4736 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
| 4417 | if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) | 4737 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) |
| 4418 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 4738 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
| 4419 | else | 4739 | else |
| 4420 | vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; | 4740 | vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; |
| @@ -4436,6 +4756,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
| 4436 | goto fail_mmu_destroy; | 4756 | goto fail_mmu_destroy; |
| 4437 | } | 4757 | } |
| 4438 | 4758 | ||
| 4759 | vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, | ||
| 4760 | GFP_KERNEL); | ||
| 4761 | if (!vcpu->arch.mce_banks) { | ||
| 4762 | r = -ENOMEM; | ||
| 4763 | goto fail_mmu_destroy; | ||
| 4764 | } | ||
| 4765 | vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; | ||
| 4766 | |||
| 4439 | return 0; | 4767 | return 0; |
| 4440 | 4768 | ||
| 4441 | fail_mmu_destroy: | 4769 | fail_mmu_destroy: |
| @@ -4483,20 +4811,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | |||
| 4483 | static void kvm_free_vcpus(struct kvm *kvm) | 4811 | static void kvm_free_vcpus(struct kvm *kvm) |
| 4484 | { | 4812 | { |
| 4485 | unsigned int i; | 4813 | unsigned int i; |
| 4814 | struct kvm_vcpu *vcpu; | ||
| 4486 | 4815 | ||
| 4487 | /* | 4816 | /* |
| 4488 | * Unpin any mmu pages first. | 4817 | * Unpin any mmu pages first. |
| 4489 | */ | 4818 | */ |
| 4490 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | 4819 | kvm_for_each_vcpu(i, vcpu, kvm) |
| 4491 | if (kvm->vcpus[i]) | 4820 | kvm_unload_vcpu_mmu(vcpu); |
| 4492 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | 4821 | kvm_for_each_vcpu(i, vcpu, kvm) |
| 4493 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 4822 | kvm_arch_vcpu_free(vcpu); |
| 4494 | if (kvm->vcpus[i]) { | 4823 | |
| 4495 | kvm_arch_vcpu_free(kvm->vcpus[i]); | 4824 | mutex_lock(&kvm->lock); |
| 4496 | kvm->vcpus[i] = NULL; | 4825 | for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) |
| 4497 | } | 4826 | kvm->vcpus[i] = NULL; |
| 4498 | } | ||
| 4499 | 4827 | ||
| 4828 | atomic_set(&kvm->online_vcpus, 0); | ||
| 4829 | mutex_unlock(&kvm->lock); | ||
| 4500 | } | 4830 | } |
| 4501 | 4831 | ||
| 4502 | void kvm_arch_sync_events(struct kvm *kvm) | 4832 | void kvm_arch_sync_events(struct kvm *kvm) |
| @@ -4573,7 +4903,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
| 4573 | 4903 | ||
| 4574 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 4904 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); |
| 4575 | spin_unlock(&kvm->mmu_lock); | 4905 | spin_unlock(&kvm->mmu_lock); |
| 4576 | kvm_flush_remote_tlbs(kvm); | ||
| 4577 | 4906 | ||
| 4578 | return 0; | 4907 | return 0; |
| 4579 | } | 4908 | } |
| @@ -4587,8 +4916,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm) | |||
| 4587 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 4916 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
| 4588 | { | 4917 | { |
| 4589 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE | 4918 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE |
| 4590 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED | 4919 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED |
| 4591 | || vcpu->arch.nmi_pending; | 4920 | || vcpu->arch.nmi_pending || |
| 4921 | (kvm_arch_interrupt_allowed(vcpu) && | ||
| 4922 | kvm_cpu_has_interrupt(vcpu)); | ||
| 4592 | } | 4923 | } |
| 4593 | 4924 | ||
| 4594 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | 4925 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) |
| @@ -4612,3 +4943,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) | |||
| 4612 | { | 4943 | { |
| 4613 | return kvm_x86_ops->interrupt_allowed(vcpu); | 4944 | return kvm_x86_ops->interrupt_allowed(vcpu); |
| 4614 | } | 4945 | } |
| 4946 | |||
| 4947 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); | ||
| 4948 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); | ||
| 4949 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); | ||
| 4950 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); | ||
| 4951 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4c8e10af78e..5eadea585d2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
| @@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr) | |||
| 31 | { | 31 | { |
| 32 | return (nr == BP_VECTOR) || (nr == OF_VECTOR); | 32 | return (nr == BP_VECTOR) || (nr == OF_VECTOR); |
| 33 | } | 33 | } |
| 34 | |||
| 35 | struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | ||
| 36 | u32 function, u32 index); | ||
| 37 | |||
| 34 | #endif | 38 | #endif |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 1617958a380..63a6ba66cbe 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
| @@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap); | |||
| 104 | EXPORT_SYMBOL(kmap_atomic); | 104 | EXPORT_SYMBOL(kmap_atomic); |
| 105 | EXPORT_SYMBOL(kunmap_atomic); | 105 | EXPORT_SYMBOL(kunmap_atomic); |
| 106 | EXPORT_SYMBOL(kmap_atomic_prot); | 106 | EXPORT_SYMBOL(kmap_atomic_prot); |
| 107 | EXPORT_SYMBOL(kmap_atomic_to_page); | ||
| 107 | 108 | ||
| 108 | void __init set_highmem_pages_init(void) | 109 | void __init set_highmem_pages_init(void) |
| 109 | { | 110 | { |
diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm index 290910e4ede..96d7c9804dc 100644 --- a/include/asm-generic/Kbuild.asm +++ b/include/asm-generic/Kbuild.asm | |||
| @@ -3,6 +3,11 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \ | |||
| 3 | header-y += kvm.h | 3 | header-y += kvm.h |
| 4 | endif | 4 | endif |
| 5 | 5 | ||
| 6 | ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \ | ||
| 7 | $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),) | ||
| 8 | header-y += kvm_para.h | ||
| 9 | endif | ||
| 10 | |||
| 6 | ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \ | 11 | ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \ |
| 7 | $(srctree)/include/asm-$(SRCARCH)/a.out.h),) | 12 | $(srctree)/include/asm-$(SRCARCH)/a.out.h),) |
| 8 | unifdef-y += a.out.h | 13 | unifdef-y += a.out.h |
diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 334a3593cdf..cff4a101f26 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild | |||
| @@ -268,6 +268,10 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \ | |||
| 268 | $(srctree)/include/asm-$(SRCARCH)/kvm.h),) | 268 | $(srctree)/include/asm-$(SRCARCH)/kvm.h),) |
| 269 | unifdef-y += kvm.h | 269 | unifdef-y += kvm.h |
| 270 | endif | 270 | endif |
| 271 | ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \ | ||
| 272 | $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),) | ||
| 273 | unifdef-y += kvm_para.h | ||
| 274 | endif | ||
| 271 | unifdef-y += llc.h | 275 | unifdef-y += llc.h |
| 272 | unifdef-y += loop.h | 276 | unifdef-y += loop.h |
| 273 | unifdef-y += lp.h | 277 | unifdef-y += lp.h |
diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 3db5d8d3748..f8f8900fc5e 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h | |||
| @@ -14,7 +14,7 @@ | |||
| 14 | 14 | ||
| 15 | #define KVM_API_VERSION 12 | 15 | #define KVM_API_VERSION 12 |
| 16 | 16 | ||
| 17 | /* for KVM_TRACE_ENABLE */ | 17 | /* for KVM_TRACE_ENABLE, deprecated */ |
| 18 | struct kvm_user_trace_setup { | 18 | struct kvm_user_trace_setup { |
| 19 | __u32 buf_size; /* sub_buffer size of each per-cpu */ | 19 | __u32 buf_size; /* sub_buffer size of each per-cpu */ |
| 20 | __u32 buf_nr; /* the number of sub_buffers of each per-cpu */ | 20 | __u32 buf_nr; /* the number of sub_buffers of each per-cpu */ |
| @@ -70,6 +70,14 @@ struct kvm_irqchip { | |||
| 70 | } chip; | 70 | } chip; |
| 71 | }; | 71 | }; |
| 72 | 72 | ||
| 73 | /* for KVM_CREATE_PIT2 */ | ||
| 74 | struct kvm_pit_config { | ||
| 75 | __u32 flags; | ||
| 76 | __u32 pad[15]; | ||
| 77 | }; | ||
| 78 | |||
| 79 | #define KVM_PIT_SPEAKER_DUMMY 1 | ||
| 80 | |||
| 73 | #define KVM_EXIT_UNKNOWN 0 | 81 | #define KVM_EXIT_UNKNOWN 0 |
| 74 | #define KVM_EXIT_EXCEPTION 1 | 82 | #define KVM_EXIT_EXCEPTION 1 |
| 75 | #define KVM_EXIT_IO 2 | 83 | #define KVM_EXIT_IO 2 |
| @@ -87,6 +95,10 @@ struct kvm_irqchip { | |||
| 87 | #define KVM_EXIT_S390_RESET 14 | 95 | #define KVM_EXIT_S390_RESET 14 |
| 88 | #define KVM_EXIT_DCR 15 | 96 | #define KVM_EXIT_DCR 15 |
| 89 | #define KVM_EXIT_NMI 16 | 97 | #define KVM_EXIT_NMI 16 |
| 98 | #define KVM_EXIT_INTERNAL_ERROR 17 | ||
| 99 | |||
| 100 | /* For KVM_EXIT_INTERNAL_ERROR */ | ||
| 101 | #define KVM_INTERNAL_ERROR_EMULATION 1 | ||
| 90 | 102 | ||
| 91 | /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ | 103 | /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ |
| 92 | struct kvm_run { | 104 | struct kvm_run { |
| @@ -173,6 +185,9 @@ struct kvm_run { | |||
| 173 | __u32 data; | 185 | __u32 data; |
| 174 | __u8 is_write; | 186 | __u8 is_write; |
| 175 | } dcr; | 187 | } dcr; |
| 188 | struct { | ||
| 189 | __u32 suberror; | ||
| 190 | } internal; | ||
| 176 | /* Fix the size of the union. */ | 191 | /* Fix the size of the union. */ |
| 177 | char padding[256]; | 192 | char padding[256]; |
| 178 | }; | 193 | }; |
| @@ -292,6 +307,28 @@ struct kvm_guest_debug { | |||
| 292 | struct kvm_guest_debug_arch arch; | 307 | struct kvm_guest_debug_arch arch; |
| 293 | }; | 308 | }; |
| 294 | 309 | ||
| 310 | enum { | ||
| 311 | kvm_ioeventfd_flag_nr_datamatch, | ||
| 312 | kvm_ioeventfd_flag_nr_pio, | ||
| 313 | kvm_ioeventfd_flag_nr_deassign, | ||
| 314 | kvm_ioeventfd_flag_nr_max, | ||
| 315 | }; | ||
| 316 | |||
| 317 | #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) | ||
| 318 | #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) | ||
| 319 | #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) | ||
| 320 | |||
| 321 | #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) | ||
| 322 | |||
| 323 | struct kvm_ioeventfd { | ||
| 324 | __u64 datamatch; | ||
| 325 | __u64 addr; /* legal pio/mmio address */ | ||
| 326 | __u32 len; /* 1, 2, 4, or 8 bytes */ | ||
| 327 | __s32 fd; | ||
| 328 | __u32 flags; | ||
| 329 | __u8 pad[36]; | ||
| 330 | }; | ||
| 331 | |||
| 295 | #define KVM_TRC_SHIFT 16 | 332 | #define KVM_TRC_SHIFT 16 |
| 296 | /* | 333 | /* |
| 297 | * kvm trace categories | 334 | * kvm trace categories |
| @@ -310,35 +347,6 @@ struct kvm_guest_debug { | |||
| 310 | #define KVM_TRC_CYCLE_SIZE 8 | 347 | #define KVM_TRC_CYCLE_SIZE 8 |
| 311 | #define KVM_TRC_EXTRA_MAX 7 | 348 | #define KVM_TRC_EXTRA_MAX 7 |
| 312 | 349 | ||
| 313 | /* This structure represents a single trace buffer record. */ | ||
| 314 | struct kvm_trace_rec { | ||
| 315 | /* variable rec_val | ||
| 316 | * is split into: | ||
| 317 | * bits 0 - 27 -> event id | ||
| 318 | * bits 28 -30 -> number of extra data args of size u32 | ||
| 319 | * bits 31 -> binary indicator for if tsc is in record | ||
| 320 | */ | ||
| 321 | __u32 rec_val; | ||
| 322 | __u32 pid; | ||
| 323 | __u32 vcpu_id; | ||
| 324 | union { | ||
| 325 | struct { | ||
| 326 | __u64 timestamp; | ||
| 327 | __u32 extra_u32[KVM_TRC_EXTRA_MAX]; | ||
| 328 | } __attribute__((packed)) timestamp; | ||
| 329 | struct { | ||
| 330 | __u32 extra_u32[KVM_TRC_EXTRA_MAX]; | ||
| 331 | } notimestamp; | ||
| 332 | } u; | ||
| 333 | }; | ||
| 334 | |||
| 335 | #define TRACE_REC_EVENT_ID(val) \ | ||
| 336 | (0x0fffffff & (val)) | ||
| 337 | #define TRACE_REC_NUM_DATA_ARGS(val) \ | ||
| 338 | (0x70000000 & ((val) << 28)) | ||
| 339 | #define TRACE_REC_TCS(val) \ | ||
| 340 | (0x80000000 & ((val) << 31)) | ||
| 341 | |||
| 342 | #define KVMIO 0xAE | 350 | #define KVMIO 0xAE |
| 343 | 351 | ||
| 344 | /* | 352 | /* |
| @@ -415,6 +423,19 @@ struct kvm_trace_rec { | |||
| 415 | #define KVM_CAP_ASSIGN_DEV_IRQ 29 | 423 | #define KVM_CAP_ASSIGN_DEV_IRQ 29 |
| 416 | /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ | 424 | /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ |
| 417 | #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 | 425 | #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 |
| 426 | #ifdef __KVM_HAVE_MCE | ||
| 427 | #define KVM_CAP_MCE 31 | ||
| 428 | #endif | ||
| 429 | #define KVM_CAP_IRQFD 32 | ||
| 430 | #ifdef __KVM_HAVE_PIT | ||
| 431 | #define KVM_CAP_PIT2 33 | ||
| 432 | #endif | ||
| 433 | #define KVM_CAP_SET_BOOT_CPU_ID 34 | ||
| 434 | #ifdef __KVM_HAVE_PIT_STATE2 | ||
| 435 | #define KVM_CAP_PIT_STATE2 35 | ||
| 436 | #endif | ||
| 437 | #define KVM_CAP_IOEVENTFD 36 | ||
| 438 | #define KVM_CAP_SET_IDENTITY_MAP_ADDR 37 | ||
| 418 | 439 | ||
| 419 | #ifdef KVM_CAP_IRQ_ROUTING | 440 | #ifdef KVM_CAP_IRQ_ROUTING |
| 420 | 441 | ||
| @@ -454,15 +475,32 @@ struct kvm_irq_routing { | |||
| 454 | 475 | ||
| 455 | #endif | 476 | #endif |
| 456 | 477 | ||
| 478 | #ifdef KVM_CAP_MCE | ||
| 479 | /* x86 MCE */ | ||
| 480 | struct kvm_x86_mce { | ||
| 481 | __u64 status; | ||
| 482 | __u64 addr; | ||
| 483 | __u64 misc; | ||
| 484 | __u64 mcg_status; | ||
| 485 | __u8 bank; | ||
| 486 | __u8 pad1[7]; | ||
| 487 | __u64 pad2[3]; | ||
| 488 | }; | ||
| 489 | #endif | ||
| 490 | |||
| 491 | #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) | ||
| 492 | |||
| 493 | struct kvm_irqfd { | ||
| 494 | __u32 fd; | ||
| 495 | __u32 gsi; | ||
| 496 | __u32 flags; | ||
| 497 | __u8 pad[20]; | ||
| 498 | }; | ||
| 499 | |||
| 457 | /* | 500 | /* |
| 458 | * ioctls for VM fds | 501 | * ioctls for VM fds |
| 459 | */ | 502 | */ |
| 460 | #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) | 503 | #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) |
| 461 | #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) | ||
| 462 | #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) | ||
| 463 | #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ | ||
| 464 | struct kvm_userspace_memory_region) | ||
| 465 | #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) | ||
| 466 | /* | 504 | /* |
| 467 | * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns | 505 | * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns |
| 468 | * a vcpu fd. | 506 | * a vcpu fd. |
| @@ -470,6 +508,12 @@ struct kvm_irq_routing { | |||
| 470 | #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) | 508 | #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) |
| 471 | #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) | 509 | #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) |
| 472 | #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) | 510 | #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) |
| 511 | #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) | ||
| 512 | #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) | ||
| 513 | #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ | ||
| 514 | struct kvm_userspace_memory_region) | ||
| 515 | #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) | ||
| 516 | #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) | ||
| 473 | /* Device model IOC */ | 517 | /* Device model IOC */ |
| 474 | #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) | 518 | #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) |
| 475 | #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) | 519 | #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) |
| @@ -498,6 +542,10 @@ struct kvm_irq_routing { | |||
| 498 | #define KVM_ASSIGN_SET_MSIX_ENTRY \ | 542 | #define KVM_ASSIGN_SET_MSIX_ENTRY \ |
| 499 | _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry) | 543 | _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry) |
| 500 | #define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) | 544 | #define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) |
| 545 | #define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) | ||
| 546 | #define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) | ||
| 547 | #define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) | ||
| 548 | #define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd) | ||
| 501 | 549 | ||
| 502 | /* | 550 | /* |
| 503 | * ioctls for vcpu fds | 551 | * ioctls for vcpu fds |
| @@ -541,6 +589,10 @@ struct kvm_irq_routing { | |||
| 541 | #define KVM_NMI _IO(KVMIO, 0x9a) | 589 | #define KVM_NMI _IO(KVMIO, 0x9a) |
| 542 | /* Available with KVM_CAP_SET_GUEST_DEBUG */ | 590 | /* Available with KVM_CAP_SET_GUEST_DEBUG */ |
| 543 | #define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) | 591 | #define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) |
| 592 | /* MCE for x86 */ | ||
| 593 | #define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64) | ||
| 594 | #define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64) | ||
| 595 | #define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce) | ||
| 544 | 596 | ||
| 545 | /* | 597 | /* |
| 546 | * Deprecated interfaces | 598 | * Deprecated interfaces |
| @@ -563,6 +615,9 @@ struct kvm_debug_guest { | |||
| 563 | #define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) | 615 | #define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) |
| 564 | #define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) | 616 | #define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) |
| 565 | 617 | ||
| 618 | #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) | ||
| 619 | #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) | ||
| 620 | |||
| 566 | #define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) | 621 | #define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) |
| 567 | #define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) | 622 | #define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) |
| 568 | #define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) | 623 | #define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) |
| @@ -633,7 +688,7 @@ struct kvm_assigned_msix_nr { | |||
| 633 | __u16 padding; | 688 | __u16 padding; |
| 634 | }; | 689 | }; |
| 635 | 690 | ||
| 636 | #define KVM_MAX_MSIX_PER_DEV 512 | 691 | #define KVM_MAX_MSIX_PER_DEV 256 |
| 637 | struct kvm_assigned_msix_entry { | 692 | struct kvm_assigned_msix_entry { |
| 638 | __u32 assigned_dev_id; | 693 | __u32 assigned_dev_id; |
| 639 | __u32 gsi; | 694 | __u32 gsi; |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3060bdc35ff..4af56036a6b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
| @@ -42,6 +42,7 @@ | |||
| 42 | 42 | ||
| 43 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 | 43 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 |
| 44 | 44 | ||
| 45 | struct kvm; | ||
| 45 | struct kvm_vcpu; | 46 | struct kvm_vcpu; |
| 46 | extern struct kmem_cache *kvm_vcpu_cache; | 47 | extern struct kmem_cache *kvm_vcpu_cache; |
| 47 | 48 | ||
| @@ -59,10 +60,18 @@ struct kvm_io_bus { | |||
| 59 | 60 | ||
| 60 | void kvm_io_bus_init(struct kvm_io_bus *bus); | 61 | void kvm_io_bus_init(struct kvm_io_bus *bus); |
| 61 | void kvm_io_bus_destroy(struct kvm_io_bus *bus); | 62 | void kvm_io_bus_destroy(struct kvm_io_bus *bus); |
| 62 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, | 63 | int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len, |
| 63 | gpa_t addr, int len, int is_write); | 64 | const void *val); |
| 64 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, | 65 | int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, |
| 65 | struct kvm_io_device *dev); | 66 | void *val); |
| 67 | int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, | ||
| 68 | struct kvm_io_device *dev); | ||
| 69 | int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, | ||
| 70 | struct kvm_io_device *dev); | ||
| 71 | void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, | ||
| 72 | struct kvm_io_device *dev); | ||
| 73 | void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus, | ||
| 74 | struct kvm_io_device *dev); | ||
| 66 | 75 | ||
| 67 | struct kvm_vcpu { | 76 | struct kvm_vcpu { |
| 68 | struct kvm *kvm; | 77 | struct kvm *kvm; |
| @@ -103,7 +112,7 @@ struct kvm_memory_slot { | |||
| 103 | struct { | 112 | struct { |
| 104 | unsigned long rmap_pde; | 113 | unsigned long rmap_pde; |
| 105 | int write_count; | 114 | int write_count; |
| 106 | } *lpage_info; | 115 | } *lpage_info[KVM_NR_PAGE_SIZES - 1]; |
| 107 | unsigned long userspace_addr; | 116 | unsigned long userspace_addr; |
| 108 | int user_alloc; | 117 | int user_alloc; |
| 109 | }; | 118 | }; |
| @@ -124,7 +133,6 @@ struct kvm_kernel_irq_routing_entry { | |||
| 124 | }; | 133 | }; |
| 125 | 134 | ||
| 126 | struct kvm { | 135 | struct kvm { |
| 127 | struct mutex lock; /* protects the vcpus array and APIC accesses */ | ||
| 128 | spinlock_t mmu_lock; | 136 | spinlock_t mmu_lock; |
| 129 | spinlock_t requests_lock; | 137 | spinlock_t requests_lock; |
| 130 | struct rw_semaphore slots_lock; | 138 | struct rw_semaphore slots_lock; |
| @@ -132,10 +140,23 @@ struct kvm { | |||
| 132 | int nmemslots; | 140 | int nmemslots; |
| 133 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + | 141 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + |
| 134 | KVM_PRIVATE_MEM_SLOTS]; | 142 | KVM_PRIVATE_MEM_SLOTS]; |
| 143 | #ifdef CONFIG_KVM_APIC_ARCHITECTURE | ||
| 144 | u32 bsp_vcpu_id; | ||
| 145 | struct kvm_vcpu *bsp_vcpu; | ||
| 146 | #endif | ||
| 135 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | 147 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; |
| 148 | atomic_t online_vcpus; | ||
| 136 | struct list_head vm_list; | 149 | struct list_head vm_list; |
| 150 | struct mutex lock; | ||
| 137 | struct kvm_io_bus mmio_bus; | 151 | struct kvm_io_bus mmio_bus; |
| 138 | struct kvm_io_bus pio_bus; | 152 | struct kvm_io_bus pio_bus; |
| 153 | #ifdef CONFIG_HAVE_KVM_EVENTFD | ||
| 154 | struct { | ||
| 155 | spinlock_t lock; | ||
| 156 | struct list_head items; | ||
| 157 | } irqfds; | ||
| 158 | struct list_head ioeventfds; | ||
| 159 | #endif | ||
| 139 | struct kvm_vm_stat stat; | 160 | struct kvm_vm_stat stat; |
| 140 | struct kvm_arch arch; | 161 | struct kvm_arch arch; |
| 141 | atomic_t users_count; | 162 | atomic_t users_count; |
| @@ -144,6 +165,7 @@ struct kvm { | |||
| 144 | struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; | 165 | struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; |
| 145 | #endif | 166 | #endif |
| 146 | 167 | ||
| 168 | struct mutex irq_lock; | ||
| 147 | #ifdef CONFIG_HAVE_KVM_IRQCHIP | 169 | #ifdef CONFIG_HAVE_KVM_IRQCHIP |
| 148 | struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */ | 170 | struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */ |
| 149 | struct hlist_head mask_notifier_list; | 171 | struct hlist_head mask_notifier_list; |
| @@ -167,6 +189,17 @@ struct kvm { | |||
| 167 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | 189 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) |
| 168 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | 190 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) |
| 169 | 191 | ||
| 192 | static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) | ||
| 193 | { | ||
| 194 | smp_rmb(); | ||
| 195 | return kvm->vcpus[i]; | ||
| 196 | } | ||
| 197 | |||
| 198 | #define kvm_for_each_vcpu(idx, vcpup, kvm) \ | ||
| 199 | for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \ | ||
| 200 | idx < atomic_read(&kvm->online_vcpus) && vcpup; \ | ||
| 201 | vcpup = kvm_get_vcpu(kvm, ++idx)) | ||
| 202 | |||
| 170 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | 203 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); |
| 171 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | 204 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); |
| 172 | 205 | ||
| @@ -201,6 +234,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
| 201 | struct kvm_userspace_memory_region *mem, | 234 | struct kvm_userspace_memory_region *mem, |
| 202 | struct kvm_memory_slot old, | 235 | struct kvm_memory_slot old, |
| 203 | int user_alloc); | 236 | int user_alloc); |
| 237 | void kvm_disable_largepages(void); | ||
| 204 | void kvm_arch_flush_shadow(struct kvm *kvm); | 238 | void kvm_arch_flush_shadow(struct kvm *kvm); |
| 205 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); | 239 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); |
| 206 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | 240 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); |
| @@ -243,8 +277,6 @@ long kvm_arch_dev_ioctl(struct file *filp, | |||
| 243 | unsigned int ioctl, unsigned long arg); | 277 | unsigned int ioctl, unsigned long arg); |
| 244 | long kvm_arch_vcpu_ioctl(struct file *filp, | 278 | long kvm_arch_vcpu_ioctl(struct file *filp, |
| 245 | unsigned int ioctl, unsigned long arg); | 279 | unsigned int ioctl, unsigned long arg); |
| 246 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); | ||
| 247 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); | ||
| 248 | 280 | ||
| 249 | int kvm_dev_ioctl_check_extension(long ext); | 281 | int kvm_dev_ioctl_check_extension(long ext); |
| 250 | 282 | ||
| @@ -300,7 +332,6 @@ int kvm_arch_hardware_setup(void); | |||
| 300 | void kvm_arch_hardware_unsetup(void); | 332 | void kvm_arch_hardware_unsetup(void); |
| 301 | void kvm_arch_check_processor_compat(void *rtn); | 333 | void kvm_arch_check_processor_compat(void *rtn); |
| 302 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); | 334 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); |
| 303 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); | ||
| 304 | 335 | ||
| 305 | void kvm_free_physmem(struct kvm *kvm); | 336 | void kvm_free_physmem(struct kvm *kvm); |
| 306 | 337 | ||
| @@ -309,8 +340,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm); | |||
| 309 | void kvm_free_all_assigned_devices(struct kvm *kvm); | 340 | void kvm_free_all_assigned_devices(struct kvm *kvm); |
| 310 | void kvm_arch_sync_events(struct kvm *kvm); | 341 | void kvm_arch_sync_events(struct kvm *kvm); |
| 311 | 342 | ||
| 312 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
| 313 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v); | ||
| 314 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); | 343 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); |
| 315 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | 344 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); |
| 316 | 345 | ||
| @@ -366,7 +395,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); | |||
| 366 | void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); | 395 | void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); |
| 367 | void kvm_register_irq_ack_notifier(struct kvm *kvm, | 396 | void kvm_register_irq_ack_notifier(struct kvm *kvm, |
| 368 | struct kvm_irq_ack_notifier *kian); | 397 | struct kvm_irq_ack_notifier *kian); |
| 369 | void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian); | 398 | void kvm_unregister_irq_ack_notifier(struct kvm *kvm, |
| 399 | struct kvm_irq_ack_notifier *kian); | ||
| 370 | int kvm_request_irq_source_id(struct kvm *kvm); | 400 | int kvm_request_irq_source_id(struct kvm *kvm); |
| 371 | void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); | 401 | void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); |
| 372 | 402 | ||
| @@ -459,37 +489,6 @@ struct kvm_stats_debugfs_item { | |||
| 459 | extern struct kvm_stats_debugfs_item debugfs_entries[]; | 489 | extern struct kvm_stats_debugfs_item debugfs_entries[]; |
| 460 | extern struct dentry *kvm_debugfs_dir; | 490 | extern struct dentry *kvm_debugfs_dir; |
| 461 | 491 | ||
| 462 | #define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \ | ||
| 463 | trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ | ||
| 464 | vcpu, 5, d1, d2, d3, d4, d5) | ||
| 465 | #define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \ | ||
| 466 | trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ | ||
| 467 | vcpu, 4, d1, d2, d3, d4, 0) | ||
| 468 | #define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \ | ||
| 469 | trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ | ||
| 470 | vcpu, 3, d1, d2, d3, 0, 0) | ||
| 471 | #define KVMTRACE_2D(evt, vcpu, d1, d2, name) \ | ||
| 472 | trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ | ||
| 473 | vcpu, 2, d1, d2, 0, 0, 0) | ||
| 474 | #define KVMTRACE_1D(evt, vcpu, d1, name) \ | ||
| 475 | trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ | ||
| 476 | vcpu, 1, d1, 0, 0, 0, 0) | ||
| 477 | #define KVMTRACE_0D(evt, vcpu, name) \ | ||
| 478 | trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ | ||
| 479 | vcpu, 0, 0, 0, 0, 0, 0) | ||
| 480 | |||
| 481 | #ifdef CONFIG_KVM_TRACE | ||
| 482 | int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg); | ||
| 483 | void kvm_trace_cleanup(void); | ||
| 484 | #else | ||
| 485 | static inline | ||
| 486 | int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) | ||
| 487 | { | ||
| 488 | return -EINVAL; | ||
| 489 | } | ||
| 490 | #define kvm_trace_cleanup() ((void)0) | ||
| 491 | #endif | ||
| 492 | |||
| 493 | #ifdef KVM_ARCH_WANT_MMU_NOTIFIER | 492 | #ifdef KVM_ARCH_WANT_MMU_NOTIFIER |
| 494 | static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) | 493 | static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) |
| 495 | { | 494 | { |
| @@ -525,4 +524,33 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} | |||
| 525 | 524 | ||
| 526 | #endif | 525 | #endif |
| 527 | 526 | ||
| 527 | #ifdef CONFIG_HAVE_KVM_EVENTFD | ||
| 528 | |||
| 529 | void kvm_eventfd_init(struct kvm *kvm); | ||
| 530 | int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); | ||
| 531 | void kvm_irqfd_release(struct kvm *kvm); | ||
| 532 | int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); | ||
| 533 | |||
| 534 | #else | ||
| 535 | |||
| 536 | static inline void kvm_eventfd_init(struct kvm *kvm) {} | ||
| 537 | static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) | ||
| 538 | { | ||
| 539 | return -EINVAL; | ||
| 540 | } | ||
| 541 | |||
| 542 | static inline void kvm_irqfd_release(struct kvm *kvm) {} | ||
| 543 | static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) | ||
| 544 | { | ||
| 545 | return -ENOSYS; | ||
| 546 | } | ||
| 547 | |||
| 548 | #endif /* CONFIG_HAVE_KVM_EVENTFD */ | ||
| 549 | |||
| 550 | #ifdef CONFIG_KVM_APIC_ARCHITECTURE | ||
| 551 | static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) | ||
| 552 | { | ||
| 553 | return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; | ||
| 554 | } | ||
| 555 | #endif | ||
| 528 | #endif | 556 | #endif |
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index 3ddce03766c..d73109243fd 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #define KVM_ENOSYS 1000 | 13 | #define KVM_ENOSYS 1000 |
| 14 | #define KVM_EFAULT EFAULT | 14 | #define KVM_EFAULT EFAULT |
| 15 | #define KVM_E2BIG E2BIG | 15 | #define KVM_E2BIG E2BIG |
| 16 | #define KVM_EPERM EPERM | ||
| 16 | 17 | ||
| 17 | #define KVM_HC_VAPIC_POLL_IRQ 1 | 18 | #define KVM_HC_VAPIC_POLL_IRQ 1 |
| 18 | #define KVM_HC_MMU_OP 2 | 19 | #define KVM_HC_MMU_OP 2 |
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h new file mode 100644 index 00000000000..dbe10845527 --- /dev/null +++ b/include/trace/events/kvm.h | |||
| @@ -0,0 +1,151 @@ | |||
| 1 | #if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 2 | #define _TRACE_KVM_MAIN_H | ||
| 3 | |||
| 4 | #include <linux/tracepoint.h> | ||
| 5 | |||
| 6 | #undef TRACE_SYSTEM | ||
| 7 | #define TRACE_SYSTEM kvm | ||
| 8 | #define TRACE_INCLUDE_FILE kvm | ||
| 9 | |||
| 10 | #if defined(__KVM_HAVE_IOAPIC) | ||
| 11 | TRACE_EVENT(kvm_set_irq, | ||
| 12 | TP_PROTO(unsigned int gsi, int level, int irq_source_id), | ||
| 13 | TP_ARGS(gsi, level, irq_source_id), | ||
| 14 | |||
| 15 | TP_STRUCT__entry( | ||
| 16 | __field( unsigned int, gsi ) | ||
| 17 | __field( int, level ) | ||
| 18 | __field( int, irq_source_id ) | ||
| 19 | ), | ||
| 20 | |||
| 21 | TP_fast_assign( | ||
| 22 | __entry->gsi = gsi; | ||
| 23 | __entry->level = level; | ||
| 24 | __entry->irq_source_id = irq_source_id; | ||
| 25 | ), | ||
| 26 | |||
| 27 | TP_printk("gsi %u level %d source %d", | ||
| 28 | __entry->gsi, __entry->level, __entry->irq_source_id) | ||
| 29 | ); | ||
| 30 | |||
| 31 | #define kvm_deliver_mode \ | ||
| 32 | {0x0, "Fixed"}, \ | ||
| 33 | {0x1, "LowPrio"}, \ | ||
| 34 | {0x2, "SMI"}, \ | ||
| 35 | {0x3, "Res3"}, \ | ||
| 36 | {0x4, "NMI"}, \ | ||
| 37 | {0x5, "INIT"}, \ | ||
| 38 | {0x6, "SIPI"}, \ | ||
| 39 | {0x7, "ExtINT"} | ||
| 40 | |||
| 41 | TRACE_EVENT(kvm_ioapic_set_irq, | ||
| 42 | TP_PROTO(__u64 e, int pin, bool coalesced), | ||
| 43 | TP_ARGS(e, pin, coalesced), | ||
| 44 | |||
| 45 | TP_STRUCT__entry( | ||
| 46 | __field( __u64, e ) | ||
| 47 | __field( int, pin ) | ||
| 48 | __field( bool, coalesced ) | ||
| 49 | ), | ||
| 50 | |||
| 51 | TP_fast_assign( | ||
| 52 | __entry->e = e; | ||
| 53 | __entry->pin = pin; | ||
| 54 | __entry->coalesced = coalesced; | ||
| 55 | ), | ||
| 56 | |||
| 57 | TP_printk("pin %u dst %x vec=%u (%s|%s|%s%s)%s", | ||
| 58 | __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, | ||
| 59 | __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), | ||
| 60 | (__entry->e & (1<<11)) ? "logical" : "physical", | ||
| 61 | (__entry->e & (1<<15)) ? "level" : "edge", | ||
| 62 | (__entry->e & (1<<16)) ? "|masked" : "", | ||
| 63 | __entry->coalesced ? " (coalesced)" : "") | ||
| 64 | ); | ||
| 65 | |||
| 66 | TRACE_EVENT(kvm_msi_set_irq, | ||
| 67 | TP_PROTO(__u64 address, __u64 data), | ||
| 68 | TP_ARGS(address, data), | ||
| 69 | |||
| 70 | TP_STRUCT__entry( | ||
| 71 | __field( __u64, address ) | ||
| 72 | __field( __u64, data ) | ||
| 73 | ), | ||
| 74 | |||
| 75 | TP_fast_assign( | ||
| 76 | __entry->address = address; | ||
| 77 | __entry->data = data; | ||
| 78 | ), | ||
| 79 | |||
| 80 | TP_printk("dst %u vec %x (%s|%s|%s%s)", | ||
| 81 | (u8)(__entry->address >> 12), (u8)__entry->data, | ||
| 82 | __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), | ||
| 83 | (__entry->address & (1<<2)) ? "logical" : "physical", | ||
| 84 | (__entry->data & (1<<15)) ? "level" : "edge", | ||
| 85 | (__entry->address & (1<<3)) ? "|rh" : "") | ||
| 86 | ); | ||
| 87 | |||
| 88 | #define kvm_irqchips \ | ||
| 89 | {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ | ||
| 90 | {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ | ||
| 91 | {KVM_IRQCHIP_IOAPIC, "IOAPIC"} | ||
| 92 | |||
| 93 | TRACE_EVENT(kvm_ack_irq, | ||
| 94 | TP_PROTO(unsigned int irqchip, unsigned int pin), | ||
| 95 | TP_ARGS(irqchip, pin), | ||
| 96 | |||
| 97 | TP_STRUCT__entry( | ||
| 98 | __field( unsigned int, irqchip ) | ||
| 99 | __field( unsigned int, pin ) | ||
| 100 | ), | ||
| 101 | |||
| 102 | TP_fast_assign( | ||
| 103 | __entry->irqchip = irqchip; | ||
| 104 | __entry->pin = pin; | ||
| 105 | ), | ||
| 106 | |||
| 107 | TP_printk("irqchip %s pin %u", | ||
| 108 | __print_symbolic(__entry->irqchip, kvm_irqchips), | ||
| 109 | __entry->pin) | ||
| 110 | ); | ||
| 111 | |||
| 112 | |||
| 113 | |||
| 114 | #endif /* defined(__KVM_HAVE_IOAPIC) */ | ||
| 115 | |||
| 116 | #define KVM_TRACE_MMIO_READ_UNSATISFIED 0 | ||
| 117 | #define KVM_TRACE_MMIO_READ 1 | ||
| 118 | #define KVM_TRACE_MMIO_WRITE 2 | ||
| 119 | |||
| 120 | #define kvm_trace_symbol_mmio \ | ||
| 121 | { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \ | ||
| 122 | { KVM_TRACE_MMIO_READ, "read" }, \ | ||
| 123 | { KVM_TRACE_MMIO_WRITE, "write" } | ||
| 124 | |||
| 125 | TRACE_EVENT(kvm_mmio, | ||
| 126 | TP_PROTO(int type, int len, u64 gpa, u64 val), | ||
| 127 | TP_ARGS(type, len, gpa, val), | ||
| 128 | |||
| 129 | TP_STRUCT__entry( | ||
| 130 | __field( u32, type ) | ||
| 131 | __field( u32, len ) | ||
| 132 | __field( u64, gpa ) | ||
| 133 | __field( u64, val ) | ||
| 134 | ), | ||
| 135 | |||
| 136 | TP_fast_assign( | ||
| 137 | __entry->type = type; | ||
| 138 | __entry->len = len; | ||
| 139 | __entry->gpa = gpa; | ||
| 140 | __entry->val = val; | ||
| 141 | ), | ||
| 142 | |||
| 143 | TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", | ||
| 144 | __print_symbolic(__entry->type, kvm_trace_symbol_mmio), | ||
| 145 | __entry->len, __entry->gpa, __entry->val) | ||
| 146 | ); | ||
| 147 | |||
| 148 | #endif /* _TRACE_KVM_MAIN_H */ | ||
| 149 | |||
| 150 | /* This part must be outside protection */ | ||
| 151 | #include <trace/define_trace.h> | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cafdcee154e..b16d6363477 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) | |||
| 234 | 234 | ||
| 235 | return 1UL << (hstate->order + PAGE_SHIFT); | 235 | return 1UL << (hstate->order + PAGE_SHIFT); |
| 236 | } | 236 | } |
| 237 | EXPORT_SYMBOL_GPL(vma_kernel_pagesize); | ||
| 237 | 238 | ||
| 238 | /* | 239 | /* |
| 239 | * Return the page size being used by the MMU to back a VMA. In the majority | 240 | * Return the page size being used by the MMU to back a VMA. In the majority |
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig new file mode 100644 index 00000000000..daece36c0a5 --- /dev/null +++ b/virt/kvm/Kconfig | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | # KVM common configuration items and defaults | ||
| 2 | |||
| 3 | config HAVE_KVM | ||
| 4 | bool | ||
| 5 | |||
| 6 | config HAVE_KVM_IRQCHIP | ||
| 7 | bool | ||
| 8 | |||
| 9 | config HAVE_KVM_EVENTFD | ||
| 10 | bool | ||
| 11 | select EVENTFD | ||
| 12 | |||
| 13 | config KVM_APIC_ARCHITECTURE | ||
| 14 | bool | ||
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 5ae620d32fa..04d69cd7049 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c | |||
| @@ -14,32 +14,28 @@ | |||
| 14 | 14 | ||
| 15 | #include "coalesced_mmio.h" | 15 | #include "coalesced_mmio.h" |
| 16 | 16 | ||
| 17 | static int coalesced_mmio_in_range(struct kvm_io_device *this, | 17 | static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) |
| 18 | gpa_t addr, int len, int is_write) | 18 | { |
| 19 | return container_of(dev, struct kvm_coalesced_mmio_dev, dev); | ||
| 20 | } | ||
| 21 | |||
| 22 | static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, | ||
| 23 | gpa_t addr, int len) | ||
| 19 | { | 24 | { |
| 20 | struct kvm_coalesced_mmio_dev *dev = | ||
| 21 | (struct kvm_coalesced_mmio_dev*)this->private; | ||
| 22 | struct kvm_coalesced_mmio_zone *zone; | 25 | struct kvm_coalesced_mmio_zone *zone; |
| 23 | int next; | 26 | struct kvm_coalesced_mmio_ring *ring; |
| 27 | unsigned avail; | ||
| 24 | int i; | 28 | int i; |
| 25 | 29 | ||
| 26 | if (!is_write) | ||
| 27 | return 0; | ||
| 28 | |||
| 29 | /* kvm->lock is taken by the caller and must be not released before | ||
| 30 | * dev.read/write | ||
| 31 | */ | ||
| 32 | |||
| 33 | /* Are we able to batch it ? */ | 30 | /* Are we able to batch it ? */ |
| 34 | 31 | ||
| 35 | /* last is the first free entry | 32 | /* last is the first free entry |
| 36 | * check if we don't meet the first used entry | 33 | * check if we don't meet the first used entry |
| 37 | * there is always one unused entry in the buffer | 34 | * there is always one unused entry in the buffer |
| 38 | */ | 35 | */ |
| 39 | 36 | ring = dev->kvm->coalesced_mmio_ring; | |
| 40 | next = (dev->kvm->coalesced_mmio_ring->last + 1) % | 37 | avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; |
| 41 | KVM_COALESCED_MMIO_MAX; | 38 | if (avail < KVM_MAX_VCPUS) { |
| 42 | if (next == dev->kvm->coalesced_mmio_ring->first) { | ||
| 43 | /* full */ | 39 | /* full */ |
| 44 | return 0; | 40 | return 0; |
| 45 | } | 41 | } |
| @@ -60,14 +56,15 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, | |||
| 60 | return 0; | 56 | return 0; |
| 61 | } | 57 | } |
| 62 | 58 | ||
| 63 | static void coalesced_mmio_write(struct kvm_io_device *this, | 59 | static int coalesced_mmio_write(struct kvm_io_device *this, |
| 64 | gpa_t addr, int len, const void *val) | 60 | gpa_t addr, int len, const void *val) |
| 65 | { | 61 | { |
| 66 | struct kvm_coalesced_mmio_dev *dev = | 62 | struct kvm_coalesced_mmio_dev *dev = to_mmio(this); |
| 67 | (struct kvm_coalesced_mmio_dev*)this->private; | ||
| 68 | struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; | 63 | struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; |
| 64 | if (!coalesced_mmio_in_range(dev, addr, len)) | ||
| 65 | return -EOPNOTSUPP; | ||
| 69 | 66 | ||
| 70 | /* kvm->lock must be taken by caller before call to in_range()*/ | 67 | spin_lock(&dev->lock); |
| 71 | 68 | ||
| 72 | /* copy data in first free entry of the ring */ | 69 | /* copy data in first free entry of the ring */ |
| 73 | 70 | ||
| @@ -76,29 +73,40 @@ static void coalesced_mmio_write(struct kvm_io_device *this, | |||
| 76 | memcpy(ring->coalesced_mmio[ring->last].data, val, len); | 73 | memcpy(ring->coalesced_mmio[ring->last].data, val, len); |
| 77 | smp_wmb(); | 74 | smp_wmb(); |
| 78 | ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; | 75 | ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; |
| 76 | spin_unlock(&dev->lock); | ||
| 77 | return 0; | ||
| 79 | } | 78 | } |
| 80 | 79 | ||
| 81 | static void coalesced_mmio_destructor(struct kvm_io_device *this) | 80 | static void coalesced_mmio_destructor(struct kvm_io_device *this) |
| 82 | { | 81 | { |
| 83 | kfree(this); | 82 | struct kvm_coalesced_mmio_dev *dev = to_mmio(this); |
| 83 | |||
| 84 | kfree(dev); | ||
| 84 | } | 85 | } |
| 85 | 86 | ||
| 87 | static const struct kvm_io_device_ops coalesced_mmio_ops = { | ||
| 88 | .write = coalesced_mmio_write, | ||
| 89 | .destructor = coalesced_mmio_destructor, | ||
| 90 | }; | ||
| 91 | |||
| 86 | int kvm_coalesced_mmio_init(struct kvm *kvm) | 92 | int kvm_coalesced_mmio_init(struct kvm *kvm) |
| 87 | { | 93 | { |
| 88 | struct kvm_coalesced_mmio_dev *dev; | 94 | struct kvm_coalesced_mmio_dev *dev; |
| 95 | int ret; | ||
| 89 | 96 | ||
| 90 | dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); | 97 | dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); |
| 91 | if (!dev) | 98 | if (!dev) |
| 92 | return -ENOMEM; | 99 | return -ENOMEM; |
| 93 | dev->dev.write = coalesced_mmio_write; | 100 | spin_lock_init(&dev->lock); |
| 94 | dev->dev.in_range = coalesced_mmio_in_range; | 101 | kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); |
| 95 | dev->dev.destructor = coalesced_mmio_destructor; | ||
| 96 | dev->dev.private = dev; | ||
| 97 | dev->kvm = kvm; | 102 | dev->kvm = kvm; |
| 98 | kvm->coalesced_mmio_dev = dev; | 103 | kvm->coalesced_mmio_dev = dev; |
| 99 | kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev); | ||
| 100 | 104 | ||
| 101 | return 0; | 105 | ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev); |
| 106 | if (ret < 0) | ||
| 107 | kfree(dev); | ||
| 108 | |||
| 109 | return ret; | ||
| 102 | } | 110 | } |
| 103 | 111 | ||
| 104 | int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, | 112 | int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, |
| @@ -109,16 +117,16 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, | |||
| 109 | if (dev == NULL) | 117 | if (dev == NULL) |
| 110 | return -EINVAL; | 118 | return -EINVAL; |
| 111 | 119 | ||
| 112 | mutex_lock(&kvm->lock); | 120 | down_write(&kvm->slots_lock); |
| 113 | if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { | 121 | if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { |
| 114 | mutex_unlock(&kvm->lock); | 122 | up_write(&kvm->slots_lock); |
| 115 | return -ENOBUFS; | 123 | return -ENOBUFS; |
| 116 | } | 124 | } |
| 117 | 125 | ||
| 118 | dev->zone[dev->nb_zones] = *zone; | 126 | dev->zone[dev->nb_zones] = *zone; |
| 119 | dev->nb_zones++; | 127 | dev->nb_zones++; |
| 120 | 128 | ||
| 121 | mutex_unlock(&kvm->lock); | 129 | up_write(&kvm->slots_lock); |
| 122 | return 0; | 130 | return 0; |
| 123 | } | 131 | } |
| 124 | 132 | ||
| @@ -132,7 +140,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, | |||
| 132 | if (dev == NULL) | 140 | if (dev == NULL) |
| 133 | return -EINVAL; | 141 | return -EINVAL; |
| 134 | 142 | ||
| 135 | mutex_lock(&kvm->lock); | 143 | down_write(&kvm->slots_lock); |
| 136 | 144 | ||
| 137 | i = dev->nb_zones; | 145 | i = dev->nb_zones; |
| 138 | while(i) { | 146 | while(i) { |
| @@ -150,7 +158,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, | |||
| 150 | i--; | 158 | i--; |
| 151 | } | 159 | } |
| 152 | 160 | ||
| 153 | mutex_unlock(&kvm->lock); | 161 | up_write(&kvm->slots_lock); |
| 154 | 162 | ||
| 155 | return 0; | 163 | return 0; |
| 156 | } | 164 | } |
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h index 5ac0ec62846..4b49f27fa31 100644 --- a/virt/kvm/coalesced_mmio.h +++ b/virt/kvm/coalesced_mmio.h | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | struct kvm_coalesced_mmio_dev { | 12 | struct kvm_coalesced_mmio_dev { |
| 13 | struct kvm_io_device dev; | 13 | struct kvm_io_device dev; |
| 14 | struct kvm *kvm; | 14 | struct kvm *kvm; |
| 15 | spinlock_t lock; | ||
| 15 | int nb_zones; | 16 | int nb_zones; |
| 16 | struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; | 17 | struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; |
| 17 | }; | 18 | }; |
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c new file mode 100644 index 00000000000..bb4ebd89b9f --- /dev/null +++ b/virt/kvm/eventfd.c | |||
| @@ -0,0 +1,578 @@ | |||
| 1 | /* | ||
| 2 | * kvm eventfd support - use eventfd objects to signal various KVM events | ||
| 3 | * | ||
| 4 | * Copyright 2009 Novell. All Rights Reserved. | ||
| 5 | * | ||
| 6 | * Author: | ||
| 7 | * Gregory Haskins <ghaskins@novell.com> | ||
| 8 | * | ||
| 9 | * This file is free software; you can redistribute it and/or modify | ||
| 10 | * it under the terms of version 2 of the GNU General Public License | ||
| 11 | * as published by the Free Software Foundation. | ||
| 12 | * | ||
| 13 | * This program is distributed in the hope that it will be useful, | ||
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 16 | * GNU General Public License for more details. | ||
| 17 | * | ||
| 18 | * You should have received a copy of the GNU General Public License | ||
| 19 | * along with this program; if not, write to the Free Software Foundation, | ||
| 20 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | ||
| 21 | */ | ||
| 22 | |||
| 23 | #include <linux/kvm_host.h> | ||
| 24 | #include <linux/kvm.h> | ||
| 25 | #include <linux/workqueue.h> | ||
| 26 | #include <linux/syscalls.h> | ||
| 27 | #include <linux/wait.h> | ||
| 28 | #include <linux/poll.h> | ||
| 29 | #include <linux/file.h> | ||
| 30 | #include <linux/list.h> | ||
| 31 | #include <linux/eventfd.h> | ||
| 32 | #include <linux/kernel.h> | ||
| 33 | |||
| 34 | #include "iodev.h" | ||
| 35 | |||
| 36 | /* | ||
| 37 | * -------------------------------------------------------------------- | ||
| 38 | * irqfd: Allows an fd to be used to inject an interrupt to the guest | ||
| 39 | * | ||
| 40 | * Credit goes to Avi Kivity for the original idea. | ||
| 41 | * -------------------------------------------------------------------- | ||
| 42 | */ | ||
| 43 | |||
| 44 | struct _irqfd { | ||
| 45 | struct kvm *kvm; | ||
| 46 | struct eventfd_ctx *eventfd; | ||
| 47 | int gsi; | ||
| 48 | struct list_head list; | ||
| 49 | poll_table pt; | ||
| 50 | wait_queue_head_t *wqh; | ||
| 51 | wait_queue_t wait; | ||
| 52 | struct work_struct inject; | ||
| 53 | struct work_struct shutdown; | ||
| 54 | }; | ||
| 55 | |||
| 56 | static struct workqueue_struct *irqfd_cleanup_wq; | ||
| 57 | |||
| 58 | static void | ||
| 59 | irqfd_inject(struct work_struct *work) | ||
| 60 | { | ||
| 61 | struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); | ||
| 62 | struct kvm *kvm = irqfd->kvm; | ||
| 63 | |||
| 64 | mutex_lock(&kvm->irq_lock); | ||
| 65 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); | ||
| 66 | kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); | ||
| 67 | mutex_unlock(&kvm->irq_lock); | ||
| 68 | } | ||
| 69 | |||
| 70 | /* | ||
| 71 | * Race-free decouple logic (ordering is critical) | ||
| 72 | */ | ||
| 73 | static void | ||
| 74 | irqfd_shutdown(struct work_struct *work) | ||
| 75 | { | ||
| 76 | struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); | ||
| 77 | |||
| 78 | /* | ||
| 79 | * Synchronize with the wait-queue and unhook ourselves to prevent | ||
| 80 | * further events. | ||
| 81 | */ | ||
| 82 | remove_wait_queue(irqfd->wqh, &irqfd->wait); | ||
| 83 | |||
| 84 | /* | ||
| 85 | * We know no new events will be scheduled at this point, so block | ||
| 86 | * until all previously outstanding events have completed | ||
| 87 | */ | ||
| 88 | flush_work(&irqfd->inject); | ||
| 89 | |||
| 90 | /* | ||
| 91 | * It is now safe to release the object's resources | ||
| 92 | */ | ||
| 93 | eventfd_ctx_put(irqfd->eventfd); | ||
| 94 | kfree(irqfd); | ||
| 95 | } | ||
| 96 | |||
| 97 | |||
| 98 | /* assumes kvm->irqfds.lock is held */ | ||
| 99 | static bool | ||
| 100 | irqfd_is_active(struct _irqfd *irqfd) | ||
| 101 | { | ||
| 102 | return list_empty(&irqfd->list) ? false : true; | ||
| 103 | } | ||
| 104 | |||
| 105 | /* | ||
| 106 | * Mark the irqfd as inactive and schedule it for removal | ||
| 107 | * | ||
| 108 | * assumes kvm->irqfds.lock is held | ||
| 109 | */ | ||
| 110 | static void | ||
| 111 | irqfd_deactivate(struct _irqfd *irqfd) | ||
| 112 | { | ||
| 113 | BUG_ON(!irqfd_is_active(irqfd)); | ||
| 114 | |||
| 115 | list_del_init(&irqfd->list); | ||
| 116 | |||
| 117 | queue_work(irqfd_cleanup_wq, &irqfd->shutdown); | ||
| 118 | } | ||
| 119 | |||
| 120 | /* | ||
| 121 | * Called with wqh->lock held and interrupts disabled | ||
| 122 | */ | ||
| 123 | static int | ||
| 124 | irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
| 125 | { | ||
| 126 | struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); | ||
| 127 | unsigned long flags = (unsigned long)key; | ||
| 128 | |||
| 129 | if (flags & POLLIN) | ||
| 130 | /* An event has been signaled, inject an interrupt */ | ||
| 131 | schedule_work(&irqfd->inject); | ||
| 132 | |||
| 133 | if (flags & POLLHUP) { | ||
| 134 | /* The eventfd is closing, detach from KVM */ | ||
| 135 | struct kvm *kvm = irqfd->kvm; | ||
| 136 | unsigned long flags; | ||
| 137 | |||
| 138 | spin_lock_irqsave(&kvm->irqfds.lock, flags); | ||
| 139 | |||
| 140 | /* | ||
| 141 | * We must check if someone deactivated the irqfd before | ||
| 142 | * we could acquire the irqfds.lock since the item is | ||
| 143 | * deactivated from the KVM side before it is unhooked from | ||
| 144 | * the wait-queue. If it is already deactivated, we can | ||
| 145 | * simply return knowing the other side will cleanup for us. | ||
| 146 | * We cannot race against the irqfd going away since the | ||
| 147 | * other side is required to acquire wqh->lock, which we hold | ||
| 148 | */ | ||
| 149 | if (irqfd_is_active(irqfd)) | ||
| 150 | irqfd_deactivate(irqfd); | ||
| 151 | |||
| 152 | spin_unlock_irqrestore(&kvm->irqfds.lock, flags); | ||
| 153 | } | ||
| 154 | |||
| 155 | return 0; | ||
| 156 | } | ||
| 157 | |||
| 158 | static void | ||
| 159 | irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, | ||
| 160 | poll_table *pt) | ||
| 161 | { | ||
| 162 | struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); | ||
| 163 | |||
| 164 | irqfd->wqh = wqh; | ||
| 165 | add_wait_queue(wqh, &irqfd->wait); | ||
| 166 | } | ||
| 167 | |||
| 168 | static int | ||
| 169 | kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) | ||
| 170 | { | ||
| 171 | struct _irqfd *irqfd; | ||
| 172 | struct file *file = NULL; | ||
| 173 | struct eventfd_ctx *eventfd = NULL; | ||
| 174 | int ret; | ||
| 175 | unsigned int events; | ||
| 176 | |||
| 177 | irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); | ||
| 178 | if (!irqfd) | ||
| 179 | return -ENOMEM; | ||
| 180 | |||
| 181 | irqfd->kvm = kvm; | ||
| 182 | irqfd->gsi = gsi; | ||
| 183 | INIT_LIST_HEAD(&irqfd->list); | ||
| 184 | INIT_WORK(&irqfd->inject, irqfd_inject); | ||
| 185 | INIT_WORK(&irqfd->shutdown, irqfd_shutdown); | ||
| 186 | |||
| 187 | file = eventfd_fget(fd); | ||
| 188 | if (IS_ERR(file)) { | ||
| 189 | ret = PTR_ERR(file); | ||
| 190 | goto fail; | ||
| 191 | } | ||
| 192 | |||
| 193 | eventfd = eventfd_ctx_fileget(file); | ||
| 194 | if (IS_ERR(eventfd)) { | ||
| 195 | ret = PTR_ERR(eventfd); | ||
| 196 | goto fail; | ||
| 197 | } | ||
| 198 | |||
| 199 | irqfd->eventfd = eventfd; | ||
| 200 | |||
| 201 | /* | ||
| 202 | * Install our own custom wake-up handling so we are notified via | ||
| 203 | * a callback whenever someone signals the underlying eventfd | ||
| 204 | */ | ||
| 205 | init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); | ||
| 206 | init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); | ||
| 207 | |||
| 208 | events = file->f_op->poll(file, &irqfd->pt); | ||
| 209 | |||
| 210 | spin_lock_irq(&kvm->irqfds.lock); | ||
| 211 | list_add_tail(&irqfd->list, &kvm->irqfds.items); | ||
| 212 | spin_unlock_irq(&kvm->irqfds.lock); | ||
| 213 | |||
| 214 | /* | ||
| 215 | * Check if there was an event already pending on the eventfd | ||
| 216 | * before we registered, and trigger it as if we didn't miss it. | ||
| 217 | */ | ||
| 218 | if (events & POLLIN) | ||
| 219 | schedule_work(&irqfd->inject); | ||
| 220 | |||
| 221 | /* | ||
| 222 | * do not drop the file until the irqfd is fully initialized, otherwise | ||
| 223 | * we might race against the POLLHUP | ||
| 224 | */ | ||
| 225 | fput(file); | ||
| 226 | |||
| 227 | return 0; | ||
| 228 | |||
| 229 | fail: | ||
| 230 | if (eventfd && !IS_ERR(eventfd)) | ||
| 231 | eventfd_ctx_put(eventfd); | ||
| 232 | |||
| 233 | if (!IS_ERR(file)) | ||
| 234 | fput(file); | ||
| 235 | |||
| 236 | kfree(irqfd); | ||
| 237 | return ret; | ||
| 238 | } | ||
| 239 | |||
| 240 | void | ||
| 241 | kvm_eventfd_init(struct kvm *kvm) | ||
| 242 | { | ||
| 243 | spin_lock_init(&kvm->irqfds.lock); | ||
| 244 | INIT_LIST_HEAD(&kvm->irqfds.items); | ||
| 245 | INIT_LIST_HEAD(&kvm->ioeventfds); | ||
| 246 | } | ||
| 247 | |||
| 248 | /* | ||
| 249 | * shutdown any irqfd's that match fd+gsi | ||
| 250 | */ | ||
| 251 | static int | ||
| 252 | kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) | ||
| 253 | { | ||
| 254 | struct _irqfd *irqfd, *tmp; | ||
| 255 | struct eventfd_ctx *eventfd; | ||
| 256 | |||
| 257 | eventfd = eventfd_ctx_fdget(fd); | ||
| 258 | if (IS_ERR(eventfd)) | ||
| 259 | return PTR_ERR(eventfd); | ||
| 260 | |||
| 261 | spin_lock_irq(&kvm->irqfds.lock); | ||
| 262 | |||
| 263 | list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { | ||
| 264 | if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) | ||
| 265 | irqfd_deactivate(irqfd); | ||
| 266 | } | ||
| 267 | |||
| 268 | spin_unlock_irq(&kvm->irqfds.lock); | ||
| 269 | eventfd_ctx_put(eventfd); | ||
| 270 | |||
| 271 | /* | ||
| 272 | * Block until we know all outstanding shutdown jobs have completed | ||
| 273 | * so that we guarantee there will not be any more interrupts on this | ||
| 274 | * gsi once this deassign function returns. | ||
| 275 | */ | ||
| 276 | flush_workqueue(irqfd_cleanup_wq); | ||
| 277 | |||
| 278 | return 0; | ||
| 279 | } | ||
| 280 | |||
| 281 | int | ||
| 282 | kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) | ||
| 283 | { | ||
| 284 | if (flags & KVM_IRQFD_FLAG_DEASSIGN) | ||
| 285 | return kvm_irqfd_deassign(kvm, fd, gsi); | ||
| 286 | |||
| 287 | return kvm_irqfd_assign(kvm, fd, gsi); | ||
| 288 | } | ||
| 289 | |||
| 290 | /* | ||
| 291 | * This function is called as the kvm VM fd is being released. Shutdown all | ||
| 292 | * irqfds that still remain open | ||
| 293 | */ | ||
| 294 | void | ||
| 295 | kvm_irqfd_release(struct kvm *kvm) | ||
| 296 | { | ||
| 297 | struct _irqfd *irqfd, *tmp; | ||
| 298 | |||
| 299 | spin_lock_irq(&kvm->irqfds.lock); | ||
| 300 | |||
| 301 | list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) | ||
| 302 | irqfd_deactivate(irqfd); | ||
| 303 | |||
| 304 | spin_unlock_irq(&kvm->irqfds.lock); | ||
| 305 | |||
| 306 | /* | ||
| 307 | * Block until we know all outstanding shutdown jobs have completed | ||
| 308 | * since we do not take a kvm* reference. | ||
| 309 | */ | ||
| 310 | flush_workqueue(irqfd_cleanup_wq); | ||
| 311 | |||
| 312 | } | ||
| 313 | |||
| 314 | /* | ||
| 315 | * create a host-wide workqueue for issuing deferred shutdown requests | ||
| 316 | * aggregated from all vm* instances. We need our own isolated single-thread | ||
| 317 | * queue to prevent deadlock against flushing the normal work-queue. | ||
| 318 | */ | ||
| 319 | static int __init irqfd_module_init(void) | ||
| 320 | { | ||
| 321 | irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); | ||
| 322 | if (!irqfd_cleanup_wq) | ||
| 323 | return -ENOMEM; | ||
| 324 | |||
| 325 | return 0; | ||
| 326 | } | ||
| 327 | |||
| 328 | static void __exit irqfd_module_exit(void) | ||
| 329 | { | ||
| 330 | destroy_workqueue(irqfd_cleanup_wq); | ||
| 331 | } | ||
| 332 | |||
| 333 | module_init(irqfd_module_init); | ||
| 334 | module_exit(irqfd_module_exit); | ||
| 335 | |||
| 336 | /* | ||
| 337 | * -------------------------------------------------------------------- | ||
| 338 | * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. | ||
| 339 | * | ||
| 340 | * userspace can register a PIO/MMIO address with an eventfd for receiving | ||
| 341 | * notification when the memory has been touched. | ||
| 342 | * -------------------------------------------------------------------- | ||
| 343 | */ | ||
| 344 | |||
| 345 | struct _ioeventfd { | ||
| 346 | struct list_head list; | ||
| 347 | u64 addr; | ||
| 348 | int length; | ||
| 349 | struct eventfd_ctx *eventfd; | ||
| 350 | u64 datamatch; | ||
| 351 | struct kvm_io_device dev; | ||
| 352 | bool wildcard; | ||
| 353 | }; | ||
| 354 | |||
| 355 | static inline struct _ioeventfd * | ||
| 356 | to_ioeventfd(struct kvm_io_device *dev) | ||
| 357 | { | ||
| 358 | return container_of(dev, struct _ioeventfd, dev); | ||
| 359 | } | ||
| 360 | |||
| 361 | static void | ||
| 362 | ioeventfd_release(struct _ioeventfd *p) | ||
| 363 | { | ||
| 364 | eventfd_ctx_put(p->eventfd); | ||
| 365 | list_del(&p->list); | ||
| 366 | kfree(p); | ||
| 367 | } | ||
| 368 | |||
| 369 | static bool | ||
| 370 | ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) | ||
| 371 | { | ||
| 372 | u64 _val; | ||
| 373 | |||
| 374 | if (!(addr == p->addr && len == p->length)) | ||
| 375 | /* address-range must be precise for a hit */ | ||
| 376 | return false; | ||
| 377 | |||
| 378 | if (p->wildcard) | ||
| 379 | /* all else equal, wildcard is always a hit */ | ||
| 380 | return true; | ||
| 381 | |||
| 382 | /* otherwise, we have to actually compare the data */ | ||
| 383 | |||
| 384 | BUG_ON(!IS_ALIGNED((unsigned long)val, len)); | ||
| 385 | |||
| 386 | switch (len) { | ||
| 387 | case 1: | ||
| 388 | _val = *(u8 *)val; | ||
| 389 | break; | ||
| 390 | case 2: | ||
| 391 | _val = *(u16 *)val; | ||
| 392 | break; | ||
| 393 | case 4: | ||
| 394 | _val = *(u32 *)val; | ||
| 395 | break; | ||
| 396 | case 8: | ||
| 397 | _val = *(u64 *)val; | ||
| 398 | break; | ||
| 399 | default: | ||
| 400 | return false; | ||
| 401 | } | ||
| 402 | |||
| 403 | return _val == p->datamatch ? true : false; | ||
| 404 | } | ||
| 405 | |||
| 406 | /* MMIO/PIO writes trigger an event if the addr/val match */ | ||
| 407 | static int | ||
| 408 | ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, | ||
| 409 | const void *val) | ||
| 410 | { | ||
| 411 | struct _ioeventfd *p = to_ioeventfd(this); | ||
| 412 | |||
| 413 | if (!ioeventfd_in_range(p, addr, len, val)) | ||
| 414 | return -EOPNOTSUPP; | ||
| 415 | |||
| 416 | eventfd_signal(p->eventfd, 1); | ||
| 417 | return 0; | ||
| 418 | } | ||
| 419 | |||
| 420 | /* | ||
| 421 | * This function is called as KVM is completely shutting down. We do not | ||
| 422 | * need to worry about locking just nuke anything we have as quickly as possible | ||
| 423 | */ | ||
| 424 | static void | ||
| 425 | ioeventfd_destructor(struct kvm_io_device *this) | ||
| 426 | { | ||
| 427 | struct _ioeventfd *p = to_ioeventfd(this); | ||
| 428 | |||
| 429 | ioeventfd_release(p); | ||
| 430 | } | ||
| 431 | |||
| 432 | static const struct kvm_io_device_ops ioeventfd_ops = { | ||
| 433 | .write = ioeventfd_write, | ||
| 434 | .destructor = ioeventfd_destructor, | ||
| 435 | }; | ||
| 436 | |||
| 437 | /* assumes kvm->slots_lock held */ | ||
| 438 | static bool | ||
| 439 | ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) | ||
| 440 | { | ||
| 441 | struct _ioeventfd *_p; | ||
| 442 | |||
| 443 | list_for_each_entry(_p, &kvm->ioeventfds, list) | ||
| 444 | if (_p->addr == p->addr && _p->length == p->length && | ||
| 445 | (_p->wildcard || p->wildcard || | ||
| 446 | _p->datamatch == p->datamatch)) | ||
| 447 | return true; | ||
| 448 | |||
| 449 | return false; | ||
| 450 | } | ||
| 451 | |||
| 452 | static int | ||
| 453 | kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) | ||
| 454 | { | ||
| 455 | int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; | ||
| 456 | struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; | ||
| 457 | struct _ioeventfd *p; | ||
| 458 | struct eventfd_ctx *eventfd; | ||
| 459 | int ret; | ||
| 460 | |||
| 461 | /* must be natural-word sized */ | ||
| 462 | switch (args->len) { | ||
| 463 | case 1: | ||
| 464 | case 2: | ||
| 465 | case 4: | ||
| 466 | case 8: | ||
| 467 | break; | ||
| 468 | default: | ||
| 469 | return -EINVAL; | ||
| 470 | } | ||
| 471 | |||
| 472 | /* check for range overflow */ | ||
| 473 | if (args->addr + args->len < args->addr) | ||
| 474 | return -EINVAL; | ||
| 475 | |||
| 476 | /* check for extra flags that we don't understand */ | ||
| 477 | if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) | ||
| 478 | return -EINVAL; | ||
| 479 | |||
| 480 | eventfd = eventfd_ctx_fdget(args->fd); | ||
| 481 | if (IS_ERR(eventfd)) | ||
| 482 | return PTR_ERR(eventfd); | ||
| 483 | |||
| 484 | p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
| 485 | if (!p) { | ||
| 486 | ret = -ENOMEM; | ||
| 487 | goto fail; | ||
| 488 | } | ||
| 489 | |||
| 490 | INIT_LIST_HEAD(&p->list); | ||
| 491 | p->addr = args->addr; | ||
| 492 | p->length = args->len; | ||
| 493 | p->eventfd = eventfd; | ||
| 494 | |||
| 495 | /* The datamatch feature is optional, otherwise this is a wildcard */ | ||
| 496 | if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) | ||
| 497 | p->datamatch = args->datamatch; | ||
| 498 | else | ||
| 499 | p->wildcard = true; | ||
| 500 | |||
| 501 | down_write(&kvm->slots_lock); | ||
| 502 | |||
| 503 | /* Verify that there isnt a match already */ | ||
| 504 | if (ioeventfd_check_collision(kvm, p)) { | ||
| 505 | ret = -EEXIST; | ||
| 506 | goto unlock_fail; | ||
| 507 | } | ||
| 508 | |||
| 509 | kvm_iodevice_init(&p->dev, &ioeventfd_ops); | ||
| 510 | |||
| 511 | ret = __kvm_io_bus_register_dev(bus, &p->dev); | ||
| 512 | if (ret < 0) | ||
| 513 | goto unlock_fail; | ||
| 514 | |||
| 515 | list_add_tail(&p->list, &kvm->ioeventfds); | ||
| 516 | |||
| 517 | up_write(&kvm->slots_lock); | ||
| 518 | |||
| 519 | return 0; | ||
| 520 | |||
| 521 | unlock_fail: | ||
| 522 | up_write(&kvm->slots_lock); | ||
| 523 | |||
| 524 | fail: | ||
| 525 | kfree(p); | ||
| 526 | eventfd_ctx_put(eventfd); | ||
| 527 | |||
| 528 | return ret; | ||
| 529 | } | ||
| 530 | |||
| 531 | static int | ||
| 532 | kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) | ||
| 533 | { | ||
| 534 | int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; | ||
| 535 | struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; | ||
| 536 | struct _ioeventfd *p, *tmp; | ||
| 537 | struct eventfd_ctx *eventfd; | ||
| 538 | int ret = -ENOENT; | ||
| 539 | |||
| 540 | eventfd = eventfd_ctx_fdget(args->fd); | ||
| 541 | if (IS_ERR(eventfd)) | ||
| 542 | return PTR_ERR(eventfd); | ||
| 543 | |||
| 544 | down_write(&kvm->slots_lock); | ||
| 545 | |||
| 546 | list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { | ||
| 547 | bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); | ||
| 548 | |||
| 549 | if (p->eventfd != eventfd || | ||
| 550 | p->addr != args->addr || | ||
| 551 | p->length != args->len || | ||
| 552 | p->wildcard != wildcard) | ||
| 553 | continue; | ||
| 554 | |||
| 555 | if (!p->wildcard && p->datamatch != args->datamatch) | ||
| 556 | continue; | ||
| 557 | |||
| 558 | __kvm_io_bus_unregister_dev(bus, &p->dev); | ||
| 559 | ioeventfd_release(p); | ||
| 560 | ret = 0; | ||
| 561 | break; | ||
| 562 | } | ||
| 563 | |||
| 564 | up_write(&kvm->slots_lock); | ||
| 565 | |||
| 566 | eventfd_ctx_put(eventfd); | ||
| 567 | |||
| 568 | return ret; | ||
| 569 | } | ||
| 570 | |||
| 571 | int | ||
| 572 | kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) | ||
| 573 | { | ||
| 574 | if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) | ||
| 575 | return kvm_deassign_ioeventfd(kvm, args); | ||
| 576 | |||
| 577 | return kvm_assign_ioeventfd(kvm, args); | ||
| 578 | } | ||
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1150c6d5c7b..9fe140bb38e 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c | |||
| @@ -36,6 +36,7 @@ | |||
| 36 | #include <asm/processor.h> | 36 | #include <asm/processor.h> |
| 37 | #include <asm/page.h> | 37 | #include <asm/page.h> |
| 38 | #include <asm/current.h> | 38 | #include <asm/current.h> |
| 39 | #include <trace/events/kvm.h> | ||
| 39 | 40 | ||
| 40 | #include "ioapic.h" | 41 | #include "ioapic.h" |
| 41 | #include "lapic.h" | 42 | #include "lapic.h" |
| @@ -103,6 +104,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | |||
| 103 | { | 104 | { |
| 104 | unsigned index; | 105 | unsigned index; |
| 105 | bool mask_before, mask_after; | 106 | bool mask_before, mask_after; |
| 107 | union kvm_ioapic_redirect_entry *e; | ||
| 106 | 108 | ||
| 107 | switch (ioapic->ioregsel) { | 109 | switch (ioapic->ioregsel) { |
| 108 | case IOAPIC_REG_VERSION: | 110 | case IOAPIC_REG_VERSION: |
| @@ -122,19 +124,20 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | |||
| 122 | ioapic_debug("change redir index %x val %x\n", index, val); | 124 | ioapic_debug("change redir index %x val %x\n", index, val); |
| 123 | if (index >= IOAPIC_NUM_PINS) | 125 | if (index >= IOAPIC_NUM_PINS) |
| 124 | return; | 126 | return; |
| 125 | mask_before = ioapic->redirtbl[index].fields.mask; | 127 | e = &ioapic->redirtbl[index]; |
| 128 | mask_before = e->fields.mask; | ||
| 126 | if (ioapic->ioregsel & 1) { | 129 | if (ioapic->ioregsel & 1) { |
| 127 | ioapic->redirtbl[index].bits &= 0xffffffff; | 130 | e->bits &= 0xffffffff; |
| 128 | ioapic->redirtbl[index].bits |= (u64) val << 32; | 131 | e->bits |= (u64) val << 32; |
| 129 | } else { | 132 | } else { |
| 130 | ioapic->redirtbl[index].bits &= ~0xffffffffULL; | 133 | e->bits &= ~0xffffffffULL; |
| 131 | ioapic->redirtbl[index].bits |= (u32) val; | 134 | e->bits |= (u32) val; |
| 132 | ioapic->redirtbl[index].fields.remote_irr = 0; | 135 | e->fields.remote_irr = 0; |
| 133 | } | 136 | } |
| 134 | mask_after = ioapic->redirtbl[index].fields.mask; | 137 | mask_after = e->fields.mask; |
| 135 | if (mask_before != mask_after) | 138 | if (mask_before != mask_after) |
| 136 | kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); | 139 | kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); |
| 137 | if (ioapic->redirtbl[index].fields.trig_mode == IOAPIC_LEVEL_TRIG | 140 | if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG |
| 138 | && ioapic->irr & (1 << index)) | 141 | && ioapic->irr & (1 << index)) |
| 139 | ioapic_service(ioapic, index); | 142 | ioapic_service(ioapic, index); |
| 140 | break; | 143 | break; |
| @@ -164,7 +167,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) | |||
| 164 | /* Always delivery PIT interrupt to vcpu 0 */ | 167 | /* Always delivery PIT interrupt to vcpu 0 */ |
| 165 | if (irq == 0) { | 168 | if (irq == 0) { |
| 166 | irqe.dest_mode = 0; /* Physical mode. */ | 169 | irqe.dest_mode = 0; /* Physical mode. */ |
| 167 | irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id; | 170 | /* need to read apic_id from apic regiest since |
| 171 | * it can be rewritten */ | ||
| 172 | irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id; | ||
| 168 | } | 173 | } |
| 169 | #endif | 174 | #endif |
| 170 | return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); | 175 | return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); |
| @@ -188,7 +193,10 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) | |||
| 188 | if ((edge && old_irr != ioapic->irr) || | 193 | if ((edge && old_irr != ioapic->irr) || |
| 189 | (!edge && !entry.fields.remote_irr)) | 194 | (!edge && !entry.fields.remote_irr)) |
| 190 | ret = ioapic_service(ioapic, irq); | 195 | ret = ioapic_service(ioapic, irq); |
| 196 | else | ||
| 197 | ret = 0; /* report coalesced interrupt */ | ||
| 191 | } | 198 | } |
| 199 | trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); | ||
| 192 | } | 200 | } |
| 193 | return ret; | 201 | return ret; |
| 194 | } | 202 | } |
| @@ -220,24 +228,29 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) | |||
| 220 | __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); | 228 | __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); |
| 221 | } | 229 | } |
| 222 | 230 | ||
| 223 | static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, | 231 | static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) |
| 224 | int len, int is_write) | ||
| 225 | { | 232 | { |
| 226 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 233 | return container_of(dev, struct kvm_ioapic, dev); |
| 234 | } | ||
| 227 | 235 | ||
| 236 | static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) | ||
| 237 | { | ||
| 228 | return ((addr >= ioapic->base_address && | 238 | return ((addr >= ioapic->base_address && |
| 229 | (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); | 239 | (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); |
| 230 | } | 240 | } |
| 231 | 241 | ||
| 232 | static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | 242 | static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, |
| 233 | void *val) | 243 | void *val) |
| 234 | { | 244 | { |
| 235 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 245 | struct kvm_ioapic *ioapic = to_ioapic(this); |
| 236 | u32 result; | 246 | u32 result; |
| 247 | if (!ioapic_in_range(ioapic, addr)) | ||
| 248 | return -EOPNOTSUPP; | ||
| 237 | 249 | ||
| 238 | ioapic_debug("addr %lx\n", (unsigned long)addr); | 250 | ioapic_debug("addr %lx\n", (unsigned long)addr); |
| 239 | ASSERT(!(addr & 0xf)); /* check alignment */ | 251 | ASSERT(!(addr & 0xf)); /* check alignment */ |
| 240 | 252 | ||
| 253 | mutex_lock(&ioapic->kvm->irq_lock); | ||
| 241 | addr &= 0xff; | 254 | addr &= 0xff; |
| 242 | switch (addr) { | 255 | switch (addr) { |
| 243 | case IOAPIC_REG_SELECT: | 256 | case IOAPIC_REG_SELECT: |
| @@ -264,22 +277,28 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | |||
| 264 | default: | 277 | default: |
| 265 | printk(KERN_WARNING "ioapic: wrong length %d\n", len); | 278 | printk(KERN_WARNING "ioapic: wrong length %d\n", len); |
| 266 | } | 279 | } |
| 280 | mutex_unlock(&ioapic->kvm->irq_lock); | ||
| 281 | return 0; | ||
| 267 | } | 282 | } |
| 268 | 283 | ||
| 269 | static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | 284 | static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, |
| 270 | const void *val) | 285 | const void *val) |
| 271 | { | 286 | { |
| 272 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 287 | struct kvm_ioapic *ioapic = to_ioapic(this); |
| 273 | u32 data; | 288 | u32 data; |
| 289 | if (!ioapic_in_range(ioapic, addr)) | ||
| 290 | return -EOPNOTSUPP; | ||
| 274 | 291 | ||
| 275 | ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", | 292 | ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", |
| 276 | (void*)addr, len, val); | 293 | (void*)addr, len, val); |
| 277 | ASSERT(!(addr & 0xf)); /* check alignment */ | 294 | ASSERT(!(addr & 0xf)); /* check alignment */ |
| 295 | |||
| 296 | mutex_lock(&ioapic->kvm->irq_lock); | ||
| 278 | if (len == 4 || len == 8) | 297 | if (len == 4 || len == 8) |
| 279 | data = *(u32 *) val; | 298 | data = *(u32 *) val; |
| 280 | else { | 299 | else { |
| 281 | printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); | 300 | printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); |
| 282 | return; | 301 | goto unlock; |
| 283 | } | 302 | } |
| 284 | 303 | ||
| 285 | addr &= 0xff; | 304 | addr &= 0xff; |
| @@ -300,6 +319,9 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | |||
| 300 | default: | 319 | default: |
| 301 | break; | 320 | break; |
| 302 | } | 321 | } |
| 322 | unlock: | ||
| 323 | mutex_unlock(&ioapic->kvm->irq_lock); | ||
| 324 | return 0; | ||
| 303 | } | 325 | } |
| 304 | 326 | ||
| 305 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic) | 327 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic) |
| @@ -314,21 +336,27 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic) | |||
| 314 | ioapic->id = 0; | 336 | ioapic->id = 0; |
| 315 | } | 337 | } |
| 316 | 338 | ||
| 339 | static const struct kvm_io_device_ops ioapic_mmio_ops = { | ||
| 340 | .read = ioapic_mmio_read, | ||
| 341 | .write = ioapic_mmio_write, | ||
| 342 | }; | ||
| 343 | |||
| 317 | int kvm_ioapic_init(struct kvm *kvm) | 344 | int kvm_ioapic_init(struct kvm *kvm) |
| 318 | { | 345 | { |
| 319 | struct kvm_ioapic *ioapic; | 346 | struct kvm_ioapic *ioapic; |
| 347 | int ret; | ||
| 320 | 348 | ||
| 321 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); | 349 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); |
| 322 | if (!ioapic) | 350 | if (!ioapic) |
| 323 | return -ENOMEM; | 351 | return -ENOMEM; |
| 324 | kvm->arch.vioapic = ioapic; | 352 | kvm->arch.vioapic = ioapic; |
| 325 | kvm_ioapic_reset(ioapic); | 353 | kvm_ioapic_reset(ioapic); |
| 326 | ioapic->dev.read = ioapic_mmio_read; | 354 | kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); |
| 327 | ioapic->dev.write = ioapic_mmio_write; | ||
| 328 | ioapic->dev.in_range = ioapic_in_range; | ||
| 329 | ioapic->dev.private = ioapic; | ||
| 330 | ioapic->kvm = kvm; | 355 | ioapic->kvm = kvm; |
| 331 | kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); | 356 | ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev); |
| 332 | return 0; | 357 | if (ret < 0) |
| 358 | kfree(ioapic); | ||
| 359 | |||
| 360 | return ret; | ||
| 333 | } | 361 | } |
| 334 | 362 | ||
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h index 55e8846ac3a..12fd3caffd2 100644 --- a/virt/kvm/iodev.h +++ b/virt/kvm/iodev.h | |||
| @@ -17,49 +17,54 @@ | |||
| 17 | #define __KVM_IODEV_H__ | 17 | #define __KVM_IODEV_H__ |
| 18 | 18 | ||
| 19 | #include <linux/kvm_types.h> | 19 | #include <linux/kvm_types.h> |
| 20 | #include <asm/errno.h> | ||
| 20 | 21 | ||
| 21 | struct kvm_io_device { | 22 | struct kvm_io_device; |
| 22 | void (*read)(struct kvm_io_device *this, | 23 | |
| 24 | /** | ||
| 25 | * kvm_io_device_ops are called under kvm slots_lock. | ||
| 26 | * read and write handlers return 0 if the transaction has been handled, | ||
| 27 | * or non-zero to have it passed to the next device. | ||
| 28 | **/ | ||
| 29 | struct kvm_io_device_ops { | ||
| 30 | int (*read)(struct kvm_io_device *this, | ||
| 31 | gpa_t addr, | ||
| 32 | int len, | ||
| 33 | void *val); | ||
| 34 | int (*write)(struct kvm_io_device *this, | ||
| 23 | gpa_t addr, | 35 | gpa_t addr, |
| 24 | int len, | 36 | int len, |
| 25 | void *val); | 37 | const void *val); |
| 26 | void (*write)(struct kvm_io_device *this, | ||
| 27 | gpa_t addr, | ||
| 28 | int len, | ||
| 29 | const void *val); | ||
| 30 | int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, | ||
| 31 | int is_write); | ||
| 32 | void (*destructor)(struct kvm_io_device *this); | 38 | void (*destructor)(struct kvm_io_device *this); |
| 39 | }; | ||
| 33 | 40 | ||
| 34 | void *private; | 41 | |
| 42 | struct kvm_io_device { | ||
| 43 | const struct kvm_io_device_ops *ops; | ||
| 35 | }; | 44 | }; |
| 36 | 45 | ||
| 37 | static inline void kvm_iodevice_read(struct kvm_io_device *dev, | 46 | static inline void kvm_iodevice_init(struct kvm_io_device *dev, |
| 38 | gpa_t addr, | 47 | const struct kvm_io_device_ops *ops) |
| 39 | int len, | ||
| 40 | void *val) | ||
| 41 | { | 48 | { |
| 42 | dev->read(dev, addr, len, val); | 49 | dev->ops = ops; |
| 43 | } | 50 | } |
| 44 | 51 | ||
| 45 | static inline void kvm_iodevice_write(struct kvm_io_device *dev, | 52 | static inline int kvm_iodevice_read(struct kvm_io_device *dev, |
| 46 | gpa_t addr, | 53 | gpa_t addr, int l, void *v) |
| 47 | int len, | ||
| 48 | const void *val) | ||
| 49 | { | 54 | { |
| 50 | dev->write(dev, addr, len, val); | 55 | return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP; |
| 51 | } | 56 | } |
| 52 | 57 | ||
| 53 | static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, | 58 | static inline int kvm_iodevice_write(struct kvm_io_device *dev, |
| 54 | gpa_t addr, int len, int is_write) | 59 | gpa_t addr, int l, const void *v) |
| 55 | { | 60 | { |
| 56 | return dev->in_range(dev, addr, len, is_write); | 61 | return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP; |
| 57 | } | 62 | } |
| 58 | 63 | ||
| 59 | static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) | 64 | static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) |
| 60 | { | 65 | { |
| 61 | if (dev->destructor) | 66 | if (dev->ops->destructor) |
| 62 | dev->destructor(dev); | 67 | dev->ops->destructor(dev); |
| 63 | } | 68 | } |
| 64 | 69 | ||
| 65 | #endif /* __KVM_IODEV_H__ */ | 70 | #endif /* __KVM_IODEV_H__ */ |
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index ddc17f0e2f3..001663ff401 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | */ | 20 | */ |
| 21 | 21 | ||
| 22 | #include <linux/kvm_host.h> | 22 | #include <linux/kvm_host.h> |
| 23 | #include <trace/events/kvm.h> | ||
| 23 | 24 | ||
| 24 | #include <asm/msidef.h> | 25 | #include <asm/msidef.h> |
| 25 | #ifdef CONFIG_IA64 | 26 | #ifdef CONFIG_IA64 |
| @@ -62,14 +63,14 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, | |||
| 62 | int i, r = -1; | 63 | int i, r = -1; |
| 63 | struct kvm_vcpu *vcpu, *lowest = NULL; | 64 | struct kvm_vcpu *vcpu, *lowest = NULL; |
| 64 | 65 | ||
| 66 | WARN_ON(!mutex_is_locked(&kvm->irq_lock)); | ||
| 67 | |||
| 65 | if (irq->dest_mode == 0 && irq->dest_id == 0xff && | 68 | if (irq->dest_mode == 0 && irq->dest_id == 0xff && |
| 66 | kvm_is_dm_lowest_prio(irq)) | 69 | kvm_is_dm_lowest_prio(irq)) |
| 67 | printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); | 70 | printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); |
| 68 | 71 | ||
| 69 | for (i = 0; i < KVM_MAX_VCPUS; i++) { | 72 | kvm_for_each_vcpu(i, vcpu, kvm) { |
| 70 | vcpu = kvm->vcpus[i]; | 73 | if (!kvm_apic_present(vcpu)) |
| 71 | |||
| 72 | if (!vcpu || !kvm_apic_present(vcpu)) | ||
| 73 | continue; | 74 | continue; |
| 74 | 75 | ||
| 75 | if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, | 76 | if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, |
| @@ -99,6 +100,8 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, | |||
| 99 | { | 100 | { |
| 100 | struct kvm_lapic_irq irq; | 101 | struct kvm_lapic_irq irq; |
| 101 | 102 | ||
| 103 | trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); | ||
| 104 | |||
| 102 | irq.dest_id = (e->msi.address_lo & | 105 | irq.dest_id = (e->msi.address_lo & |
| 103 | MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; | 106 | MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; |
| 104 | irq.vector = (e->msi.data & | 107 | irq.vector = (e->msi.data & |
| @@ -113,7 +116,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, | |||
| 113 | return kvm_irq_delivery_to_apic(kvm, NULL, &irq); | 116 | return kvm_irq_delivery_to_apic(kvm, NULL, &irq); |
| 114 | } | 117 | } |
| 115 | 118 | ||
| 116 | /* This should be called with the kvm->lock mutex held | 119 | /* This should be called with the kvm->irq_lock mutex held |
| 117 | * Return value: | 120 | * Return value: |
| 118 | * < 0 Interrupt was ignored (masked or not delivered for other reasons) | 121 | * < 0 Interrupt was ignored (masked or not delivered for other reasons) |
| 119 | * = 0 Interrupt was coalesced (previous irq is still pending) | 122 | * = 0 Interrupt was coalesced (previous irq is still pending) |
| @@ -125,6 +128,10 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) | |||
| 125 | unsigned long *irq_state, sig_level; | 128 | unsigned long *irq_state, sig_level; |
| 126 | int ret = -1; | 129 | int ret = -1; |
| 127 | 130 | ||
| 131 | trace_kvm_set_irq(irq, level, irq_source_id); | ||
| 132 | |||
| 133 | WARN_ON(!mutex_is_locked(&kvm->irq_lock)); | ||
| 134 | |||
| 128 | if (irq < KVM_IOAPIC_NUM_PINS) { | 135 | if (irq < KVM_IOAPIC_NUM_PINS) { |
| 129 | irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; | 136 | irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; |
| 130 | 137 | ||
| @@ -134,7 +141,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) | |||
| 134 | else | 141 | else |
| 135 | clear_bit(irq_source_id, irq_state); | 142 | clear_bit(irq_source_id, irq_state); |
| 136 | sig_level = !!(*irq_state); | 143 | sig_level = !!(*irq_state); |
| 137 | } else /* Deal with MSI/MSI-X */ | 144 | } else if (!level) |
| 145 | return ret; | ||
| 146 | else /* Deal with MSI/MSI-X */ | ||
| 138 | sig_level = 1; | 147 | sig_level = 1; |
| 139 | 148 | ||
| 140 | /* Not possible to detect if the guest uses the PIC or the | 149 | /* Not possible to detect if the guest uses the PIC or the |
| @@ -159,6 +168,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) | |||
| 159 | struct hlist_node *n; | 168 | struct hlist_node *n; |
| 160 | unsigned gsi = pin; | 169 | unsigned gsi = pin; |
| 161 | 170 | ||
| 171 | trace_kvm_ack_irq(irqchip, pin); | ||
| 172 | |||
| 162 | list_for_each_entry(e, &kvm->irq_routing, link) | 173 | list_for_each_entry(e, &kvm->irq_routing, link) |
| 163 | if (e->type == KVM_IRQ_ROUTING_IRQCHIP && | 174 | if (e->type == KVM_IRQ_ROUTING_IRQCHIP && |
| 164 | e->irqchip.irqchip == irqchip && | 175 | e->irqchip.irqchip == irqchip && |
| @@ -175,19 +186,26 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) | |||
| 175 | void kvm_register_irq_ack_notifier(struct kvm *kvm, | 186 | void kvm_register_irq_ack_notifier(struct kvm *kvm, |
| 176 | struct kvm_irq_ack_notifier *kian) | 187 | struct kvm_irq_ack_notifier *kian) |
| 177 | { | 188 | { |
| 189 | mutex_lock(&kvm->irq_lock); | ||
| 178 | hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); | 190 | hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); |
| 191 | mutex_unlock(&kvm->irq_lock); | ||
| 179 | } | 192 | } |
| 180 | 193 | ||
| 181 | void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian) | 194 | void kvm_unregister_irq_ack_notifier(struct kvm *kvm, |
| 195 | struct kvm_irq_ack_notifier *kian) | ||
| 182 | { | 196 | { |
| 197 | mutex_lock(&kvm->irq_lock); | ||
| 183 | hlist_del_init(&kian->link); | 198 | hlist_del_init(&kian->link); |
| 199 | mutex_unlock(&kvm->irq_lock); | ||
| 184 | } | 200 | } |
| 185 | 201 | ||
| 186 | /* The caller must hold kvm->lock mutex */ | ||
| 187 | int kvm_request_irq_source_id(struct kvm *kvm) | 202 | int kvm_request_irq_source_id(struct kvm *kvm) |
| 188 | { | 203 | { |
| 189 | unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; | 204 | unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; |
| 190 | int irq_source_id = find_first_zero_bit(bitmap, | 205 | int irq_source_id; |
| 206 | |||
| 207 | mutex_lock(&kvm->irq_lock); | ||
| 208 | irq_source_id = find_first_zero_bit(bitmap, | ||
| 191 | sizeof(kvm->arch.irq_sources_bitmap)); | 209 | sizeof(kvm->arch.irq_sources_bitmap)); |
| 192 | 210 | ||
| 193 | if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { | 211 | if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { |
| @@ -197,6 +215,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) | |||
| 197 | 215 | ||
| 198 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); | 216 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); |
| 199 | set_bit(irq_source_id, bitmap); | 217 | set_bit(irq_source_id, bitmap); |
| 218 | mutex_unlock(&kvm->irq_lock); | ||
| 200 | 219 | ||
| 201 | return irq_source_id; | 220 | return irq_source_id; |
| 202 | } | 221 | } |
| @@ -207,6 +226,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) | |||
| 207 | 226 | ||
| 208 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); | 227 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); |
| 209 | 228 | ||
| 229 | mutex_lock(&kvm->irq_lock); | ||
| 210 | if (irq_source_id < 0 || | 230 | if (irq_source_id < 0 || |
| 211 | irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { | 231 | irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { |
| 212 | printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); | 232 | printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); |
| @@ -215,19 +235,24 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) | |||
| 215 | for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) | 235 | for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) |
| 216 | clear_bit(irq_source_id, &kvm->arch.irq_states[i]); | 236 | clear_bit(irq_source_id, &kvm->arch.irq_states[i]); |
| 217 | clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); | 237 | clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); |
| 238 | mutex_unlock(&kvm->irq_lock); | ||
| 218 | } | 239 | } |
| 219 | 240 | ||
| 220 | void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, | 241 | void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, |
| 221 | struct kvm_irq_mask_notifier *kimn) | 242 | struct kvm_irq_mask_notifier *kimn) |
| 222 | { | 243 | { |
| 244 | mutex_lock(&kvm->irq_lock); | ||
| 223 | kimn->irq = irq; | 245 | kimn->irq = irq; |
| 224 | hlist_add_head(&kimn->link, &kvm->mask_notifier_list); | 246 | hlist_add_head(&kimn->link, &kvm->mask_notifier_list); |
| 247 | mutex_unlock(&kvm->irq_lock); | ||
| 225 | } | 248 | } |
| 226 | 249 | ||
| 227 | void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, | 250 | void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, |
| 228 | struct kvm_irq_mask_notifier *kimn) | 251 | struct kvm_irq_mask_notifier *kimn) |
| 229 | { | 252 | { |
| 253 | mutex_lock(&kvm->irq_lock); | ||
| 230 | hlist_del(&kimn->link); | 254 | hlist_del(&kimn->link); |
| 255 | mutex_unlock(&kvm->irq_lock); | ||
| 231 | } | 256 | } |
| 232 | 257 | ||
| 233 | void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) | 258 | void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) |
| @@ -235,6 +260,8 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) | |||
| 235 | struct kvm_irq_mask_notifier *kimn; | 260 | struct kvm_irq_mask_notifier *kimn; |
| 236 | struct hlist_node *n; | 261 | struct hlist_node *n; |
| 237 | 262 | ||
| 263 | WARN_ON(!mutex_is_locked(&kvm->irq_lock)); | ||
| 264 | |||
| 238 | hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link) | 265 | hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link) |
| 239 | if (kimn->irq == irq) | 266 | if (kimn->irq == irq) |
| 240 | kimn->func(kimn, mask); | 267 | kimn->func(kimn, mask); |
| @@ -250,7 +277,9 @@ static void __kvm_free_irq_routing(struct list_head *irq_routing) | |||
| 250 | 277 | ||
| 251 | void kvm_free_irq_routing(struct kvm *kvm) | 278 | void kvm_free_irq_routing(struct kvm *kvm) |
| 252 | { | 279 | { |
| 280 | mutex_lock(&kvm->irq_lock); | ||
| 253 | __kvm_free_irq_routing(&kvm->irq_routing); | 281 | __kvm_free_irq_routing(&kvm->irq_routing); |
| 282 | mutex_unlock(&kvm->irq_lock); | ||
| 254 | } | 283 | } |
| 255 | 284 | ||
| 256 | static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, | 285 | static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, |
| @@ -325,13 +354,13 @@ int kvm_set_irq_routing(struct kvm *kvm, | |||
| 325 | e = NULL; | 354 | e = NULL; |
| 326 | } | 355 | } |
| 327 | 356 | ||
| 328 | mutex_lock(&kvm->lock); | 357 | mutex_lock(&kvm->irq_lock); |
| 329 | list_splice(&kvm->irq_routing, &tmp); | 358 | list_splice(&kvm->irq_routing, &tmp); |
| 330 | INIT_LIST_HEAD(&kvm->irq_routing); | 359 | INIT_LIST_HEAD(&kvm->irq_routing); |
| 331 | list_splice(&irq_list, &kvm->irq_routing); | 360 | list_splice(&irq_list, &kvm->irq_routing); |
| 332 | INIT_LIST_HEAD(&irq_list); | 361 | INIT_LIST_HEAD(&irq_list); |
| 333 | list_splice(&tmp, &irq_list); | 362 | list_splice(&tmp, &irq_list); |
| 334 | mutex_unlock(&kvm->lock); | 363 | mutex_unlock(&kvm->irq_lock); |
| 335 | 364 | ||
| 336 | r = 0; | 365 | r = 0; |
| 337 | 366 | ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2884baf1d5f..897bff3b7df 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
| @@ -59,9 +59,18 @@ | |||
| 59 | #include "irq.h" | 59 | #include "irq.h" |
| 60 | #endif | 60 | #endif |
| 61 | 61 | ||
| 62 | #define CREATE_TRACE_POINTS | ||
| 63 | #include <trace/events/kvm.h> | ||
| 64 | |||
| 62 | MODULE_AUTHOR("Qumranet"); | 65 | MODULE_AUTHOR("Qumranet"); |
| 63 | MODULE_LICENSE("GPL"); | 66 | MODULE_LICENSE("GPL"); |
| 64 | 67 | ||
| 68 | /* | ||
| 69 | * Ordering of locks: | ||
| 70 | * | ||
| 71 | * kvm->slots_lock --> kvm->lock --> kvm->irq_lock | ||
| 72 | */ | ||
| 73 | |||
| 65 | DEFINE_SPINLOCK(kvm_lock); | 74 | DEFINE_SPINLOCK(kvm_lock); |
| 66 | LIST_HEAD(vm_list); | 75 | LIST_HEAD(vm_list); |
| 67 | 76 | ||
| @@ -79,6 +88,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, | |||
| 79 | 88 | ||
| 80 | static bool kvm_rebooting; | 89 | static bool kvm_rebooting; |
| 81 | 90 | ||
| 91 | static bool largepages_enabled = true; | ||
| 92 | |||
| 82 | #ifdef KVM_CAP_DEVICE_ASSIGNMENT | 93 | #ifdef KVM_CAP_DEVICE_ASSIGNMENT |
| 83 | static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, | 94 | static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, |
| 84 | int assigned_dev_id) | 95 | int assigned_dev_id) |
| @@ -120,17 +131,13 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) | |||
| 120 | { | 131 | { |
| 121 | struct kvm_assigned_dev_kernel *assigned_dev; | 132 | struct kvm_assigned_dev_kernel *assigned_dev; |
| 122 | struct kvm *kvm; | 133 | struct kvm *kvm; |
| 123 | int irq, i; | 134 | int i; |
| 124 | 135 | ||
| 125 | assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, | 136 | assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, |
| 126 | interrupt_work); | 137 | interrupt_work); |
| 127 | kvm = assigned_dev->kvm; | 138 | kvm = assigned_dev->kvm; |
| 128 | 139 | ||
| 129 | /* This is taken to safely inject irq inside the guest. When | 140 | mutex_lock(&kvm->irq_lock); |
| 130 | * the interrupt injection (or the ioapic code) uses a | ||
| 131 | * finer-grained lock, update this | ||
| 132 | */ | ||
| 133 | mutex_lock(&kvm->lock); | ||
| 134 | spin_lock_irq(&assigned_dev->assigned_dev_lock); | 141 | spin_lock_irq(&assigned_dev->assigned_dev_lock); |
| 135 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { | 142 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { |
| 136 | struct kvm_guest_msix_entry *guest_entries = | 143 | struct kvm_guest_msix_entry *guest_entries = |
| @@ -143,23 +150,13 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) | |||
| 143 | kvm_set_irq(assigned_dev->kvm, | 150 | kvm_set_irq(assigned_dev->kvm, |
| 144 | assigned_dev->irq_source_id, | 151 | assigned_dev->irq_source_id, |
| 145 | guest_entries[i].vector, 1); | 152 | guest_entries[i].vector, 1); |
| 146 | irq = assigned_dev->host_msix_entries[i].vector; | ||
| 147 | if (irq != 0) | ||
| 148 | enable_irq(irq); | ||
| 149 | assigned_dev->host_irq_disabled = false; | ||
| 150 | } | 153 | } |
| 151 | } else { | 154 | } else |
| 152 | kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, | 155 | kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, |
| 153 | assigned_dev->guest_irq, 1); | 156 | assigned_dev->guest_irq, 1); |
| 154 | if (assigned_dev->irq_requested_type & | ||
| 155 | KVM_DEV_IRQ_GUEST_MSI) { | ||
| 156 | enable_irq(assigned_dev->host_irq); | ||
| 157 | assigned_dev->host_irq_disabled = false; | ||
| 158 | } | ||
| 159 | } | ||
| 160 | 157 | ||
| 161 | spin_unlock_irq(&assigned_dev->assigned_dev_lock); | 158 | spin_unlock_irq(&assigned_dev->assigned_dev_lock); |
| 162 | mutex_unlock(&assigned_dev->kvm->lock); | 159 | mutex_unlock(&assigned_dev->kvm->irq_lock); |
| 163 | } | 160 | } |
| 164 | 161 | ||
| 165 | static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) | 162 | static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) |
| @@ -179,8 +176,10 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) | |||
| 179 | 176 | ||
| 180 | schedule_work(&assigned_dev->interrupt_work); | 177 | schedule_work(&assigned_dev->interrupt_work); |
| 181 | 178 | ||
| 182 | disable_irq_nosync(irq); | 179 | if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { |
| 183 | assigned_dev->host_irq_disabled = true; | 180 | disable_irq_nosync(irq); |
| 181 | assigned_dev->host_irq_disabled = true; | ||
| 182 | } | ||
| 184 | 183 | ||
| 185 | out: | 184 | out: |
| 186 | spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); | 185 | spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); |
| @@ -215,7 +214,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) | |||
| 215 | static void deassign_guest_irq(struct kvm *kvm, | 214 | static void deassign_guest_irq(struct kvm *kvm, |
| 216 | struct kvm_assigned_dev_kernel *assigned_dev) | 215 | struct kvm_assigned_dev_kernel *assigned_dev) |
| 217 | { | 216 | { |
| 218 | kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); | 217 | kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); |
| 219 | assigned_dev->ack_notifier.gsi = -1; | 218 | assigned_dev->ack_notifier.gsi = -1; |
| 220 | 219 | ||
| 221 | if (assigned_dev->irq_source_id != -1) | 220 | if (assigned_dev->irq_source_id != -1) |
| @@ -417,6 +416,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, | |||
| 417 | { | 416 | { |
| 418 | dev->guest_irq = irq->guest_irq; | 417 | dev->guest_irq = irq->guest_irq; |
| 419 | dev->ack_notifier.gsi = -1; | 418 | dev->ack_notifier.gsi = -1; |
| 419 | dev->host_irq_disabled = false; | ||
| 420 | return 0; | 420 | return 0; |
| 421 | } | 421 | } |
| 422 | #endif | 422 | #endif |
| @@ -427,6 +427,7 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, | |||
| 427 | { | 427 | { |
| 428 | dev->guest_irq = irq->guest_irq; | 428 | dev->guest_irq = irq->guest_irq; |
| 429 | dev->ack_notifier.gsi = -1; | 429 | dev->ack_notifier.gsi = -1; |
| 430 | dev->host_irq_disabled = false; | ||
| 430 | return 0; | 431 | return 0; |
| 431 | } | 432 | } |
| 432 | #endif | 433 | #endif |
| @@ -693,11 +694,6 @@ out: | |||
| 693 | } | 694 | } |
| 694 | #endif | 695 | #endif |
| 695 | 696 | ||
| 696 | static inline int valid_vcpu(int n) | ||
| 697 | { | ||
| 698 | return likely(n >= 0 && n < KVM_MAX_VCPUS); | ||
| 699 | } | ||
| 700 | |||
| 701 | inline int kvm_is_mmio_pfn(pfn_t pfn) | 697 | inline int kvm_is_mmio_pfn(pfn_t pfn) |
| 702 | { | 698 | { |
| 703 | if (pfn_valid(pfn)) { | 699 | if (pfn_valid(pfn)) { |
| @@ -745,12 +741,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) | |||
| 745 | if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) | 741 | if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) |
| 746 | cpumask_clear(cpus); | 742 | cpumask_clear(cpus); |
| 747 | 743 | ||
| 748 | me = get_cpu(); | ||
| 749 | spin_lock(&kvm->requests_lock); | 744 | spin_lock(&kvm->requests_lock); |
| 750 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 745 | me = smp_processor_id(); |
| 751 | vcpu = kvm->vcpus[i]; | 746 | kvm_for_each_vcpu(i, vcpu, kvm) { |
| 752 | if (!vcpu) | ||
| 753 | continue; | ||
| 754 | if (test_and_set_bit(req, &vcpu->requests)) | 747 | if (test_and_set_bit(req, &vcpu->requests)) |
| 755 | continue; | 748 | continue; |
| 756 | cpu = vcpu->cpu; | 749 | cpu = vcpu->cpu; |
| @@ -764,7 +757,6 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) | |||
| 764 | else | 757 | else |
| 765 | called = false; | 758 | called = false; |
| 766 | spin_unlock(&kvm->requests_lock); | 759 | spin_unlock(&kvm->requests_lock); |
| 767 | put_cpu(); | ||
| 768 | free_cpumask_var(cpus); | 760 | free_cpumask_var(cpus); |
| 769 | return called; | 761 | return called; |
| 770 | } | 762 | } |
| @@ -986,7 +978,9 @@ static struct kvm *kvm_create_vm(void) | |||
| 986 | spin_lock_init(&kvm->mmu_lock); | 978 | spin_lock_init(&kvm->mmu_lock); |
| 987 | spin_lock_init(&kvm->requests_lock); | 979 | spin_lock_init(&kvm->requests_lock); |
| 988 | kvm_io_bus_init(&kvm->pio_bus); | 980 | kvm_io_bus_init(&kvm->pio_bus); |
| 981 | kvm_eventfd_init(kvm); | ||
| 989 | mutex_init(&kvm->lock); | 982 | mutex_init(&kvm->lock); |
| 983 | mutex_init(&kvm->irq_lock); | ||
| 990 | kvm_io_bus_init(&kvm->mmio_bus); | 984 | kvm_io_bus_init(&kvm->mmio_bus); |
| 991 | init_rwsem(&kvm->slots_lock); | 985 | init_rwsem(&kvm->slots_lock); |
| 992 | atomic_set(&kvm->users_count, 1); | 986 | atomic_set(&kvm->users_count, 1); |
| @@ -1006,19 +1000,25 @@ out: | |||
| 1006 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | 1000 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, |
| 1007 | struct kvm_memory_slot *dont) | 1001 | struct kvm_memory_slot *dont) |
| 1008 | { | 1002 | { |
| 1003 | int i; | ||
| 1004 | |||
| 1009 | if (!dont || free->rmap != dont->rmap) | 1005 | if (!dont || free->rmap != dont->rmap) |
| 1010 | vfree(free->rmap); | 1006 | vfree(free->rmap); |
| 1011 | 1007 | ||
| 1012 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | 1008 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) |
| 1013 | vfree(free->dirty_bitmap); | 1009 | vfree(free->dirty_bitmap); |
| 1014 | 1010 | ||
| 1015 | if (!dont || free->lpage_info != dont->lpage_info) | 1011 | |
| 1016 | vfree(free->lpage_info); | 1012 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { |
| 1013 | if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { | ||
| 1014 | vfree(free->lpage_info[i]); | ||
| 1015 | free->lpage_info[i] = NULL; | ||
| 1016 | } | ||
| 1017 | } | ||
| 1017 | 1018 | ||
| 1018 | free->npages = 0; | 1019 | free->npages = 0; |
| 1019 | free->dirty_bitmap = NULL; | 1020 | free->dirty_bitmap = NULL; |
| 1020 | free->rmap = NULL; | 1021 | free->rmap = NULL; |
| 1021 | free->lpage_info = NULL; | ||
| 1022 | } | 1022 | } |
| 1023 | 1023 | ||
| 1024 | void kvm_free_physmem(struct kvm *kvm) | 1024 | void kvm_free_physmem(struct kvm *kvm) |
| @@ -1071,6 +1071,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) | |||
| 1071 | { | 1071 | { |
| 1072 | struct kvm *kvm = filp->private_data; | 1072 | struct kvm *kvm = filp->private_data; |
| 1073 | 1073 | ||
| 1074 | kvm_irqfd_release(kvm); | ||
| 1075 | |||
| 1074 | kvm_put_kvm(kvm); | 1076 | kvm_put_kvm(kvm); |
| 1075 | return 0; | 1077 | return 0; |
| 1076 | } | 1078 | } |
| @@ -1089,8 +1091,8 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 1089 | { | 1091 | { |
| 1090 | int r; | 1092 | int r; |
| 1091 | gfn_t base_gfn; | 1093 | gfn_t base_gfn; |
| 1092 | unsigned long npages, ugfn; | 1094 | unsigned long npages; |
| 1093 | unsigned long largepages, i; | 1095 | unsigned long i; |
| 1094 | struct kvm_memory_slot *memslot; | 1096 | struct kvm_memory_slot *memslot; |
| 1095 | struct kvm_memory_slot old, new; | 1097 | struct kvm_memory_slot old, new; |
| 1096 | 1098 | ||
| @@ -1164,31 +1166,51 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 1164 | else | 1166 | else |
| 1165 | new.userspace_addr = 0; | 1167 | new.userspace_addr = 0; |
| 1166 | } | 1168 | } |
| 1167 | if (npages && !new.lpage_info) { | 1169 | if (!npages) |
| 1168 | largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; | 1170 | goto skip_lpage; |
| 1169 | largepages -= base_gfn / KVM_PAGES_PER_HPAGE; | ||
| 1170 | 1171 | ||
| 1171 | new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); | 1172 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { |
| 1173 | unsigned long ugfn; | ||
| 1174 | unsigned long j; | ||
| 1175 | int lpages; | ||
| 1176 | int level = i + 2; | ||
| 1172 | 1177 | ||
| 1173 | if (!new.lpage_info) | 1178 | /* Avoid unused variable warning if no large pages */ |
| 1179 | (void)level; | ||
| 1180 | |||
| 1181 | if (new.lpage_info[i]) | ||
| 1182 | continue; | ||
| 1183 | |||
| 1184 | lpages = 1 + (base_gfn + npages - 1) / | ||
| 1185 | KVM_PAGES_PER_HPAGE(level); | ||
| 1186 | lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); | ||
| 1187 | |||
| 1188 | new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); | ||
| 1189 | |||
| 1190 | if (!new.lpage_info[i]) | ||
| 1174 | goto out_free; | 1191 | goto out_free; |
| 1175 | 1192 | ||
| 1176 | memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); | 1193 | memset(new.lpage_info[i], 0, |
| 1194 | lpages * sizeof(*new.lpage_info[i])); | ||
| 1177 | 1195 | ||
| 1178 | if (base_gfn % KVM_PAGES_PER_HPAGE) | 1196 | if (base_gfn % KVM_PAGES_PER_HPAGE(level)) |
| 1179 | new.lpage_info[0].write_count = 1; | 1197 | new.lpage_info[i][0].write_count = 1; |
| 1180 | if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) | 1198 | if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) |
| 1181 | new.lpage_info[largepages-1].write_count = 1; | 1199 | new.lpage_info[i][lpages - 1].write_count = 1; |
| 1182 | ugfn = new.userspace_addr >> PAGE_SHIFT; | 1200 | ugfn = new.userspace_addr >> PAGE_SHIFT; |
| 1183 | /* | 1201 | /* |
| 1184 | * If the gfn and userspace address are not aligned wrt each | 1202 | * If the gfn and userspace address are not aligned wrt each |
| 1185 | * other, disable large page support for this slot | 1203 | * other, or if explicitly asked to, disable large page |
| 1204 | * support for this slot | ||
| 1186 | */ | 1205 | */ |
| 1187 | if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) | 1206 | if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || |
| 1188 | for (i = 0; i < largepages; ++i) | 1207 | !largepages_enabled) |
| 1189 | new.lpage_info[i].write_count = 1; | 1208 | for (j = 0; j < lpages; ++j) |
| 1209 | new.lpage_info[i][j].write_count = 1; | ||
| 1190 | } | 1210 | } |
| 1191 | 1211 | ||
| 1212 | skip_lpage: | ||
| 1213 | |||
| 1192 | /* Allocate page dirty bitmap if needed */ | 1214 | /* Allocate page dirty bitmap if needed */ |
| 1193 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | 1215 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { |
| 1194 | unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; | 1216 | unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; |
| @@ -1200,6 +1222,10 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
| 1200 | if (old.npages) | 1222 | if (old.npages) |
| 1201 | kvm_arch_flush_shadow(kvm); | 1223 | kvm_arch_flush_shadow(kvm); |
| 1202 | } | 1224 | } |
| 1225 | #else /* not defined CONFIG_S390 */ | ||
| 1226 | new.user_alloc = user_alloc; | ||
| 1227 | if (user_alloc) | ||
| 1228 | new.userspace_addr = mem->userspace_addr; | ||
| 1203 | #endif /* not defined CONFIG_S390 */ | 1229 | #endif /* not defined CONFIG_S390 */ |
| 1204 | 1230 | ||
| 1205 | if (!npages) | 1231 | if (!npages) |
| @@ -1299,6 +1325,12 @@ out: | |||
| 1299 | return r; | 1325 | return r; |
| 1300 | } | 1326 | } |
| 1301 | 1327 | ||
| 1328 | void kvm_disable_largepages(void) | ||
| 1329 | { | ||
| 1330 | largepages_enabled = false; | ||
| 1331 | } | ||
| 1332 | EXPORT_SYMBOL_GPL(kvm_disable_largepages); | ||
| 1333 | |||
| 1302 | int is_error_page(struct page *page) | 1334 | int is_error_page(struct page *page) |
| 1303 | { | 1335 | { |
| 1304 | return page == bad_page; | 1336 | return page == bad_page; |
| @@ -1635,9 +1667,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) | |||
| 1635 | for (;;) { | 1667 | for (;;) { |
| 1636 | prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); | 1668 | prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); |
| 1637 | 1669 | ||
| 1638 | if ((kvm_arch_interrupt_allowed(vcpu) && | 1670 | if (kvm_arch_vcpu_runnable(vcpu)) { |
| 1639 | kvm_cpu_has_interrupt(vcpu)) || | ||
| 1640 | kvm_arch_vcpu_runnable(vcpu)) { | ||
| 1641 | set_bit(KVM_REQ_UNHALT, &vcpu->requests); | 1671 | set_bit(KVM_REQ_UNHALT, &vcpu->requests); |
| 1642 | break; | 1672 | break; |
| 1643 | } | 1673 | } |
| @@ -1714,24 +1744,18 @@ static struct file_operations kvm_vcpu_fops = { | |||
| 1714 | */ | 1744 | */ |
| 1715 | static int create_vcpu_fd(struct kvm_vcpu *vcpu) | 1745 | static int create_vcpu_fd(struct kvm_vcpu *vcpu) |
| 1716 | { | 1746 | { |
| 1717 | int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); | 1747 | return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); |
| 1718 | if (fd < 0) | ||
| 1719 | kvm_put_kvm(vcpu->kvm); | ||
| 1720 | return fd; | ||
| 1721 | } | 1748 | } |
| 1722 | 1749 | ||
| 1723 | /* | 1750 | /* |
| 1724 | * Creates some virtual cpus. Good luck creating more than one. | 1751 | * Creates some virtual cpus. Good luck creating more than one. |
| 1725 | */ | 1752 | */ |
| 1726 | static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) | 1753 | static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) |
| 1727 | { | 1754 | { |
| 1728 | int r; | 1755 | int r; |
| 1729 | struct kvm_vcpu *vcpu; | 1756 | struct kvm_vcpu *vcpu, *v; |
| 1730 | |||
| 1731 | if (!valid_vcpu(n)) | ||
| 1732 | return -EINVAL; | ||
| 1733 | 1757 | ||
| 1734 | vcpu = kvm_arch_vcpu_create(kvm, n); | 1758 | vcpu = kvm_arch_vcpu_create(kvm, id); |
| 1735 | if (IS_ERR(vcpu)) | 1759 | if (IS_ERR(vcpu)) |
| 1736 | return PTR_ERR(vcpu); | 1760 | return PTR_ERR(vcpu); |
| 1737 | 1761 | ||
| @@ -1742,23 +1766,38 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) | |||
| 1742 | return r; | 1766 | return r; |
| 1743 | 1767 | ||
| 1744 | mutex_lock(&kvm->lock); | 1768 | mutex_lock(&kvm->lock); |
| 1745 | if (kvm->vcpus[n]) { | 1769 | if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { |
| 1746 | r = -EEXIST; | 1770 | r = -EINVAL; |
| 1747 | goto vcpu_destroy; | 1771 | goto vcpu_destroy; |
| 1748 | } | 1772 | } |
| 1749 | kvm->vcpus[n] = vcpu; | 1773 | |
| 1750 | mutex_unlock(&kvm->lock); | 1774 | kvm_for_each_vcpu(r, v, kvm) |
| 1775 | if (v->vcpu_id == id) { | ||
| 1776 | r = -EEXIST; | ||
| 1777 | goto vcpu_destroy; | ||
| 1778 | } | ||
| 1779 | |||
| 1780 | BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); | ||
| 1751 | 1781 | ||
| 1752 | /* Now it's all set up, let userspace reach it */ | 1782 | /* Now it's all set up, let userspace reach it */ |
| 1753 | kvm_get_kvm(kvm); | 1783 | kvm_get_kvm(kvm); |
| 1754 | r = create_vcpu_fd(vcpu); | 1784 | r = create_vcpu_fd(vcpu); |
| 1755 | if (r < 0) | 1785 | if (r < 0) { |
| 1756 | goto unlink; | 1786 | kvm_put_kvm(kvm); |
| 1787 | goto vcpu_destroy; | ||
| 1788 | } | ||
| 1789 | |||
| 1790 | kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; | ||
| 1791 | smp_wmb(); | ||
| 1792 | atomic_inc(&kvm->online_vcpus); | ||
| 1793 | |||
| 1794 | #ifdef CONFIG_KVM_APIC_ARCHITECTURE | ||
| 1795 | if (kvm->bsp_vcpu_id == id) | ||
| 1796 | kvm->bsp_vcpu = vcpu; | ||
| 1797 | #endif | ||
| 1798 | mutex_unlock(&kvm->lock); | ||
| 1757 | return r; | 1799 | return r; |
| 1758 | 1800 | ||
| 1759 | unlink: | ||
| 1760 | mutex_lock(&kvm->lock); | ||
| 1761 | kvm->vcpus[n] = NULL; | ||
| 1762 | vcpu_destroy: | 1801 | vcpu_destroy: |
| 1763 | mutex_unlock(&kvm->lock); | 1802 | mutex_unlock(&kvm->lock); |
| 1764 | kvm_arch_vcpu_destroy(vcpu); | 1803 | kvm_arch_vcpu_destroy(vcpu); |
| @@ -2199,6 +2238,7 @@ static long kvm_vm_ioctl(struct file *filp, | |||
| 2199 | vfree(entries); | 2238 | vfree(entries); |
| 2200 | break; | 2239 | break; |
| 2201 | } | 2240 | } |
| 2241 | #endif /* KVM_CAP_IRQ_ROUTING */ | ||
| 2202 | #ifdef __KVM_HAVE_MSIX | 2242 | #ifdef __KVM_HAVE_MSIX |
| 2203 | case KVM_ASSIGN_SET_MSIX_NR: { | 2243 | case KVM_ASSIGN_SET_MSIX_NR: { |
| 2204 | struct kvm_assigned_msix_nr entry_nr; | 2244 | struct kvm_assigned_msix_nr entry_nr; |
| @@ -2221,7 +2261,35 @@ static long kvm_vm_ioctl(struct file *filp, | |||
| 2221 | break; | 2261 | break; |
| 2222 | } | 2262 | } |
| 2223 | #endif | 2263 | #endif |
| 2224 | #endif /* KVM_CAP_IRQ_ROUTING */ | 2264 | case KVM_IRQFD: { |
| 2265 | struct kvm_irqfd data; | ||
| 2266 | |||
| 2267 | r = -EFAULT; | ||
| 2268 | if (copy_from_user(&data, argp, sizeof data)) | ||
| 2269 | goto out; | ||
| 2270 | r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); | ||
| 2271 | break; | ||
| 2272 | } | ||
| 2273 | case KVM_IOEVENTFD: { | ||
| 2274 | struct kvm_ioeventfd data; | ||
| 2275 | |||
| 2276 | r = -EFAULT; | ||
| 2277 | if (copy_from_user(&data, argp, sizeof data)) | ||
| 2278 | goto out; | ||
| 2279 | r = kvm_ioeventfd(kvm, &data); | ||
| 2280 | break; | ||
| 2281 | } | ||
| 2282 | #ifdef CONFIG_KVM_APIC_ARCHITECTURE | ||
| 2283 | case KVM_SET_BOOT_CPU_ID: | ||
| 2284 | r = 0; | ||
| 2285 | mutex_lock(&kvm->lock); | ||
| 2286 | if (atomic_read(&kvm->online_vcpus) != 0) | ||
| 2287 | r = -EBUSY; | ||
| 2288 | else | ||
| 2289 | kvm->bsp_vcpu_id = arg; | ||
| 2290 | mutex_unlock(&kvm->lock); | ||
| 2291 | break; | ||
| 2292 | #endif | ||
| 2225 | default: | 2293 | default: |
| 2226 | r = kvm_arch_vm_ioctl(filp, ioctl, arg); | 2294 | r = kvm_arch_vm_ioctl(filp, ioctl, arg); |
| 2227 | } | 2295 | } |
| @@ -2288,6 +2356,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) | |||
| 2288 | case KVM_CAP_USER_MEMORY: | 2356 | case KVM_CAP_USER_MEMORY: |
| 2289 | case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: | 2357 | case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: |
| 2290 | case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: | 2358 | case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: |
| 2359 | #ifdef CONFIG_KVM_APIC_ARCHITECTURE | ||
| 2360 | case KVM_CAP_SET_BOOT_CPU_ID: | ||
| 2361 | #endif | ||
| 2291 | return 1; | 2362 | return 1; |
| 2292 | #ifdef CONFIG_HAVE_KVM_IRQCHIP | 2363 | #ifdef CONFIG_HAVE_KVM_IRQCHIP |
| 2293 | case KVM_CAP_IRQ_ROUTING: | 2364 | case KVM_CAP_IRQ_ROUTING: |
| @@ -2335,7 +2406,7 @@ static long kvm_dev_ioctl(struct file *filp, | |||
| 2335 | case KVM_TRACE_ENABLE: | 2406 | case KVM_TRACE_ENABLE: |
| 2336 | case KVM_TRACE_PAUSE: | 2407 | case KVM_TRACE_PAUSE: |
| 2337 | case KVM_TRACE_DISABLE: | 2408 | case KVM_TRACE_DISABLE: |
| 2338 | r = kvm_trace_ioctl(ioctl, arg); | 2409 | r = -EOPNOTSUPP; |
| 2339 | break; | 2410 | break; |
| 2340 | default: | 2411 | default: |
| 2341 | return kvm_arch_dev_ioctl(filp, ioctl, arg); | 2412 | return kvm_arch_dev_ioctl(filp, ioctl, arg); |
| @@ -2449,26 +2520,71 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) | |||
| 2449 | } | 2520 | } |
| 2450 | } | 2521 | } |
| 2451 | 2522 | ||
| 2452 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, | 2523 | /* kvm_io_bus_write - called under kvm->slots_lock */ |
| 2453 | gpa_t addr, int len, int is_write) | 2524 | int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, |
| 2525 | int len, const void *val) | ||
| 2454 | { | 2526 | { |
| 2455 | int i; | 2527 | int i; |
| 2528 | for (i = 0; i < bus->dev_count; i++) | ||
| 2529 | if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) | ||
| 2530 | return 0; | ||
| 2531 | return -EOPNOTSUPP; | ||
| 2532 | } | ||
| 2456 | 2533 | ||
| 2457 | for (i = 0; i < bus->dev_count; i++) { | 2534 | /* kvm_io_bus_read - called under kvm->slots_lock */ |
| 2458 | struct kvm_io_device *pos = bus->devs[i]; | 2535 | int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) |
| 2536 | { | ||
| 2537 | int i; | ||
| 2538 | for (i = 0; i < bus->dev_count; i++) | ||
| 2539 | if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) | ||
| 2540 | return 0; | ||
| 2541 | return -EOPNOTSUPP; | ||
| 2542 | } | ||
| 2459 | 2543 | ||
| 2460 | if (pos->in_range(pos, addr, len, is_write)) | 2544 | int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, |
| 2461 | return pos; | 2545 | struct kvm_io_device *dev) |
| 2462 | } | 2546 | { |
| 2547 | int ret; | ||
| 2463 | 2548 | ||
| 2464 | return NULL; | 2549 | down_write(&kvm->slots_lock); |
| 2550 | ret = __kvm_io_bus_register_dev(bus, dev); | ||
| 2551 | up_write(&kvm->slots_lock); | ||
| 2552 | |||
| 2553 | return ret; | ||
| 2465 | } | 2554 | } |
| 2466 | 2555 | ||
| 2467 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) | 2556 | /* An unlocked version. Caller must have write lock on slots_lock. */ |
| 2557 | int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, | ||
| 2558 | struct kvm_io_device *dev) | ||
| 2468 | { | 2559 | { |
| 2469 | BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); | 2560 | if (bus->dev_count > NR_IOBUS_DEVS-1) |
| 2561 | return -ENOSPC; | ||
| 2470 | 2562 | ||
| 2471 | bus->devs[bus->dev_count++] = dev; | 2563 | bus->devs[bus->dev_count++] = dev; |
| 2564 | |||
| 2565 | return 0; | ||
| 2566 | } | ||
| 2567 | |||
| 2568 | void kvm_io_bus_unregister_dev(struct kvm *kvm, | ||
| 2569 | struct kvm_io_bus *bus, | ||
| 2570 | struct kvm_io_device *dev) | ||
| 2571 | { | ||
| 2572 | down_write(&kvm->slots_lock); | ||
| 2573 | __kvm_io_bus_unregister_dev(bus, dev); | ||
| 2574 | up_write(&kvm->slots_lock); | ||
| 2575 | } | ||
| 2576 | |||
| 2577 | /* An unlocked version. Caller must have write lock on slots_lock. */ | ||
| 2578 | void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, | ||
| 2579 | struct kvm_io_device *dev) | ||
| 2580 | { | ||
| 2581 | int i; | ||
| 2582 | |||
| 2583 | for (i = 0; i < bus->dev_count; i++) | ||
| 2584 | if (bus->devs[i] == dev) { | ||
| 2585 | bus->devs[i] = bus->devs[--bus->dev_count]; | ||
| 2586 | break; | ||
| 2587 | } | ||
| 2472 | } | 2588 | } |
| 2473 | 2589 | ||
| 2474 | static struct notifier_block kvm_cpu_notifier = { | 2590 | static struct notifier_block kvm_cpu_notifier = { |
| @@ -2501,11 +2617,9 @@ static int vcpu_stat_get(void *_offset, u64 *val) | |||
| 2501 | *val = 0; | 2617 | *val = 0; |
| 2502 | spin_lock(&kvm_lock); | 2618 | spin_lock(&kvm_lock); |
| 2503 | list_for_each_entry(kvm, &vm_list, vm_list) | 2619 | list_for_each_entry(kvm, &vm_list, vm_list) |
| 2504 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 2620 | kvm_for_each_vcpu(i, vcpu, kvm) |
| 2505 | vcpu = kvm->vcpus[i]; | 2621 | *val += *(u32 *)((void *)vcpu + offset); |
| 2506 | if (vcpu) | 2622 | |
| 2507 | *val += *(u32 *)((void *)vcpu + offset); | ||
| 2508 | } | ||
| 2509 | spin_unlock(&kvm_lock); | 2623 | spin_unlock(&kvm_lock); |
| 2510 | return 0; | 2624 | return 0; |
| 2511 | } | 2625 | } |
| @@ -2679,15 +2793,15 @@ out_free_0: | |||
| 2679 | __free_page(bad_page); | 2793 | __free_page(bad_page); |
| 2680 | out: | 2794 | out: |
| 2681 | kvm_arch_exit(); | 2795 | kvm_arch_exit(); |
| 2682 | kvm_exit_debug(); | ||
| 2683 | out_fail: | 2796 | out_fail: |
| 2797 | kvm_exit_debug(); | ||
| 2684 | return r; | 2798 | return r; |
| 2685 | } | 2799 | } |
| 2686 | EXPORT_SYMBOL_GPL(kvm_init); | 2800 | EXPORT_SYMBOL_GPL(kvm_init); |
| 2687 | 2801 | ||
| 2688 | void kvm_exit(void) | 2802 | void kvm_exit(void) |
| 2689 | { | 2803 | { |
| 2690 | kvm_trace_cleanup(); | 2804 | tracepoint_synchronize_unregister(); |
| 2691 | misc_deregister(&kvm_dev); | 2805 | misc_deregister(&kvm_dev); |
| 2692 | kmem_cache_destroy(kvm_vcpu_cache); | 2806 | kmem_cache_destroy(kvm_vcpu_cache); |
| 2693 | sysdev_unregister(&kvm_sysdev); | 2807 | sysdev_unregister(&kvm_sysdev); |
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c deleted file mode 100644 index f5987444644..00000000000 --- a/virt/kvm/kvm_trace.c +++ /dev/null | |||
| @@ -1,285 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * kvm trace | ||
| 3 | * | ||
| 4 | * It is designed to allow debugging traces of kvm to be generated | ||
| 5 | * on UP / SMP machines. Each trace entry can be timestamped so that | ||
| 6 | * it's possible to reconstruct a chronological record of trace events. | ||
| 7 | * The implementation refers to blktrace kernel support. | ||
| 8 | * | ||
| 9 | * Copyright (c) 2008 Intel Corporation | ||
| 10 | * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> | ||
| 11 | * | ||
| 12 | * Authors: Feng(Eric) Liu, eric.e.liu@intel.com | ||
| 13 | * | ||
| 14 | * Date: Feb 2008 | ||
| 15 | */ | ||
| 16 | |||
| 17 | #include <linux/module.h> | ||
| 18 | #include <linux/relay.h> | ||
| 19 | #include <linux/debugfs.h> | ||
| 20 | #include <linux/ktime.h> | ||
| 21 | |||
| 22 | #include <linux/kvm_host.h> | ||
| 23 | |||
| 24 | #define KVM_TRACE_STATE_RUNNING (1 << 0) | ||
| 25 | #define KVM_TRACE_STATE_PAUSE (1 << 1) | ||
| 26 | #define KVM_TRACE_STATE_CLEARUP (1 << 2) | ||
| 27 | |||
| 28 | struct kvm_trace { | ||
| 29 | int trace_state; | ||
| 30 | struct rchan *rchan; | ||
| 31 | struct dentry *lost_file; | ||
| 32 | atomic_t lost_records; | ||
| 33 | }; | ||
| 34 | static struct kvm_trace *kvm_trace; | ||
| 35 | |||
| 36 | struct kvm_trace_probe { | ||
| 37 | const char *name; | ||
| 38 | const char *format; | ||
| 39 | u32 timestamp_in; | ||
| 40 | marker_probe_func *probe_func; | ||
| 41 | }; | ||
| 42 | |||
| 43 | static inline int calc_rec_size(int timestamp, int extra) | ||
| 44 | { | ||
| 45 | int rec_size = KVM_TRC_HEAD_SIZE; | ||
| 46 | |||
| 47 | rec_size += extra; | ||
| 48 | return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; | ||
| 49 | } | ||
| 50 | |||
| 51 | static void kvm_add_trace(void *probe_private, void *call_data, | ||
| 52 | const char *format, va_list *args) | ||
| 53 | { | ||
| 54 | struct kvm_trace_probe *p = probe_private; | ||
| 55 | struct kvm_trace *kt = kvm_trace; | ||
| 56 | struct kvm_trace_rec rec; | ||
| 57 | struct kvm_vcpu *vcpu; | ||
| 58 | int i, size; | ||
| 59 | u32 extra; | ||
| 60 | |||
| 61 | if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING)) | ||
| 62 | return; | ||
| 63 | |||
| 64 | rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32)); | ||
| 65 | vcpu = va_arg(*args, struct kvm_vcpu *); | ||
| 66 | rec.pid = current->tgid; | ||
| 67 | rec.vcpu_id = vcpu->vcpu_id; | ||
| 68 | |||
| 69 | extra = va_arg(*args, u32); | ||
| 70 | WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX)); | ||
| 71 | extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX); | ||
| 72 | |||
| 73 | rec.rec_val |= TRACE_REC_TCS(p->timestamp_in) | ||
| 74 | | TRACE_REC_NUM_DATA_ARGS(extra); | ||
| 75 | |||
| 76 | if (p->timestamp_in) { | ||
| 77 | rec.u.timestamp.timestamp = ktime_to_ns(ktime_get()); | ||
| 78 | |||
| 79 | for (i = 0; i < extra; i++) | ||
| 80 | rec.u.timestamp.extra_u32[i] = va_arg(*args, u32); | ||
| 81 | } else { | ||
| 82 | for (i = 0; i < extra; i++) | ||
| 83 | rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32); | ||
| 84 | } | ||
| 85 | |||
| 86 | size = calc_rec_size(p->timestamp_in, extra * sizeof(u32)); | ||
| 87 | relay_write(kt->rchan, &rec, size); | ||
| 88 | } | ||
| 89 | |||
| 90 | static struct kvm_trace_probe kvm_trace_probes[] = { | ||
| 91 | { "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace }, | ||
| 92 | { "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace }, | ||
| 93 | }; | ||
| 94 | |||
| 95 | static int lost_records_get(void *data, u64 *val) | ||
| 96 | { | ||
| 97 | struct kvm_trace *kt = data; | ||
| 98 | |||
| 99 | *val = atomic_read(&kt->lost_records); | ||
| 100 | return 0; | ||
| 101 | } | ||
| 102 | |||
| 103 | DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n"); | ||
| 104 | |||
| 105 | /* | ||
| 106 | * The relay channel is used in "no-overwrite" mode, it keeps trace of how | ||
| 107 | * many times we encountered a full subbuffer, to tell user space app the | ||
| 108 | * lost records there were. | ||
| 109 | */ | ||
| 110 | static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, | ||
| 111 | void *prev_subbuf, size_t prev_padding) | ||
| 112 | { | ||
| 113 | struct kvm_trace *kt; | ||
| 114 | |||
| 115 | if (!relay_buf_full(buf)) { | ||
| 116 | if (!prev_subbuf) { | ||
| 117 | /* | ||
| 118 | * executed only once when the channel is opened | ||
| 119 | * save metadata as first record | ||
| 120 | */ | ||
| 121 | subbuf_start_reserve(buf, sizeof(u32)); | ||
| 122 | *(u32 *)subbuf = 0x12345678; | ||
| 123 | } | ||
| 124 | |||
| 125 | return 1; | ||
| 126 | } | ||
| 127 | |||
| 128 | kt = buf->chan->private_data; | ||
| 129 | atomic_inc(&kt->lost_records); | ||
| 130 | |||
| 131 | return 0; | ||
| 132 | } | ||
| 133 | |||
| 134 | static struct dentry *kvm_create_buf_file_callack(const char *filename, | ||
| 135 | struct dentry *parent, | ||
| 136 | int mode, | ||
| 137 | struct rchan_buf *buf, | ||
| 138 | int *is_global) | ||
| 139 | { | ||
| 140 | return debugfs_create_file(filename, mode, parent, buf, | ||
| 141 | &relay_file_operations); | ||
| 142 | } | ||
| 143 | |||
| 144 | static int kvm_remove_buf_file_callback(struct dentry *dentry) | ||
| 145 | { | ||
| 146 | debugfs_remove(dentry); | ||
| 147 | return 0; | ||
| 148 | } | ||
| 149 | |||
| 150 | static struct rchan_callbacks kvm_relay_callbacks = { | ||
| 151 | .subbuf_start = kvm_subbuf_start_callback, | ||
| 152 | .create_buf_file = kvm_create_buf_file_callack, | ||
| 153 | .remove_buf_file = kvm_remove_buf_file_callback, | ||
| 154 | }; | ||
| 155 | |||
| 156 | static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts) | ||
| 157 | { | ||
| 158 | struct kvm_trace *kt; | ||
| 159 | int i, r = -ENOMEM; | ||
| 160 | |||
| 161 | if (!kuts->buf_size || !kuts->buf_nr) | ||
| 162 | return -EINVAL; | ||
| 163 | |||
| 164 | kt = kzalloc(sizeof(*kt), GFP_KERNEL); | ||
| 165 | if (!kt) | ||
| 166 | goto err; | ||
| 167 | |||
| 168 | r = -EIO; | ||
| 169 | atomic_set(&kt->lost_records, 0); | ||
| 170 | kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir, | ||
| 171 | kt, &kvm_trace_lost_ops); | ||
| 172 | if (!kt->lost_file) | ||
| 173 | goto err; | ||
| 174 | |||
| 175 | kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size, | ||
| 176 | kuts->buf_nr, &kvm_relay_callbacks, kt); | ||
| 177 | if (!kt->rchan) | ||
| 178 | goto err; | ||
| 179 | |||
| 180 | kvm_trace = kt; | ||
| 181 | |||
| 182 | for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { | ||
| 183 | struct kvm_trace_probe *p = &kvm_trace_probes[i]; | ||
| 184 | |||
| 185 | r = marker_probe_register(p->name, p->format, p->probe_func, p); | ||
| 186 | if (r) | ||
| 187 | printk(KERN_INFO "Unable to register probe %s\n", | ||
| 188 | p->name); | ||
| 189 | } | ||
| 190 | |||
| 191 | kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING; | ||
| 192 | |||
| 193 | return 0; | ||
| 194 | err: | ||
| 195 | if (kt) { | ||
| 196 | if (kt->lost_file) | ||
| 197 | debugfs_remove(kt->lost_file); | ||
| 198 | if (kt->rchan) | ||
| 199 | relay_close(kt->rchan); | ||
| 200 | kfree(kt); | ||
| 201 | } | ||
| 202 | return r; | ||
| 203 | } | ||
| 204 | |||
| 205 | static int kvm_trace_enable(char __user *arg) | ||
| 206 | { | ||
| 207 | struct kvm_user_trace_setup kuts; | ||
| 208 | int ret; | ||
| 209 | |||
| 210 | ret = copy_from_user(&kuts, arg, sizeof(kuts)); | ||
| 211 | if (ret) | ||
| 212 | return -EFAULT; | ||
| 213 | |||
| 214 | ret = do_kvm_trace_enable(&kuts); | ||
| 215 | if (ret) | ||
| 216 | return ret; | ||
| 217 | |||
| 218 | return 0; | ||
| 219 | } | ||
| 220 | |||
| 221 | static int kvm_trace_pause(void) | ||
| 222 | { | ||
| 223 | struct kvm_trace *kt = kvm_trace; | ||
| 224 | int r = -EINVAL; | ||
| 225 | |||
| 226 | if (kt == NULL) | ||
| 227 | return r; | ||
| 228 | |||
| 229 | if (kt->trace_state == KVM_TRACE_STATE_RUNNING) { | ||
| 230 | kt->trace_state = KVM_TRACE_STATE_PAUSE; | ||
| 231 | relay_flush(kt->rchan); | ||
| 232 | r = 0; | ||
| 233 | } | ||
| 234 | |||
| 235 | return r; | ||
| 236 | } | ||
| 237 | |||
| 238 | void kvm_trace_cleanup(void) | ||
| 239 | { | ||
| 240 | struct kvm_trace *kt = kvm_trace; | ||
| 241 | int i; | ||
| 242 | |||
| 243 | if (kt == NULL) | ||
| 244 | return; | ||
| 245 | |||
| 246 | if (kt->trace_state == KVM_TRACE_STATE_RUNNING || | ||
| 247 | kt->trace_state == KVM_TRACE_STATE_PAUSE) { | ||
| 248 | |||
| 249 | kt->trace_state = KVM_TRACE_STATE_CLEARUP; | ||
| 250 | |||
| 251 | for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { | ||
| 252 | struct kvm_trace_probe *p = &kvm_trace_probes[i]; | ||
| 253 | marker_probe_unregister(p->name, p->probe_func, p); | ||
| 254 | } | ||
| 255 | marker_synchronize_unregister(); | ||
| 256 | |||
| 257 | relay_close(kt->rchan); | ||
| 258 | debugfs_remove(kt->lost_file); | ||
| 259 | kfree(kt); | ||
| 260 | } | ||
| 261 | } | ||
| 262 | |||
| 263 | int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) | ||
| 264 | { | ||
| 265 | void __user *argp = (void __user *)arg; | ||
| 266 | long r = -EINVAL; | ||
| 267 | |||
| 268 | if (!capable(CAP_SYS_ADMIN)) | ||
| 269 | return -EPERM; | ||
| 270 | |||
| 271 | switch (ioctl) { | ||
| 272 | case KVM_TRACE_ENABLE: | ||
| 273 | r = kvm_trace_enable(argp); | ||
| 274 | break; | ||
| 275 | case KVM_TRACE_PAUSE: | ||
| 276 | r = kvm_trace_pause(); | ||
| 277 | break; | ||
| 278 | case KVM_TRACE_DISABLE: | ||
| 279 | r = 0; | ||
| 280 | kvm_trace_cleanup(); | ||
| 281 | break; | ||
| 282 | } | ||
| 283 | |||
| 284 | return r; | ||
| 285 | } | ||
